diff --git a/PG-PuReMD/src/analyze.c b/PG-PuReMD/src/analyze.c index 8b8b42673be48103e8bf1247dea700e2c5cc4958..f395ad05b30aa2d2ff1054498db71b1ff96f30c4 100644 --- a/PG-PuReMD/src/analyze.c +++ b/PG-PuReMD/src/analyze.c @@ -158,7 +158,9 @@ void Analyze_Fragments( reax_system *system, control_params *control, } } fprintf( fout, "\n" ); +#if defined(DEBUG) fflush( fout ); +#endif } diff --git a/PG-PuReMD/src/cuda/cuda_allocate.cu b/PG-PuReMD/src/cuda/cuda_allocate.cu index dd791df7996fdd2bd7b487f629eebb1adf4c64dc..a1ca790f8752eeaf2ee0e7283c277fde34f8508e 100644 --- a/PG-PuReMD/src/cuda/cuda_allocate.cu +++ b/PG-PuReMD/src/cuda/cuda_allocate.cu @@ -85,7 +85,6 @@ void Cuda_Allocate_Grid( reax_system *system ) // // k_init_nbrs <<< blocks, block_size >>> // ( nbrs_x, host->max_nbrs ); -// cudaDeviceSynchronize( ); // cudaCheckError( ); // // cuda_malloc( (void **)& device->cells, sizeof(grid_cell) * total, @@ -406,9 +405,15 @@ void Cuda_Allocate_Workspace_Part2( reax_system *system, control_params *control storage *workspace, int total_cap ) { int total_real, total_rvec; +#if defined(DUAL_SOLVER) + int total_rvec2; +#endif total_real = sizeof(real) * total_cap; total_rvec = sizeof(rvec) * total_cap; +#if defined(DUAL_SOLVER) + total_rvec2 = sizeof(rvec2) * total_cap; +#endif /* bond order related storage */ cuda_malloc( (void **) &workspace->total_bond_order, total_real, TRUE, "total_bo" ); @@ -431,22 +436,22 @@ void Cuda_Allocate_Workspace_Part2( reax_system *system, control_params *control /* charge matrix storage */ if ( control->cm_solver_pre_comp_type == JACOBI_PC ) { - cuda_malloc( (void **) &workspace->Hdia_inv, sizeof(real) * total_cap, TRUE, "Hdia_inv" ); + cuda_malloc( (void **) &workspace->Hdia_inv, total_real, TRUE, "Hdia_inv" ); } if ( control->cm_solver_pre_comp_type == ICHOLT_PC || control->cm_solver_pre_comp_type == ILUT_PC || control->cm_solver_pre_comp_type == ILUTP_PC || control->cm_solver_pre_comp_type == FG_ILUT_PC ) { - cuda_malloc( (void **) &workspace->droptol, sizeof(real) * total_cap, TRUE, "droptol" ); + cuda_malloc( (void **) &workspace->droptol, total_real, TRUE, "droptol" ); } - cuda_malloc( (void **) &workspace->b_s, sizeof(real) * total_cap, TRUE, "b_s" ); - cuda_malloc( (void **) &workspace->b_t, sizeof(real) * total_cap, TRUE, "b_t" ); - cuda_malloc( (void **) &workspace->s, sizeof(real) * total_cap, TRUE, "s" ); - cuda_malloc( (void **) &workspace->t, sizeof(real) * total_cap, TRUE, "t" ); + cuda_malloc( (void **) &workspace->b_s, total_real, TRUE, "b_s" ); + cuda_malloc( (void **) &workspace->b_t, total_real, TRUE, "b_t" ); + cuda_malloc( (void **) &workspace->s, total_real, TRUE, "s" ); + cuda_malloc( (void **) &workspace->t, total_real, TRUE, "t" ); #if defined(DUAL_SOLVER) - cuda_malloc( (void **) &workspace->b, sizeof(rvec2) * total_cap, TRUE, "b" ); - cuda_malloc( (void **) &workspace->x, sizeof(rvec2) * total_cap, TRUE, "x" ); + cuda_malloc( (void **) &workspace->b, total_rvec2, TRUE, "b" ); + cuda_malloc( (void **) &workspace->x, total_rvec2, TRUE, "x" ); #endif switch ( control->cm_solver_type ) @@ -454,9 +459,9 @@ void Cuda_Allocate_Workspace_Part2( reax_system *system, control_params *control case GMRES_S: case GMRES_H_S: cuda_malloc( (void **) &workspace->b_prc, - sizeof(real) * total_cap, TRUE, "b_prc" ); + total_real, TRUE, "b_prc" ); cuda_malloc( (void **) &workspace->b_prm, - sizeof(real) * total_cap, TRUE, "b_prm" ); + total_real, TRUE, "b_prm" ); cuda_malloc( (void **) &workspace->y, (control->cm_solver_restart + 1) * sizeof(real), TRUE, "y" ); cuda_malloc( (void **) &workspace->z, @@ -474,45 +479,45 @@ void Cuda_Allocate_Workspace_Part2( reax_system *system, control_params *control break; case SDM_S: - cuda_malloc( (void **) &workspace->r, sizeof(real) * total_cap, TRUE, "r" ); - cuda_malloc( (void **) &workspace->d, sizeof(real) * total_cap, TRUE, "d" ); - cuda_malloc( (void **) &workspace->q, sizeof(real) * total_cap, TRUE, "q" ); - cuda_malloc( (void **) &workspace->p, sizeof(real) * total_cap, TRUE, "p" ); + cuda_malloc( (void **) &workspace->r, total_real, TRUE, "r" ); + cuda_malloc( (void **) &workspace->d, total_real, TRUE, "d" ); + cuda_malloc( (void **) &workspace->q, total_real, TRUE, "q" ); + cuda_malloc( (void **) &workspace->p, total_real, TRUE, "p" ); break; case CG_S: - cuda_malloc( (void **) &workspace->r, sizeof(real) * total_cap, TRUE, "r" ); - cuda_malloc( (void **) &workspace->d, sizeof(real) * total_cap, TRUE, "d" ); - cuda_malloc( (void **) &workspace->q, sizeof(real) * total_cap, TRUE, "q" ); - cuda_malloc( (void **) &workspace->p, sizeof(real) * total_cap, TRUE, "p" ); + cuda_malloc( (void **) &workspace->r, total_real, TRUE, "r" ); + cuda_malloc( (void **) &workspace->d, total_real, TRUE, "d" ); + cuda_malloc( (void **) &workspace->q, total_real, TRUE, "q" ); + cuda_malloc( (void **) &workspace->p, total_real, TRUE, "p" ); #if defined(DUAL_SOLVER) - cuda_malloc( (void **) &workspace->r2, sizeof(rvec2) * total_cap, TRUE, "r2" ); - cuda_malloc( (void **) &workspace->d2, sizeof(rvec2) * total_cap, TRUE, "d2" ); - cuda_malloc( (void **) &workspace->q2, sizeof(rvec2) * total_cap, TRUE, "q2" ); - cuda_malloc( (void **) &workspace->p2, sizeof(rvec2) * total_cap, TRUE, "p2" ); + cuda_malloc( (void **) &workspace->r2, total_rvec2, TRUE, "r2" ); + cuda_malloc( (void **) &workspace->d2, total_rvec2, TRUE, "d2" ); + cuda_malloc( (void **) &workspace->q2, total_rvec2, TRUE, "q2" ); + cuda_malloc( (void **) &workspace->p2, total_rvec2, TRUE, "p2" ); #endif break; case BiCGStab_S: - cuda_malloc( (void **) &workspace->y, sizeof(real) * total_cap, TRUE, "y" ); - cuda_malloc( (void **) &workspace->g, sizeof(real) * total_cap, TRUE, "g" ); - cuda_malloc( (void **) &workspace->z, sizeof(real) * total_cap, TRUE, "z" ); - cuda_malloc( (void **) &workspace->r, sizeof(real) * total_cap, TRUE, "r" ); - cuda_malloc( (void **) &workspace->d, sizeof(real) * total_cap, TRUE, "d" ); - cuda_malloc( (void **) &workspace->q, sizeof(real) * total_cap, TRUE, "q" ); - cuda_malloc( (void **) &workspace->p, sizeof(real) * total_cap, TRUE, "p" ); - cuda_malloc( (void **) &workspace->r_hat, sizeof(real) * total_cap, TRUE, "r_hat" ); - cuda_malloc( (void **) &workspace->q_hat, sizeof(real) * total_cap, TRUE, "q_hat" ); + cuda_malloc( (void **) &workspace->y, total_real, TRUE, "y" ); + cuda_malloc( (void **) &workspace->g, total_real, TRUE, "g" ); + cuda_malloc( (void **) &workspace->z, total_real, TRUE, "z" ); + cuda_malloc( (void **) &workspace->r, total_real, TRUE, "r" ); + cuda_malloc( (void **) &workspace->d, total_real, TRUE, "d" ); + cuda_malloc( (void **) &workspace->q, total_real, TRUE, "q" ); + cuda_malloc( (void **) &workspace->p, total_real, TRUE, "p" ); + cuda_malloc( (void **) &workspace->r_hat, total_real, TRUE, "r_hat" ); + cuda_malloc( (void **) &workspace->q_hat, total_real, TRUE, "q_hat" ); #if defined(DUAL_SOLVER) - cuda_malloc( (void **) &workspace->y2, sizeof(rvec2) * total_cap, TRUE, "y" ); - cuda_malloc( (void **) &workspace->g2, sizeof(rvec2) * total_cap, TRUE, "g" ); - cuda_malloc( (void **) &workspace->z2, sizeof(rvec2) * total_cap, TRUE, "z" ); - cuda_malloc( (void **) &workspace->r2, sizeof(rvec2) * total_cap, TRUE, "r" ); - cuda_malloc( (void **) &workspace->d2, sizeof(rvec2) * total_cap, TRUE, "d" ); - cuda_malloc( (void **) &workspace->q2, sizeof(rvec2) * total_cap, TRUE, "q" ); - cuda_malloc( (void **) &workspace->p2, sizeof(rvec2) * total_cap, TRUE, "p" ); - cuda_malloc( (void **) &workspace->r_hat2, sizeof(rvec2) * total_cap, TRUE, "r_hat" ); - cuda_malloc( (void **) &workspace->q_hat2, sizeof(rvec2) * total_cap, TRUE, "q_hat" ); + cuda_malloc( (void **) &workspace->y2, total_rvec2, TRUE, "y" ); + cuda_malloc( (void **) &workspace->g2, total_rvec2, TRUE, "g" ); + cuda_malloc( (void **) &workspace->z2, total_rvec2, TRUE, "z" ); + cuda_malloc( (void **) &workspace->r2, total_rvec2, TRUE, "r" ); + cuda_malloc( (void **) &workspace->d2, total_rvec2, TRUE, "d" ); + cuda_malloc( (void **) &workspace->q2, total_rvec2, TRUE, "q" ); + cuda_malloc( (void **) &workspace->p2, total_rvec2, TRUE, "p" ); + cuda_malloc( (void **) &workspace->r_hat2, total_rvec2, TRUE, "r_hat" ); + cuda_malloc( (void **) &workspace->q_hat2, total_rvec2, TRUE, "q_hat" ); #endif break; diff --git a/PG-PuReMD/src/cuda/cuda_bond_orders.cu b/PG-PuReMD/src/cuda/cuda_bond_orders.cu index 0c11e58fbaf40d8237c16e28d5aea1b966a14919..43d8f988dd28191d46f4d8b2818bc0a02b1e5247 100644 --- a/PG-PuReMD/src/cuda/cuda_bond_orders.cu +++ b/PG-PuReMD/src/cuda/cuda_bond_orders.cu @@ -767,7 +767,6 @@ void Cuda_Total_Forces( reax_system *system, control_params *control, (control_params *) control->d_control_params, (simulation_data *)data->d_simulation_data, spad_rvec, system->N ); - cudaDeviceSynchronize( ); cudaCheckError( ); if ( control->virial != 0 ) @@ -775,19 +774,16 @@ void Cuda_Total_Forces( reax_system *system, control_params *control, /* reduction for ext_press */ k_reduction_rvec <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec) * (DEF_BLOCK_SIZE / 32) >>> ( spad_rvec, &spad_rvec[system->N], system->N ); - cudaDeviceSynchronize( ); cudaCheckError( ); k_reduction_rvec <<< 1, ((blocks + 31) / 32) * 32, sizeof(rvec) * ((blocks + 31) / 32) >>> ( &spad_rvec[system->N], &((simulation_data *)data->d_simulation_data)->my_ext_press, blocks ); - cudaDeviceSynchronize( ); cudaCheckError( ); } /* post processing for the atomic forces */ k_total_forces_part2 <<< blocks, DEF_BLOCK_SIZE >>> ( system->d_my_atoms, *(lists[BONDS]), *(workspace->d_workspace), system->N ); - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -801,6 +797,5 @@ void Cuda_Total_Forces_PURE( reax_system *system, storage *workspace ) k_total_forces_pure <<< blocks, DEF_BLOCK_SIZE >>> ( system->d_my_atoms, system->n, *(workspace->d_workspace) ); - cudaDeviceSynchronize( ); cudaCheckError( ); } diff --git a/PG-PuReMD/src/cuda/cuda_charges.cu b/PG-PuReMD/src/cuda/cuda_charges.cu index a8d4bc86feff484d8e143167e27837b849870a35..6688eb854b91e42efb7e511295b9b133a00d5ac8 100644 --- a/PG-PuReMD/src/cuda/cuda_charges.cu +++ b/PG-PuReMD/src/cuda/cuda_charges.cu @@ -63,7 +63,6 @@ static void jacobi( reax_system const * const system, k_jacobi <<< blocks, DEF_BLOCK_SIZE >>> ( system->d_my_atoms, system->reax_param.d_sbp, *(workspace->d_workspace), system->n ); - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -133,7 +132,6 @@ void Sort_Matrix_Rows( sparse_matrix * const A, reax_system const * const system cub::DeviceRadixSort::SortPairs( d_temp_storage, temp_storage_bytes, &d_j_temp[start[i]], &A->j[start[i]], &d_val_temp[start[i]], &A->val[start[i]], num_entries ); - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -245,7 +243,6 @@ static void Spline_Extrapolate_Charges_QEq( reax_system const * const system, ( system->d_my_atoms, system->reax_param.d_sbp, (control_params *)control->d_control_params, *(workspace->d_workspace), system->n ); - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -421,7 +418,6 @@ static void Extrapolate_Charges_QEq_Part2( reax_system const * const system, k_extrapolate_charges_qeq_part2 <<< blocks, DEF_BLOCK_SIZE >>> ( system->d_my_atoms, *(workspace->d_workspace), u, spad, system->n ); - cudaDeviceSynchronize( ); cudaCheckError( ); copy_host_device( q, spad, sizeof(real) * system->n, @@ -463,7 +459,6 @@ static void Update_Ghost_Atom_Charges( reax_system const * const system, k_update_ghost_atom_charges <<< blocks, DEF_BLOCK_SIZE >>> ( system->d_my_atoms, spad, system->n, system->N ); - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -502,13 +497,11 @@ static void Calculate_Charges_QEq( reax_system const * const system, k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * (DEF_BLOCK_SIZE / 32) >>> ( workspace->d_workspace->x, spad_rvec2, system->n ); - cudaDeviceSynchronize( ); cudaCheckError( ); k_reduction_rvec2 <<< 1, ((blocks + 31) / 32) * 32, sizeof(rvec2) * ((blocks + 31) / 32) >>> ( spad_rvec2, &spad_rvec2[blocks], blocks ); - cudaDeviceSynchronize( ); cudaCheckError( ); copy_host_device( &my_sum, &spad_rvec2[blocks], diff --git a/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu index 01d8b831a2f3528cbdb5b8d097d894f77fc41e58..2dcd614bcc2c8ced5fb9a2486fa0827eca6ee9e8 100644 --- a/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu +++ b/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu @@ -320,7 +320,6 @@ void Vector_MakeZero( real * const v, unsigned int k ) k_vector_makezero <<< blocks, DEF_BLOCK_SIZE >>> ( v, k ); - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -343,7 +342,6 @@ void Vector_Copy( real * const dest, real const * const v, k_vector_copy <<< blocks, DEF_BLOCK_SIZE >>> ( dest, v, k ); - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -366,7 +364,6 @@ void Vector_Copy_rvec2( rvec2 * const dest, rvec2 const * const v, k_vector_copy_rvec2 <<< blocks, DEF_BLOCK_SIZE >>> ( dest, v, k ); - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -381,7 +378,6 @@ void Vector_Copy_From_rvec2( real * const dst, rvec2 const * const src, k_vector_copy_from_rvec2 <<< blocks, DEF_BLOCK_SIZE >>> ( dst, src, index, k ); - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -396,7 +392,6 @@ void Vector_Copy_To_rvec2( rvec2 * const dst, real const * const src, k_vector_copy_to_rvec2 <<< blocks, DEF_BLOCK_SIZE >>> ( dst, src, index, k ); - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -420,7 +415,6 @@ void Vector_Scale( real * const dest, real c, real const * const v, k_vector_scale <<< blocks, DEF_BLOCK_SIZE >>> ( dest, c, v, k ); - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -445,7 +439,6 @@ void Vector_Sum( real * const dest, real c, real const * const v, k_vector_sum <<< blocks, DEF_BLOCK_SIZE >>> ( dest, c, v, d, y, k ); - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -460,7 +453,6 @@ void Vector_Sum_rvec2( rvec2 * const dest, real c0, real c1, rvec2 const * const k_vector_sum_rvec2 <<< blocks, DEF_BLOCK_SIZE >>> ( dest, c0, c1, v, d0, d1, y, k ); - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -485,7 +477,6 @@ void Vector_Add( real * const dest, real c, real const * const v, k_vector_add <<< blocks, DEF_BLOCK_SIZE >>> ( dest, c, v, k ); - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -510,7 +501,6 @@ void Vector_Add_rvec2( rvec2 * const dest, real c0, real c1, rvec2 const * const k_vector_add_rvec2 <<< blocks, DEF_BLOCK_SIZE >>> ( dest, c0, c1, v, k ); - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -533,7 +523,6 @@ void Vector_Mult( real * const dest, real const * const v1, k_vector_mult <<< blocks, DEF_BLOCK_SIZE >>> ( dest, v1, v2, k ); - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -556,7 +545,6 @@ void Vector_Mult_rvec2( rvec2 * const dest, rvec2 const * const v1, k_vector_mult_rvec2 <<< blocks, DEF_BLOCK_SIZE >>> ( dest, v1, v2, k ); - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -687,13 +675,11 @@ void Dot_local_rvec2( control_params const * const control, k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>> ( spad, &spad[k], k ); - cudaDeviceSynchronize( ); cudaCheckError( ); k_reduction_rvec2 <<< 1, ((blocks + 31) / 32) * 32, sizeof(rvec2) * ((blocks + 31) / 32) >>> ( &spad[k], &spad[k + blocks], blocks ); - cudaDeviceSynchronize( ); cudaCheckError( ); //TODO: keep result of reduction on devie and pass directly to CUDA-aware MPI diff --git a/PG-PuReMD/src/cuda/cuda_forces.cu b/PG-PuReMD/src/cuda/cuda_forces.cu index 1da8cb67c990b80a013e423fa82b9f206194b54a..6d6f5aec22d3fe984d80863de630b4cf796f1eab 100644 --- a/PG-PuReMD/src/cuda/cuda_forces.cu +++ b/PG-PuReMD/src/cuda/cuda_forces.cu @@ -1203,7 +1203,6 @@ static int Cuda_Estimate_Storage_Three_Body( reax_system *system, control_params Cuda_Estimate_Valence_Angles <<< control->blocks_n, control->block_size_n >>> ( system->d_my_atoms, (control_params *)control->d_control_params, *(lists[BONDS]), system->n, system->N, thbody ); - cudaDeviceSynchronize( ); cudaCheckError( ); Cuda_Reduction_Sum( thbody, system->d_total_thbodies, system->total_bonds ); @@ -1252,7 +1251,6 @@ static void Print_Forces( reax_system *system ) k_print_forces <<< blocks, DEF_BLOCK_SIZE >>> ( system->d_my_atoms, workspace->d_workspace->f, system->n ); - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -1266,7 +1264,6 @@ static void Print_HBonds( reax_system *system, int step ) k_print_hbonds <<< blocks, DEF_BLOCK_SIZE >>> ( system->d_my_atoms, *(lists[HBONDS]), system->n, system->my_rank, step ); - cudaDeviceSynchronize( ); cudaCheckError( ); } #endif @@ -1288,7 +1285,6 @@ void Cuda_Init_Neighbor_Indices( reax_system *system, reax_list *far_nbr_list ) /* init end_indices */ k_init_end_index <<< blocks, DEF_BLOCK_SIZE >>> ( system->d_far_nbrs, far_nbr_list->index, far_nbr_list->end_index, far_nbr_list->n ); - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -1312,7 +1308,6 @@ void Cuda_Init_HBond_Indices( reax_system *system, storage *workspace, /* init Hindices */ k_init_hindex <<< blocks, DEF_BLOCK_SIZE >>> ( system->d_my_atoms, system->total_cap ); - cudaDeviceSynchronize( ); cudaCheckError( ); /* init indices and end_indices */ @@ -1321,7 +1316,6 @@ void Cuda_Init_HBond_Indices( reax_system *system, storage *workspace, k_init_hbond_indices <<< blocks, DEF_BLOCK_SIZE >>> ( system->d_my_atoms, system->reax_param.d_sbp, system->d_hbonds, temp, hbond_list->index, hbond_list->end_index, system->total_cap ); - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -1342,7 +1336,6 @@ void Cuda_Init_Bond_Indices( reax_system *system, reax_list * bond_list ) /* init end_indices */ k_init_end_index <<< blocks, DEF_BLOCK_SIZE >>> ( system->d_bonds, bond_list->index, bond_list->end_index, system->total_cap ); - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -1365,7 +1358,6 @@ void Cuda_Init_Sparse_Matrix_Indices( reax_system *system, sparse_matrix *H ) /* init end_indices */ k_init_end_index <<< blocks, DEF_BLOCK_SIZE >>> ( system->d_cm_entries, H->start, H->end, H->n_max ); - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -1411,7 +1403,6 @@ void Cuda_Estimate_Storages( reax_system *system, control_params *control, workspace->d_workspace->H.n_max, system->d_cm_entries, system->d_max_cm_entries ); } - cudaDeviceSynchronize( ); cudaCheckError( ); Cuda_Reduction_Sum( system->d_max_cm_entries, @@ -1432,7 +1423,6 @@ void Cuda_Estimate_Storages( reax_system *system, control_params *control, system->n, system->N, system->total_cap, system->d_bonds, system->d_max_bonds, system->d_hbonds, system->d_max_hbonds ); - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -1475,7 +1465,7 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, simulation_data *data, storage *workspace, reax_list **lists, output_controls *out_control ) { - int renbr, blocks, ret, ret_bonds, ret_hbonds, ret_cm; + int renbr, blocks, ret, realloc_bonds, realloc_hbonds, realloc_cm; static int dist_done = FALSE, cm_done = FALSE, bonds_done = FALSE; #if defined(LOG_PERFORMANCE) double time; @@ -1492,7 +1482,6 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, // (((system->N - system->n) % DEF_BLOCK_SIZE == 0) ? 0 : 1); // k_init_bond_mark <<< blocks, DEF_BLOCK_SIZE >>> // ( system->n, (system->N - system->n), workspace->d_workspace->bond_mark ); -// cudaDeviceSynchronize( ); // cudaCheckError( ); /* reset reallocation flags on device */ @@ -1507,7 +1496,6 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, { k_init_distance <<< control->blocks_n, control->block_size_n >>> ( system->d_my_atoms, *(lists[FAR_NBRS]), system->N ); - cudaDeviceSynchronize( ); cudaCheckError( ); dist_done = TRUE; @@ -1563,7 +1551,6 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, system->d_max_cm_entries, system->d_realloc_cm_entries ); } } - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -1581,7 +1568,6 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, workspace->d_LR, system->n, system->N, system->reax_param.num_atom_types, system->d_max_bonds, system->d_realloc_bonds, system->d_max_hbonds, system->d_realloc_hbonds ); - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -1590,21 +1576,21 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, #endif /* check reallocation flags on device */ - copy_host_device( &ret_cm, system->d_realloc_cm_entries, sizeof(int), + copy_host_device( &realloc_cm, system->d_realloc_cm_entries, sizeof(int), cudaMemcpyDeviceToHost, "Cuda_Init_Forces::d_realloc_cm_entries" ); - copy_host_device( &ret_bonds, system->d_realloc_bonds, sizeof(int), + copy_host_device( &realloc_bonds, system->d_realloc_bonds, sizeof(int), cudaMemcpyDeviceToHost, "Cuda_Init_Forces::d_realloc_bonds" ); - copy_host_device( &ret_hbonds, system->d_realloc_hbonds, sizeof(int), + copy_host_device( &realloc_hbonds, system->d_realloc_hbonds, sizeof(int), cudaMemcpyDeviceToHost, "Cuda_Init_Forces::d_realloc_hbonds" ); - ret = (ret_cm == FALSE && ret_bonds == FALSE && ret_hbonds == FALSE) + ret = (realloc_cm == FALSE && realloc_bonds == FALSE && realloc_hbonds == FALSE) ? SUCCESS : FAILURE; - if ( ret_cm == FALSE ) + if ( realloc_cm == FALSE ) { cm_done = TRUE; } - if ( ret_bonds == FALSE && ret_hbonds == FALSE ) + if ( realloc_bonds == FALSE && realloc_hbonds == FALSE ) { bonds_done = TRUE; } @@ -1613,7 +1599,6 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, { k_update_sym_dbond_indices <<< control->blocks_n, control->block_size_n >>> ( *(lists[BONDS]), system->N ); - cudaDeviceSynchronize( ); cudaCheckError( ); if ( control->hbond_cut > 0.0 && system->numH > 0 ) @@ -1624,13 +1609,11 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, /* make hbond_list symmetric */ k_update_sym_hbond_indices <<< blocks, DEF_BLOCK_SIZE >>> ( system->d_my_atoms, *(lists[HBONDS]), system->N ); - cudaDeviceSynchronize( ); cudaCheckError( ); } /* update bond_mark */ // k_bond_mark <<< control->blocks_n, control->block_size_n >>> -// cudaDeviceSynchronize( ); // cudaCheckError( ); dist_done = FALSE; @@ -1640,11 +1623,13 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, else { Cuda_Estimate_Storages( system, control, workspace, lists, - ret_bonds, ret_hbonds, ret_cm, data->step - data->prev_steps ); + realloc_bonds, realloc_hbonds, realloc_cm, + data->step - data->prev_steps ); - workspace->d_workspace->realloc.bonds = ret_bonds; - workspace->d_workspace->realloc.hbonds = ret_hbonds; - workspace->d_workspace->realloc.cm = ret_cm; + /* schedule reallocations after updating allocation sizes */ + workspace->d_workspace->realloc.bonds = realloc_bonds; + workspace->d_workspace->realloc.hbonds = realloc_hbonds; + workspace->d_workspace->realloc.cm = realloc_cm; } return ret; @@ -1692,7 +1677,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, Cuda_BO_Part1 <<< control->blocks_n, control->block_size_n >>> ( system->d_my_atoms, system->reax_param.d_sbp, *(workspace->d_workspace), system->N ); - cudaDeviceSynchronize( ); cudaCheckError( ); Cuda_BO_Part2 <<< control->blocks_n, control->block_size_n >>> @@ -1700,18 +1684,15 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, system->reax_param.d_tbp, *(workspace->d_workspace), *(lists[BONDS]), system->reax_param.num_atom_types, system->N ); - cudaDeviceSynchronize( ); cudaCheckError( ); Cuda_BO_Part3 <<< control->blocks_n, control->block_size_n >>> ( *(workspace->d_workspace), *(lists[BONDS]), system->N ); - cudaDeviceSynchronize( ); cudaCheckError( ); Cuda_BO_Part4 <<< control->blocks_n, control->block_size_n >>> ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, *(workspace->d_workspace), system->N ); - cudaDeviceSynchronize( ); cudaCheckError( ); /* 2. Bond Energy Interactions */ @@ -1722,7 +1703,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, system->reax_param.d_tbp, *(workspace->d_workspace), *(lists[BONDS]), system->n, system->reax_param.num_atom_types, spad ); - cudaDeviceSynchronize( ); cudaCheckError( ); /* reduction for E_BE */ @@ -1741,12 +1721,10 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, system->reax_param.d_sbp, system->reax_param.d_tbp, *(workspace->d_workspace), *(lists[BONDS]), system->n, system->reax_param.num_atom_types, spad, &spad[system->n], &spad[2 * system->n] ); - cudaDeviceSynchronize( ); cudaCheckError( ); Cuda_Atom_Energy_Part2 <<< control->blocks, control->block_size >>> ( *(lists[BONDS]), *(workspace->d_workspace), system->n ); - cudaDeviceSynchronize( ); cudaCheckError( ); if ( update_energy == TRUE ) @@ -1795,7 +1773,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, *(workspace->d_workspace), *(lists[BONDS]), *(lists[THREE_BODIES]), system->n, system->N, system->reax_param.num_atom_types, spad, &spad[system->N], &spad[2 * system->N], (rvec *) (&spad[3 * system->N]) ); - cudaDeviceSynchronize( ); cudaCheckError( ); if ( update_energy == TRUE ) @@ -1824,7 +1801,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, k_reduction_rvec <<< control->blocks_n, control->block_size_n, sizeof(rvec) * (control->block_size_n / 32) >>> ( rvec_spad, &rvec_spad[system->N], system->N ); - cudaDeviceSynchronize( ); cudaCheckError( ); k_reduction_rvec <<< 1, control->blocks_pow_2_n, @@ -1832,7 +1808,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, ( &rvec_spad[system->N], &((simulation_data *)data->d_simulation_data)->my_ext_press, control->blocks_n ); - cudaDeviceSynchronize (); cudaCheckError( ); // Cuda_Reduction_Sum( rvec_spad, // &((simulation_data *)data->d_simulation_data)->my_ext_press, @@ -1842,7 +1817,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, Cuda_Valence_Angles_Part2 <<< control->blocks_n, control->block_size_n >>> ( system->d_my_atoms, (control_params *) control->d_control_params, *(workspace->d_workspace), *(lists[BONDS]), system->N ); - cudaDeviceSynchronize( ); cudaCheckError( ); /* 5. Torsion Angles Interactions */ @@ -1855,7 +1829,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, *(lists[THREE_BODIES]), *(workspace->d_workspace), system->n, system->reax_param.num_atom_types, spad, &spad[system->n], (rvec *) (&spad[2 * system->n]) ); - cudaDeviceSynchronize( ); cudaCheckError( ); if ( update_energy == TRUE ) @@ -1879,7 +1852,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, k_reduction_rvec <<< control->blocks, control->block_size, sizeof(rvec) * (control->block_size / 32) >>> ( rvec_spad, &rvec_spad[system->n], system->n ); - cudaDeviceSynchronize( ); cudaCheckError( ); k_reduction_rvec <<< 1, control->blocks_pow_2, @@ -1887,7 +1859,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, ( &rvec_spad[system->n], &((simulation_data *)data->d_simulation_data)->my_ext_press, control->blocks ); - cudaDeviceSynchronize( ); cudaCheckError( ); // Cuda_Reduction_Sum( rvec_spad, // &((simulation_data *)data->d_simulation_data)->my_ext_press, @@ -1897,7 +1868,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, Cuda_Torsion_Angles_Part2 <<< control->blocks_n, control->block_size_n >>> ( system->d_my_atoms, *(workspace->d_workspace), *(lists[BONDS]), system->N ); - cudaDeviceSynchronize( ); cudaCheckError( ); /* 6. Hydrogen Bonds Interactions */ @@ -1920,7 +1890,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, *(lists[FAR_NBRS]), *(lists[BONDS]), *(lists[HBONDS]), system->n, system->reax_param.num_atom_types, spad, (rvec *) (&spad[system->n]), system->my_rank ); - cudaDeviceSynchronize( ); cudaCheckError( ); if ( update_energy == TRUE ) @@ -1939,7 +1908,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, k_reduction_rvec <<< control->blocks, control->block_size, sizeof(rvec) * (control->block_size / 32) >>> ( rvec_spad, &rvec_spad[system->n], system->n ); - cudaDeviceSynchronize( ); cudaCheckError( ); k_reduction_rvec <<< 1, control->blocks_pow_2, @@ -1947,7 +1915,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, ( &rvec_spad[system->n], &((simulation_data *)data->d_simulation_data)->my_ext_press, control->blocks ); - cudaDeviceSynchronize( ); cudaCheckError( ); // Cuda_Reduction_Sum( rvec_spad, // &((simulation_data *)data->d_simulation_data)->my_ext_press, @@ -1957,7 +1924,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, Cuda_Hydrogen_Bonds_Part2 <<< control->blocks, control->block_size >>> ( system->d_my_atoms, *(workspace->d_workspace), *(lists[BONDS]), system->n ); - cudaDeviceSynchronize( ); cudaCheckError( ); // hnbrs_blocks = (system->n * HB_POST_PROC_KER_THREADS_PER_ATOM / HB_POST_PROC_BLOCK_SIZE) + @@ -1968,7 +1934,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, // Cuda_Hydrogen_Bonds_Part3_opt <<< hnbrs_blocks, HB_POST_PROC_BLOCK_SIZE, // HB_POST_PROC_BLOCK_SIZE * sizeof(rvec) >>> // ( system->d_my_atoms, *(workspace->d_workspace), *(lists[HBONDS]), system->n ); - cudaDeviceSynchronize( ); cudaCheckError( ); } diff --git a/PG-PuReMD/src/cuda/cuda_integrate.cu b/PG-PuReMD/src/cuda/cuda_integrate.cu index 1c3f30682893640e272135c788ddce273fab919f..8478249e5e7da1e4d2744fb763eca204d4dd3565 100644 --- a/PG-PuReMD/src/cuda/cuda_integrate.cu +++ b/PG-PuReMD/src/cuda/cuda_integrate.cu @@ -191,7 +191,6 @@ void Velocity_Verlet_Part1( reax_system *system, real dt ) k_velocity_verlet_part1 <<< blocks, DEF_BLOCK_SIZE >>> ( system->d_my_atoms, system->reax_param.d_sbp, dt, system->n ); - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -205,7 +204,6 @@ void Velocity_Verlet_Part2( reax_system *system, real dt ) k_velocity_verlet_part2 <<< blocks, DEF_BLOCK_SIZE >>> ( system->d_my_atoms, system->reax_param.d_sbp, dt, system->n ); - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -219,7 +217,6 @@ void Velocity_Verlet_Nose_Hoover_NVT_Part1( reax_system *system, real dt ) k_velocity_verlet_nose_hoover_nvt <<< blocks, DEF_BLOCK_SIZE >>> ( system->d_my_atoms, system->reax_param.d_sbp, dt, system->n ); - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -234,7 +231,6 @@ void Velocity_Verlet_Nose_Hoover_NVT_Part2( reax_system *system, storage *worksp k_velocity_verlet_nose_hoover_nvt <<< blocks, DEF_BLOCK_SIZE >>> ( system->d_my_atoms, workspace->v_const, system->reax_param.d_sbp, dt, v_xi, system->n ); - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -251,7 +247,6 @@ real Velocity_Verlet_Nose_Hoover_NVT_Part3( reax_system *system, storage *worksp k_velocity_verlet_nose_hoover_nvt_part3 <<< blocks, DEF_BLOCK_SIZE >>> ( system->d_my_atoms, workspace->v_const, system->reax_param.d_sbp, dt, v_xi_old, d_my_ekin, system->n ); - cudaDeviceSynchronize( ); cudaCheckError( ); Cuda_Reduction_Sum( d_my_ekin, d_total_my_ekin, system->n ); @@ -273,7 +268,6 @@ static void Cuda_Scale_Velocities_Berendsen_NVT( reax_system *system, real lambd k_scale_velocites_berendsen_nvt <<< blocks, DEF_BLOCK_SIZE >>> ( system->d_my_atoms, lambda, system->n ); - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -287,7 +281,6 @@ void Cuda_Scale_Velocities_NPT( reax_system *system, real lambda, rvec mu ) k_scale_velocities_npt <<< blocks, DEF_BLOCK_SIZE >>> ( system->d_my_atoms, lambda, mu[0], mu[1], mu[2], system->n ); - cudaDeviceSynchronize( ); cudaCheckError( ); } diff --git a/PG-PuReMD/src/cuda/cuda_neighbors.cu b/PG-PuReMD/src/cuda/cuda_neighbors.cu index 62878b2fef5df0326a0ee5dcdd320e6b29c14f2c..d937b35191273a00bff94ba9873b01638c842b77 100644 --- a/PG-PuReMD/src/cuda/cuda_neighbors.cu +++ b/PG-PuReMD/src/cuda/cuda_neighbors.cu @@ -587,7 +587,6 @@ extern "C" int Cuda_Generate_Neighbor_Lists( reax_system *system, system->d_my_grid, *(lists[FAR_NBRS]), system->n, system->N, system->d_far_nbrs, system->d_max_far_nbrs, system->d_realloc_far_nbrs ); - cudaDeviceSynchronize( ); cudaCheckError( ); /* multiple threads per atom implementation */ @@ -598,7 +597,6 @@ extern "C" int Cuda_Generate_Neighbor_Lists( reax_system *system, // sizeof(int) * 2 * NBRS_BLOCK_SIZE >>> // ( system->d_my_atoms, system->my_ext_box, system->d_my_grid, // *(lists[FAR_NBRS]), system->n, system->N ); -// cudaDeviceSynchronize( ); // cudaCheckError( ); /* check reallocation flag on device */ @@ -635,7 +633,6 @@ void Cuda_Estimate_Num_Neighbors( reax_system *system, simulation_data *data ) ( system->d_my_atoms, system->my_ext_box, system->d_my_grid, system->n, system->N, system->total_cap, system->d_far_nbrs, system->d_max_far_nbrs ); - cudaDeviceSynchronize( ); cudaCheckError( ); Cuda_Reduction_Sum( system->d_max_far_nbrs, system->d_total_far_nbrs, diff --git a/PG-PuReMD/src/cuda/cuda_nonbonded.cu b/PG-PuReMD/src/cuda/cuda_nonbonded.cu index 37e0e28b1912c155efcf7df1482f28c3aa016868..ab1c21224ca9363914d1210055cb05c8c073603b 100644 --- a/PG-PuReMD/src/cuda/cuda_nonbonded.cu +++ b/PG-PuReMD/src/cuda/cuda_nonbonded.cu @@ -680,7 +680,6 @@ static void Cuda_Compute_Polarization_Energy( reax_system *system, storage *work k_compute_polarization_energy <<< blocks, DEF_BLOCK_SIZE >>> ( system->d_my_atoms, system->reax_param.d_sbp, system->n, spad ); - cudaDeviceSynchronize( ); cudaCheckError( ); Cuda_Reduction_Sum( spad, @@ -720,7 +719,6 @@ void Cuda_NonBonded_Energy( reax_system *system, control_params *control, // *(workspace->d_workspace), *(lists[FAR_NBRS]), // system->n, system->reax_param.num_atom_types, // spad, &spad[system->n], (rvec *) (&spad[2 * system->n]) ); - cudaDeviceSynchronize( ); cudaCheckError( ); } else @@ -734,7 +732,6 @@ void Cuda_NonBonded_Energy( reax_system *system, control_params *control, data->step, data->prev_steps, out_control->energy_update_freq, spad, &spad[system->n], (rvec *) (&spad[2 * system->n])); - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -759,7 +756,6 @@ void Cuda_NonBonded_Energy( reax_system *system, control_params *control, k_reduction_rvec <<< control->blocks, control->block_size, sizeof(rvec) * (control->block_size / 32) >>> ( spad_rvec, &spad_rvec[system->n], system->n ); - cudaDeviceSynchronize( ); cudaCheckError( ); k_reduction_rvec <<< 1, control->blocks_pow_2, @@ -767,7 +763,6 @@ void Cuda_NonBonded_Energy( reax_system *system, control_params *control, ( &spad_rvec[system->n], &((simulation_data *)data->d_simulation_data)->my_ext_press, control->blocks ); - cudaDeviceSynchronize( ); cudaCheckError( ); } diff --git a/PG-PuReMD/src/cuda/cuda_post_evolve.cu b/PG-PuReMD/src/cuda/cuda_post_evolve.cu index 7d419aa49db0fecca903f3ae3d151bdca96aed07..ea0e3f163342ff7f4d2611aa2ac77df2888cf7d8 100644 --- a/PG-PuReMD/src/cuda/cuda_post_evolve.cu +++ b/PG-PuReMD/src/cuda/cuda_post_evolve.cu @@ -35,6 +35,5 @@ extern "C" void Cuda_Remove_CoM_Velocities( reax_system *system, { k_remove_center_of_mass_velocities <<< control->blocks, control->block_size >>> ( system->d_my_atoms, (simulation_data *)data->d_simulation_data, system->n ); - cudaDeviceSynchronize( ); cudaCheckError( ); } diff --git a/PG-PuReMD/src/cuda/cuda_reduction.cu b/PG-PuReMD/src/cuda/cuda_reduction.cu index 71dce725d14ae8576bad491a8b75e66a801e1d8c..16ed9d253a1113cf30d2f8689233f323b33304c2 100644 --- a/PG-PuReMD/src/cuda/cuda_reduction.cu +++ b/PG-PuReMD/src/cuda/cuda_reduction.cu @@ -37,7 +37,6 @@ void Cuda_Reduction_Sum( int *d_array, int *d_dest, size_t n ) /* determine temporary device storage requirements */ cub::DeviceReduce::Sum( d_temp_storage, temp_storage_bytes, d_array, d_dest, n ); - cudaDeviceSynchronize( ); cudaCheckError( ); /* allocate temporary storage */ @@ -47,7 +46,6 @@ void Cuda_Reduction_Sum( int *d_array, int *d_dest, size_t n ) /* run sum-reduction */ cub::DeviceReduce::Sum( d_temp_storage, temp_storage_bytes, d_array, d_dest, n ); - cudaDeviceSynchronize( ); cudaCheckError( ); /* deallocate temporary storage */ @@ -67,7 +65,6 @@ void Cuda_Reduction_Sum( real *d_array, real *d_dest, size_t n ) /* determine temporary device storage requirements */ cub::DeviceReduce::Sum( d_temp_storage, temp_storage_bytes, d_array, d_dest, n ); - cudaDeviceSynchronize( ); cudaCheckError( ); /* allocate temporary storage */ @@ -77,7 +74,6 @@ void Cuda_Reduction_Sum( real *d_array, real *d_dest, size_t n ) /* run sum-reduction */ cub::DeviceReduce::Sum( d_temp_storage, temp_storage_bytes, d_array, d_dest, n ); - cudaDeviceSynchronize( ); cudaCheckError( ); /* deallocate temporary storage */ @@ -99,7 +95,6 @@ void Cuda_Reduction_Sum( real *d_array, real *d_dest, size_t n ) // /* determine temporary device storage requirements */ // cub::DeviceReduce::Reduce( d_temp_storage, temp_storage_bytes, // d_array, d_dest, n, sum_op, init ); -// cudaDeviceSynchronize( ); // cudaCheckError( ); // // /* allocate temporary storage */ @@ -109,7 +104,6 @@ void Cuda_Reduction_Sum( real *d_array, real *d_dest, size_t n ) // /* run sum-reduction */ // cub::DeviceReduce::Reduce( d_temp_storage, temp_storage_bytes, // d_array, d_dest, n, sum_op, init ); -// cudaDeviceSynchronize( ); // cudaCheckError( ); // // /* deallocate temporary storage */ @@ -129,7 +123,6 @@ void Cuda_Reduction_Max( int *d_array, int *d_dest, size_t n ) /* determine temporary device storage requirements */ cub::DeviceReduce::Max( d_temp_storage, temp_storage_bytes, d_array, d_dest, n ); - cudaDeviceSynchronize( ); cudaCheckError( ); /* allocate temporary storage */ @@ -139,7 +132,6 @@ void Cuda_Reduction_Max( int *d_array, int *d_dest, size_t n ) /* run exclusive prefix sum */ cub::DeviceReduce::Max( d_temp_storage, temp_storage_bytes, d_array, d_dest, n ); - cudaDeviceSynchronize( ); cudaCheckError( ); /* deallocate temporary storage */ @@ -159,7 +151,6 @@ void Cuda_Scan_Excl_Sum( int *d_src, int *d_dest, size_t n ) /* determine temporary device storage requirements */ cub::DeviceScan::ExclusiveSum( d_temp_storage, temp_storage_bytes, d_src, d_dest, n ); - cudaDeviceSynchronize( ); cudaCheckError( ); /* allocate temporary storage */ @@ -169,7 +160,6 @@ void Cuda_Scan_Excl_Sum( int *d_src, int *d_dest, size_t n ) /* run exclusive prefix sum */ cub::DeviceScan::ExclusiveSum( d_temp_storage, temp_storage_bytes, d_src, d_dest, n ); - cudaDeviceSynchronize( ); cudaCheckError( ); /* deallocate temporary storage */ diff --git a/PG-PuReMD/src/cuda/cuda_reset_tools.cu b/PG-PuReMD/src/cuda/cuda_reset_tools.cu index 5d1af4df2d6fae5a376665a7d87bcfd2f360944e..99d55dd095d2431527931b262cdb2a7ed8bd47a5 100644 --- a/PG-PuReMD/src/cuda/cuda_reset_tools.cu +++ b/PG-PuReMD/src/cuda/cuda_reset_tools.cu @@ -62,7 +62,6 @@ void Cuda_Reset_Workspace( reax_system *system, storage *workspace ) k_reset_workspace <<< blocks, DEF_BLOCK_SIZE >>> ( *(workspace->d_workspace), system->total_cap ); - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -79,7 +78,6 @@ void Cuda_Reset_Atoms_HBond_Indices( reax_system* system, control_params *contro k_reset_hindex <<< control->blocks_n, control->block_size_n >>> ( system->d_my_atoms, system->reax_param.d_sbp, hindex, system->N ); - cudaDeviceSynchronize( ); cudaCheckError( ); Cuda_Reduction_Sum( hindex, system->d_numH, system->N ); diff --git a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu index 39edee8b1cf89af02f01a391e365065e4e9ca526..2964a08199a509727bd33bc8c5eb158574bf3064 100644 --- a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu +++ b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu @@ -532,7 +532,6 @@ void dual_jacobi_apply( real const * const Hdia_inv, rvec2 const * const y, k_dual_jacobi_apply <<< blocks, DEF_BLOCK_SIZE >>> ( Hdia_inv, y, x, n ); - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -547,7 +546,6 @@ void jacobi_apply( real const * const Hdia_inv, real const * const y, k_jacobi_apply <<< blocks, DEF_BLOCK_SIZE >>> ( Hdia_inv, y, x, n ); - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -638,7 +636,6 @@ static void Dual_Sparse_MatVec_local( control_params const * const control, k_dual_sparse_matvec_full_opt_csr <<< blocks, DEF_BLOCK_SIZE >>> ( A->start, A->end, A->j, A->val, x, b, A->n ); } - cudaDeviceSynchronize( ); cudaCheckError( ); } @@ -818,7 +815,6 @@ static void Sparse_MatVec_local( control_params const * const control, k_sparse_matvec_full_opt_csr <<< blocks, DEF_BLOCK_SIZE >>> ( A->start, A->end, A->j, A->val, x, b, A->n ); } - cudaDeviceSynchronize( ); cudaCheckError( ); } diff --git a/PG-PuReMD/src/cuda/cuda_system_props.cu b/PG-PuReMD/src/cuda/cuda_system_props.cu index e13bb8b9c135ea93d4404db0d76c9caf5f570738..88d0e1a84a296cb4b7b17fbc1b4429c8123a749e 100644 --- a/PG-PuReMD/src/cuda/cuda_system_props.cu +++ b/PG-PuReMD/src/cuda/cuda_system_props.cu @@ -576,13 +576,11 @@ static void Cuda_Compute_Momentum( reax_system *system, control_params *control, k_center_of_mass_blocks_xcm <<< control->blocks, control->block_size, sizeof(rvec) * (control->block_size / 32) >>> ( system->reax_param.d_sbp, system->d_my_atoms, spad, system->n ); - cudaDeviceSynchronize( ); cudaCheckError( ); k_reduction_rvec <<< 1, control->blocks_pow_2, sizeof(rvec) * (control->blocks_pow_2 / 32) >>> ( spad, &spad[control->blocks], control->blocks ); - cudaDeviceSynchronize( ); cudaCheckError( ); copy_host_device( xcm, &spad[control->blocks], sizeof(rvec), @@ -595,13 +593,11 @@ static void Cuda_Compute_Momentum( reax_system *system, control_params *control, k_center_of_mass_blocks_vcm <<< control->blocks, control->block_size, sizeof(rvec) * (control->block_size / 32) >>> ( system->reax_param.d_sbp, system->d_my_atoms, spad, system->n ); - cudaDeviceSynchronize( ); cudaCheckError( ); k_reduction_rvec <<< 1, control->blocks_pow_2, sizeof(rvec) * (control->blocks_pow_2 / 32) >>> ( spad, &spad[control->blocks], control->blocks ); - cudaDeviceSynchronize( ); cudaCheckError( ); copy_host_device( vcm, &spad[control->blocks], sizeof(rvec), @@ -614,13 +610,11 @@ static void Cuda_Compute_Momentum( reax_system *system, control_params *control, k_center_of_mass_blocks_amcm <<< control->blocks, control->block_size, sizeof(rvec) * (control->block_size / 32) >>> ( system->reax_param.d_sbp, system->d_my_atoms, spad, system->n ); - cudaDeviceSynchronize( ); cudaCheckError( ); k_reduction_rvec <<< 1, control->blocks_pow_2, sizeof(rvec) * (control->blocks_pow_2 / 32) >>> ( spad, &spad[control->blocks], control->blocks ); - cudaDeviceSynchronize( ); cudaCheckError( ); copy_host_device( amcm, &spad[control->blocks], sizeof(rvec), @@ -644,28 +638,24 @@ static void Cuda_Compute_Inertial_Tensor( reax_system *system, control_params *c sizeof(real) * 2 * (control->block_size / 32) >>> ( system->reax_param.d_sbp, system->d_my_atoms, spad, my_xcm[0], my_xcm[1], my_xcm[2], system->n ); - cudaDeviceSynchronize( ); cudaCheckError( ); k_compute_inertial_tensor_xz_yy <<< control->blocks, control->block_size, sizeof(real) * 2 * (control->block_size / 32) >>> ( system->reax_param.d_sbp, system->d_my_atoms, spad, my_xcm[0], my_xcm[1], my_xcm[2], system->n ); - cudaDeviceSynchronize( ); cudaCheckError( ); k_compute_inertial_tensor_yz_zz <<< control->blocks, control->block_size, sizeof(real) * 2 * (control->block_size / 32) >>> ( system->reax_param.d_sbp, system->d_my_atoms, spad, my_xcm[0], my_xcm[1], my_xcm[2], system->n ); - cudaDeviceSynchronize( ); cudaCheckError( ); /* reduction of block-level partial sums for inertial tensor */ k_compute_inertial_tensor_blocks <<< 1, control->blocks_pow_2, sizeof(real) * 6 * control->blocks_pow_2 >>> ( spad, &spad[6 * control->blocks], control->blocks ); - cudaDeviceSynchronize( ); cudaCheckError( ); copy_host_device( t, &spad[6 * control->blocks], @@ -710,7 +700,6 @@ extern "C" void Cuda_Compute_Kinetic_Energy( reax_system *system, k_compute_kinetic_energy <<< control->blocks, control->block_size, sizeof(real) * (control->block_size / 32) >>> ( system->reax_param.d_sbp, system->d_my_atoms, block_energy, system->n ); - cudaDeviceSynchronize( ); cudaCheckError( ); /* note: above kernel sums the kinetic energy contribution within blocks, @@ -751,7 +740,6 @@ void Cuda_Compute_Total_Mass( reax_system *system, control_params *control, k_compute_total_mass <<< control->blocks, control->block_size, sizeof(real) * (control->block_size / 32) >>> ( system->reax_param.d_sbp, system->d_my_atoms, spad_real, system->n ); - cudaDeviceSynchronize( ); cudaCheckError( ); /* note: above kernel sums the mass contribution within blocks, @@ -905,14 +893,12 @@ void Cuda_Compute_Pressure( reax_system* system, control_params *control, k_reduction_rvec <<< control->blocks, control->block_size, sizeof(rvec) * (control->block_size / 32) >>> ( rvec_spad, &rvec_spad[system->n], system->n ); - cudaDeviceSynchronize( ); cudaCheckError( ); k_reduction_rvec <<< 1, control->blocks_pow_2, sizeof(rvec) * (control->blocks_pow_2 / 32) >>> ( &rvec_spad[system->n], &rvec_spad[system->n + control->blocks], control->blocks ); - cudaDeviceSynchronize( ); cudaCheckError( ); copy_host_device( &int_press, &rvec_spad[system->n + control->blocks], diff --git a/PG-PuReMD/src/cuda/cuda_utils.h b/PG-PuReMD/src/cuda/cuda_utils.h index b40fa6d1896da3d5760249c843104f20fdad2c1b..5d039c57a72d1ae5a3d89f86479e065763830560 100644 --- a/PG-PuReMD/src/cuda/cuda_utils.h +++ b/PG-PuReMD/src/cuda/cuda_utils.h @@ -22,26 +22,33 @@ void Cuda_Print_Mem_Usage( ); #define cudaCheckError() __cudaCheckError( __FILE__, __LINE__ ) static inline void __cudaCheckError( const char *file, const int line ) { +#if defined(DEBUG) cudaError err; - err = cudaGetLastError(); +#if defined(DEBUG_FOCUS) + /* Block until tasks in stream are complete in order to enable + * more pinpointed error checking. However, this will affect performance. */ + err = cudaDeviceSynchronize( ); if ( cudaSuccess != err ) { - fprintf( stderr, "[ERROR] runtime error encountered: %s:%d\n", file, line ); + fprintf( stderr, "[ERROR] runtime error encountered with cudaDeviceSynchronize( ) at: %s:%d\n", file, line ); fprintf( stderr, " [INFO] CUDA API error code: %d\n", err ); + fprintf( stderr, " [INFO] CUDA API error name: %s\n", cudaGetErrorName( err ) ); + fprintf( stderr, " [INFO] CUDA API error text: %s\n", cudaGetErrorString( err ) ); exit( RUNTIME_ERROR ); } +#endif -#if defined(DEBUG_FOCUS) - /* More careful checking. However, this will affect performance. */ - err = cudaDeviceSynchronize( ); - if( cudaSuccess != err ) + err = cudaGetLastError(); + if ( cudaSuccess != err ) { - exit( RUNTIME_ERROR ); + fprintf( stderr, "[ERROR] runtime error encountered: %s:%d\n", file, line ); + fprintf( stderr, " [INFO] CUDA API error code: %d\n", err ); + fprintf( stderr, " [INFO] CUDA API error name: %s\n", cudaGetErrorName( err ) ); + fprintf( stderr, " [INFO] CUDA API error text: %s\n", cudaGetErrorString( err ) ); + exit( RUNTIME_ERROR ); } #endif - - return; } diff --git a/PG-PuReMD/src/geo_tools.c b/PG-PuReMD/src/geo_tools.c index 7ec19c85dad48eb2ae3f3ede092f548b21f1bdda..3b3b0c1a2cef4f9eb947fcf5b88925e233deffbb 100644 --- a/PG-PuReMD/src/geo_tools.c +++ b/PG-PuReMD/src/geo_tools.c @@ -532,7 +532,9 @@ void Write_PDB_File( reax_system * const system, reax_list * const bond_list, system->big_box.box_norms[2], RAD2DEG(alpha), RAD2DEG(beta), RAD2DEG(gamma), " ", 0 ); fprintf( out_control->log, "Box written\n" ); +#if defined(DEBUG) fflush( out_control->log ); +#endif } /* write atom lines to buffer */ diff --git a/PG-PuReMD/src/io_tools.c b/PG-PuReMD/src/io_tools.c index a139964e61b5bfd15e1bd0176d8a5dab44acf9d7..d97655b798ff1abe1a2b052ca75aab06873ccc9a 100644 --- a/PG-PuReMD/src/io_tools.c +++ b/PG-PuReMD/src/io_tools.c @@ -81,8 +81,8 @@ void Init_Output_Files( reax_system *system, control_params *control, fprintf( out_control->out, "%-6s%24s%24s%24s%13s%16s%13s\n", "step", "total energy", "potential", "kinetic", "T(K)", "V(A^3)", "P(GPa)" ); -#endif fflush( out_control->out ); +#endif /* potential file */ sprintf( temp, "%s.pot", control->sim_name ); @@ -101,8 +101,8 @@ void Init_Output_Files( reax_system *system, control_params *control, "step", "ebond", "eatom", "elp", "eang", "ecoa", "ehb", "etor", "econj", "evdw", "ecoul", "epol" ); -#endif fflush( out_control->pot ); +#endif #if defined(LOG_PERFORMANCE) /* log file */ @@ -114,7 +114,9 @@ void Init_Output_Files( reax_system *system, control_params *control, "step", "total", "comm", "nbrs", "init", "bonded", "nonb", "charges", "siters", "retries" ); +#if defined(DEBUG) fflush( out_control->log ); +#endif #endif } @@ -130,7 +132,9 @@ void Init_Output_Files( reax_system *system, control_params *control, fprintf( out_control->prs, "%8s%13s%13s%13s%13s%13s%13s%13s\n", "step", "Pint/norm[x]", "Pint/norm[y]", "Pint/norm[z]", "Pext/Ptot[x]", "Pext/Ptot[y]", "Pext/Ptot[z]", "Pkin/V" ); +#if defined(DEBUG) fflush( out_control->prs ); +#endif } /* electric dipole moment analysis file */ @@ -143,7 +147,9 @@ void Init_Output_Files( reax_system *system, control_params *control, fprintf( out_control->dpl, "%6s%20s%30s", "step", "molecule count", "avg dipole moment norm" ); +#if defined(DEBUG) fflush( out_control->dpl ); +#endif } /* diffusion coef analysis file */ @@ -155,7 +161,9 @@ void Init_Output_Files( reax_system *system, control_params *control, fprintf( out_control->drft, "%7s%20s%20s\n", "step", "type count", "avg disp^2" ); +#if defined(DEBUG) fflush( out_control->drft ); +#endif } } @@ -1295,10 +1303,10 @@ void Output_Results( reax_system *system, control_params *control, data->sys_en.e_hb, data->sys_en.e_tor, data->sys_en.e_con, data->sys_en.e_vdW, data->sys_en.e_ele, data->sys_en.e_pol); -#endif //DEBUG fflush( out_control->out ); fflush( out_control->pot ); +#endif //DEBUG #if defined(LOG_PERFORMANCE) t_elapsed = Get_Elapsed_Time( data->timing.total ); @@ -1346,7 +1354,9 @@ void Output_Results( reax_system *system, control_params *control, // data->timing.cm_solver_orthog * denom, // data->timing.cm_solver_tri_solve * denom ); +#if defined(DEBUG) fflush( out_control->log ); +#endif #endif //LOG_PERFORMANCE if ( control->virial ) @@ -1365,7 +1375,9 @@ void Output_Results( reax_system *system, control_params *control, data->tot_press[0], data->tot_press[1], data->tot_press[2], system->big_box.V ); +#if defined(DEBUG) fflush( out_control->prs ); +#endif } } diff --git a/PG-PuReMD/src/lin_alg.c b/PG-PuReMD/src/lin_alg.c index 8001090c5466b99a7a8ed15c202e1a41fc0edbf4..7d470e3e9d1c40471e3162ba3acac70ecb205877 100644 --- a/PG-PuReMD/src/lin_alg.c +++ b/PG-PuReMD/src/lin_alg.c @@ -964,7 +964,7 @@ real sparse_approx_inverse( reax_system const * const system, t_start = Get_Time( ); Dist( system, mpi_data, row_nnz, REAL_PTR_TYPE, MPI_INT ); t_comm += Get_Time( ) - t_start; - fprintf( stdout,"SAI after Dist call\n"); + fprintf( stdout,"SAI after Dist call\n" ); fflush( stdout ); out_bufs = mpi_data->out_nt_buffers; @@ -992,7 +992,7 @@ real sparse_approx_inverse( reax_system const * const system, j_recv[d] = (int *) malloc( sizeof(int) * cnt ); val_recv[d] = (real *) malloc( sizeof(real) * cnt ); - fprintf( stdout,"Dist communication receive phase direction %d will receive %d\n", d, cnt); + fprintf( stdout, "Dist communication receive phase direction %d will receive %d\n", d, cnt ); fflush( stdout ); t_start = Get_Time( ); ret = MPI_Irecv( j_recv + d, cnt, MPI_INT, nbr->receive_rank, @@ -1023,7 +1023,7 @@ real sparse_approx_inverse( reax_system const * const system, } // row_nnz[ out_bufs[d].index[i] ]; } - fprintf( stdout,"Dist communication send phase direction %d should send %d\n", d, cnt); + fprintf( stdout,"Dist communication send phase direction %d should send %d\n", d, cnt ); fflush( stdout ); if ( cnt ) @@ -1050,18 +1050,18 @@ real sparse_approx_inverse( reax_system const * const system, ret = MPI_Send( j_send, cnt, MPI_INT, nbr->rank, d, mpi_data->comm_mesh3D ); Check_MPI_Error( ret, __FILE__, __LINE__ ); - fprintf( stdout,"Dist communication send phase direction %d cnt = %d\n", d, cnt); + fprintf( stdout,"Dist communication send phase direction %d cnt = %d\n", d, cnt ); fflush( stdout ); ret = MPI_Send( val_send, cnt, MPI_DOUBLE, nbr->rank, d, mpi_data->comm_mesh3D ); Check_MPI_Error( ret, __FILE__, __LINE__ ); - fprintf( stdout,"Dist communication send phase direction %d cnt = %d\n", d, cnt); + fprintf( stdout,"Dist communication send phase direction %d cnt = %d\n", d, cnt ); fflush( stdout ); t_comm += Get_Time( ) - t_start; } } } - fprintf( stdout," Dist communication for sending row info before waitany\n"); + fprintf( stdout," Dist communication for sending row info before waitany\n" ); fflush( stdout ); /////////////////////// for ( d = 0; d < count; ++d ) @@ -1097,7 +1097,7 @@ real sparse_approx_inverse( reax_system const * const system, } } ////////////////////// - fprintf( stdout," wow wow wow, Dist communication for sending row info worked\n"); + fprintf( stdout, "Dist communication for sending row info worked\n" ); fflush( stdout ); //TODO: size? X = (int *) malloc( sizeof(int) * (system->bigN + 1) ); @@ -1180,7 +1180,7 @@ real sparse_approx_inverse( reax_system const * const system, local_pos = A_spar_patt->j[ A_spar_patt->start[i] + d_j ]; if ( local_pos < 0 || local_pos >= system->N ) { - fprintf( stderr, "THE LOCAL POSITION OF THE ATOM IS NOT VALID, STOP THE EXECUTION\n"); + fprintf( stderr, "THE LOCAL POSITION OF THE ATOM IS NOT VALID, STOP THE EXECUTION\n" ); fflush( stderr ); } diff --git a/PG-PuReMD/src/nonbonded.c b/PG-PuReMD/src/nonbonded.c index f2bad81a279f47236feafc7193144fad819e0595..cc86a3a2c60d165abe045308b74f20d7a7767c47 100644 --- a/PG-PuReMD/src/nonbonded.c +++ b/PG-PuReMD/src/nonbonded.c @@ -182,7 +182,8 @@ void vdW_Coulomb_Energy( reax_system const * const system, #if defined(DEBUG_FOCUS) fprintf( stderr, "%6d%6d%24.12f%24.12f%24.12f%24.12f\n", i + 1, j + 1, - e_base, de_base, e_core, de_core ); fflush( stderr ); + e_base, de_base, e_core, de_core ); + fflush( stderr ); #endif /* Coulomb Calculations */ @@ -198,7 +199,8 @@ void vdW_Coulomb_Energy( reax_system const * const system, #if defined(DEBUG_FOCUS) fprintf( stderr, "%6d%6d%24.12f%24.12f\n", - i + 1, j + 1, e_clb, de_clb ); fflush( stderr ); + i + 1, j + 1, e_clb, de_clb ); + fflush( stderr ); #endif if ( control->virial == 0 ) diff --git a/PG-PuReMD/src/restart.c b/PG-PuReMD/src/restart.c index 81ded98bee6acd1509e16e3e58e0953144ae8b63..ef932e36a61729d8642c6e9a0c4bdd69d638be5c 100644 --- a/PG-PuReMD/src/restart.c +++ b/PG-PuReMD/src/restart.c @@ -147,7 +147,9 @@ void Write_Restart_File( reax_system *system, control_params *control, system->big_box.box[1][2], system->big_box.box[2][0], system->big_box.box[2][1], system->big_box.box[2][2]); - fflush(fres); +#if defined(DEBUG) + fflush( fres ); +#endif buffer_req = system->bigN * RESTART_LINE_LEN + 1; } @@ -312,7 +314,8 @@ void Read_Binary_Restart_File( const char * const res_file, reax_system *system, " name = %8s, x = (%f, %f, %f), v = (%f, %f, %f)\n", top, p_atom->orig_id, p_atom->type, p_atom->name, p_atom->x[0], p_atom->x[1], p_atom->x[2], - p_atom->v[0], p_atom->v[1], p_atom->v[2] ); fflush( stderr ); + p_atom->v[0], p_atom->v[1], p_atom->v[2] ); + fflush( stderr ); #endif top++;