From 036214bea373f1ff103f448ec7bedacebdc6087c Mon Sep 17 00:00:00 2001 From: "Kurt A. O'Hearn" <ohearnku@cse.msu.edu> Date: Mon, 21 Nov 2016 11:11:36 -0500 Subject: [PATCH] PG-PuReMD: code clean-up. --- PG-PuReMD/src/cuda_linear_solvers.cu | 197 +++++++++++++++++--------- PG-PuReMD/src/cuda_qEq.cu | 81 +++++++---- PG-PuReMD/src/linear_solvers.c | 151 ++++++++++++++++---- PG-PuReMD/src/parallelreax.c | 203 ++++++++++++++++----------- PG-PuReMD/src/qEq.c | 51 +++++-- 5 files changed, 462 insertions(+), 221 deletions(-) diff --git a/PG-PuReMD/src/cuda_linear_solvers.cu b/PG-PuReMD/src/cuda_linear_solvers.cu index 1b1f510c..0bc4d4e3 100644 --- a/PG-PuReMD/src/cuda_linear_solvers.cu +++ b/PG-PuReMD/src/cuda_linear_solvers.cu @@ -28,225 +28,287 @@ #include "matvec.h" - -void get_from_device (real *host, real *device, unsigned int bytes, char *msg) +void get_from_device(real *host, real *device, unsigned int bytes, char *msg) { - copy_host_device (host, device, bytes, cudaMemcpyDeviceToHost, msg); + copy_host_device( host, device, bytes, cudaMemcpyDeviceToHost, msg ); } -void put_on_device (real *host, real *device, unsigned int bytes, char *msg) + +void put_on_device(real *host, real *device, unsigned int bytes, char *msg) { - copy_host_device (host, device, bytes, cudaMemcpyHostToDevice, msg); + copy_host_device( host, device, bytes, cudaMemcpyHostToDevice, msg ); } -void Cuda_Vector_Sum (real *res, real a, real *x, real b, real *y, int count) + +void Cuda_Vector_Sum(real *res, real a, real *x, real b, real *y, int count) { //res = ax + by //use the cublas here int blocks; + blocks = (count / DEF_BLOCK_SIZE) + ((count % DEF_BLOCK_SIZE == 0) ? 0 : 1); + k_vector_sum <<< blocks, DEF_BLOCK_SIZE >>> ( res, a, x, b, y, count ); - cudaThreadSynchronize (); - cudaCheckError (); + + cudaThreadSynchronize(); + cudaCheckError(); } -void Cuda_CG_Preconditioner (real *res, real *a, real *b, int count) + +void Cuda_CG_Preconditioner(real *res, real *a, real *b, int count) { //res = a*b - vector multiplication //use the cublas here. int blocks; + blocks = (count / DEF_BLOCK_SIZE) + ((count % DEF_BLOCK_SIZE == 0) ? 0 : 1); + k_vector_mul <<< blocks, DEF_BLOCK_SIZE >>> ( res, a, b, count ); - cudaThreadSynchronize (); + + cudaThreadSynchronize(); } -CUDA_GLOBAL void k_diagnol_preconditioner (storage p_workspace, rvec2 *b, int n) + +CUDA_GLOBAL void k_diagnol_preconditioner(storage p_workspace, rvec2 *b, int n) { storage *workspace = &( p_workspace ); int j = blockIdx.x * blockDim.x + threadIdx.x; - if (j >= n) return; + + if (j >= n) + { + return; + } //for( j = 0; j < system->n; ++j ) { // residual workspace->r2[j][0] = b[j][0] - workspace->q2[j][0]; workspace->r2[j][1] = b[j][1] - workspace->q2[j][1]; + // apply diagonal pre-conditioner workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; //} } -void Cuda_CG_Diagnol_Preconditioner (storage *workspace, rvec2 *b, int n) + +void Cuda_CG_Diagnol_Preconditioner(storage *workspace, rvec2 *b, int n) { int blocks; blocks = (n / DEF_BLOCK_SIZE) + (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); + k_diagnol_preconditioner <<< blocks, DEF_BLOCK_SIZE >>> (*workspace, b, n); - cudaThreadSynchronize (); - cudaCheckError (); + + cudaThreadSynchronize(); + cudaCheckError(); } -CUDA_GLOBAL void k_dual_cg_preconditioner (storage p_workspace, rvec2 *x, + +CUDA_GLOBAL void k_dual_cg_preconditioner(storage p_workspace, rvec2 *x, real alpha_0, real alpha_1, int n, rvec2 *my_dot) { storage *workspace = &( p_workspace ); rvec2 alpha; + int j = blockIdx.x * blockDim.x + threadIdx.x; + alpha[0] = alpha_0; alpha[1] = alpha_1; - int j = blockIdx.x * blockDim.x + threadIdx.x; - if (j >= n) return; + if (j >= n) + { + return; + } + my_dot[j][0] = my_dot[j][1] = 0.0; //for( j = 0; j < system->n; ++j ) { // update x x[j][0] += alpha[0] * workspace->d2[j][0]; x[j][1] += alpha[1] * workspace->d2[j][1]; + // update residual workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0]; workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1]; + // apply diagonal pre-conditioner workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; + // dot product: r.p my_dot[j][0] = workspace->r2[j][0] * workspace->p2[j][0]; my_dot[j][1] = workspace->r2[j][1] * workspace->p2[j][1]; //} } -void Cuda_DualCG_Preconditioer (storage *workspace, rvec2 *x, rvec2 alpha, int n, rvec2 result) + +void Cuda_DualCG_Preconditioer(storage *workspace, rvec2 *x, rvec2 alpha, int n, rvec2 result) { int blocks; rvec2 *tmp = (rvec2 *) scratch; - cuda_memset (tmp, 0, sizeof (rvec2) * ( 2 * n + 1), "cuda_dualcg_preconditioner"); + cuda_memset( tmp, 0, sizeof (rvec2) * ( 2 * n + 1), "cuda_dualcg_preconditioner" ); blocks = (n / DEF_BLOCK_SIZE) + (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); + k_dual_cg_preconditioner <<< blocks, DEF_BLOCK_SIZE >>> (*workspace, x, alpha[0], alpha[1], n, tmp); - cudaThreadSynchronize (); - cudaCheckError (); + + cudaThreadSynchronize(); + cudaCheckError(); //Reduction to calculate my_dot k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * DEF_BLOCK_SIZE >>> ( tmp, tmp + n, n); - cudaThreadSynchronize (); - cudaCheckError (); + + cudaThreadSynchronize(); + cudaCheckError(); k_reduction_rvec2 <<< 1, BLOCKS_POW_2, sizeof (rvec2) * BLOCKS_POW_2 >>> ( tmp + n, tmp + 2*n, blocks); - cudaThreadSynchronize (); - cudaCheckError (); - copy_host_device (result, (tmp + 2*n), sizeof (rvec2), cudaMemcpyDeviceToHost, "my_dot"); + cudaThreadSynchronize(); + cudaCheckError(); + + copy_host_device( result, (tmp + 2*n), sizeof (rvec2), cudaMemcpyDeviceToHost, "my_dot" ); } -void Cuda_Norm (rvec2 *arr, int n, rvec2 result) + +void Cuda_Norm(rvec2 *arr, int n, rvec2 result) { int blocks; rvec2 *tmp = (rvec2 *) scratch; blocks = (n / DEF_BLOCK_SIZE) + (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); + k_norm_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * DEF_BLOCK_SIZE >>> (arr, tmp, n, INITIAL); - cudaThreadSynchronize (); - cudaCheckError (); + + cudaThreadSynchronize(); + cudaCheckError(); k_norm_rvec2 <<< 1, BLOCKS_POW_2, sizeof (rvec2) * BLOCKS_POW_2 >>> (tmp, tmp + BLOCKS_POW_2, blocks, FINAL ); - cudaThreadSynchronize (); - cudaCheckError (); - copy_host_device (result, tmp + BLOCKS_POW_2, sizeof (rvec2), - cudaMemcpyDeviceToHost, "cuda_norm_rvec2"); + cudaThreadSynchronize(); + cudaCheckError(); + + copy_host_device( result, tmp + BLOCKS_POW_2, sizeof (rvec2), + cudaMemcpyDeviceToHost, "cuda_norm_rvec2" ); } -void Cuda_Dot (rvec2 *a, rvec2 *b, rvec2 result, int n) + +void Cuda_Dot(rvec2 *a, rvec2 *b, rvec2 result, int n) { int blocks; rvec2 *tmp = (rvec2 *) scratch; blocks = (n / DEF_BLOCK_SIZE) + (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); + k_dot_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * DEF_BLOCK_SIZE >>> ( a, b, tmp, n ); - cudaThreadSynchronize (); - cudaCheckError (); + + cudaThreadSynchronize(); + cudaCheckError(); k_norm_rvec2 <<< 1, BLOCKS_POW_2, sizeof (rvec2) * BLOCKS_POW_2 >>> //k_norm_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * BLOCKS_POW_2 >>> ( tmp, tmp + BLOCKS_POW_2, blocks, FINAL ); - cudaThreadSynchronize (); - cudaCheckError (); - copy_host_device (result, tmp + BLOCKS_POW_2, sizeof (rvec2), - cudaMemcpyDeviceToHost, "cuda_dot"); + cudaThreadSynchronize(); + cudaCheckError(); + + copy_host_device( result, tmp + BLOCKS_POW_2, sizeof (rvec2), + cudaMemcpyDeviceToHost, "cuda_dot" ); } -void Cuda_Vector_Sum_Rvec2 (rvec2 *x, rvec2 *a, rvec2 b, rvec2 *c, int n) + +void Cuda_Vector_Sum_Rvec2(rvec2 *x, rvec2 *a, rvec2 b, rvec2 *c, int n) { int blocks; blocks = (n / DEF_BLOCK_SIZE) + (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); + k_rvec2_pbetad <<< blocks, DEF_BLOCK_SIZE >>> ( x, a, b[0], b[1], c, n); - cudaThreadSynchronize (); - cudaCheckError (); + + cudaThreadSynchronize(); + cudaCheckError(); } -CUDA_GLOBAL void k_rvec2_to_real_copy ( real *dst, rvec2 *src, int index, int n) + +CUDA_GLOBAL void k_rvec2_to_real_copy( real *dst, rvec2 *src, int index, int n) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n) return; + + if (i >= n) + { + return; + } dst[i] = src[i][index]; } -void Cuda_RvecCopy_From (real *dst, rvec2 *src, int index, int n) + +void Cuda_RvecCopy_From(real *dst, rvec2 *src, int index, int n) { int blocks; + blocks = (n / DEF_BLOCK_SIZE) + (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); + k_rvec2_to_real_copy <<< blocks, DEF_BLOCK_SIZE >>> ( dst, src, index, n); - cudaThreadSynchronize (); - cudaCheckError (); + + cudaThreadSynchronize(); + cudaCheckError(); } -CUDA_GLOBAL void k_real_to_rvec2_copy ( rvec2 *dst, real *src, int index, int n) + +CUDA_GLOBAL void k_real_to_rvec2_copy( rvec2 *dst, real *src, int index, int n) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n) return; + + if (i >= n) + { + return; + } dst[i][index] = src[i]; } -void Cuda_RvecCopy_To (rvec2 *dst, real *src, int index, int n) + +void Cuda_RvecCopy_To(rvec2 *dst, real *src, int index, int n) { int blocks; + blocks = (n / DEF_BLOCK_SIZE) + (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); + k_real_to_rvec2_copy <<< blocks, DEF_BLOCK_SIZE >>> ( dst, src, index, n); - cudaThreadSynchronize (); - cudaCheckError (); + + cudaThreadSynchronize(); + cudaCheckError(); } -void Cuda_Dual_Matvec (sparse_matrix *H, rvec2 *a, rvec2 *b, int n, int size) + +void Cuda_Dual_Matvec(sparse_matrix *H, rvec2 *a, rvec2 *b, int n, int size) { int blocks; + blocks = (n / DEF_BLOCK_SIZE) + (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1); - cuda_memset (b, 0, sizeof (rvec2) * size, "dual_matvec:result"); + cuda_memset( b, 0, sizeof (rvec2) * size, "dual_matvec:result" ); //One thread per row implementation //k_dual_matvec <<< blocks, DEF_BLOCK_SIZE >>> @@ -258,21 +320,24 @@ void Cuda_Dual_Matvec (sparse_matrix *H, rvec2 *a, rvec2 *b, int n, int size) #if defined(__SM_35__) k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>> #else - k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, - sizeof (rvec2) * MATVEC_BLOCK_SIZE >>> + k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, + sizeof (rvec2) * MATVEC_BLOCK_SIZE >>> #endif (*H, a, b, n); - cudaThreadSynchronize (); - cudaCheckError (); + + cudaThreadSynchronize(); + cudaCheckError(); } -void Cuda_Matvec (sparse_matrix *H, real *a, real *b, int n, int size) + +void Cuda_Matvec(sparse_matrix *H, real *a, real *b, int n, int size) { int blocks; + blocks = (n / DEF_BLOCK_SIZE) + (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1); - cuda_memset (b, 0, sizeof (real) * size, "dual_matvec:result"); + cuda_memset( b, 0, sizeof (real) * size, "dual_matvec:result" ); //one thread per row implementation //k_matvec <<< blocks, DEF_BLOCK_SIZE >>> @@ -283,11 +348,11 @@ void Cuda_Matvec (sparse_matrix *H, real *a, real *b, int n, int size) #if defined(__SM_35__) k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>> #else - k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, + k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, sizeof (real) * MATVEC_BLOCK_SIZE>>> #endif (*H, a, b, n); - cudaThreadSynchronize (); - cudaCheckError (); -} + cudaThreadSynchronize(); + cudaCheckError(); +} diff --git a/PG-PuReMD/src/cuda_qEq.cu b/PG-PuReMD/src/cuda_qEq.cu index b2094583..9405df86 100644 --- a/PG-PuReMD/src/cuda_qEq.cu +++ b/PG-PuReMD/src/cuda_qEq.cu @@ -27,15 +27,19 @@ #include "validation.h" + CUDA_GLOBAL void ker_init_matvec( reax_atom *my_atoms, single_body_parameters *sbp, storage p_workspace, int n ) { storage *workspace = &( p_workspace ); reax_atom *atom; - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n) return; + + if (i >= n) + { + return; + } //for( i = 0; i < system->n; ++i ) { atom = &( my_atoms[i] ); @@ -54,7 +58,8 @@ CUDA_GLOBAL void ker_init_matvec( reax_atom *my_atoms, //} } -void Cuda_Init_MatVec ( reax_system *system, storage *workspace ) + +void Cuda_Init_MatVec( reax_system *system, storage *workspace ) { int blocks; @@ -64,39 +69,48 @@ void Cuda_Init_MatVec ( reax_system *system, storage *workspace ) ker_init_matvec <<< blocks, DEF_BLOCK_SIZE >>> ( system->d_my_atoms, system->reax_param.d_sbp, *dev_workspace, system->n ); - cudaThreadSynchronize (); - cudaCheckError (); + + cudaThreadSynchronize(); + cudaCheckError(); } -void cuda_charges_x (reax_system *system, rvec2 my_sum) + +void cuda_charges_x(reax_system *system, rvec2 my_sum) { int blocks; rvec2 *output = (rvec2 *) scratch; - cuda_memset (output, 0, sizeof (rvec2) * 2 * system->n, "cuda_charges_x:q"); + cuda_memset( output, 0, sizeof (rvec2) * 2 * system->n, "cuda_charges_x:q" ); blocks = system->n / DEF_BLOCK_SIZE + (( system->n % DEF_BLOCK_SIZE == 0 ) ? 0 : 1); k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * DEF_BLOCK_SIZE >>> ( dev_workspace->x, output, system->n ); - cudaThreadSynchronize (); - cudaCheckError (); + + cudaThreadSynchronize(); + cudaCheckError(); k_reduction_rvec2 <<< 1, BLOCKS_POW_2, sizeof (rvec2) * BLOCKS_POW_2 >>> ( output, output + system->n, blocks ); - cudaThreadSynchronize (); - cudaCheckError (); - copy_host_device (my_sum, output + system->n, sizeof (rvec2), cudaMemcpyDeviceToHost, "charges:x"); + cudaThreadSynchronize(); + cudaCheckError(); + + copy_host_device( my_sum, output + system->n, sizeof (rvec2), cudaMemcpyDeviceToHost, "charges:x" ); } + CUDA_GLOBAL void ker_calculate_st (reax_atom *my_atoms, storage p_workspace, real u, real *q, int n) { storage *workspace = &( p_workspace ); reax_atom *atom; int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n) return; + + if (i >= n) + { + return; + } //for( i = 0; i < system->n; ++i ) { atom = &( my_atoms[i] ); @@ -117,7 +131,6 @@ CUDA_GLOBAL void ker_calculate_st (reax_atom *my_atoms, storage p_workspace, atom->t[0] = workspace->x[i][1]; //} } - //TODO if we use the function argument (output), we are getting //TODO Address not mapped/Invalid permissions error with segmentation fault //TODO so using the local argument, which is a global variable anyways. @@ -126,24 +139,27 @@ CUDA_GLOBAL void ker_calculate_st (reax_atom *my_atoms, storage p_workspace, //TODO //TODO + extern "C" void cuda_charges_st (reax_system *system, storage *workspace, real *output, real u) { int blocks; real *tmp = (real *) scratch; real *tmp_output = (real *) host_scratch; - cuda_memset (tmp, 0, sizeof (real) * system->n, "charges:q"); - memset (tmp_output, 0, sizeof (real) * system->n); + cuda_memset( tmp, 0, sizeof (real) * system->n, "charges:q" ); + memset( tmp_output, 0, sizeof (real) * system->n ); blocks = system->n / DEF_BLOCK_SIZE + (( system->n % DEF_BLOCK_SIZE == 0 ) ? 0 : 1); + ker_calculate_st <<< blocks, DEF_BLOCK_SIZE >>> ( system->d_my_atoms, *dev_workspace, u, tmp, system->n); - cudaThreadSynchronize (); - cudaCheckError (); - copy_host_device (output, tmp, sizeof (real) * system->n, - cudaMemcpyDeviceToHost, "charges:q"); + cudaThreadSynchronize(); + cudaCheckError(); + + copy_host_device( output, tmp, sizeof (real) * system->n, + cudaMemcpyDeviceToHost, "charges:q" ); } //TODO //TODO @@ -153,25 +169,34 @@ extern "C" void cuda_charges_st (reax_system *system, storage *workspace, real * //TODO //TODO -CUDA_GLOBAL void ker_update_q (reax_atom *my_atoms, real *q, int n, int N) + +CUDA_GLOBAL void ker_update_q(reax_atom *my_atoms, real *q, int n, int N) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= (N-n)) return; + + if (i >= (N-n)) + { + return; + } //for( i = system->n; i < system->N; ++i ) my_atoms[i + n].q = q[i + n]; } -void cuda_charges_updateq (reax_system *system, real *q) + +void cuda_charges_updateq(reax_system *system, real *q) { int blocks; real *dev_q = (real *) scratch; - copy_host_device (q, dev_q, system->N * sizeof (real), - cudaMemcpyHostToDevice, "charges:q"); - blocks = (system->N - system->n) / DEF_BLOCK_SIZE + + + copy_host_device( q, dev_q, system->N * sizeof (real), + cudaMemcpyHostToDevice, "charges:q" ); + blocks = (system->N - system->n) / DEF_BLOCK_SIZE + (( (system->N - system->n) % DEF_BLOCK_SIZE == 0 ) ? 0 : 1); + ker_update_q <<< blocks, DEF_BLOCK_SIZE >>> ( system->d_my_atoms, dev_q, system->n, system->N); - cudaThreadSynchronize (); - cudaCheckError (); + + cudaThreadSynchronize(); + cudaCheckError(); } diff --git a/PG-PuReMD/src/linear_solvers.c b/PG-PuReMD/src/linear_solvers.c index 739c7702..f4ecf2d8 100644 --- a/PG-PuReMD/src/linear_solvers.c +++ b/PG-PuReMD/src/linear_solvers.c @@ -40,7 +40,9 @@ void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N ) real H; for ( i = 0; i < N; ++i ) + { b[i][0] = b[i][1] = 0; + } /* perform multiplication */ for ( i = 0; i < A->n; ++i ) @@ -96,18 +98,20 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, #ifdef HAVE_CUDA check_zeros_host (x, system->N, "x"); #endif + Dist( system, mpi_data, x, mpi_data->mpi_rvec2, scale, rvec2_packer ); + #ifdef HAVE_CUDA check_zeros_host (x, system->N, "x"); #endif dual_Sparse_MatVec( H, x, workspace->q2, N ); - // if (data->step > 0) return; // tryQEq Coll(system, mpi_data, workspace->q2, mpi_data->mpi_rvec2, scale, rvec2_unpacker); + #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) Update_Timing_Info( &t_start, &matvec_time ); @@ -118,6 +122,7 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, /* residual */ workspace->r2[j][0] = b[j][0] - workspace->q2[j][0]; workspace->r2[j][1] = b[j][1] - workspace->q2[j][1]; + /* apply diagonal pre-conditioner */ workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; @@ -148,12 +153,14 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, //fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] ); MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm ); //fprintf( stderr, "HOST:sig_new: %f %f\n", sig_new[0], sig_new[1] ); + #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) + { Update_Timing_Info( &t_start, &dot_time ); + } #endif - for ( i = 1; i < 300; ++i ) { Dist(system, mpi_data, workspace->d2, mpi_data->mpi_rvec2, scale, rvec2_packer); @@ -163,6 +170,7 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, // tryQEq Coll(system, mpi_data, workspace->q2, mpi_data->mpi_rvec2, scale, rvec2_unpacker); + #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) Update_Timing_Info( &t_start, &matvec_time ); @@ -200,12 +208,17 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, sig_old[1] = sig_new[1]; MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm ); //fprintf( stderr, "HOST:sig_new: %f %f\n", sig_new[0], sig_new[1] ); + #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) Update_Timing_Info( &t_start, &dot_time ); #endif + if ( SQRT(sig_new[0]) / b_norm[0] <= tol || SQRT(sig_new[1]) / b_norm[1] <= tol ) + { break; + } + beta[0] = sig_new[0] / sig_old[0]; beta[1] = sig_new[1] / sig_old[1]; for ( j = 0; j < system->n; ++j ) @@ -216,31 +229,39 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, } } - if ( SQRT(sig_new[0]) / b_norm[0] <= tol ) { for ( j = 0; j < n; ++j ) + { workspace->t[j] = workspace->x[j][1]; + } matvecs = CG( system, workspace, H, workspace->b_t, tol, workspace->t, mpi_data, fout ); fprintf (stderr, " CG1: iterations --> %d \n", matvecs ); for ( j = 0; j < n; ++j ) + { workspace->x[j][1] = workspace->t[j]; + } } else if ( SQRT(sig_new[1]) / b_norm[1] <= tol ) { for ( j = 0; j < n; ++j ) + { workspace->s[j] = workspace->x[j][0]; + } matvecs = CG( system, workspace, H, workspace->b_s, tol, workspace->s, mpi_data, fout ); fprintf (stderr, " CG2: iterations --> %d \n", matvecs ); for ( j = 0; j < system->n; ++j ) + { workspace->x[j][0] = workspace->s[j]; + } } - if ( i >= 300 ) + { fprintf( stderr, "CG convergence failed!\n" ); + } #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) @@ -252,10 +273,10 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, } - #ifdef HAVE_CUDA int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, - rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout, simulation_data *data ) + rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout, + simulation_data *data ) { int i, j, n, N, matvecs, scale; rvec2 tmp, alpha, beta; @@ -263,7 +284,6 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2 sig_old, sig_new; MPI_Comm comm; rvec2 *spad = (rvec2 *) host_scratch; - int a; n = system->n; @@ -286,13 +306,13 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, // Dist( system, mpi_data, workspace->x, mpi_data->mpi_rvec2, scale, rvec2_packer ); //#endif -// check_zeros_device (x, system->N, "x"); +// check_zeros_device( x, system->N, "x" ); - get_from_device (spad, x, sizeof (rvec2) * system->total_cap, "CG:x:get"); + get_from_device( spad, x, sizeof (rvec2) * system->total_cap, "CG:x:get" ); Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer ); - put_on_device (spad, x, sizeof (rvec2) * system->total_cap, "CG:x:put"); + put_on_device( spad, x, sizeof (rvec2) * system->total_cap, "CG:x:put" ); -// check_zeros_device (x, system->N, "x"); +// check_zeros_device( x, system->N, "x" ); // compare_rvec2 (workspace->x, x, N, "x"); // if (data->step > 0) { @@ -308,6 +328,7 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, //#endif //originally we were using only H->n which was system->n (init_md.c) //Cuda_Dual_Matvec ( H, x, dev_workspace->q2, H->n, system->total_cap); + Cuda_Dual_Matvec ( H, x, dev_workspace->q2, system->N, system->total_cap); // compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2"); @@ -319,6 +340,7 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, //#ifdef __CUDA_DEBUG__ // Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker); //#endif + get_from_device (spad, dev_workspace->q2, sizeof (rvec2) * system->total_cap, "CG:q2:get" ); Coll(system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker); put_on_device (spad, dev_workspace->q2, sizeof (rvec2) * system->total_cap, "CG:q2:put" ); @@ -338,7 +360,9 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, // workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; // } //#endif + Cuda_CG_Diagnol_Preconditioner (dev_workspace, b, system->n); + // compare_rvec2 (workspace->r2, dev_workspace->r2, n, "r2"); // compare_rvec2 (workspace->d2, dev_workspace->d2, n, "d2"); @@ -354,6 +378,7 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, my_sum[0] = my_sum[1] = 0; Cuda_Norm (b, n, my_sum); + // fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]); MPI_Allreduce( &my_sum, &norm_sqr, 2, MPI_DOUBLE, MPI_SUM, comm ); @@ -373,12 +398,18 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, my_dot[0] = my_dot[1] = 0; Cuda_Dot (dev_workspace->r2, dev_workspace->d2, my_dot, n); + // fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] ); + MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm ); + //fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] ); + #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) + { Update_Timing_Info( &t_start, &dot_time ); + } #endif for ( i = 1; i < 300; ++i ) @@ -387,15 +418,19 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, //#ifdef __CUDA_DEBUG__ // Dist(system,mpi_data,workspace->d2,mpi_data->mpi_rvec2,scale,rvec2_packer); //#endif - get_from_device (spad, dev_workspace->d2, sizeof (rvec2) * system->total_cap, "cg:d2:get"); - Dist(system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer); - put_on_device (spad, dev_workspace->d2, sizeof (rvec2) * system->total_cap, "cg:d2:put"); + + get_from_device( spad, dev_workspace->d2, sizeof (rvec2) * system->total_cap, "cg:d2:get" ); + Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer ); + put_on_device( spad, dev_workspace->d2, sizeof (rvec2) * system->total_cap, "cg:d2:put" ); + //print_device_rvec2 (dev_workspace->d2, N); //#ifdef __CUDA_DEBUG__ // dual_Sparse_MatVec( &workspace->H, workspace->d2, workspace->q2, N ); //#endif + Cuda_Dual_Matvec( H, dev_workspace->d2, dev_workspace->q2, system->N, system->total_cap ); + /* fprintf (stderr, "******************* Device sparse Matrix--------> %d \n", H->n ); fprintf (stderr, " ******* HOST SPARSE MATRIX ******** \n"); @@ -409,20 +444,23 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, */ //compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2"); - // tryQEq // MVAPICH2 //#ifdef __CUDA_DEBUG__ // Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker); //#endif - get_from_device (spad, dev_workspace->q2, sizeof (rvec2) * system->total_cap, "cg:q2:get"); - Coll(system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker); - put_on_device (spad, dev_workspace->q2, sizeof (rvec2) * system->total_cap, "cg:q2:put"); + + get_from_device( spad, dev_workspace->q2, sizeof (rvec2) * system->total_cap, "cg:q2:get" ); + Coll( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker ); + put_on_device( spad, dev_workspace->q2, sizeof (rvec2) * system->total_cap, "cg:q2:put" ); // compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2"); + #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) + { Update_Timing_Info( &t_start, &matvec_time ); + } #endif /* dot product: d.q */ @@ -434,6 +472,7 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, // } // fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] ); //#endif + my_dot[0] = my_dot[1] = 0; Cuda_Dot (dev_workspace->d2, dev_workspace->q2, my_dot, n); //fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] ); @@ -462,8 +501,10 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, // } // fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] ); //#endif + my_dot[0] = my_dot[1] = 0; - Cuda_DualCG_Preconditioer (dev_workspace, x, alpha, system->n, my_dot); + Cuda_DualCG_Preconditioer( dev_workspace, x, alpha, system->n, my_dot ); + //fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] ); // compare_rvec2 (workspace->x, dev_workspace->x, N, "x"); @@ -473,13 +514,20 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, sig_old[0] = sig_new[0]; sig_old[1] = sig_new[1]; MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm ); + //fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] ); + #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) + { Update_Timing_Info( &t_start, &dot_time ); + } #endif + if ( SQRT(sig_new[0]) / b_norm[0] <= tol || SQRT(sig_new[1]) / b_norm[1] <= tol ) + { break; + } beta[0] = sig_new[0] / sig_old[0]; beta[1] = sig_new[1] / sig_old[1]; @@ -491,8 +539,9 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, // workspace->d2[j][1] = workspace->p2[j][1] + beta[1] * workspace->d2[j][1]; // } //#endif - Cuda_Vector_Sum_Rvec2 (dev_workspace->d2, dev_workspace->p2, beta, - dev_workspace->d2, system->n); + + Cuda_Vector_Sum_Rvec2( dev_workspace->d2, dev_workspace->p2, beta, + dev_workspace->d2, system->n ); // compare_rvec2 (workspace->d2, dev_workspace->d2, N, "q2"); } @@ -503,44 +552,54 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, //for( j = 0; j < n; ++j ) // workspace->t[j] = workspace->x[j][1]; //fprintf (stderr, "Getting started with Cuda_CG1 \n"); - Cuda_RvecCopy_From (dev_workspace->t, dev_workspace->x, 1, system->n); + + Cuda_RvecCopy_From( dev_workspace->t, dev_workspace->x, 1, system->n ); //compare_array (workspace->b_t, dev_workspace->b_t, system->n, "b_t"); //compare_array (workspace->t, dev_workspace->t, system->n, "t"); matvecs = Cuda_CG( system, workspace, H, dev_workspace->b_t, tol, dev_workspace->t, - mpi_data, fout ); + mpi_data, fout ); + //fprintf (stderr, " Cuda_CG1: iterations --> %d \n", matvecs ); //for( j = 0; j < n; ++j ) // workspace->x[j][1] = workspace->t[j]; - Cuda_RvecCopy_To (dev_workspace->x, dev_workspace->t, 1, system->n); + + Cuda_RvecCopy_To( dev_workspace->x, dev_workspace->t, 1, system->n ); } else if ( SQRT(sig_new[1]) / b_norm[1] <= tol ) { //for( j = 0; j < n; ++j ) // workspace->s[j] = workspace->x[j][0]; - Cuda_RvecCopy_From (dev_workspace->s, dev_workspace->x, 0, system->n); + + Cuda_RvecCopy_From( dev_workspace->s, dev_workspace->x, 0, system->n ); //compare_array (workspace->s, dev_workspace->s, system->n, "s"); //compare_array (workspace->b_s, dev_workspace->b_s, system->n, "b_s"); //fprintf (stderr, "Getting started with Cuda_CG2 \n"); + matvecs = Cuda_CG( system, workspace, H, dev_workspace->b_s, tol, dev_workspace->s, - mpi_data, fout ); + mpi_data, fout ); + //fprintf (stderr, " Cuda_CG2: iterations --> %d \n", matvecs ); //for( j = 0; j < system->n; ++j ) // workspace->x[j][0] = workspace->s[j]; - Cuda_RvecCopy_To (dev_workspace->x, dev_workspace->s, 0, system->n); - } + Cuda_RvecCopy_To( dev_workspace->x, dev_workspace->s, 0, system->n ); + } if ( i >= 300 ) + { fprintf( stderr, "Dual CG convergence failed! -> %d\n", i ); + } #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) + { fprintf( fout, "QEq %d + %d iters. matvecs: %f dot: %f\n", - i + 1, matvecs, matvec_time, dot_time ); + i + 1, matvecs, matvec_time, dot_time ); + } #endif return (i + 1) + matvecs; @@ -554,7 +613,9 @@ void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N ) real H; for ( i = 0; i < N; ++i ) + { b[i] = 0; + } /* perform multiplication */ for ( i = 0; i < A->n; ++i ) @@ -583,34 +644,47 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H, scale = sizeof(real) / sizeof(void); Dist( system, mpi_data, x, MPI_DOUBLE, scale, real_packer ); Sparse_MatVec( H, x, workspace->q, system->N ); + // tryQEq Coll( system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker ); + #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) + { Update_Timing_Info( &t_start, &matvec_time ); + } #endif Vector_Sum( workspace->r , 1., b, -1., workspace->q, system->n ); for ( j = 0; j < system->n; ++j ) + { workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition + } b_norm = Parallel_Norm( b, system->n, mpi_data->world ); sig_new = Parallel_Dot(workspace->r, workspace->d, system->n, mpi_data->world); sig0 = sig_new; + #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) + { Update_Timing_Info( &t_start, &dot_time ); + } #endif for ( i = 1; i < 300 && SQRT(sig_new) / b_norm > tol; ++i ) { Dist( system, mpi_data, workspace->d, MPI_DOUBLE, scale, real_packer ); Sparse_MatVec( H, workspace->d, workspace->q, system->N ); + //tryQEq Coll(system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker); + #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) + { Update_Timing_Info( &t_start, &matvec_time ); + } #endif tmp = Parallel_Dot(workspace->d, workspace->q, system->n, mpi_data->world); @@ -619,16 +693,21 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H, Vector_Add( workspace->r, -alpha, workspace->q, system->n ); /* pre-conditioning */ for ( j = 0; j < system->n; ++j ) + { workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j]; + } sig_old = sig_new; sig_new = Parallel_Dot(workspace->r, workspace->p, system->n, mpi_data->world); //fprintf (stderr, "Host : sig_new: %f \n", sig_new ); beta = sig_new / sig_old; Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, system->n ); + #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) + { Update_Timing_Info( &t_start, &dot_time ); + } #endif } @@ -662,12 +741,15 @@ int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, //MVAPICH2 put_on_device (spad, x, sizeof (real) * system->total_cap , "cuda_cg:x:put"); Cuda_Matvec( H, x, dev_workspace->q, system->N, system->total_cap ); + // tryQEq // MVAPICH2 get_from_device (spad, dev_workspace->q, sizeof (real) * system->total_cap, "cuda_cg:q:get" ); Coll( system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker ); + //MVAPICH2 put_on_device (spad, dev_workspace->q, sizeof (real) * system->total_cap, "cuda_cg:q:put" ); + #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) Update_Timing_Info( &t_start, &matvec_time ); @@ -688,27 +770,33 @@ int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, sig_new = Parallel_Dot(spad, spad + system->total_cap, system->n, mpi_data->world); sig0 = sig_new; + #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) + { Update_Timing_Info( &t_start, &dot_time ); + } #endif for ( i = 1; i < 300 && SQRT(sig_new) / b_norm > tol; ++i ) { - //MVAPICH2 get_from_device (spad, dev_workspace->d, sizeof (real) * system->total_cap, "cuda_cg:d:get"); Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer ); put_on_device (spad, dev_workspace->d, sizeof (real) * system->total_cap, "cuda_cg:d:put"); Cuda_Matvec( H, dev_workspace->d, dev_workspace->q, system->N, system->total_cap ); + //tryQEq get_from_device (spad, dev_workspace->q, sizeof (real) * system->total_cap, "cuda_cg:q:get" ); Coll(system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker); put_on_device (spad, dev_workspace->q, sizeof (real) * system->total_cap , "cuda_cg:q:get"); + #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) + { Update_Timing_Info( &t_start, &matvec_time ); + } #endif //TODO do the parallel dot on the device for the local sum @@ -737,9 +825,12 @@ int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, beta = sig_new / sig_old; Cuda_Vector_Sum( dev_workspace->d, 1., dev_workspace->p, beta, dev_workspace->d, system->n ); + #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) + { Update_Timing_Info( &t_start, &dot_time ); + } #endif } diff --git a/PG-PuReMD/src/parallelreax.c b/PG-PuReMD/src/parallelreax.c index 40cc9c3b..5020d59b 100644 --- a/PG-PuReMD/src/parallelreax.c +++ b/PG-PuReMD/src/parallelreax.c @@ -52,7 +52,6 @@ LR_lookup_table *d_LR; //////////////////////////// //CUDA SPECIFIC DECLARATIONS //////////////////////////// - reax_list **dev_lists; storage *dev_workspace; void *scratch; @@ -64,9 +63,8 @@ int MATVEC_BLOCKS; void Read_System( char *geo_file, char *ffield_file, char *control_file, - reax_system *system, control_params *control, - simulation_data *data, storage *workspace, - output_controls *out_control, mpi_datatypes *mpi_data ) + reax_system *system, control_params *control, simulation_data *data, + storage *workspace, output_controls *out_control, mpi_datatypes *mpi_data ) { /* ffield file */ Read_Force_Field( ffield_file, &(system->reax_param), control ); @@ -76,9 +74,13 @@ void Read_System( char *geo_file, char *ffield_file, char *control_file, /* geo file */ if ( control->geo_format == CUSTOM ) + { Read_Geo( geo_file, system, control, data, workspace, mpi_data ); + } else if ( control->geo_format == PDB ) + { Read_PDB( geo_file, system, control, data, workspace, mpi_data ); + } else if ( control->geo_format == ASCII_RESTART ) { Read_Restart( geo_file, system, control, data, workspace, mpi_data ); @@ -98,9 +100,8 @@ void Read_System( char *geo_file, char *ffield_file, char *control_file, void Post_Evolve( reax_system* system, control_params* control, - simulation_data* data, storage* workspace, - reax_list** lists, output_controls *out_control, - mpi_datatypes *mpi_data ) + simulation_data* data, storage* workspace, reax_list** lists, + output_controls *out_control, mpi_datatypes *mpi_data ) { int i; rvec diff, cross; @@ -131,9 +132,8 @@ void Post_Evolve( reax_system* system, control_params* control, #ifdef HAVE_CUDA void Cuda_Post_Evolve( reax_system* system, control_params* control, - simulation_data* data, storage* workspace, - reax_list** lists, output_controls *out_control, - mpi_datatypes *mpi_data ) + simulation_data* data, storage* workspace, reax_list** lists, + output_controls *out_control, mpi_datatypes *mpi_data ) { /* remove trans & rot velocity of the center of mass from system */ if ( control->ensemble != NVE && control->remove_CoM_vel && @@ -152,20 +152,20 @@ void Cuda_Post_Evolve( reax_system* system, control_params* control, #ifdef HAVE_CUDA -void init_blocks (reax_system *system) +void init_blocks(reax_system *system) { - compute_blocks (&BLOCKS, &BLOCK_SIZE, system->n); - compute_nearest_pow_2 (BLOCKS, &BLOCKS_POW_2); + compute_blocks( &BLOCKS, &BLOCK_SIZE, system->n ); + compute_nearest_pow_2( BLOCKS, &BLOCKS_POW_2 ); - compute_blocks (&BLOCKS_N, &BLOCK_SIZE, system->N); - compute_nearest_pow_2 (BLOCKS_N, &BLOCKS_POW_2_N); + compute_blocks( &BLOCKS_N, &BLOCK_SIZE, system->N ); + compute_nearest_pow_2( BLOCKS_N, &BLOCKS_POW_2_N ); - compute_matvec_blocks (&MATVEC_BLOCKS, system->N); - //#if defined(__CUDA_DEBUG_LOG__) - //fprintf (stderr, " MATVEC_BLOCKS: %d BLOCKSIZE: %d - N:%d \n", - // MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, system->N ); - //#endif + compute_matvec_blocks( &MATVEC_BLOCKS, system->N ); +#if defined(__CUDA_DEBUG_LOG__) + fprintf( stderr, " MATVEC_BLOCKS: %d BLOCKSIZE: %d - N:%d \n", + MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, system->N ); +#endif } #endif @@ -218,7 +218,7 @@ int main( int argc, char* argv[] ) fprintf (stderr, " General block size: %d \n", DEF_BLOCK_SIZE); #endif - /* allocated main datastructures */ + /* allocate main data structures */ system = (reax_system *) smalloc( sizeof(reax_system), "system" ); control = (control_params *) smalloc( sizeof(control_params), "control" ); data = (simulation_data *) smalloc( sizeof(simulation_data), "data" ); @@ -236,13 +236,11 @@ int main( int argc, char* argv[] ) lists[i]->end_index = NULL; lists[i]->select.v = NULL; } - out_control = (output_controls *) - smalloc( sizeof(output_controls), "out_control" ); + out_control = (output_controls *) smalloc( sizeof(output_controls), "out_control" ); mpi_data = (mpi_datatypes *) smalloc( sizeof(mpi_datatypes), "mpi_data" ); /* allocate the cuda auxiliary data structures */ dev_workspace = (storage *) smalloc( sizeof(storage), "dev_workspace" ); - dev_lists = (reax_list **) smalloc ( LIST_N * sizeof (reax_list *), "dev_lists" ); for ( i = 0; i < LIST_N; ++i ) { @@ -253,8 +251,7 @@ int main( int argc, char* argv[] ) /* Initialize member variables */ system->init_thblist = FALSE; - - /* setup the parallel environment */ + /* setup MPI environment */ MPI_Init( &argc, &argv ); MPI_Comm_size( MPI_COMM_WORLD, &(control->nprocs) ); MPI_Comm_rank( MPI_COMM_WORLD, &(system->my_rank) ); @@ -263,38 +260,46 @@ int main( int argc, char* argv[] ) /* setup the CUDA Device for this process can be on the same machine * or on a different machine, for now use the rank to compute the device - * This will only work on a single machine with 2 GPUs*/ - - Setup_Cuda_Environment (system->my_rank, control->nprocs, control->gpus_per_node); + * This will only work on a single machine with 2 GPUs */ + Setup_Cuda_Environment( system->my_rank, control->nprocs, control->gpus_per_node ); //Cleanup_Cuda_Environment (); + // +#if defined(DEBUG) print_device_mem_usage (); - //fprintf( stderr, "p%d: Total number of GPUs on this node -- %d\n", system->my_rank, my_device_id); + fprintf( stderr, "p%d: Total number of GPUs on this node -- %d\n", system->my_rank, my_device_id); +#endif - /* read system description files */ + /* read system config files */ Read_System( argv[1], argv[2], argv[3], system, control, - data, workspace, out_control, mpi_data ); + data, workspace, out_control, mpi_data ); + #if defined(DEBUG) fprintf( stderr, "p%d: read simulation info\n", system->my_rank ); MPI_Barrier( MPI_COMM_WORLD ); #endif /* init the blocks sizes for cuda kernels */ - init_blocks (system); + init_blocks( system ); /* measure total simulation time after input is read */ if ( system->my_rank == MASTER_NODE ) + { t_start = Get_Time( ); + } - /* initialize datastructures */ + /* initialize data structures */ Cuda_Initialize( system, control, data, workspace, lists, out_control, mpi_data ); + #if defined(__CUDA_DEBUG__) Pure_Initialize( system, control, data, workspace, lists, out_control, mpi_data ); #endif +#if defined(DEBUG) print_device_mem_usage (); +#endif /* init the blocks sizes for cuda kernels */ - init_blocks (system); + init_blocks( system ); #if defined(DEBUG) fprintf( stderr, "p%d: initializated data structures\n", system->my_rank ); @@ -304,49 +309,60 @@ int main( int argc, char* argv[] ) // compute f_0 Comm_Atoms( system, control, data, workspace, lists, mpi_data, 1 ); - Sync_Atoms ( system ); - Sync_Grid (&system->my_grid, &system->d_my_grid); + Sync_Atoms( system ); + Sync_Grid( &system->my_grid, &system->d_my_grid ); init_blocks (system); + #if defined(__CUDA_DENUG_LOG__) fprintf( stderr, "p%d: Comm_Atoms synchronized \n", system->my_rank ); #endif //Second step Cuda_Reset ( system, control, data, workspace, lists ); + #if defined(__CUDA_DEBUG__) Reset( system, control, data, workspace, lists ); #endif - //fprintf( stderr, "p%d: Cuda_Reset done...\n", system->my_rank ); - +#if defined(DEBUG) + fprintf( stderr, "p%d: Cuda_Reset done...\n", system->my_rank ); +#endif //Third Step Cuda_Generate_Neighbor_Lists( system, data, workspace, lists ); + #if defined(__CUDA_DEBUG__) Generate_Neighbor_Lists( system, data, workspace, lists ); #endif -#if defined(__CUDA_DENUG_LOG__) + +#if defined(DEBUG) fprintf (stderr, "p%d: Cuda_Generate_Neighbor_Lists done...\n", system->my_rank ); #endif //Fourth Step -#if defined(__CUDA_DEBUG__) +#if defined(DEBUG) fprintf (stderr, " Host Compute Forces begin.... \n"); +#endif + +#if defined(__CUDA_DEBUG__) Compute_Forces( system, control, data, workspace, lists, out_control, mpi_data ); #endif - Cuda_Compute_Forces( system, control, data, workspace, - lists, out_control, mpi_data ); -#if defined(__CUDA_DENUG_LOG__) + + Cuda_Compute_Forces( system, control, data, workspace, lists, + out_control, mpi_data ); + +#if defined(DEBUG) fprintf (stderr, "p%d: Cuda_Compute_Forces done...\n", system->my_rank ); #endif - #if defined (__CUDA_DEBUG__) Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D ); #endif + Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D ); -#if defined(__CUDA_DENUG_LOG__) + +#if defined(DEBUG) fprintf (stderr, "p%d: Cuda_Compute_Kinetic_Energy done ... \n", system->my_rank); #endif @@ -356,7 +372,9 @@ int main( int argc, char* argv[] ) #if !defined(__CUDA_DEBUG__) Output_Results( system, control, data, lists, out_control, mpi_data ); - //fprintf (stderr, "p%d: Output_Results done ... \n", system->my_rank); +#endif +#if defined(DEBUG) + fprintf (stderr, "p%d: Output_Results done ... \n", system->my_rank); #endif #if defined(DEBUG) @@ -368,26 +386,39 @@ int main( int argc, char* argv[] ) for ( ++data->step; data->step <= control->nsteps; data->step++ ) { if ( control->T_mode ) + { Temperature_Control( control, data ); + } - //t_begin = Get_Time (); +#if defined(DEBUG) + t_begin = Get_Time(); +#endif #if defined(__CUDA_DEBUG__) Evolve( system, control, data, workspace, lists, out_control, mpi_data ); #endif + Cuda_Evolve( system, control, data, workspace, lists, out_control, mpi_data ); - //t_end = Get_Timing_Info (t_begin); - //fprintf (stderr, " Evolve time: %f \n", t_end); +#if defined(DEBUG) + t_end = Get_Timing_Info( t_begin ); + fprintf( stderr, " Evolve time: %f \n", t_end ); +#endif + +#if defined(DEBUG) + t_begin = Get_Time(); +#endif - //t_begin = Get_Time (); Cuda_Post_Evolve(system, control, data, workspace, lists, out_control, mpi_data); + #if defined(__CUDA_DEBUG__) Post_Evolve(system, control, data, workspace, lists, out_control, mpi_data); #endif - //t_end = Get_Timing_Info (t_begin); - //fprintf (stderr, " Post Evolve time: %f \n", t_end); +#if defined(DEBUG) + t_end = Get_Timing_Info( t_begin ); + fprintf( stderr, " Post Evolve time: %f \n", t_end ); +#endif #if !defined(__CUDA_DEBUG__) Output_Results( system, control, data, lists, out_control, mpi_data ); @@ -403,6 +434,7 @@ int main( int argc, char* argv[] ) // else if( out_control->restart_format == WRITE_BINARY ) // Write_Binary_Restart( system, control, data, out_control, mpi_data ); // } + #if defined(DEBUG) fprintf( stderr, "p%d: step%d completed\n", system->my_rank, data->step ); MPI_Barrier( MPI_COMM_WORLD ); @@ -410,32 +442,22 @@ int main( int argc, char* argv[] ) } - //vaildate the results in debug mode #if defined(__CUDA_DEBUG__) + // vaildate the results in debug mode validate_device (system, data, workspace, lists); #endif #else - /* allocated main datastructures */ - system = (reax_system *) - smalloc( sizeof(reax_system), "system" ); - control = (control_params *) - smalloc( sizeof(control_params), "control" ); - data = (simulation_data *) - smalloc( sizeof(simulation_data), "data" ); - workspace = (storage *) - smalloc( sizeof(storage), "workspace" ); - - lists = (reax_list **) - smalloc( LIST_N * sizeof(reax_list*), "lists" ); + /* allocate main data structures */ + system = (reax_system *) smalloc( sizeof(reax_system), "system" ); + control = (control_params *) smalloc( sizeof(control_params), "control" ); + data = (simulation_data *) smalloc( sizeof(simulation_data), "data" ); + workspace = (storage *) smalloc( sizeof(storage), "workspace" ); + + lists = (reax_list **) smalloc( LIST_N * sizeof(reax_list*), "lists" ); for ( i = 0; i < LIST_N; ++i ) { -/* - lists[i] = (reax_list *) - smalloc( sizeof(reax_list), "lists[i]" ); - lists[i]->allocated = 0;*/ - lists[i] = (reax_list *) smalloc( sizeof(reax_list), "lists[i]" ); lists[i]->allocated = 0; @@ -446,14 +468,11 @@ int main( int argc, char* argv[] ) lists[i]->end_index = NULL; lists[i]->select.v = NULL; } - out_control = (output_controls *) - smalloc( sizeof(output_controls), "out_control" ); - mpi_data = (mpi_datatypes *) - smalloc( sizeof(mpi_datatypes), "mpi_data" ); + out_control = (output_controls *) smalloc( sizeof(output_controls), "out_control" ); + mpi_data = (mpi_datatypes *) smalloc( sizeof(mpi_datatypes), "mpi_data" ); /* allocate the cuda auxiliary data structures */ dev_workspace = (storage *) smalloc( sizeof(storage), "dev_workspace" ); - dev_lists = (reax_list **) smalloc ( LIST_N * sizeof (reax_list *), "dev_lists" ); for ( i = 0; i < LIST_N; ++i ) { @@ -464,17 +483,18 @@ int main( int argc, char* argv[] ) /* Initialize member variables */ system->init_thblist = FALSE; - /* setup the parallel environment */ + /* setup MPI environment */ MPI_Init( &argc, &argv ); MPI_Comm_size( MPI_COMM_WORLD, &(control->nprocs) ); MPI_Comm_rank( MPI_COMM_WORLD, &(system->my_rank) ); system->wsize = control->nprocs; - system->global_offset = (int*) - scalloc( system->wsize + 1, sizeof(int), "global_offset" ); + system->global_offset = (int*) scalloc( system->wsize + 1, + sizeof(int), "global_offset" ); - /* read system description files */ + /* read system config files */ Read_System( argv[1], argv[2], argv[3], system, control, - data, workspace, out_control, mpi_data ); + data, workspace, out_control, mpi_data ); + #if defined(DEBUG) fprintf( stderr, "p%d: read simulation info\n", system->my_rank ); MPI_Barrier( MPI_COMM_WORLD ); @@ -482,7 +502,10 @@ int main( int argc, char* argv[] ) /* measure total simulation time after input is read */ if ( system->my_rank == MASTER_NODE ) + { t_start = Get_Time( ); + } + /* initialize datastructures */ Initialize( system, control, data, workspace, lists, out_control, mpi_data ); @@ -494,12 +517,16 @@ int main( int argc, char* argv[] ) /* compute f_0 */ Comm_Atoms( system, control, data, workspace, lists, mpi_data, 1 ); Reset( system, control, data, workspace, lists ); - //Print_List(*lists + BONDS); + +#if defined(DEBUG) + Print_List(*lists + BONDS); +#endif + Generate_Neighbor_Lists( system, data, workspace, lists ); - Compute_Forces( system, control, data, workspace, - lists, out_control, mpi_data ); + Compute_Forces( system, control, data, workspace, lists, out_control, mpi_data ); Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D ); Output_Results( system, control, data, lists, out_control, mpi_data ); + #if defined(DEBUG) fprintf( stderr, "p%d: computed forces at t0\n", system->my_rank ); MPI_Barrier( mpi_data->world ); @@ -509,7 +536,9 @@ int main( int argc, char* argv[] ) for ( ++data->step; data->step <= control->nsteps; data->step++ ) { if ( control->T_mode ) + { Temperature_Control( control, data ); + } Evolve( system, control, data, workspace, lists, out_control, mpi_data ); Post_Evolve(system, control, data, workspace, lists, out_control, mpi_data); @@ -521,10 +550,15 @@ int main( int argc, char* argv[] ) (data->step - data->prev_steps) % out_control->restart_freq == 0 ) { if ( out_control->restart_format == WRITE_ASCII ) + { Write_Restart( system, control, data, out_control, mpi_data ); + } else if ( out_control->restart_format == WRITE_BINARY ) + { Write_Binary_Restart( system, control, data, out_control, mpi_data ); + } } + #if defined(DEBUG) fprintf( stderr, "p%d: step%d completed\n", system->my_rank, data->step ); MPI_Barrier( mpi_data->world ); @@ -547,7 +581,6 @@ int main( int argc, char* argv[] ) MPI_Finalize(); - /* de-allocate data structures */ //for( i = 0; i < LIST_N; ++i ) { //if (lists[i]->index) free (lists[i]->index); diff --git a/PG-PuReMD/src/qEq.c b/PG-PuReMD/src/qEq.c index 76966044..b4451fc7 100644 --- a/PG-PuReMD/src/qEq.c +++ b/PG-PuReMD/src/qEq.c @@ -33,6 +33,7 @@ #include "validation.h" #endif + int compare_matrix_entry(const void *v1, const void *v2) { return ((sparse_matrix_entry *)v1)->j - ((sparse_matrix_entry *)v2)->j; @@ -48,7 +49,7 @@ void Sort_Matrix_Rows( sparse_matrix *A ) si = A->start[i]; ei = A->end[i]; qsort( &(A->entries[si]), ei - si, - sizeof(sparse_matrix_entry), compare_matrix_entry ); + sizeof(sparse_matrix_entry), compare_matrix_entry ); } } @@ -60,13 +61,16 @@ void Calculate_Droptol( sparse_matrix *A, real *droptol, real dtol ) /* init droptol to 0 - not necessary for an upper-triangular A */ for ( i = 0; i < A->n; ++i ) + { droptol[i] = 0; + } /* calculate sqaure of the norm of each row */ for ( i = 0; i < A->n; ++i ) { val = A->entries[A->start[i]].val; // diagonal entry droptol[i] += val * val; + // only within my block for ( k = A->start[i] + 1; A->entries[k].j < A->n; ++k ) { @@ -97,6 +101,7 @@ int Estimate_LU_Fill( sparse_matrix *A, real *droptol ) fillin = 0; for ( i = 0; i < A->n; ++i ) + { for ( pj = A->start[i] + 1; A->entries[pj].j < A->n; ++pj ) { j = A->entries[pj].j; @@ -104,13 +109,14 @@ int Estimate_LU_Fill( sparse_matrix *A, real *droptol ) if ( fabs(val) > droptol[i] ) ++fillin; } + } return fillin + A->n; } void ICHOLT( sparse_matrix *A, real *droptol, - sparse_matrix *L, sparse_matrix *U ) + sparse_matrix *L, sparse_matrix *U ) { sparse_matrix_entry tmp[1000]; int i, j, pj, k1, k2, tmptop, Utop; @@ -123,7 +129,9 @@ void ICHOLT( sparse_matrix *A, real *droptol, Utop = 0; tmptop = 0; for ( i = 0; i < A->n; ++i ) + { U->start[i] = L->start[i] = L->end[i] = Ltop[i] = 0; + } for ( i = A->n - 1; i >= 0; --i ) { @@ -146,11 +154,17 @@ void ICHOLT( sparse_matrix *A, real *droptol, while ( k1 >= 0 && k2 < U->end[j] ) { if ( tmp[k1].j < U->entries[k2].j ) + { k1--; + } else if ( tmp[k1].j > U->entries[k2].j ) + { k2++; + } else + { val -= (tmp[k1--].val * U->entries[k2++].val); + } } // U matrix is upper triangular @@ -168,8 +182,10 @@ void ICHOLT( sparse_matrix *A, real *droptol, dval = A->entries[A->start[i]].val; //fprintf( stdout, "%d %d %24.16f\n", 6540-i, 6540-i, dval ); for ( k1 = 0; k1 < tmptop; ++k1 ) + { //if( fabs(tmp[k1].val) > droptol[i] ) dval -= SQR(tmp[k1].val); + } dval = SQRT(dval); // keep the diagonal in any case U->entries[Utop].j = i; @@ -193,14 +209,17 @@ void ICHOLT( sparse_matrix *A, real *droptol, U->end[i] = Utop; //fprintf( stderr, "i = %d - Utop = %d\n", i, Utop ); } - // print matrix U + #if defined(DEBUG) + // print matrix U fprintf( stderr, "nnz(U): %d\n", Utop ); for ( i = 0; i < U->n; ++i ) { fprintf( stderr, "row%d: ", i ); for ( pj = U->start[i]; pj < U->end[i]; ++pj ) + { fprintf( stderr, "%d ", U->entries[pj].j ); + } fprintf( stderr, "\n" ); } #endif @@ -214,6 +233,7 @@ void ICHOLT( sparse_matrix *A, real *droptol, } for ( i = 0; i < U->n; ++i ) + { for ( pj = U->start[i]; pj < U->end[i]; ++pj ) { j = U->entries[pj].j; @@ -221,25 +241,28 @@ void ICHOLT( sparse_matrix *A, real *droptol, L->entries[L->end[j]].val = U->entries[pj].val; L->end[j]++; } + } - // print matrix L #if defined(DEBUG) + // print matrix L fprintf( stderr, "nnz(L): %d\n", L->end[L->n - 1] ); for ( i = 0; i < L->n; ++i ) { fprintf( stderr, "row%d: ", i ); for ( pj = L->start[i]; pj < L->end[i]; ++pj ) + { fprintf( stderr, "%d ", L->entries[pj].j ); + } fprintf( stderr, "\n" ); } #endif + sfree( Ltop, "Ltop" ); } void Init_MatVec( reax_system *system, simulation_data *data, - control_params *control, storage *workspace, - mpi_datatypes *mpi_data ) + control_params *control, storage *workspace, mpi_datatypes *mpi_data ) { int i; //, fillin; reax_atom *atom; @@ -312,9 +335,8 @@ void Init_MatVec( reax_system *system, simulation_data *data, } - void Calculate_Charges( reax_system *system, storage *workspace, - mpi_datatypes *mpi_data ) + mpi_datatypes *mpi_data ) { int i, scale; real u;//, s_sum, t_sum; @@ -362,7 +384,9 @@ void Calculate_Charges( reax_system *system, storage *workspace, Dist( system, mpi_data, q, MPI_DOUBLE, scale, real_packer ); for ( i = system->n; i < system->N; ++i ) + { system->my_atoms[i].q = q[i]; + } free(q); } @@ -381,9 +405,9 @@ void Cuda_Calculate_Charges( reax_system *system, storage *workspace, scale = sizeof(real) / sizeof(void); q = (real *) host_scratch; - memset ( q, 0, system->N * sizeof (real)); + memset( q, 0, system->N * sizeof (real)); - cuda_charges_x ( system, my_sum ); + cuda_charges_x( system, my_sum ); //fprintf (stderr, "Device: my_sum[0]: %f and %f \n", my_sum[0], my_sum[1]); MPI_Allreduce( &my_sum, &all_sum, 2, MPI_DOUBLE, MPI_SUM, mpi_data->world ); @@ -391,11 +415,11 @@ void Cuda_Calculate_Charges( reax_system *system, storage *workspace, u = all_sum[0] / all_sum[1]; //fprintf (stderr, "Device: u: %f \n", u); - cuda_charges_st ( system, workspace, q, u ); + cuda_charges_st( system, workspace, q, u ); Dist( system, mpi_data, q, MPI_DOUBLE, scale, real_packer ); - cuda_charges_updateq ( system, q ); + cuda_charges_updateq( system, q ); } #endif @@ -412,6 +436,7 @@ void QEq( reax_system *system, control_params *control, simulation_data *data, //if( data->step == 50010 ) { // Print_Linear_System( system, control, workspace, data->step ); // } + #if defined(DEBUG) fprintf( stderr, "p%d: initialized qEq\n", system->my_rank ); //Print_Linear_System( system, control, workspace, data->step ); @@ -428,6 +453,7 @@ void QEq( reax_system *system, control_params *control, simulation_data *data, //s_matvecs = PCG( system, workspace, workspace->H, workspace->b_s, // control->q_err, workspace->L, workspace->U, workspace->s, // mpi_data, out_control->log ); + #if defined(DEBUG) fprintf( stderr, "p%d: first CG completed\n", system->my_rank ); #endif @@ -493,6 +519,7 @@ void Cuda_QEq( reax_system *system, control_params *control, simulation_data *da #endif Cuda_Calculate_Charges( system, workspace, mpi_data ); + #if defined(DEBUG) fprintf( stderr, "p%d: computed charges\n", system->my_rank ); //Print_Charges( system ); -- GitLab