From ce6b6624af77182154f3297e6428026b95ae1152 Mon Sep 17 00:00:00 2001 From: "Kurt A. O'Hearn" <ohearnku@cse.msu.edu> Date: Tue, 22 Nov 2016 15:30:26 -0500 Subject: [PATCH] PuReMD-GPU: code clean-up. --- PuReMD-GPU/src/GMRES.cu | 485 +++++++++++++++++++----------------- PuReMD-GPU/src/QEq.cu | 121 ++++++--- PuReMD/src/linear_solvers.c | 71 +++++- 3 files changed, 403 insertions(+), 274 deletions(-) diff --git a/PuReMD-GPU/src/GMRES.cu b/PuReMD-GPU/src/GMRES.cu index d00100e9..2c1b0cf6 100644 --- a/PuReMD-GPU/src/GMRES.cu +++ b/PuReMD-GPU/src/GMRES.cu @@ -220,7 +220,6 @@ int GMRES( static_storage *workspace, sparse_matrix *H, ///////////////////////////////////////////////////////////////// //Cuda Functions for GMRES implementation ///////////////////////////////////////////////////////////////// - GLOBAL void GMRES_Diagonal_Preconditioner (real *b_proc, real *b, real *Hdia_inv, int entries) { int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -230,6 +229,7 @@ GLOBAL void GMRES_Diagonal_Preconditioner (real *b_proc, real *b, real *Hdia_inv b_proc [i] = b[i] * Hdia_inv[i]; } + GLOBAL void GMRES_Givens_Rotation (int j, real *h, real *hc, real *hs, real g_j, real *output) { real tmp1, tmp2, cc; @@ -256,6 +256,7 @@ GLOBAL void GMRES_Givens_Rotation (int j, real *h, real *hc, real *hs, real g_j, output[1] = tmp2; } + GLOBAL void GMRES_BackSubstitution (int j, real *g, real *h, real *y) { real temp; @@ -275,25 +276,26 @@ int Cuda_GMRES( static_storage *workspace, real *b, real tol, real *x ) real cc, tmp1, tmp2, temp, bnorm; real v_add_tmp; sparse_matrix *H = &workspace->H; - real t_start, t_elapsed; - real *spad = (real *)scratch; real *g = (real *) calloc ((RESTART+1), REAL_SIZE); N = H->n; - cuda_memset (spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH ); + cuda_memset(spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH ); - Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> (b, spad, H->n, INITIAL); - cudaThreadSynchronize (); - cudaCheckError (); + Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> + (b, spad, H->n, INITIAL); + cudaThreadSynchronize(); + cudaCheckError(); - Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, spad + BLOCKS_POW_2, BLOCKS_POW_2, FINAL); - cudaThreadSynchronize (); - cudaCheckError (); + Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> + (spad, spad + BLOCKS_POW_2, BLOCKS_POW_2, FINAL); + cudaThreadSynchronize(); + cudaCheckError(); - copy_host_device ( &bnorm, spad + BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device( &bnorm, spad + BLOCKS_POW_2, REAL_SIZE, + cudaMemcpyDeviceToHost, __LINE__); #ifdef __DEBUG_CUDA__ fprintf (stderr, "Norm of the array is %e \n", bnorm ); @@ -302,138 +304,158 @@ int Cuda_GMRES( static_storage *workspace, real *b, real tol, real *x ) /* apply the diagonal pre-conditioner to rhs */ GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>> (workspace->b_prc, b, workspace->Hdia_inv, N); - cudaThreadSynchronize (); - cudaCheckError (); + cudaThreadSynchronize(); + cudaCheckError(); /* GMRES outer-loop */ for( itr = 0; itr < MAX_ITR; ++itr ) { /* calculate r0 */ //Sparse_MatVec( H, x, workspace->b_prm ); - Cuda_Matvec_csr <<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> ( *H, x, workspace->b_prm, N ); - cudaThreadSynchronize (); - cudaCheckError (); + Cuda_Matvec_csr <<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> + ( *H, x, workspace->b_prm, N ); + cudaThreadSynchronize(); + cudaCheckError(); GMRES_Diagonal_Preconditioner <<< BLOCKS, BLOCK_SIZE >>> (workspace->b_prm, workspace->b_prm, workspace->Hdia_inv, N); - cudaThreadSynchronize (); - cudaCheckError (); + cudaThreadSynchronize(); + cudaCheckError(); Cuda_Vector_Sum <<< BLOCKS, BLOCK_SIZE >>> - (&workspace->v[ index_wkspace_sys (0,0,N) ], 1.,workspace->b_prc, -1., workspace->b_prm, N); - cudaThreadSynchronize (); + (&workspace->v[ index_wkspace_sys (0,0,N) ], 1., + workspace->b_prc, -1., workspace->b_prm, N); + cudaThreadSynchronize(); cudaCheckError (); //workspace->g[0] = Norm( &workspace->v[index_wkspace_sys (0,0,system)], N ); { - cuda_memset (spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH ); + cuda_memset( spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH ); Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> (&workspace->v [index_wkspace_sys (0, 0, N)], spad, N, INITIAL); - cudaThreadSynchronize (); - cudaCheckError (); + cudaThreadSynchronize(); + cudaCheckError(); - Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->g[0], BLOCKS_POW_2, FINAL); - cudaThreadSynchronize (); - cudaCheckError (); + Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> + (spad, &workspace->g[0], BLOCKS_POW_2, FINAL); + cudaThreadSynchronize(); + cudaCheckError(); - copy_host_device( g, workspace->g, REAL_SIZE, cudaMemcpyDeviceToHost, RES_STORAGE_G); + copy_host_device( g, workspace->g, REAL_SIZE, + cudaMemcpyDeviceToHost, RES_STORAGE_G); } - Cuda_Vector_Scale <<< BLOCKS, BLOCK_SIZE >>> - ( &workspace->v[ index_wkspace_sys (0,0,N) ], 1.0/g[0], &workspace->v[index_wkspace_sys(0,0,N)], N ); - cudaThreadSynchronize (); - cudaCheckError (); + Cuda_Vector_Scale<<< BLOCKS, BLOCK_SIZE >>> + ( &workspace->v[ index_wkspace_sys (0,0,N) ], 1.0/g[0], + &workspace->v[index_wkspace_sys(0,0,N)], N ); + cudaThreadSynchronize(); + cudaCheckError(); /* GMRES inner-loop */ #ifdef __DEBUG_CUDA__ - fprintf (stderr, " Inner loop inputs bnorm : %f , tol : %f g[j] : %f \n", bnorm, tol, g[0] ); + fprintf( stderr, + " Inner loop inputs bnorm : %f , tol : %f g[j] : %f \n", bnorm, + tol, g[0] ); #endif + for( j = 0; j < RESTART && fabs(g[j]) / bnorm > tol; j++ ) { /* matvec */ //Sparse_MatVec( H, &workspace->v[index_wkspace_sys(j,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)] ); - Cuda_Matvec_csr - <<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> - ( *H, &workspace->v[ index_wkspace_sys (j, 0, N)], &workspace->v[ index_wkspace_sys (j+1, 0, N) ], N ); - cudaThreadSynchronize (); - cudaCheckError (); + Cuda_Matvec_csr<<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> + ( *H, &workspace->v[ index_wkspace_sys (j, 0, N)], + &workspace->v[ index_wkspace_sys (j+1, 0, N) ], N ); + cudaThreadSynchronize(); + cudaCheckError(); - GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>> - (&workspace->v[ index_wkspace_sys (j+1,0,N) ], &workspace->v[ index_wkspace_sys (j+1,0,N) ], workspace->Hdia_inv, N); - cudaThreadSynchronize (); - cudaCheckError (); + GMRES_Diagonal_Preconditioner<<<BLOCKS, BLOCK_SIZE>>> + (&workspace->v[ index_wkspace_sys (j+1,0,N) ], + &workspace->v[ index_wkspace_sys( j+1,0,N) ], + workspace->Hdia_inv, N ); + cudaThreadSynchronize(); + cudaCheckError(); /* apply modified Gram-Schmidt to orthogonalize the new residual */ - for( i = 0; i <= j; i++ ) { + for( i = 0; i <= j; i++ ) + { Cuda_Dot <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> - (&workspace->v[index_wkspace_sys(i,0,N)], &workspace->v[index_wkspace_sys(j+1,0,N)], spad, N); - cudaThreadSynchronize (); - cudaCheckError (); + (&workspace->v[index_wkspace_sys(i,0,N)], + &workspace->v[index_wkspace_sys(j+1,0,N)], spad, N); + cudaThreadSynchronize(); + cudaCheckError(); - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->h[ index_wkspace_res (i,j) ], BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); + Cuda_reduction<<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> + (spad, &workspace->h[ index_wkspace_res (i,j) ], BLOCKS_POW_2); + cudaThreadSynchronize(); + cudaCheckError(); copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (i,j)], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - Cuda_Vector_Add <<< BLOCKS, BLOCK_SIZE >>> + Cuda_Vector_Add<<< BLOCKS, BLOCK_SIZE >>> ( &workspace->v[index_wkspace_sys(j+1,0,N)], -v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N ); - cudaThreadSynchronize (); - cudaCheckError (); + cudaThreadSynchronize(); + cudaCheckError(); } //workspace->h[ index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys(j+1,0,system)], N ); - cuda_memset (spad, 0, REAL_SIZE * N * 2, RES_SCRATCH ); + cuda_memset(spad, 0, REAL_SIZE * N * 2, RES_SCRATCH ); - Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> (&workspace->v[index_wkspace_sys(j+1,0,N)], spad, N, INITIAL); - cudaThreadSynchronize (); - cudaCheckError (); + Cuda_Norm<<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> + (&workspace->v[index_wkspace_sys(j+1,0,N)], spad, N, INITIAL); + cudaThreadSynchronize(); + cudaCheckError(); - Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->h[ index_wkspace_res (j+1,j) ], BLOCKS_POW_2, FINAL); - cudaThreadSynchronize (); - cudaCheckError (); + Cuda_Norm<<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> + (spad, &workspace->h[ index_wkspace_res (j+1,j) ], BLOCKS_POW_2, FINAL); + cudaThreadSynchronize(); + cudaCheckError(); - copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (j+1,j) ], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device(&v_add_tmp, + &workspace->h[ index_wkspace_res (j+1,j) ], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - Cuda_Vector_Scale <<< BLOCKS, BLOCK_SIZE >>> + Cuda_Vector_Scale<<< BLOCKS, BLOCK_SIZE >>> ( &workspace->v[index_wkspace_sys(j+1,0,N)], 1. / v_add_tmp, &workspace->v[index_wkspace_sys(j+1,0,N)], N ); - cudaThreadSynchronize (); - cudaCheckError (); + cudaThreadSynchronize(); + cudaCheckError(); /* Givens rotations on the upper-Hessenberg matrix to make it U */ - GMRES_Givens_Rotation <<<1, 1>>> + GMRES_Givens_Rotation<<<1, 1>>> (j, workspace->h, workspace->hc, workspace->hs, g[j], spad); - cudaThreadSynchronize (); - cudaCheckError (); - copy_host_device (&g[j], spad, 2 * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + cudaThreadSynchronize(); + cudaCheckError(); + copy_host_device(&g[j], spad, 2 * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); } - copy_host_device (g, workspace->g, (RESTART+1)*REAL_SIZE, cudaMemcpyHostToDevice, __LINE__); + copy_host_device(g, workspace->g, (RESTART+1)*REAL_SIZE, + cudaMemcpyHostToDevice, __LINE__); /* solve Hy = g. H is now upper-triangular, do back-substitution */ - copy_host_device (g, spad, (RESTART+1) * REAL_SIZE, cudaMemcpyHostToDevice, RES_STORAGE_G); - GMRES_BackSubstitution <<<1, 1>>> + copy_host_device(g, spad, (RESTART+1) * REAL_SIZE, + cudaMemcpyHostToDevice, RES_STORAGE_G); + GMRES_BackSubstitution<<<1, 1>>> (j, spad, workspace->h, workspace->y); - cudaThreadSynchronize (); - cudaCheckError (); + cudaThreadSynchronize(); + cudaCheckError(); /* update x = x_0 + Vy */ for( i = 0; i < j; i++ ) { - copy_host_device (&v_add_tmp, &workspace->y[i], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device(&v_add_tmp, &workspace->y[i], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); Cuda_Vector_Add <<<BLOCKS, BLOCK_SIZE>>> ( x, v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N ); cudaThreadSynchronize (); - cudaCheckError (); + cudaCheckError(); } /* stopping condition */ if( fabs(g[j]) / bnorm <= tol ) + { break; + } } if( itr >= MAX_ITR ) { @@ -444,6 +466,7 @@ int Cuda_GMRES( static_storage *workspace, real *b, real tol, real *x ) #ifdef __DEBUG_CUDA__ fprintf (stderr, " GPU values itr : %d, RESTART: %d, j: %d \n", itr, RESTART, j); #endif + return itr * (RESTART+1) + j + 1; } @@ -679,9 +702,11 @@ int Cublas_GMRES(reax_system *system, static_storage *workspace, real *b, real t #ifdef __DEBUG_CUDA__ fprintf (stderr, " GPU values itr : %d, RESTART: %d, j: %d \n", itr, RESTART, j); #endif + return itr * (RESTART+1) + j + 1; } + int GMRES_HouseHolder( static_storage *workspace, sparse_matrix *H, real *b, real tol, real *x, FILE *fout, reax_system *system) { @@ -874,7 +899,8 @@ int PGMRES( static_storage *workspace, sparse_matrix *H, real *b, real tol, bnorm = Norm( b, N ); /* GMRES outer-loop */ - for( itr = 0; itr < MAX_ITR; ++itr ) { + for( itr = 0; itr < MAX_ITR; ++itr ) + { /* calculate r0 */ Sparse_MatVec( H, x, workspace->b_prm ); Vector_Sum( &workspace->v[index_wkspace_sys(0,0,system)], 1., b, -1., workspace->b_prm, N ); @@ -885,14 +911,18 @@ int PGMRES( static_storage *workspace, sparse_matrix *H, real *b, real tol, //fprintf( stderr, "res: %.15e\n", workspace->g[0] ); /* GMRES inner-loop */ - for( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ ) { + for( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ ) + { /* matvec */ Sparse_MatVec( H, &workspace->v[index_wkspace_sys (j,0,system)], &workspace->v[index_wkspace_sys (j+1,0,system)] ); Forward_Subs( L, &workspace->v[index_wkspace_sys(j+1,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)] ); Backward_Subs( U, &workspace->v[index_wkspace_sys(j+1,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)] ); /* apply modified Gram-Schmidt to orthogonalize the new residual */ - for( i = 0; i < j-1; i++ ) workspace->h[ index_wkspace_res (i,j)] = 0; + for( i = 0; i < j-1; i++ ) + { + workspace->h[ index_wkspace_res (i,j)] = 0; + } //for( i = 0; i <= j; i++ ) { for( i = MAX(j-1,0); i <= j; i++ ) { @@ -906,8 +936,10 @@ int PGMRES( static_storage *workspace, sparse_matrix *H, real *b, real tol, // fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j ); /* Givens rotations on the upper-Hessenberg matrix to make it U */ - for( i = MAX(j-1,0); i <= j; i++ ) { - if( i == j ) { + for( i = MAX(j-1,0); i <= j; i++ ) + { + if( i == j ) + { cc = SQRT( SQR(workspace->h[ index_wkspace_res (j,j) ])+SQR(workspace->h[ index_wkspace_res (j+1,j) ]) ); workspace->hc[j] = workspace->h[ index_wkspace_res (j,j) ] / cc; workspace->hs[j] = workspace->h[ index_wkspace_res (j+1,j) ] / cc; @@ -937,10 +969,13 @@ int PGMRES( static_storage *workspace, sparse_matrix *H, real *b, real tol, /* solve Hy = g: H is now upper-triangular, do back-substitution */ - for( i = j-1; i >= 0; i-- ) { + for( i = j-1; i >= 0; i-- ) + { temp = workspace->g[i]; for( k = j-1; k > i; k-- ) + { temp -= workspace->h[ index_wkspace_res (i,k) ] * workspace->y[k]; + } workspace->y[i] = temp / workspace->h[index_wkspace_res (i,i)]; } @@ -955,184 +990,186 @@ int PGMRES( static_storage *workspace, sparse_matrix *H, real *b, real tol, /* stopping condition */ if( fabs(workspace->g[j]) / bnorm <= tol ) + { break; } + } - // Sparse_MatVec( H, x, workspace->b_prm ); - // for( i = 0; i < N; ++i ) - // workspace->b_prm[i] *= workspace->Hdia_inv[i]; - // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" ); - // for( i = 0; i < N; ++i ) - // fprintf( fout, "%10.5f%15.12f%15.12f\n", - // workspace->b_prc[i], workspace->b_prm[i], x[i] );*/ - - // fprintf(fout,"GMRES outer:%d, inner:%d iters - residual norm: %25.20f\n", - // itr, j, fabs( workspace->g[j] ) / bnorm ); - // data->timing.matvec += itr * RESTART + j; + // Sparse_MatVec( H, x, workspace->b_prm ); + // for( i = 0; i < N; ++i ) + // workspace->b_prm[i] *= workspace->Hdia_inv[i]; + // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" ); + // for( i = 0; i < N; ++i ) + // fprintf( fout, "%10.5f%15.12f%15.12f\n", + // workspace->b_prc[i], workspace->b_prm[i], x[i] );*/ - if( itr >= MAX_ITR ) { - fprintf( stderr, "GMRES convergence failed\n" ); - // return -1; - return itr * (RESTART+1) + j + 1; - } + // fprintf(fout,"GMRES outer:%d, inner:%d iters - residual norm: %25.20f\n", + // itr, j, fabs( workspace->g[j] ) / bnorm ); + // data->timing.matvec += itr * RESTART + j; + if( itr >= MAX_ITR ) { + fprintf( stderr, "GMRES convergence failed\n" ); + // return -1; return itr * (RESTART+1) + j + 1; } + return itr * (RESTART+1) + j + 1; +} - int PCG( static_storage *workspace, sparse_matrix *A, real *b, real tol, - sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout, reax_system* system ) - { - int i, N; - real tmp, alpha, beta, b_norm, r_norm; - real sig0, sig_old, sig_new; - - N = A->n; - b_norm = Norm( b, N ); - //fprintf( stderr, "b_norm: %.15e\n", b_norm ); - Sparse_MatVec( A, x, workspace->q ); - Vector_Sum( workspace->r , 1., b, -1., workspace->q, N ); +int PCG( static_storage *workspace, sparse_matrix *A, real *b, real tol, + sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout, reax_system* system ) +{ + int i, N; + real tmp, alpha, beta, b_norm, r_norm; + real sig0, sig_old, sig_new; + + N = A->n; + b_norm = Norm( b, N ); + //fprintf( stderr, "b_norm: %.15e\n", b_norm ); + + Sparse_MatVec( A, x, workspace->q ); + Vector_Sum( workspace->r , 1., b, -1., workspace->q, N ); + r_norm = Norm(workspace->r, N); + //Print_Soln( workspace, x, q, b, N ); + //fprintf( stderr, "res: %.15e\n", r_norm ); + + Forward_Subs( L, workspace->r, workspace->d ); + Backward_Subs( U, workspace->d, workspace->p ); + sig_new = Dot( workspace->r, workspace->p, N ); + sig0 = sig_new; + + for( i = 0; i < 200 && r_norm/b_norm > tol; ++i ) + { + //for( i = 0; i < 200 && sig_new > SQR(tol) * sig0; ++i ) { + Sparse_MatVec( A, workspace->p, workspace->q ); + tmp = Dot( workspace->q, workspace->p, N ); + alpha = sig_new / tmp; + Vector_Add( x, alpha, workspace->p, N ); + //fprintf( stderr, "iter%d: |p|=%.15e |q|=%.15e tmp=%.15e\n", + // i+1, Norm(workspace->p,N), Norm(workspace->q,N), tmp ); + + Vector_Add( workspace->r, -alpha, workspace->q, N ); r_norm = Norm(workspace->r, N); - //Print_Soln( workspace, x, q, b, N ); //fprintf( stderr, "res: %.15e\n", r_norm ); Forward_Subs( L, workspace->r, workspace->d ); - Backward_Subs( U, workspace->d, workspace->p ); - sig_new = Dot( workspace->r, workspace->p, N ); - sig0 = sig_new; - - for( i = 0; i < 200 && r_norm/b_norm > tol; ++i ) { - //for( i = 0; i < 200 && sig_new > SQR(tol) * sig0; ++i ) { - Sparse_MatVec( A, workspace->p, workspace->q ); - tmp = Dot( workspace->q, workspace->p, N ); - alpha = sig_new / tmp; - Vector_Add( x, alpha, workspace->p, N ); - //fprintf( stderr, "iter%d: |p|=%.15e |q|=%.15e tmp=%.15e\n", - // i+1, Norm(workspace->p,N), Norm(workspace->q,N), tmp ); - - Vector_Add( workspace->r, -alpha, workspace->q, N ); - r_norm = Norm(workspace->r, N); - //fprintf( stderr, "res: %.15e\n", r_norm ); - - Forward_Subs( L, workspace->r, workspace->d ); - Backward_Subs( U, workspace->d, workspace->d ); - sig_old = sig_new; - sig_new = Dot( workspace->r, workspace->d, N ); - beta = sig_new / sig_old; - Vector_Sum( workspace->p, 1., workspace->d, beta, workspace->p, N ); - } - - //fprintf( fout, "CG took %d iterations\n", i ); - if( i >= 200 ) { - fprintf( stderr, "CG convergence failed!\n" ); - return i; - } + Backward_Subs( U, workspace->d, workspace->d ); + sig_old = sig_new; + sig_new = Dot( workspace->r, workspace->d, N ); + beta = sig_new / sig_old; + Vector_Sum( workspace->p, 1., workspace->d, beta, workspace->p, N ); + } + //fprintf( fout, "CG took %d iterations\n", i ); + if( i >= 200 ) { + fprintf( stderr, "CG convergence failed!\n" ); return i; - } + } + return i; +} - int CG( static_storage *workspace, sparse_matrix *H, - real *b, real tol, real *x, FILE *fout, reax_system *system) - { - int i, j, N; - real tmp, alpha, beta, b_norm; - real sig_old, sig_new, sig0; - - N = H->n; - b_norm = Norm( b, N ); - //fprintf( stderr, "b_norm: %10.6f\n", b_norm ); - - Sparse_MatVec( H, x, workspace->q ); - Vector_Sum( workspace->r , 1., b, -1., workspace->q, N ); - for( j = 0; j < N; ++j ) - workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; - - sig_new = Dot( workspace->r, workspace->d, N ); - sig0 = sig_new; - //Print_Soln( workspace, x, q, b, N ); - //fprintf( stderr, "sig_new: %24.15e, d_norm:%24.15e, q_norm:%24.15e\n", - // sqrt(sig_new), Norm(workspace->d,N), Norm(workspace->q,N) ); - //fprintf( stderr, "sig_new: %f\n", sig_new ); - - for( i = 0; i < 300 && SQRT(sig_new) / b_norm > tol; ++i ) { - //for( i = 0; i < 300 && sig_new > SQR(tol)*sig0; ++i ) { - Sparse_MatVec( H, workspace->d, workspace->q ); - tmp = Dot( workspace->d, workspace->q, N ); - //fprintf( stderr, "tmp: %f\n", tmp ); - alpha = sig_new / tmp; - Vector_Add( x, alpha, workspace->d, N ); - //fprintf( stderr, "d_norm:%24.15e, q_norm:%24.15e, tmp:%24.15e\n", - // Norm(workspace->d,N), Norm(workspace->q,N), tmp ); - - Vector_Add( workspace->r, -alpha, workspace->q, N ); - for( j = 0; j < N; ++j ) - workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j]; - - sig_old = sig_new; - sig_new = Dot( workspace->r, workspace->p, N ); - beta = sig_new / sig_old; - Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, N ); - //fprintf( stderr, "sig_new: %f\n", sig_new ); - } - fprintf( stderr, "CG took %d iterations\n", i ); +int CG( static_storage *workspace, sparse_matrix *H, + real *b, real tol, real *x, FILE *fout, reax_system *system) +{ + int i, j, N; + real tmp, alpha, beta, b_norm; + real sig_old, sig_new, sig0; - if( i >= 300 ) { - fprintf( stderr, "CG convergence failed!\n" ); - return i; - } + N = H->n; + b_norm = Norm( b, N ); + //fprintf( stderr, "b_norm: %10.6f\n", b_norm ); + + Sparse_MatVec( H, x, workspace->q ); + Vector_Sum( workspace->r , 1., b, -1., workspace->q, N ); + for( j = 0; j < N; ++j ) + workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; + + sig_new = Dot( workspace->r, workspace->d, N ); + sig0 = sig_new; + //Print_Soln( workspace, x, q, b, N ); + //fprintf( stderr, "sig_new: %24.15e, d_norm:%24.15e, q_norm:%24.15e\n", + // sqrt(sig_new), Norm(workspace->d,N), Norm(workspace->q,N) ); + //fprintf( stderr, "sig_new: %f\n", sig_new ); + + for( i = 0; i < 300 && SQRT(sig_new) / b_norm > tol; ++i ) { + //for( i = 0; i < 300 && sig_new > SQR(tol)*sig0; ++i ) { + Sparse_MatVec( H, workspace->d, workspace->q ); + tmp = Dot( workspace->d, workspace->q, N ); + //fprintf( stderr, "tmp: %f\n", tmp ); + alpha = sig_new / tmp; + Vector_Add( x, alpha, workspace->d, N ); + //fprintf( stderr, "d_norm:%24.15e, q_norm:%24.15e, tmp:%24.15e\n", + // Norm(workspace->d,N), Norm(workspace->q,N), tmp ); + + Vector_Add( workspace->r, -alpha, workspace->q, N ); + for( j = 0; j < N; ++j ) + workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j]; + + sig_old = sig_new; + sig_new = Dot( workspace->r, workspace->p, N ); + beta = sig_new / sig_old; + Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, N ); + //fprintf( stderr, "sig_new: %f\n", sig_new ); + } - return i; - } + fprintf( stderr, "CG took %d iterations\n", i ); + + if( i >= 300 ) { + fprintf( stderr, "CG convergence failed!\n" ); + return i; + } + return i; +} - /* Steepest Descent */ - int SDM( static_storage *workspace, sparse_matrix *H, - real *b, real tol, real *x, FILE *fout ) - { - int i, j, N; - real tmp, alpha, beta, b_norm; - real sig0, sig; - N = H->n; - b_norm = Norm( b, N ); - //fprintf( stderr, "b_norm: %10.6f\n", b_norm ); +/* Steepest Descent */ +int SDM( static_storage *workspace, sparse_matrix *H, + real *b, real tol, real *x, FILE *fout ) +{ + int i, j, N; + real tmp, alpha, beta, b_norm; + real sig0, sig; - Sparse_MatVec( H, x, workspace->q ); - Vector_Sum( workspace->r , 1., b, -1., workspace->q, N ); - for( j = 0; j < N; ++j ) - workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; + N = H->n; + b_norm = Norm( b, N ); + //fprintf( stderr, "b_norm: %10.6f\n", b_norm ); - sig = Dot( workspace->r, workspace->d, N ); - sig0 = sig; + Sparse_MatVec( H, x, workspace->q ); + Vector_Sum( workspace->r , 1., b, -1., workspace->q, N ); + for( j = 0; j < N; ++j ) + workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; - for( i = 0; i < 300 && SQRT(sig) / b_norm > tol; ++i ) { - Sparse_MatVec( H, workspace->d, workspace->q ); + sig = Dot( workspace->r, workspace->d, N ); + sig0 = sig; - sig = Dot( workspace->r, workspace->d, N ); - tmp = Dot( workspace->d, workspace->q, N ); - alpha = sig / tmp; + for( i = 0; i < 300 && SQRT(sig) / b_norm > tol; ++i ) { + Sparse_MatVec( H, workspace->d, workspace->q ); - Vector_Add( x, alpha, workspace->d, N ); - Vector_Add( workspace->r, -alpha, workspace->q, N ); - for( j = 0; j < N; ++j ) - workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; + sig = Dot( workspace->r, workspace->d, N ); + tmp = Dot( workspace->d, workspace->q, N ); + alpha = sig / tmp; - //fprintf( stderr, "d_norm:%24.15e, q_norm:%24.15e, tmp:%24.15e\n", - // Norm(workspace->d,N), Norm(workspace->q,N), tmp ); - } + Vector_Add( x, alpha, workspace->d, N ); + Vector_Add( workspace->r, -alpha, workspace->q, N ); + for( j = 0; j < N; ++j ) + workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; - fprintf( stderr, "SDM took %d iterations\n", i ); + //fprintf( stderr, "d_norm:%24.15e, q_norm:%24.15e, tmp:%24.15e\n", + // Norm(workspace->d,N), Norm(workspace->q,N), tmp ); + } - if( i >= 300 ) { - fprintf( stderr, "SDM convergence failed!\n" ); - return i; - } + fprintf( stderr, "SDM took %d iterations\n", i ); - return i; - } + if( i >= 300 ) { + fprintf( stderr, "SDM convergence failed!\n" ); + return i; + } + return i; +} diff --git a/PuReMD-GPU/src/QEq.cu b/PuReMD-GPU/src/QEq.cu index 5d849b26..06f708cc 100644 --- a/PuReMD-GPU/src/QEq.cu +++ b/PuReMD-GPU/src/QEq.cu @@ -34,6 +34,7 @@ #include "system_props.h" + HOST_DEVICE void swap(sparse_matrix_entry *array, int index1, int index2) { sparse_matrix_entry temp = array[index1]; @@ -41,6 +42,7 @@ HOST_DEVICE void swap(sparse_matrix_entry *array, int index1, int index2) array[index2] = temp; } + HOST_DEVICE void quick_sort(sparse_matrix_entry *array, int start, int end) { int i = start; @@ -62,6 +64,7 @@ HOST_DEVICE void quick_sort(sparse_matrix_entry *array, int start, int end) } } + int compare_matrix_entry(const void *v1, const void *v2) { return ((sparse_matrix_entry *)v1)->j - ((sparse_matrix_entry *)v2)->j; @@ -80,6 +83,7 @@ void Sort_Matrix_Rows( sparse_matrix *A ) } } + GLOBAL void Cuda_Sort_Matrix_Rows ( sparse_matrix A ) { int i; @@ -129,10 +133,11 @@ void Calculate_Droptol( sparse_matrix *A, real *droptol, real dtol ) //fprintf( stderr, "\n" ); } + GLOBAL void Cuda_Calculate_Droptol ( sparse_matrix p_A, real *droptol, real dtol ) { int i = blockIdx.x * blockDim.x + threadIdx.x; - int k, j, offset, x, diagnol; + int k, j, offset, x, diagonal; real val; sparse_matrix *A = &p_A; @@ -152,10 +157,11 @@ GLOBAL void Cuda_Calculate_Droptol ( sparse_matrix p_A, real *droptol, real dtol } + GLOBAL void Cuda_Calculate_Droptol_js ( sparse_matrix p_A, real *droptol, real dtol ) { int i = blockIdx.x * blockDim.x + threadIdx.x; - int k, j, offset, x, diagnol; + int k, j, offset, x, diagonal; real val; sparse_matrix *A = &p_A; @@ -171,17 +177,18 @@ GLOBAL void Cuda_Calculate_Droptol_js ( sparse_matrix p_A, real *droptol, real d } } -GLOBAL void Cuda_Calculate_Droptol_diagnol ( sparse_matrix p_A, real *droptol, real dtol ) + +GLOBAL void Cuda_Calculate_Droptol_diagonal ( sparse_matrix p_A, real *droptol, real dtol ) { int i = blockIdx.x * blockDim.x + threadIdx.x; - int k, j, offset, x, diagnol; + int k, j, offset, x, diagonal; real val; sparse_matrix *A = &p_A; if ( i < A->n ) { - //diagnol element - diagnol = A->end[i]-1; - val = A->entries [diagnol].val; + //diagonal element + diagonal = A->end[i]-1; + val = A->entries [diagonal].val; droptol [i] += val*val; } @@ -213,6 +220,7 @@ int Estimate_LU_Fill( sparse_matrix *A, real *droptol ) return fillin + A->n; } + GLOBAL void Cuda_Estimate_LU_Fill ( sparse_matrix p_A, real *droptol, int *fillin) { int i, j, pj; @@ -233,6 +241,7 @@ GLOBAL void Cuda_Estimate_LU_Fill ( sparse_matrix p_A, real *droptol, int *filli } } + void ICHOLT( sparse_matrix *A, real *droptol, sparse_matrix *L, sparse_matrix *U ) { @@ -338,7 +347,6 @@ void ICHOLT( sparse_matrix *A, real *droptol, } - void Cuda_ICHOLT( sparse_matrix *A, real *droptol, sparse_matrix *L, sparse_matrix *U ) { @@ -489,7 +497,7 @@ end = A->end[i]-1; //inclusive count = end - start; //inclusive tmp_val [kid] = 0; -if (kid < count) //diagnol not included +if (kid < count) //diagonal not included { pj = start + kid; @@ -540,7 +548,7 @@ if (kid == 0) } } -//diagnol element +//diagonal element //val = A->entries[pj].val; //for( k1 = 0; k1 < tmptop; ++k1 ) if (kid < count) @@ -612,7 +620,8 @@ void Init_MatVec( reax_system *system, control_params *control, //char fname[100]; if(control->refactor > 0 && - ((data->step-data->prev_steps)%control->refactor==0 || workspace->L.entries==NULL)){ + ((data->step-data->prev_steps)%control->refactor==0 || workspace->L.entries==NULL)) + { //Print_Linear_System( system, control, workspace, data->step ); Sort_Matrix_Rows( &workspace->H ); @@ -621,17 +630,21 @@ void Init_MatVec( reax_system *system, control_params *control, Calculate_Droptol( &workspace->H, workspace->droptol, control->droptol ); //fprintf( stderr, "drop tolerances calculated\n" ); - - if( workspace->L.entries == NULL ) { + if( workspace->L.entries == NULL ) + { fillin = Estimate_LU_Fill( &workspace->H, workspace->droptol ); + #ifdef __DEBUG_CUDA__ fprintf( stderr, "fillin = %d\n", fillin ); #endif + if( Allocate_Matrix( &(workspace->L), far_nbrs->n, fillin ) == 0 || - Allocate_Matrix( &(workspace->U), far_nbrs->n, fillin ) == 0 ){ + Allocate_Matrix( &(workspace->U), far_nbrs->n, fillin ) == 0 ) + { fprintf( stderr, "not enough memory for LU matrices. terminating.\n" ); exit(INSUFFICIENT_SPACE); } + #if defined(DEBUG_FOCUS) fprintf( stderr, "fillin = %d\n", fillin ); fprintf( stderr, "allocated memory: L = U = %ldMB\n", @@ -689,43 +702,44 @@ void Init_MatVec( reax_system *system, control_params *control, } } -void Cuda_Init_MatVec( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list *far_nbrs ) + +void Cuda_Init_MatVec(reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, list *far_nbrs ) { int i, fillin; real s_tmp, t_tmp; int *spad = (int *)scratch; real start = 0, end = 0; - if(control->refactor > 0 && - ((data->step-data->prev_steps)%control->refactor==0 || dev_workspace->L.entries==NULL)){ - - Cuda_Sort_Matrix_Rows <<< BLOCKS, BLOCK_SIZE >>> + if( control->refactor > 0 && + ((data->step-data->prev_steps)%control->refactor==0 || + dev_workspace->L.entries==NULL) ) + { + Cuda_Sort_Matrix_Rows<<< BLOCKS, BLOCK_SIZE >>> ( dev_workspace->H ); - cudaThreadSynchronize (); - cudaCheckError (); + cudaThreadSynchronize(); + cudaCheckError(); #ifdef __DEBUG_CUDA__ fprintf (stderr, "Sorting done... \n"); #endif - Cuda_Calculate_Droptol <<<BLOCKS, BLOCK_SIZE >>> + Cuda_Calculate_Droptol<<<BLOCKS, BLOCK_SIZE >>> ( dev_workspace->H, dev_workspace->droptol, control->droptol ); - cudaThreadSynchronize (); - cudaCheckError (); + cudaThreadSynchronize(); + cudaCheckError(); #ifdef __DEBUG_CUDA__ fprintf (stderr, "Droptol done... \n"); #endif - if( dev_workspace->L.entries == NULL ) { - + if( dev_workspace->L.entries == NULL ) + { cuda_memset ( spad, 0, 2 * INT_SIZE * system->N, RES_SCRATCH ); Cuda_Estimate_LU_Fill <<< BLOCKS, BLOCK_SIZE >>> ( dev_workspace->H, dev_workspace->droptol, spad ); - cudaThreadSynchronize (); - cudaCheckError (); + cudaThreadSynchronize(); + cudaCheckError(); //Reduction for fill in Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, INT_SIZE * BLOCK_SIZE >>> @@ -789,16 +803,36 @@ void Cuda_Init_MatVec( reax_system *system, control_params *control, #ifdef __DEBUG_CUDA__ fprintf (stderr, " Allocation temp matrices count %d entries %d \n", dev_workspace->H.n, dev_workspace->H.m ); #endif + start = Get_Time (); - if (!Allocate_Matrix (&t_H, dev_workspace->H.n, dev_workspace->H.m)) { fprintf (stderr, "No space for H matrix \n"); exit (0);} - if (!Allocate_Matrix (&t_L, far_nbrs->n, dev_workspace->L.m)) { fprintf (stderr, "No space for L matrix \n"); exit (0); } - if (!Allocate_Matrix (&t_U, far_nbrs->n, dev_workspace->U.m)) { fprintf (stderr, "No space for U matrix \n"); exit (0); } + if(!Allocate_Matrix(&t_H, dev_workspace->H.n, dev_workspace->H.m)) + { + fprintf(stderr, "No space for H matrix \n"); + exit(0); + } + if(!Allocate_Matrix(&t_L, far_nbrs->n, dev_workspace->L.m)) + { + fprintf(stderr, "No space for L matrix \n"); + exit(0); + } + if(!Allocate_Matrix(&t_U, far_nbrs->n, dev_workspace->U.m)) + { + fprintf(stderr, "No space for U matrix \n"); + exit(0); + } - copy_host_device ( t_H.start, dev_workspace->H.start, INT_SIZE * (dev_workspace->H.n + 1), cudaMemcpyDeviceToHost, RES_SPARSE_MATRIX_INDEX ); - copy_host_device ( t_H.end, dev_workspace->H.end, INT_SIZE * (dev_workspace->H.n + 1), cudaMemcpyDeviceToHost, RES_SPARSE_MATRIX_INDEX ); - copy_host_device ( t_H.entries, dev_workspace->H.entries, SPARSE_MATRIX_ENTRY_SIZE * dev_workspace->H.m, cudaMemcpyDeviceToHost, RES_SPARSE_MATRIX_ENTRY ); + copy_host_device ( t_H.start, dev_workspace->H.start, INT_SIZE * + (dev_workspace->H.n + 1), cudaMemcpyDeviceToHost, + RES_SPARSE_MATRIX_INDEX ); + copy_host_device ( t_H.end, dev_workspace->H.end, INT_SIZE * + (dev_workspace->H.n + 1), cudaMemcpyDeviceToHost, + RES_SPARSE_MATRIX_INDEX ); + copy_host_device ( t_H.entries, dev_workspace->H.entries, + SPARSE_MATRIX_ENTRY_SIZE * dev_workspace->H.m, + cudaMemcpyDeviceToHost, RES_SPARSE_MATRIX_ENTRY ); - copy_host_device ( t_droptol, dev_workspace->droptol, REAL_SIZE * system->N, cudaMemcpyDeviceToHost, RES_STORAGE_DROPTOL ); + copy_host_device ( t_droptol, dev_workspace->droptol, REAL_SIZE * + system->N, cudaMemcpyDeviceToHost, RES_STORAGE_DROPTOL ); //fprintf (stderr, " Done copying LUH .. \n"); Cuda_ICHOLT (&t_H, t_droptol, &t_L, &t_U); @@ -828,6 +862,7 @@ void Cuda_Init_MatVec( reax_system *system, control_params *control, } } + GLOBAL void Init_MatVec_Postprocess (static_storage p_workspace, int N ) { @@ -873,6 +908,7 @@ GLOBAL void Init_MatVec_Postprocess (static_storage p_workspace, int N ) workspace->t[index_wkspace_sys(0,i,N)] = t_tmp; } + void Calculate_Charges( reax_system *system, static_storage *workspace ) { int i; @@ -891,17 +927,25 @@ void Calculate_Charges( reax_system *system, static_storage *workspace ) #endif for( i = 0; i < system->N; ++i ) + { system->atoms[i].q = workspace->s[index_wkspace_sys(0,i,system)] - u * workspace->t[index_wkspace_sys(0,i,system)]; + } } + GLOBAL void Cuda_Update_Atoms_q ( reax_atom *atoms, real *s, real u, real *t, int N) { int i = blockIdx.x*blockDim.x + threadIdx.x; - if (i >= N) return; + + if (i >= N) + { + return; + } atoms[i].q = s[index_wkspace_sys(0,i,N)] - u * t[index_wkspace_sys(0,i,N)]; } + void Cuda_Calculate_Charges (reax_system *system, static_storage *workspace) { real *spad = (real *) scratch; @@ -1018,6 +1062,7 @@ void QEq( reax_system *system, control_params *control, simulation_data *data, //Print_Charges( system, control, workspace, data->step ); } + void Cuda_QEq( reax_system *system, control_params *control, simulation_data *data, static_storage *workspace, list *far_nbrs, output_controls *out_control ) diff --git a/PuReMD/src/linear_solvers.c b/PuReMD/src/linear_solvers.c index 835ffa35..d08dfe1e 100644 --- a/PuReMD/src/linear_solvers.c +++ b/PuReMD/src/linear_solvers.c @@ -36,7 +36,9 @@ void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N ) real H; for ( i = 0; i < N; ++i ) + { b[i][0] = b[i][1] = 0; + } /* perform multiplication */ for ( i = 0; i < A->n; ++i ) @@ -64,7 +66,7 @@ void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N ) int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, - rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout ) + rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout ) { int i, j, n, N, matvecs, scale; rvec2 tmp, alpha, beta; @@ -86,13 +88,17 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, t_start = Get_Time( ); } #endif + Dist( system, mpi_data, x, mpi_data->mpi_rvec2, scale, rvec2_packer ); dual_Sparse_MatVec( H, x, workspace->q2, N ); // tryQEq Coll(system, mpi_data, workspace->q2, mpi_data->mpi_rvec2, scale, rvec2_unpacker); + #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) + { Update_Timing_Info( &t_start, &matvec_time ); + } #endif for ( j = 0; j < system->n; ++j ) @@ -126,6 +132,7 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, } MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm ); //fprintf( stderr, "sig_new: %f %f\n", sig_new[0], sig_new[1] ); + #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) Update_Timing_Info( &t_start, &dot_time ); @@ -137,9 +144,12 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, dual_Sparse_MatVec( H, workspace->d2, workspace->q2, N ); // tryQEq Coll(system, mpi_data, workspace->q2, mpi_data->mpi_rvec2, scale, rvec2_unpacker); + #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) + { Update_Timing_Info( &t_start, &matvec_time ); + } #endif /* dot product: d.q */ @@ -174,12 +184,18 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, sig_old[1] = sig_new[1]; MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm ); //fprintf( stderr, "sig_new: %f %f\n", sig_new[0], sig_new[1] ); + #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) + { Update_Timing_Info( &t_start, &dot_time ); + } #endif + if ( sqrt(sig_new[0]) / b_norm[0] <= tol || sqrt(sig_new[1]) / b_norm[1] <= tol ) + { break; + } beta[0] = sig_new[0] / sig_old[0]; beta[1] = sig_new[1] / sig_old[1]; @@ -194,30 +210,41 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, if ( sqrt(sig_new[0]) / b_norm[0] <= tol ) { for ( j = 0; j < n; ++j ) + { workspace->t[j] = workspace->x[j][1]; - matvecs = CG( system, workspace, H, workspace->b_t, tol, workspace->t, - mpi_data, fout ); + } + matvecs = CG( system, workspace, H, workspace->b_t, tol, + workspace->t,mpi_data, fout ); for ( j = 0; j < n; ++j ) + { workspace->x[j][1] = workspace->t[j]; + } } else if ( sqrt(sig_new[1]) / b_norm[1] <= tol ) { for ( j = 0; j < n; ++j ) + { workspace->s[j] = workspace->x[j][0]; + } matvecs = CG( system, workspace, H, workspace->b_s, tol, workspace->s, - mpi_data, fout ); + mpi_data, fout ); for ( j = 0; j < system->n; ++j ) + { workspace->x[j][0] = workspace->s[j]; + } } - if ( i >= 300 ) + { fprintf( stderr, "CG convergence failed!\n" ); + } #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) - fprintf( fout, "QEq %d + %d iters. matvecs: %f dot: %f\n", - i + 1, matvecs, matvec_time, dot_time ); + { + fprintf( fout, "QEq %d + %d iters. matvecs: %f dot: %f\n", i + 1, + matvecs, matvec_time, dot_time ); + } #endif return (i + 1) + matvecs; @@ -230,7 +257,9 @@ void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N ) real H; for ( i = 0; i < N; ++i ) + { b[i] = 0; + } /* perform multiplication */ for ( i = 0; i < A->n; ++i ) @@ -249,8 +278,8 @@ void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N ) } -int CG( reax_system *system, storage *workspace, sparse_matrix *H, - real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout ) +int CG( reax_system *system, storage *workspace, sparse_matrix *H, real *b, + real tol, real *x, mpi_datatypes* mpi_data, FILE *fout ) { int i, j, scale; real tmp, alpha, beta, b_norm; @@ -269,21 +298,29 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H, Sparse_MatVec( H, x, workspace->q, system->N ); // tryQEq Coll( system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker ); + #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) + { Update_Timing_Info( &t_start, &matvec_time ); + } #endif Vector_Sum( workspace->r , 1., b, -1., workspace->q, system->n ); for ( j = 0; j < system->n; ++j ) + { workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition + } b_norm = Parallel_Norm( b, system->n, mpi_data->world ); sig_new = Parallel_Dot(workspace->r, workspace->d, system->n, mpi_data->world); sig0 = sig_new; + #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) + { Update_Timing_Info( &t_start, &dot_time ); + } #endif for ( i = 1; i < 300 && sqrt(sig_new) / b_norm > tol; ++i ) @@ -292,9 +329,12 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H, Sparse_MatVec( H, workspace->d, workspace->q, system->N ); //tryQEq Coll(system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker); + #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) + { Update_Timing_Info( &t_start, &matvec_time ); + } #endif tmp = Parallel_Dot(workspace->d, workspace->q, system->n, mpi_data->world); @@ -303,15 +343,20 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H, Vector_Add( workspace->r, -alpha, workspace->q, system->n ); /* pre-conditioning */ for ( j = 0; j < system->n; ++j ) + { workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j]; + } sig_old = sig_new; sig_new = Parallel_Dot(workspace->r, workspace->p, system->n, mpi_data->world); beta = sig_new / sig_old; Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, system->n ); + #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) + { Update_Timing_Info( &t_start, &dot_time ); + } #endif } @@ -323,8 +368,10 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H, #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) - fprintf( fout, "QEq %d iters. matvecs: %f dot: %f\n", - i, matvec_time, dot_time ); + { + fprintf( fout, "QEq %d iters. matvecs: %f dot: %f\n", i, matvec_time, + dot_time ); + } #endif return i; @@ -332,7 +379,7 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H, int CG_test( reax_system *system, storage *workspace, sparse_matrix *H, - real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout ) + real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout ) { int i, j, scale; real tmp, alpha, beta, b_norm; -- GitLab