/*---------------------------------------------------------------------- PuReMD - Purdue ReaxFF Molecular Dynamics Program Copyright (2010) Purdue University Hasan Metin Aktulga, haktulga@cs.purdue.edu Joseph Fogarty, jcfogart@mail.usf.edu Sagar Pandit, pandit@usf.edu Ananth Y Grama, ayg@cs.purdue.edu This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details: <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ #include "cuda_linear_solvers.h" #include "reax_types.h" #include "cuda_utils.h" #include "reduction.h" #include "dual_matvec.h" #include "matvec.h" void get_from_device(real *host, real *device, unsigned int bytes, const char *msg) { copy_host_device( host, device, bytes, cudaMemcpyDeviceToHost, msg ); } void put_on_device(real *host, real *device, unsigned int bytes, const char *msg) { copy_host_device( host, device, bytes, cudaMemcpyHostToDevice, msg ); } void Cuda_Vector_Sum(real *res, real a, real *x, real b, real *y, int count) { //res = ax + by //use the cublas here int blocks; blocks = (count / DEF_BLOCK_SIZE) + ((count % DEF_BLOCK_SIZE == 0) ? 0 : 1); k_vector_sum <<< blocks, DEF_BLOCK_SIZE >>> ( res, a, x, b, y, count ); cudaThreadSynchronize(); cudaCheckError(); } void Cuda_CG_Preconditioner(real *res, real *a, real *b, int count) { //res = a*b - vector multiplication //use the cublas here. int blocks; blocks = (count / DEF_BLOCK_SIZE) + ((count % DEF_BLOCK_SIZE == 0) ? 0 : 1); k_vector_mul <<< blocks, DEF_BLOCK_SIZE >>> ( res, a, b, count ); cudaThreadSynchronize(); } CUDA_GLOBAL void k_diagonal_preconditioner(storage p_workspace, rvec2 *b, int n) { storage *workspace = &( p_workspace ); int j = blockIdx.x * blockDim.x + threadIdx.x; if (j >= n) { return; } //for( j = 0; j < system->n; ++j ) { // residual workspace->r2[j][0] = b[j][0] - workspace->q2[j][0]; workspace->r2[j][1] = b[j][1] - workspace->q2[j][1]; // apply diagonal pre-conditioner workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; //} } void Cuda_CG_Diagonal_Preconditioner(storage *workspace, rvec2 *b, int n) { int blocks; blocks = (n / DEF_BLOCK_SIZE) + (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); k_diagonal_preconditioner <<< blocks, DEF_BLOCK_SIZE >>> (*workspace, b, n); cudaThreadSynchronize(); cudaCheckError(); } CUDA_GLOBAL void k_dual_cg_preconditioner(storage p_workspace, rvec2 *x, real alpha_0, real alpha_1, int n, rvec2 *my_dot) { storage *workspace = &( p_workspace ); rvec2 alpha; int j = blockIdx.x * blockDim.x + threadIdx.x; alpha[0] = alpha_0; alpha[1] = alpha_1; if (j >= n) { return; } my_dot[j][0] = my_dot[j][1] = 0.0; //for( j = 0; j < system->n; ++j ) { // update x x[j][0] += alpha[0] * workspace->d2[j][0]; x[j][1] += alpha[1] * workspace->d2[j][1]; // update residual workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0]; workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1]; // apply diagonal pre-conditioner workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; // dot product: r.p my_dot[j][0] = workspace->r2[j][0] * workspace->p2[j][0]; my_dot[j][1] = workspace->r2[j][1] * workspace->p2[j][1]; //} } void Cuda_DualCG_Preconditioner(storage *workspace, rvec2 *x, rvec2 alpha, int n, rvec2 result) { int blocks; rvec2 *tmp = (rvec2 *) scratch; cuda_memset( tmp, 0, sizeof (rvec2) * ( 2 * n + 1), "cuda_dualcg_preconditioner" ); blocks = (n / DEF_BLOCK_SIZE) + (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); k_dual_cg_preconditioner <<< blocks, DEF_BLOCK_SIZE >>> (*workspace, x, alpha[0], alpha[1], n, tmp); cudaThreadSynchronize(); cudaCheckError(); //Reduction to calculate my_dot k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>> ( tmp, tmp + n, n); cudaThreadSynchronize(); cudaCheckError(); k_reduction_rvec2 <<< 1, BLOCKS_POW_2, sizeof(rvec2) * BLOCKS_POW_2 >>> ( tmp + n, tmp + 2*n, blocks); cudaThreadSynchronize(); cudaCheckError(); copy_host_device( result, (tmp + 2*n), sizeof(rvec2), cudaMemcpyDeviceToHost, "my_dot" ); } void Cuda_Norm(rvec2 *arr, int n, rvec2 result) { int blocks; rvec2 *tmp = (rvec2 *) scratch; blocks = (n / DEF_BLOCK_SIZE) + (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); k_norm_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * DEF_BLOCK_SIZE >>> (arr, tmp, n, INITIAL); cudaThreadSynchronize(); cudaCheckError(); k_norm_rvec2 <<< 1, BLOCKS_POW_2, sizeof (rvec2) * BLOCKS_POW_2 >>> (tmp, tmp + BLOCKS_POW_2, blocks, FINAL ); cudaThreadSynchronize(); cudaCheckError(); copy_host_device( result, tmp + BLOCKS_POW_2, sizeof (rvec2), cudaMemcpyDeviceToHost, "cuda_norm_rvec2" ); } void Cuda_Dot(rvec2 *a, rvec2 *b, rvec2 result, int n) { int blocks; rvec2 *tmp = (rvec2 *) scratch; blocks = (n / DEF_BLOCK_SIZE) + (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); k_dot_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * DEF_BLOCK_SIZE >>> ( a, b, tmp, n ); cudaThreadSynchronize(); cudaCheckError(); k_norm_rvec2 <<< 1, BLOCKS_POW_2, sizeof (rvec2) * BLOCKS_POW_2 >>> //k_norm_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * BLOCKS_POW_2 >>> ( tmp, tmp + BLOCKS_POW_2, blocks, FINAL ); cudaThreadSynchronize(); cudaCheckError(); copy_host_device( result, tmp + BLOCKS_POW_2, sizeof (rvec2), cudaMemcpyDeviceToHost, "cuda_dot" ); } void Cuda_Vector_Sum_Rvec2(rvec2 *x, rvec2 *a, rvec2 b, rvec2 *c, int n) { int blocks; blocks = (n / DEF_BLOCK_SIZE) + (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); k_rvec2_pbetad <<< blocks, DEF_BLOCK_SIZE >>> ( x, a, b[0], b[1], c, n); cudaThreadSynchronize(); cudaCheckError(); } CUDA_GLOBAL void k_rvec2_to_real_copy( real *dst, rvec2 *src, int index, int n) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i >= n) { return; } dst[i] = src[i][index]; } void Cuda_RvecCopy_From(real *dst, rvec2 *src, int index, int n) { int blocks; blocks = (n / DEF_BLOCK_SIZE) + (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); k_rvec2_to_real_copy <<< blocks, DEF_BLOCK_SIZE >>> ( dst, src, index, n); cudaThreadSynchronize(); cudaCheckError(); } CUDA_GLOBAL void k_real_to_rvec2_copy( rvec2 *dst, real *src, int index, int n) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i >= n) { return; } dst[i][index] = src[i]; } void Cuda_RvecCopy_To(rvec2 *dst, real *src, int index, int n) { int blocks; blocks = (n / DEF_BLOCK_SIZE) + (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); k_real_to_rvec2_copy <<< blocks, DEF_BLOCK_SIZE >>> ( dst, src, index, n); cudaThreadSynchronize(); cudaCheckError(); } void Cuda_Dual_Matvec(sparse_matrix *H, rvec2 *a, rvec2 *b, int n, int size) { int blocks; blocks = (n / DEF_BLOCK_SIZE) + (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1); cuda_memset( b, 0, sizeof (rvec2) * size, "dual_matvec:result" ); //One thread per row implementation //k_dual_matvec <<< blocks, DEF_BLOCK_SIZE >>> // (*H, a, b, n); //cudaThreadSynchronize (); //cudaCheckError (); //One warp per row implementation #if defined(__SM_35__) k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>> #else k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, sizeof (rvec2) * MATVEC_BLOCK_SIZE >>> #endif (*H, a, b, n); cudaThreadSynchronize(); cudaCheckError(); } void Cuda_Matvec(sparse_matrix *H, real *a, real *b, int n, int size) { int blocks; blocks = (n / DEF_BLOCK_SIZE) + (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1); cuda_memset( b, 0, sizeof (real) * size, "dual_matvec:result" ); //one thread per row implementation //k_matvec <<< blocks, DEF_BLOCK_SIZE >>> // (*H, a, b, n); //cudaThreadSynchronize (); //cudaCheckError (); #if defined(__SM_35__) k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>> #else k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, sizeof (real) * MATVEC_BLOCK_SIZE>>> #endif (*H, a, b, n); cudaThreadSynchronize(); cudaCheckError(); }