Skip to content
Snippets Groups Projects
cuda_linear_solvers.cu 9.09 KiB
Newer Older
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
/*----------------------------------------------------------------------
  PuReMD - Purdue ReaxFF Molecular Dynamics Program
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
  Copyright (2010) Purdue University
  Hasan Metin Aktulga, haktulga@cs.purdue.edu
  Joseph Fogarty, jcfogart@mail.usf.edu
  Sagar Pandit, pandit@usf.edu
  Ananth Y Grama, ayg@cs.purdue.edu

  This program is free software; you can redistribute it and/or
  modify it under the terms of the GNU General Public License as
  published by the Free Software Foundation; either version 2 of 
  the License, or (at your option) any later version.
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
  See the GNU General Public License for more details:
  <http://www.gnu.org/licenses/>.
  ----------------------------------------------------------------------*/

#include "cuda_linear_solvers.h"

Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
#include "cuda_utils.h"
#include "reduction.h"
#include "dual_matvec.h"
#include "matvec.h"


void get_from_device(real *host, real *device, unsigned int bytes, const char *msg)
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
{
    copy_host_device( host, device, bytes, cudaMemcpyDeviceToHost, msg );
void put_on_device(real *host, real *device, unsigned int bytes, const char *msg)
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
{
    copy_host_device( host, device, bytes, cudaMemcpyHostToDevice, msg );

void Cuda_Vector_Sum(real *res, real a, real *x, real b, real *y, int count)
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
{
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    //res = ax + by
    //use the cublas here
    int blocks;
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    blocks = (count / DEF_BLOCK_SIZE) + 
        ((count % DEF_BLOCK_SIZE == 0) ? 0 : 1);
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    k_vector_sum <<< blocks, DEF_BLOCK_SIZE >>>
        ( res, a, x, b, y, count );

    cudaThreadSynchronize();
    cudaCheckError();

void Cuda_CG_Preconditioner(real *res, real *a, real *b, int count)
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
{
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    //res = a*b - vector multiplication
    //use the cublas here.
    int blocks;
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    blocks = (count / DEF_BLOCK_SIZE) + 
        ((count % DEF_BLOCK_SIZE == 0) ? 0 : 1);
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    k_vector_mul <<< blocks, DEF_BLOCK_SIZE >>>
        ( res, a, b, count );

    cudaThreadSynchronize();
CUDA_GLOBAL void k_diagonal_preconditioner(storage p_workspace, rvec2 *b, int n)
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
{
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    storage *workspace = &( p_workspace );
    int j = blockIdx.x * blockDim.x + threadIdx.x;

    if (j >= n)
    {
        return;
    }
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

    //for( j = 0; j < system->n; ++j ) {
    // residual 
    workspace->r2[j][0] = b[j][0] - workspace->q2[j][0];
    workspace->r2[j][1] = b[j][1] - workspace->q2[j][1];
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    // apply diagonal pre-conditioner
    workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; 
    workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; 
    //}
void Cuda_CG_Diagonal_Preconditioner(storage *workspace, rvec2 *b, int n)
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
{
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    int blocks;

    blocks = (n / DEF_BLOCK_SIZE) + 
        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
    k_diagonal_preconditioner <<< blocks, DEF_BLOCK_SIZE >>>
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
        (*workspace, b, n);

    cudaThreadSynchronize();
    cudaCheckError();

CUDA_GLOBAL void k_dual_cg_preconditioner(storage p_workspace, rvec2 *x, 
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
        real alpha_0, real alpha_1, int n, rvec2 *my_dot)
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
{
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    storage *workspace = &( p_workspace );
    rvec2 alpha;
    int j = blockIdx.x * blockDim.x + threadIdx.x;

Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    alpha[0] = alpha_0;
    alpha[1] = alpha_1;

    if (j >= n)
    {
        return;
    }

Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    my_dot[j][0] = my_dot[j][1] = 0.0;

    //for( j = 0; j < system->n; ++j ) {
    // update x 
    x[j][0] += alpha[0] * workspace->d2[j][0];
    x[j][1] += alpha[1] * workspace->d2[j][1];      
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    // update residual 
    workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0]; 
    workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1]; 
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    // apply diagonal pre-conditioner 
    workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
    workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    // dot product: r.p 
    my_dot[j][0] = workspace->r2[j][0] * workspace->p2[j][0];
    my_dot[j][1] = workspace->r2[j][1] * workspace->p2[j][1];
    //}
void Cuda_DualCG_Preconditioner(storage *workspace, rvec2 *x, rvec2 alpha,
        int n, rvec2 result)
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
{
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    int blocks;
    rvec2 *tmp = (rvec2 *) scratch;

    cuda_memset( tmp, 0, sizeof (rvec2) * ( 2 * n + 1),
            "cuda_dualcg_preconditioner" );
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    blocks = (n / DEF_BLOCK_SIZE) + 
        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    k_dual_cg_preconditioner <<< blocks, DEF_BLOCK_SIZE >>>
        (*workspace, x, alpha[0], alpha[1], n, tmp);

    cudaThreadSynchronize();
    cudaCheckError();
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

    //Reduction to calculate my_dot
    k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>>
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
        ( tmp, tmp + n, n);

    cudaThreadSynchronize();
    cudaCheckError();
    k_reduction_rvec2 <<< 1, BLOCKS_POW_2, sizeof(rvec2) * BLOCKS_POW_2 >>>
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
        ( tmp + n, tmp + 2*n, blocks);

    cudaThreadSynchronize();
    cudaCheckError();

    copy_host_device( result, (tmp + 2*n), sizeof(rvec2),
            cudaMemcpyDeviceToHost, "my_dot" );

void Cuda_Norm(rvec2 *arr, int n, rvec2 result)
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
{
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    int blocks;
    rvec2 *tmp = (rvec2 *) scratch;

    blocks = (n / DEF_BLOCK_SIZE) + 
        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    k_norm_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * DEF_BLOCK_SIZE >>>
        (arr, tmp, n, INITIAL);

    cudaThreadSynchronize();
    cudaCheckError();
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

    k_norm_rvec2 <<< 1, BLOCKS_POW_2, sizeof (rvec2) * BLOCKS_POW_2 >>>
        (tmp, tmp + BLOCKS_POW_2, blocks, FINAL );

    cudaThreadSynchronize();
    cudaCheckError();

    copy_host_device( result, tmp + BLOCKS_POW_2, sizeof (rvec2), 
            cudaMemcpyDeviceToHost, "cuda_norm_rvec2" );

void Cuda_Dot(rvec2 *a, rvec2 *b, rvec2 result, int n)
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
{
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    int blocks;
    rvec2 *tmp = (rvec2 *) scratch;

    blocks = (n / DEF_BLOCK_SIZE) + 
        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    k_dot_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * DEF_BLOCK_SIZE >>>
        ( a, b, tmp, n );

    cudaThreadSynchronize();
    cudaCheckError();
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

    k_norm_rvec2 <<< 1, BLOCKS_POW_2, sizeof (rvec2) * BLOCKS_POW_2 >>> 
        //k_norm_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * BLOCKS_POW_2 >>> 
        ( tmp, tmp + BLOCKS_POW_2, blocks, FINAL );

    cudaThreadSynchronize();
    cudaCheckError();

    copy_host_device( result, tmp + BLOCKS_POW_2, sizeof (rvec2), 
            cudaMemcpyDeviceToHost, "cuda_dot" );

void Cuda_Vector_Sum_Rvec2(rvec2 *x, rvec2 *a, rvec2 b, rvec2 *c, int n)
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
{
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    int blocks;

    blocks = (n / DEF_BLOCK_SIZE) + 
        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    k_rvec2_pbetad <<< blocks, DEF_BLOCK_SIZE >>> 
        ( x, a, b[0], b[1], c, n);

    cudaThreadSynchronize();
    cudaCheckError();

CUDA_GLOBAL void k_rvec2_to_real_copy( real *dst, rvec2 *src, int index, int n)
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
{
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i >= n)
    {
        return;
    }
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    dst[i] = src[i][index];

void Cuda_RvecCopy_From(real *dst, rvec2 *src, int index, int n)
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
{
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    int blocks;
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    blocks = (n / DEF_BLOCK_SIZE) + 
        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    k_rvec2_to_real_copy <<< blocks, DEF_BLOCK_SIZE >>>
        ( dst, src, index, n);

    cudaThreadSynchronize();
    cudaCheckError();

CUDA_GLOBAL void k_real_to_rvec2_copy( rvec2 *dst, real *src, int index, int n)
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
{
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i >= n)
    {
        return;
    }
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    dst[i][index] = src[i];

void Cuda_RvecCopy_To(rvec2 *dst, real *src, int index, int n)
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
{
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    int blocks;
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    blocks = (n / DEF_BLOCK_SIZE) + 
        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    k_real_to_rvec2_copy <<< blocks, DEF_BLOCK_SIZE >>>
        ( dst, src, index, n);

    cudaThreadSynchronize();
    cudaCheckError();

void Cuda_Dual_Matvec(sparse_matrix *H, rvec2 *a, rvec2 *b, int n, int size)
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
{
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    int blocks;
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    blocks = (n / DEF_BLOCK_SIZE) + 
        (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

    cuda_memset( b, 0, sizeof (rvec2) * size, "dual_matvec:result" );
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    //One thread per row implementation
    //k_dual_matvec <<< blocks, DEF_BLOCK_SIZE >>>
    //        (*H, a, b, n);
    //cudaThreadSynchronize ();
    //cudaCheckError ();
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    //One warp per row implementation
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
#if defined(__SM_35__)
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>>
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
#else
    k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE,
                      sizeof (rvec2) * MATVEC_BLOCK_SIZE >>>
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
#endif
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
                      (*H, a, b, n);

    cudaThreadSynchronize();
    cudaCheckError();

void Cuda_Matvec(sparse_matrix *H, real *a, real *b, int n, int size)
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
{
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    int blocks;
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    blocks = (n / DEF_BLOCK_SIZE) + 
        (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

    cuda_memset( b, 0, sizeof (real) * size, "dual_matvec:result" );
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    //one thread per row implementation
    //k_matvec <<< blocks, DEF_BLOCK_SIZE >>>
    //        (*H, a, b, n);
    //cudaThreadSynchronize ();
    //cudaCheckError ();
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
#if defined(__SM_35__)
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>>
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
#else
    k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE,
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
                 sizeof (real) * MATVEC_BLOCK_SIZE>>>
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
#endif
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
                     (*H, a, b, n);
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

    cudaThreadSynchronize();
    cudaCheckError();
}