Skip to content
Snippets Groups Projects
Commit 9fca672f authored by Kurt A. O'Hearn's avatar Kurt A. O'Hearn
Browse files

PG-PuReMD: code clean-up.

parent a7c6e0e5
No related branches found
No related tags found
No related merge requests found
......@@ -36,7 +36,9 @@ void real_packer( void *dummy, mpi_out_data *out_buf )
real *out = (real*) out_buf->out_atoms;
for ( i = 0; i < out_buf->cnt; ++i )
{
out[i] = buf[ out_buf->index[i] ];
}
}
......@@ -47,7 +49,9 @@ void rvec_packer( void *dummy, mpi_out_data *out_buf )
rvec *out = (rvec*)out_buf->out_atoms;
for ( i = 0; i < out_buf->cnt; ++i )
{
memcpy( out[i], buf[ out_buf->index[i] ], sizeof(rvec) );
}
}
......@@ -58,12 +62,14 @@ void rvec2_packer( void *dummy, mpi_out_data *out_buf )
rvec2 *out = (rvec2*) out_buf->out_atoms;
for ( i = 0; i < out_buf->cnt; ++i )
{
memcpy( out[i], buf[ out_buf->index[i] ], sizeof(rvec2) );
}
}
void Dist( reax_system* system, mpi_datatypes *mpi_data,
void *buf, MPI_Datatype type, int scale, dist_packer pack )
void Dist( reax_system* system, mpi_datatypes *mpi_data, void *buf,
MPI_Datatype type, int scale, dist_packer pack )
{
int d;
mpi_out_data *out_bufs;
......@@ -83,31 +89,41 @@ void Dist( reax_system* system, mpi_datatypes *mpi_data,
/* initiate recvs */
nbr1 = &(system->my_nbrs[2 * d]);
if ( nbr1->atoms_cnt )
{
MPI_Irecv( buf + nbr1->atoms_str * scale, nbr1->atoms_cnt, type,
nbr1->rank, 2 * d + 1, comm, &req1 );
nbr1->rank, 2 * d + 1, comm, &req1 );
}
nbr2 = &(system->my_nbrs[2 * d + 1]);
if ( nbr2->atoms_cnt )
{
MPI_Irecv( buf + nbr2->atoms_str * scale, nbr2->atoms_cnt, type,
nbr2->rank, 2 * d, comm, &req2 );
nbr2->rank, 2 * d, comm, &req2 );
}
/* send both messages in dimension d */
if ( out_bufs[2 * d].cnt )
{
pack( buf, out_bufs + (2 * d) );
MPI_Send( out_bufs[2 * d].out_atoms, out_bufs[2 * d].cnt, type,
nbr1->rank, 2 * d, comm );
nbr1->rank, 2 * d, comm );
}
if ( out_bufs[2 * d + 1].cnt )
{
pack( buf, out_bufs + (2 * d + 1) );
MPI_Send( out_bufs[2 * d + 1].out_atoms, out_bufs[2 * d + 1].cnt, type,
nbr2->rank, 2 * d + 1, comm );
MPI_Send( out_bufs[2 * d + 1].out_atoms, out_bufs[2 * d + 1].cnt,
type, nbr2->rank, 2 * d + 1, comm );
}
if ( nbr1->atoms_cnt ) MPI_Wait( &req1, &stat1 );
if ( nbr2->atoms_cnt ) MPI_Wait( &req2, &stat2 );
if( nbr1->atoms_cnt )
{
MPI_Wait( &req1, &stat1 );
}
if( nbr2->atoms_cnt )
{
MPI_Wait( &req2, &stat2 );
}
}
#if defined(DEBUG)
......@@ -123,7 +139,9 @@ void real_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf )
real *buf = (real*) dummy_buf;
for ( i = 0; i < out_buf->cnt; ++i )
{
buf[ out_buf->index[i] ] += in[i];
}
}
......@@ -138,7 +156,7 @@ void rvec_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf )
rvec_Add( buf[ out_buf->index[i] ], in[i] );
#if defined(DEBUG)
fprintf( stderr, "rvec_unpacker: cnt=%d i =%d index[i]=%d\n",
out_buf->cnt, i, out_buf->index[i] );
out_buf->cnt, i, out_buf->index[i] );
#endif
}
}
......@@ -158,8 +176,8 @@ void rvec2_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf )
}
void Coll( reax_system* system, mpi_datatypes *mpi_data,
void *buf, MPI_Datatype type, int scale, coll_unpacker unpack )
void Coll( reax_system* system, mpi_datatypes *mpi_data, void *buf,
MPI_Datatype type, int scale, coll_unpacker unpack )
{
int d;
void *in1, *in2;
......@@ -172,6 +190,7 @@ void Coll( reax_system* system, mpi_datatypes *mpi_data,
#if defined(DEBUG)
fprintf( stderr, "p%d coll: entered\n", system->my_rank );
#endif
comm = mpi_data->comm_mesh3D;
in1 = mpi_data->in1_buffer;
in2 = mpi_data->in2_buffer;
......@@ -182,28 +201,36 @@ void Coll( reax_system* system, mpi_datatypes *mpi_data,
/* initiate recvs */
nbr1 = &(system->my_nbrs[2 * d]);
if ( out_bufs[2 * d].cnt )
{
MPI_Irecv(in1, out_bufs[2 * d].cnt, type, nbr1->rank, 2 * d + 1, comm, &req1);
}
nbr2 = &(system->my_nbrs[2 * d + 1]);
if ( out_bufs[2 * d + 1].cnt )
{
MPI_Irecv(in2, out_bufs[2 * d + 1].cnt, type, nbr2->rank, 2 * d, comm, &req2);
}
/* send both messages in dimension d */
if ( nbr1->atoms_cnt )
{
MPI_Send( buf + nbr1->atoms_str * scale, nbr1->atoms_cnt, type,
nbr1->rank, 2 * d, comm );
nbr1->rank, 2 * d, comm );
}
if ( nbr2->atoms_cnt )
{
MPI_Send( buf + nbr2->atoms_str * scale, nbr2->atoms_cnt, type,
nbr2->rank, 2 * d + 1, comm );
nbr2->rank, 2 * d + 1, comm );
}
#if defined(DEBUG)
fprintf( stderr, "p%d coll[%d] nbr1: str=%d cnt=%d recv=%d\n",
system->my_rank, d, nbr1->atoms_str, nbr1->atoms_cnt,
out_bufs[2 * d].cnt );
system->my_rank, d, nbr1->atoms_str, nbr1->atoms_cnt,
out_bufs[2 * d].cnt );
fprintf( stderr, "p%d coll[%d] nbr2: str=%d cnt=%d recv=%d\n",
system->my_rank, d, nbr2->atoms_str, nbr2->atoms_cnt,
out_bufs[2 * d + 1].cnt );
system->my_rank, d, nbr2->atoms_str, nbr2->atoms_cnt,
out_bufs[2 * d + 1].cnt );
#endif
if ( out_bufs[2 * d].cnt )
......@@ -233,7 +260,9 @@ real Parallel_Norm( real *v, int n, MPI_Comm comm )
my_sum = 0;
for ( i = 0; i < n; ++i )
{
my_sum += SQR( v[i] );
}
MPI_Allreduce( &my_sum, &norm_sqr, 1, MPI_DOUBLE, MPI_SUM, comm );
......@@ -249,7 +278,9 @@ real Parallel_Dot( real *v1, real *v2, int n, MPI_Comm comm )
my_dot = 0;
for ( i = 0; i < n; ++i )
{
my_dot += v1[i] * v2[i];
}
MPI_Allreduce( &my_dot, &res, 1, MPI_DOUBLE, MPI_SUM, comm );
......@@ -265,7 +296,9 @@ real Parallel_Vector_Acc( real *v, int n, MPI_Comm comm )
my_acc = 0;
for ( i = 0; i < n; ++i )
{
my_acc += v[i];
}
MPI_Allreduce( &my_acc, &res, 1, MPI_DOUBLE, MPI_SUM, comm );
......@@ -275,29 +308,33 @@ real Parallel_Vector_Acc( real *v, int n, MPI_Comm comm )
/*****************************************************************************/
#if defined(TEST_FORCES)
void Coll_ids_at_Master( reax_system *system, storage *workspace,
mpi_datatypes *mpi_data )
void Coll_ids_at_Master( reax_system *system, storage *workspace, mpi_datatypes
*mpi_data )
{
int i;
int *id_list;
MPI_Gather( &system->n, 1, MPI_INT, workspace->rcounts, 1, MPI_INT,
MASTER_NODE, MPI_COMM_WORLD );
MASTER_NODE, MPI_COMM_WORLD );
if ( system->my_rank == MASTER_NODE )
{
workspace->displs[0] = 0;
for ( i = 1; i < system->wsize; ++i )
{
workspace->displs[i] = workspace->displs[i - 1] + workspace->rcounts[i - 1];
}
}
id_list = (int*) malloc( system->n * sizeof(int) );
for ( i = 0; i < system->n; ++i )
{
id_list[i] = system->my_atoms[i].orig_id;
}
MPI_Gatherv( id_list, system->n, MPI_INT,
workspace->id_all, workspace->rcounts, workspace->displs,
MPI_INT, MASTER_NODE, MPI_COMM_WORLD );
MPI_Gatherv( id_list, system->n, MPI_INT, workspace->id_all,
workspace->rcounts, workspace->displs, MPI_INT, MASTER_NODE,
MPI_COMM_WORLD );
free( id_list );
......@@ -305,18 +342,20 @@ void Coll_ids_at_Master( reax_system *system, storage *workspace,
if ( system->my_rank == MASTER_NODE )
{
for ( i = 0 ; i < system->bigN; ++i )
{
fprintf( stderr, "id_all[%d]: %d\n", i, workspace->id_all[i] );
}
}
#endif
}
void Coll_rvecs_at_Master( reax_system *system, storage *workspace,
mpi_datatypes *mpi_data, rvec* v )
mpi_datatypes *mpi_data, rvec* v )
{
MPI_Gatherv( v, system->n, mpi_data->mpi_rvec,
workspace->f_all, workspace->rcounts, workspace->displs,
mpi_data->mpi_rvec, MASTER_NODE, MPI_COMM_WORLD );
MPI_Gatherv( v, system->n, mpi_data->mpi_rvec, workspace->f_all,
workspace->rcounts, workspace->displs, mpi_data->mpi_rvec,
MASTER_NODE, MPI_COMM_WORLD );
}
#endif
......@@ -28,13 +28,13 @@
#include "matvec.h"
void get_from_device(real *host, real *device, unsigned int bytes, char *msg)
void get_from_device(real *host, real *device, unsigned int bytes, const char *msg)
{
copy_host_device( host, device, bytes, cudaMemcpyDeviceToHost, msg );
}
void put_on_device(real *host, real *device, unsigned int bytes, char *msg)
void put_on_device(real *host, real *device, unsigned int bytes, const char *msg)
{
copy_host_device( host, device, bytes, cudaMemcpyHostToDevice, msg );
}
......@@ -73,7 +73,7 @@ void Cuda_CG_Preconditioner(real *res, real *a, real *b, int count)
}
CUDA_GLOBAL void k_diagnol_preconditioner(storage p_workspace, rvec2 *b, int n)
CUDA_GLOBAL void k_diagonal_preconditioner(storage p_workspace, rvec2 *b, int n)
{
storage *workspace = &( p_workspace );
int j = blockIdx.x * blockDim.x + threadIdx.x;
......@@ -95,14 +95,14 @@ CUDA_GLOBAL void k_diagnol_preconditioner(storage p_workspace, rvec2 *b, int n)
}
void Cuda_CG_Diagnol_Preconditioner(storage *workspace, rvec2 *b, int n)
void Cuda_CG_Diagonal_Preconditioner(storage *workspace, rvec2 *b, int n)
{
int blocks;
blocks = (n / DEF_BLOCK_SIZE) +
(( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
k_diagnol_preconditioner <<< blocks, DEF_BLOCK_SIZE >>>
k_diagonal_preconditioner <<< blocks, DEF_BLOCK_SIZE >>>
(*workspace, b, n);
cudaThreadSynchronize();
......@@ -147,12 +147,14 @@ CUDA_GLOBAL void k_dual_cg_preconditioner(storage p_workspace, rvec2 *x,
}
void Cuda_DualCG_Preconditioer(storage *workspace, rvec2 *x, rvec2 alpha, int n, rvec2 result)
void Cuda_DualCG_Preconditioner(storage *workspace, rvec2 *x, rvec2 alpha,
int n, rvec2 result)
{
int blocks;
rvec2 *tmp = (rvec2 *) scratch;
cuda_memset( tmp, 0, sizeof (rvec2) * ( 2 * n + 1), "cuda_dualcg_preconditioner" );
cuda_memset( tmp, 0, sizeof (rvec2) * ( 2 * n + 1),
"cuda_dualcg_preconditioner" );
blocks = (n / DEF_BLOCK_SIZE) +
(( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
......@@ -163,19 +165,20 @@ void Cuda_DualCG_Preconditioer(storage *workspace, rvec2 *x, rvec2 alpha, int n,
cudaCheckError();
//Reduction to calculate my_dot
k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * DEF_BLOCK_SIZE >>>
k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>>
( tmp, tmp + n, n);
cudaThreadSynchronize();
cudaCheckError();
k_reduction_rvec2 <<< 1, BLOCKS_POW_2, sizeof (rvec2) * BLOCKS_POW_2 >>>
k_reduction_rvec2 <<< 1, BLOCKS_POW_2, sizeof(rvec2) * BLOCKS_POW_2 >>>
( tmp + n, tmp + 2*n, blocks);
cudaThreadSynchronize();
cudaCheckError();
copy_host_device( result, (tmp + 2*n), sizeof (rvec2), cudaMemcpyDeviceToHost, "my_dot" );
copy_host_device( result, (tmp + 2*n), sizeof(rvec2),
cudaMemcpyDeviceToHost, "my_dot" );
}
......
......@@ -24,23 +24,25 @@
#include "reax_types.h"
#ifdef __cplusplus
extern "C" {
#endif
void get_from_device (real *host, real *device, unsigned int bytes, char *);
void put_on_device (real *host, real *device, unsigned int bytes, char *);
void Cuda_Vector_Sum (real *res, real a, real *x, real b, real *y, int count);
void Cuda_CG_Preconditioner (real *res, real *a, real *b, int count);
void Cuda_CG_Diagnol_Preconditioner (storage *workspace, rvec2 *b, int n);
void Cuda_DualCG_Preconditioer (storage *workspace, rvec2 *, rvec2 alpha, int n, rvec2 result);
void Cuda_Norm (rvec2 *arr, int n, rvec2 result);
void Cuda_Dot (rvec2 *a, rvec2 *b, rvec2 result, int n);
void Cuda_Vector_Sum_Rvec2 (rvec2 *x, rvec2 *, rvec2 , rvec2 *c, int n);
void Cuda_RvecCopy_From (real *dst, rvec2 *src, int index, int n);
void Cuda_RvecCopy_To (rvec2 *dst, real *src, int index, int n);
void Cuda_Dual_Matvec (sparse_matrix *, rvec2 *, rvec2 *, int , int);
void Cuda_Matvec (sparse_matrix *, real *, real *, int , int);
void get_from_device(real *host, real *device, unsigned int bytes, const char *);
void put_on_device(real *host, real *device, unsigned int bytes, const char *);
void Cuda_Vector_Sum(real *res, real a, real *x, real b, real *y, int count);
void Cuda_CG_Preconditioner(real *res, real *a, real *b, int count);
void Cuda_CG_Diagonal_Preconditioner(storage *workspace, rvec2 *b, int n);
void Cuda_DualCG_Preconditioner(storage *workspace, rvec2 *, rvec2 alpha, int n, rvec2 result);
void Cuda_Norm(rvec2 *arr, int n, rvec2 result);
void Cuda_Dot(rvec2 *a, rvec2 *b, rvec2 result, int n);
void Cuda_Vector_Sum_Rvec2(rvec2 *x, rvec2 *, rvec2 , rvec2 *c, int n);
void Cuda_RvecCopy_From(real *dst, rvec2 *src, int index, int n);
void Cuda_RvecCopy_To(rvec2 *dst, real *src, int index, int n);
void Cuda_Dual_Matvec(sparse_matrix *, rvec2 *, rvec2 *, int , int);
void Cuda_Matvec(sparse_matrix *, real *, real *, int , int);
#ifdef __cplusplus
}
......
......@@ -28,9 +28,8 @@
#include "validation.h"
CUDA_GLOBAL void ker_init_matvec( reax_atom *my_atoms,
single_body_parameters *sbp,
storage p_workspace, int n )
CUDA_GLOBAL void ker_init_matvec( reax_atom *my_atoms, single_body_parameters
*sbp, storage p_workspace, int n )
{
storage *workspace = &( p_workspace );
reax_atom *atom;
......
......@@ -25,40 +25,35 @@
#include "reax_types.h"
#include "reax_types.h"
#ifdef __cplusplus
extern "C" {
#endif
/*
* Part of the code is taken from this site.
* And the other is taken from the download in the PGPuReMD folder on CUPID
http://wenda.baba.io/questions/4481817/overloading-the-cuda-shuffle-function-makes-the-original-ones-invisible.html
*/
#if defined(__SM_35__)
/* Part of the code is taken from this site.
* And the other is taken from the download in the PGPuReMD folder on CUPID
* http://wenda.baba.io/questions/4481817/overloading-the-cuda-shuffle-function-makes-the-original-ones-invisible.html
*/
CUDA_DEVICE inline real shfl(real x, int lane)
{
// Split the double number into 2 32b registers.
//
int lo, hi;
asm volatile( "mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(x));
asm volatile( "mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(x) );
// Shuffle the two 32b registers.
//
lo = __shfl_xor(lo, lane);
hi = __shfl_xor(hi, lane);
lo = __shfl_xor( lo, lane );
hi = __shfl_xor( hi, lane );
// Recreate the 64b number.
// //asm volatile( "mov.b64 %0, {%1,%2};" : "=d(x)" : "r"(lo), "r"(hi));
// //return x;
//
return __hiloint2double( hi, lo);
//asm volatile( "mov.b64 %0, {%1,%2};" : "=d(x)" : "r"(lo), "r"(hi) );
//return x;
return __hiloint2double( hi, lo );
}
#endif
#ifdef __cplusplus
}
#endif
......
This diff is collapsed.
......@@ -69,8 +69,9 @@ void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N )
}
int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout, simulation_data *data )
int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2
*b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout,
simulation_data *data )
{
int i, j, n, N, matvecs, scale;
rvec2 tmp, alpha, beta;
......@@ -341,13 +342,17 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
// Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker);
//#endif
get_from_device (spad, dev_workspace->q2, sizeof (rvec2) * system->total_cap, "CG:q2:get" );
get_from_device (spad, dev_workspace->q2, sizeof (rvec2) *
system->total_cap, "CG:q2:get" );
Coll(system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker);
put_on_device (spad, dev_workspace->q2, sizeof (rvec2) * system->total_cap, "CG:q2:put" );
put_on_device (spad, dev_workspace->q2, sizeof (rvec2) * system->total_cap,
"CG:q2:put" );
#if defined(CG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
{
Update_Timing_Info( &t_start, &matvec_time );
}
#endif
//#ifdef __CUDA_DEBUG__
......@@ -419,9 +424,11 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
// Dist(system,mpi_data,workspace->d2,mpi_data->mpi_rvec2,scale,rvec2_packer);
//#endif
get_from_device( spad, dev_workspace->d2, sizeof (rvec2) * system->total_cap, "cg:d2:get" );
get_from_device( spad, dev_workspace->d2, sizeof (rvec2) *
system->total_cap, "cg:d2:get" );
Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer );
put_on_device( spad, dev_workspace->d2, sizeof (rvec2) * system->total_cap, "cg:d2:put" );
put_on_device( spad, dev_workspace->d2, sizeof (rvec2) *
system->total_cap, "cg:d2:put" );
//print_device_rvec2 (dev_workspace->d2, N);
......@@ -429,7 +436,8 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
// dual_Sparse_MatVec( &workspace->H, workspace->d2, workspace->q2, N );
//#endif
Cuda_Dual_Matvec( H, dev_workspace->d2, dev_workspace->q2, system->N, system->total_cap );
Cuda_Dual_Matvec( H, dev_workspace->d2, dev_workspace->q2, system->N,
system->total_cap );
/*
fprintf (stderr, "******************* Device sparse Matrix--------> %d \n", H->n );
......@@ -450,9 +458,12 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
// Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker);
//#endif
get_from_device( spad, dev_workspace->q2, sizeof (rvec2) * system->total_cap, "cg:q2:get" );
Coll( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker );
put_on_device( spad, dev_workspace->q2, sizeof (rvec2) * system->total_cap, "cg:q2:put" );
get_from_device( spad, dev_workspace->q2, sizeof (rvec2) *
system->total_cap, "cg:q2:get" );
Coll( system, mpi_data, spad, mpi_data->mpi_rvec2, scale,
rvec2_unpacker );
put_on_device( spad, dev_workspace->q2, sizeof (rvec2) *
system->total_cap, "cg:q2:put" );
// compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
......@@ -503,7 +514,7 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
//#endif
my_dot[0] = my_dot[1] = 0;
Cuda_DualCG_Preconditioer( dev_workspace, x, alpha, system->n, my_dot );
Cuda_DualCG_Preconditioner( dev_workspace, x, alpha, system->n, my_dot );
//fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] );
......@@ -634,8 +645,8 @@ void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N )
}
int CG( reax_system *system, storage *workspace, sparse_matrix *H,
real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout)
int CG( reax_system *system, storage *workspace, sparse_matrix *H, real *b,
real tol, real *x, mpi_datatypes* mpi_data, FILE *fout)
{
int i, j, scale;
real tmp, alpha, beta, b_norm;
......@@ -722,8 +733,8 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H,
#ifdef HAVE_CUDA
int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H,
real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, real
*b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
{
int i, j, scale;
real tmp, alpha, beta, b_norm;
......@@ -734,40 +745,49 @@ int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H,
/* x is on the device */
//MVAPICH2
memset (spad, 0, sizeof (real) * system->total_cap);
get_from_device (spad, x, sizeof (real) * system->total_cap, "cuda_cg:x:get");
memset( spad, 0, sizeof (real) * system->total_cap );
get_from_device( spad, x, sizeof (real) * system->total_cap, "cuda_cg:x:get" );
Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer );
//MVAPICH2
put_on_device (spad, x, sizeof (real) * system->total_cap , "cuda_cg:x:put");
put_on_device( spad, x, sizeof (real) * system->total_cap , "cuda_cg:x:put" );
Cuda_Matvec( H, x, dev_workspace->q, system->N, system->total_cap );
// tryQEq
// MVAPICH2
get_from_device (spad, dev_workspace->q, sizeof (real) * system->total_cap, "cuda_cg:q:get" );
get_from_device( spad, dev_workspace->q, sizeof (real) * system->total_cap,
"cuda_cg:q:get" );
Coll( system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker );
//MVAPICH2
put_on_device (spad, dev_workspace->q, sizeof (real) * system->total_cap, "cuda_cg:q:put" );
put_on_device( spad, dev_workspace->q, sizeof (real) * system->total_cap,
"cuda_cg:q:put" );
#if defined(CG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
{
Update_Timing_Info( &t_start, &matvec_time );
}
#endif
Cuda_Vector_Sum( dev_workspace->r , 1., b, -1., dev_workspace->q, system->n );
Cuda_Vector_Sum( dev_workspace->r , 1., b, -1., dev_workspace->q,
system->n );
//for( j = 0; j < system->n; ++j )
// workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition
Cuda_CG_Preconditioner (dev_workspace->d, dev_workspace->r, dev_workspace->Hdia_inv, system->n);
Cuda_CG_Preconditioner( dev_workspace->d, dev_workspace->r,
dev_workspace->Hdia_inv, system->n );
//TODO do the parallel_norm on the device for the local sum
get_from_device (spad, b, sizeof (real) * system->n, "cuda_cg:b:get");
get_from_device( spad, b, sizeof (real) * system->n, "cuda_cg:b:get" );
b_norm = Parallel_Norm( spad, system->n, mpi_data->world );
//TODO do the parallel dot on the device for the local sum
get_from_device (spad, dev_workspace->r, sizeof (real) * system->total_cap, "cuda_cg:r:get");
get_from_device (spad + system->total_cap, dev_workspace->d, sizeof (real) * system->total_cap, "cuda_cg:d:get");
sig_new = Parallel_Dot(spad, spad + system->total_cap, system->n, mpi_data->world);
get_from_device( spad, dev_workspace->r, sizeof (real) * system->total_cap,
"cuda_cg:r:get" );
get_from_device( spad + system->total_cap, dev_workspace->d, sizeof (real)
* system->total_cap, "cuda_cg:d:get" );
sig_new = Parallel_Dot( spad, spad + system->total_cap, system->n,
mpi_data->world );
sig0 = sig_new;
......@@ -781,16 +801,20 @@ int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H,
for ( i = 1; i < 300 && SQRT(sig_new) / b_norm > tol; ++i )
{
//MVAPICH2
get_from_device (spad, dev_workspace->d, sizeof (real) * system->total_cap, "cuda_cg:d:get");
get_from_device( spad, dev_workspace->d, sizeof (real) *
system->total_cap, "cuda_cg:d:get" );
Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer );
put_on_device (spad, dev_workspace->d, sizeof (real) * system->total_cap, "cuda_cg:d:put");
put_on_device( spad, dev_workspace->d, sizeof (real) *
system->total_cap, "cuda_cg:d:put" );
Cuda_Matvec( H, dev_workspace->d, dev_workspace->q, system->N, system->total_cap );
//tryQEq
get_from_device (spad, dev_workspace->q, sizeof (real) * system->total_cap, "cuda_cg:q:get" );
Coll(system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker);
put_on_device (spad, dev_workspace->q, sizeof (real) * system->total_cap , "cuda_cg:q:get");
get_from_device( spad, dev_workspace->q, sizeof (real) *
system->total_cap, "cuda_cg:q:get" );
Coll( system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker );
put_on_device( spad, dev_workspace->q, sizeof (real) *
system->total_cap , "cuda_cg:q:get" );
#if defined(CG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
......@@ -800,31 +824,38 @@ int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H,
#endif
//TODO do the parallel dot on the device for the local sum
get_from_device (spad, dev_workspace->d, sizeof (real) * system->n, "cuda_cg:d:get");
get_from_device (spad + system->n, dev_workspace->q, sizeof (real) * system->n, "cuda_cg:q:get");
tmp = Parallel_Dot(spad, spad + system->n, system->n, mpi_data->world);
get_from_device( spad, dev_workspace->d, sizeof (real) * system->n,
"cuda_cg:d:get" );
get_from_device( spad + system->n, dev_workspace->q, sizeof (real) *
system->n, "cuda_cg:q:get" );
tmp = Parallel_Dot( spad, spad + system->n, system->n, mpi_data->world );
alpha = sig_new / tmp;
//Cuda_Vector_Add( x, alpha, dev_workspace->d, system->n );
Cuda_Vector_Sum( x, alpha, dev_workspace->d, 1.0, x, system->n );
//Cuda_Vector_Add( workspace->r, -alpha, workspace->q, system->n );
Cuda_Vector_Sum( dev_workspace->r, -alpha, dev_workspace->q, 1.0, dev_workspace->r, system->n );
Cuda_Vector_Sum( dev_workspace->r, -alpha, dev_workspace->q, 1.0,
dev_workspace->r, system->n );
/* pre-conditioning */
//for( j = 0; j < system->n; ++j )
// workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
Cuda_CG_Preconditioner (dev_workspace->p, dev_workspace->r, dev_workspace->Hdia_inv, system->n);
Cuda_CG_Preconditioner( dev_workspace->p, dev_workspace->r,
dev_workspace->Hdia_inv, system->n );
sig_old = sig_new;
//TODO do the parallel dot on the device for the local sum
get_from_device (spad, dev_workspace->r, sizeof (real) * system->n, "cuda_cg:r:get");
get_from_device (spad + system->n, dev_workspace->p, sizeof (real) * system->n, "cuda_cg:p:get");
sig_new = Parallel_Dot(spad , spad + system->n, system->n, mpi_data->world);
get_from_device( spad, dev_workspace->r, sizeof (real) * system->n,
"cuda_cg:r:get" );
get_from_device( spad + system->n, dev_workspace->p, sizeof (real) *
system->n, "cuda_cg:p:get" );
sig_new = Parallel_Dot( spad , spad + system->n, system->n, mpi_data->world );
//fprintf (stderr, "Device: sig_new: %f \n", sig_new );
beta = sig_new / sig_old;
Cuda_Vector_Sum( dev_workspace->d, 1., dev_workspace->p, beta, dev_workspace->d, system->n );
Cuda_Vector_Sum( dev_workspace->d, 1., dev_workspace->p, beta,
dev_workspace->d, system->n );
#if defined(CG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
......@@ -845,8 +876,8 @@ int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H,
#endif
int CG_test( reax_system *system, storage *workspace, sparse_matrix *H,
real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
int CG_test( reax_system *system, storage *workspace, sparse_matrix *H, real
*b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
{
int i, j, scale;
real tmp, alpha, beta, b_norm;
......@@ -1013,10 +1044,9 @@ void Backward_Subs( sparse_matrix *U, real *y, real *x )
}
int PCG( reax_system *system, storage *workspace,
sparse_matrix *H, real *b, real tol,
sparse_matrix *L, sparse_matrix *U, real *x,
mpi_datatypes* mpi_data, FILE *fout )
int PCG( reax_system *system, storage *workspace, sparse_matrix *H, real *b,
real tol, sparse_matrix *L, sparse_matrix *U, real *x, mpi_datatypes*
mpi_data, FILE *fout )
{
int i, me, n, N, scale;
real tmp, alpha, beta, b_norm, r_norm, sig_old, sig_new;
......@@ -1028,6 +1058,7 @@ int PCG( reax_system *system, storage *workspace,
world = mpi_data->world;
scale = sizeof(real) / sizeof(void);
b_norm = Parallel_Norm( b, n, world );
#if defined(DEBUG_FOCUS)
if ( me == MASTER_NODE )
{
......@@ -1045,6 +1076,7 @@ int PCG( reax_system *system, storage *workspace,
Forward_Subs( L, workspace->r, workspace->d );
Backward_Subs( U, workspace->d, workspace->p );
sig_new = Parallel_Dot( workspace->r, workspace->p, n, world );
#if defined(DEBUG_FOCUS)
if ( me == MASTER_NODE )
{
......@@ -1065,19 +1097,25 @@ int PCG( reax_system *system, storage *workspace,
tmp = Parallel_Dot( workspace->q, workspace->p, n, world );
alpha = sig_new / tmp;
Vector_Add( x, alpha, workspace->p, n );
#if defined(DEBUG_FOCUS)
if ( me == MASTER_NODE )
fprintf(stderr, "iter%d: |p|=%.15e |q|=%.15e tmp=%.15e\n",
i, Parallel_Norm(workspace->p, n, world),
{
fprintf(stderr, "iter%d: |p|=%.15e |q|=%.15e tmp=%.15e\n", i,
Parallel_Norm(workspace->p, n, world),
Parallel_Norm(workspace->q, n, world), tmp );
}
MPI_Barrier( world );
#endif
Vector_Add( workspace->r, -alpha, workspace->q, n );
r_norm = Parallel_Norm( workspace->r, n, world );
#if defined(DEBUG_FOCUS)
if ( me == MASTER_NODE )
{
fprintf( stderr, "iter%d: res=%.15e\n", i, r_norm );
}
MPI_Barrier( world );
#endif
......@@ -1091,10 +1129,15 @@ int PCG( reax_system *system, storage *workspace,
#if defined(DEBUG_FOCUS)
if ( me == MASTER_NODE )
{
fprintf( stderr, "PCG took %d iterations\n", i );
}
#endif
if ( i >= 100 )
{
fprintf( stderr, "PCG convergence failed!\n" );
}
return i;
}
......@@ -1109,6 +1152,7 @@ int sCG( reax_system *system, storage *workspace, sparse_matrix *H,
real sig_old, sig_new, sig0;
b_norm = Norm( b, system->n );
#if defined(DEBUG)
if ( system->my_rank == MASTER_NODE )
{
......
......@@ -484,9 +484,9 @@ void QEq( reax_system *system, control_params *control, simulation_data *data,
#ifdef HAVE_CUDA
void Cuda_QEq( reax_system *system, control_params *control, simulation_data *data,
storage *workspace, output_controls *out_control,
mpi_datatypes *mpi_data )
void Cuda_QEq( reax_system *system, control_params *control, simulation_data
*data, storage *workspace, output_controls *out_control, mpi_datatypes
*mpi_data )
{
int s_matvecs, t_matvecs;
......@@ -509,8 +509,9 @@ void Cuda_QEq( reax_system *system, control_params *control, simulation_data *da
#endif
//MATRIX CHANGES
s_matvecs = Cuda_dual_CG(system, workspace, &dev_workspace->H, dev_workspace->b,
control->q_err, dev_workspace->x, mpi_data, out_control->log, data);
s_matvecs = Cuda_dual_CG(system, workspace, &dev_workspace->H,
dev_workspace->b, control->q_err, dev_workspace->x, mpi_data,
out_control->log, data);
t_matvecs = 0;
//fprintf (stderr, "Device: First CG complated with iterations: %d \n", s_matvecs);
......
......@@ -25,9 +25,9 @@
#include "reax_types.h"
void QEq( reax_system*, control_params*, simulation_data*,
storage*, output_controls*, mpi_datatypes* );
storage*, output_controls*, mpi_datatypes* );
void Cuda_QEq( reax_system*, control_params*, simulation_data*,
storage*, output_controls*, mpi_datatypes* );
storage*, output_controls*, mpi_datatypes* );
#endif
......@@ -4,7 +4,9 @@
#include "cuda_shuffle.h"
CUDA_GLOBAL void k_reduction(const real *input, real *per_block_results, const size_t n)
CUDA_GLOBAL void k_reduction(const real *input, real *per_block_results,
const size_t n)
{
#if defined(__SM_35__)
extern __shared__ real my_results[];
......@@ -12,46 +14,56 @@ CUDA_GLOBAL void k_reduction(const real *input, real *per_block_results, const s
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
real x = 0;
if(i < n)
if( i < n )
{
x = input[i];
}
sdata = x;
__syncthreads();
for(int z = 16; z >=1; z/=2)
sdata+= shfl ( sdata, z);
for( int z = 16; z >=1; z/=2 )
{
sdata += shfl( sdata, z );
}
if (threadIdx.x % 32 == 0)
if ( threadIdx.x % 32 == 0 )
{
my_results[threadIdx.x >> 5] = sdata;
}
__syncthreads ();
__syncthreads();
for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) {
if(threadIdx.x < offset)
for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 )
{
if( threadIdx.x < offset )
{
my_results[threadIdx.x] += my_results[threadIdx.x + offset];
}
__syncthreads();
}
if(threadIdx.x == 0)
if( threadIdx.x == 0 )
{
per_block_results[blockIdx.x] = my_results[0];
}
#else
extern __shared__ real sdata[];
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
real x = 0;
if(i < n)
if( i < n )
{
x = input[i];
}
sdata[threadIdx.x] = x;
__syncthreads();
for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 )
{
if(threadIdx.x < offset)
if( threadIdx.x < offset )
{
sdata[threadIdx.x] += sdata[threadIdx.x + offset];
}
......@@ -59,65 +71,72 @@ CUDA_GLOBAL void k_reduction(const real *input, real *per_block_results, const s
__syncthreads();
}
if(threadIdx.x == 0)
if( threadIdx.x == 0 )
{
per_block_results[blockIdx.x] = sdata[0];
}
#endif
}
CUDA_GLOBAL void k_reduction_rvec (rvec *input, rvec *results, size_t n)
CUDA_GLOBAL void k_reduction_rvec(rvec *input, rvec *results, size_t n)
{
#if defined(__SM_35__)
extern __shared__ rvec my_rvec[];
rvec sdata;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
rvec_MakeZero( sdata );
if(i < n)
rvec_Copy (sdata, input[i]);
if( i < n )
{
rvec_Copy( sdata, input[i] );
}
__syncthreads();
for(int z = 16; z >=1; z/=2){
sdata[0] += shfl ( sdata[0], z);
sdata[1] += shfl ( sdata[1], z);
sdata[2] += shfl ( sdata[2], z);
for( int z = 16; z >=1; z/=2 )
{
sdata[0] += shfl( sdata[0], z);
sdata[1] += shfl( sdata[1], z);
sdata[2] += shfl( sdata[2], z);
}
if (threadIdx.x % 32 == 0)
if ( threadIdx.x % 32 == 0 )
{
rvec_Copy( my_rvec[threadIdx.x >> 5] , sdata );
}
__syncthreads ();
for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) {
if(threadIdx.x < offset)
for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 )
{
if( threadIdx.x < offset )
{
rvec_Add( my_rvec[threadIdx.x], my_rvec[threadIdx.x + offset] );
}
__syncthreads();
}
if(threadIdx.x == 0)
rvec_Add (results[blockIdx.x], my_rvec[0]);
if( threadIdx.x == 0 )
{
rvec_Add( results[blockIdx.x], my_rvec[0] );
}
#else
extern __shared__ rvec svec_data[];
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
rvec x;
rvec_MakeZero (x);
rvec_MakeZero( x );
if(i < n)
{
rvec_Copy (x, input[i]);
rvec_Copy( x, input[i] );
}
rvec_Copy (svec_data[threadIdx.x], x);
rvec_Copy(svec_data[threadIdx.x], x);
__syncthreads();
for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
......@@ -136,14 +155,12 @@ CUDA_GLOBAL void k_reduction_rvec (rvec *input, rvec *results, size_t n)
rvec_Add (results[blockIdx.x], svec_data[0]);
}
#endif
}
CUDA_GLOBAL void k_reduction_rvec2 (rvec2 *input, rvec2 *results, size_t n)
{
#if defined(__SM_35__)
extern __shared__ rvec2 my_rvec2[];
rvec2 sdata;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
......@@ -222,40 +239,51 @@ CUDA_GLOBAL void k_reduction_rvec2 (rvec2 *input, rvec2 *results, size_t n)
#endif
}
CUDA_GLOBAL void k_dot (const real *a, const real *b, real *per_block_results, const size_t n )
CUDA_GLOBAL void k_dot (const real *a, const real *b, real *per_block_results,
const size_t n )
{
#if defined(__SM_35__)
extern __shared__ real my_dot[];
real sdot;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdot = 0.0;
if(i < n)
if( i < n )
{
sdot = a[i] * b[i];
}
__syncthreads();
for(int z = 16; z >=1; z/=2)
for( int z = 16; z >=1; z/=2 )
{
sdot += shfl ( sdot, z);
}
if (threadIdx.x % 32 == 0)
if( threadIdx.x % 32 == 0 )
{
my_dot[threadIdx.x >> 5] = sdot;
}
__syncthreads ();
for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) {
if(threadIdx.x < offset)
for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 )
{
if( threadIdx.x < offset )
{
my_dot[threadIdx.x] += my_dot[threadIdx.x + offset];
}
__syncthreads();
}
if(threadIdx.x == 0)
if( threadIdx.x == 0 )
{
per_block_results[blockIdx.x] = my_dot[0];
}
#else
extern __shared__ real sdot[];
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
real x = 0;
......@@ -283,39 +311,49 @@ CUDA_GLOBAL void k_dot (const real *a, const real *b, real *per_block_results, c
}
#endif
}
CUDA_GLOBAL void k_norm (const real *input, real *per_block_results, const size_t n, int pass)
{
#if defined(__SM_35__)
extern __shared__ real my_norm[];
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
real snorm = 0.0;
if(i < n)
if( i < n )
{
snorm = SQR (input[i]);
}
__syncthreads();
for(int z = 16; z >=1; z/=2)
{
snorm += shfl ( snorm, z);
}
if (threadIdx.x % 32 == 0)
{
my_norm[threadIdx.x >> 5] = snorm;
}
__syncthreads ();
for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) {
for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1)
{
if(threadIdx.x < offset)
{
my_norm[threadIdx.x] += my_norm[threadIdx.x + offset];
}
__syncthreads();
}
if(threadIdx.x == 0)
{
per_block_results[blockIdx.x] = my_norm[0];
}
#else
extern __shared__ real snorm[];
......@@ -323,7 +361,9 @@ CUDA_GLOBAL void k_norm (const real *input, real *per_block_results, const size_
real x = 0;
if(i < n)
{
x = SQR (input[i]);
}
snorm[threadIdx.x] = x;
__syncthreads();
......@@ -339,24 +379,24 @@ CUDA_GLOBAL void k_norm (const real *input, real *per_block_results, const size_
}
if(threadIdx.x == 0)
{
per_block_results[blockIdx.x] = snorm[0];
}
#endif
}
CUDA_GLOBAL void k_norm_rvec2 (const rvec2 *input, rvec2 *per_block_results, const size_t n, int pass)
CUDA_GLOBAL void k_norm_rvec2 (const rvec2 *input, rvec2 *per_block_results,
const size_t n, int pass)
{
#if defined(__SM_35__)
extern __shared__ rvec2 my_norm2[];
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
rvec2 snorm2;
snorm2[0] = snorm2[1] = 0;
if(i < n) {
if(i < n)
{
if (pass == INITIAL) {
snorm2[0] = SQR (input[i][0]);
snorm2[1] = SQR (input[i][1]);
......@@ -367,7 +407,8 @@ CUDA_GLOBAL void k_norm_rvec2 (const rvec2 *input, rvec2 *per_block_results, con
}
__syncthreads();
for(int z = 16; z >=1; z/=2){
for(int z = 16; z >=1; z/=2)
{
snorm2[0] += shfl ( snorm2[0], z);
snorm2[1] += shfl ( snorm2[1], z);
}
......@@ -394,17 +435,20 @@ CUDA_GLOBAL void k_norm_rvec2 (const rvec2 *input, rvec2 *per_block_results, con
}
#else
extern __shared__ rvec2 snorm2[];
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
rvec2 x;
x[0] = x[1] = 0;
if(i < n) {
if (pass == INITIAL) {
if( i < n )
{
if( pass == INITIAL )
{
x[0] = SQR (input[i][0]);
x[1] = SQR (input[i][1]);
} else {
}
else
{
x[0] = input[i][0];
x[1] = input[i][1];
}
......@@ -425,17 +469,19 @@ CUDA_GLOBAL void k_norm_rvec2 (const rvec2 *input, rvec2 *per_block_results, con
__syncthreads();
}
if(threadIdx.x == 0) {
if(threadIdx.x == 0)
{
per_block_results[blockIdx.x][0] = snorm2[0][0];
per_block_results[blockIdx.x][1] = snorm2[0][1];
}
#endif
}
CUDA_GLOBAL void k_dot_rvec2 (const rvec2 *a, rvec2 *b, rvec2 *res, const size_t n)
CUDA_GLOBAL void k_dot_rvec2(const rvec2 *a, rvec2 *b, rvec2 *res,
const size_t n)
{
#if defined(__SM_35__)
extern __shared__ rvec2 my_dot2[];
rvec2 sdot2;
......@@ -509,14 +555,19 @@ CUDA_GLOBAL void k_dot_rvec2 (const rvec2 *a, rvec2 *b, rvec2 *res, const size_t
#endif
}
//////////////////////////////////////////////////
//vector functions
//////////////////////////////////////////////////
CUDA_GLOBAL void k_vector_sum( real* dest, real c, real* v, real d, real* y, int k )
CUDA_GLOBAL void k_vector_sum( real* dest, real c, real* v, real d, real* y,
int k )
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if ( i >= k) return;
if( i >= k )
{
return;
}
dest[i] = c * v[i] + d * y[i];
}
......@@ -525,26 +576,40 @@ CUDA_GLOBAL void k_vector_sum( real* dest, real c, real* v, real d, real* y, int
CUDA_GLOBAL void k_vector_mul( real* dest, real* v, real* y, int k )
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if ( i >= k) return;
if( i >= k )
{
return;
}
dest[i] = v[i] * y[i];
}
CUDA_GLOBAL void k_rvec2_mul( rvec2* dest, rvec2* v, rvec2* y, int k )
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if ( i >= k) return;
if( i >= k )
{
return;
}
dest[i][0] = v[i][0] * y[i][0];
dest[i][1] = v[i][1] * y[i][1];
}
CUDA_GLOBAL void k_rvec2_pbetad (rvec2 *dest, rvec2 *a,
real beta0, real beta1,
rvec2 *b, int n)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if ( i >= n) return;
if( i >= n )
{
return;
}
dest[i][0] = a[i][0] + beta0 * b[i][0];
dest[i][1] = a[i][1] + beta1 * b[i][1];
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment