Newer
Older
/*----------------------------------------------------------------------
PuReMD - Purdue ReaxFF Molecular Dynamics Program
Copyright (2010) Purdue University
Hasan Metin Aktulga, haktulga@cs.purdue.edu
Joseph Fogarty, jcfogart@mail.usf.edu
Sagar Pandit, pandit@usf.edu
Ananth Y Grama, ayg@cs.purdue.edu
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License as
published by the Free Software Foundation; either version 2 of
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
See the GNU General Public License for more details:
<http://www.gnu.org/licenses/>.
----------------------------------------------------------------------*/
#include "linear_solvers.h"
#include "basic_comm.h"
#include "io_tools.h"
#include "tool_box.h"
#include "vector.h"
Kurt A. O'Hearn
committed
#ifdef HAVE_CUDA
Kurt A. O'Hearn
committed
#endif
#if defined(CG_PERFORMANCE)
real t_start, t_elapsed, matvec_time, dot_time;
#endif
void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N )
{
int i, j, k, si;
real H;
for ( i = 0; i < N; ++i )
/* perform multiplication */
for ( i = 0; i < A->n; ++i )
{
si = A->start[i];
b[i][0] += A->entries[si].val * x[i][0];
b[i][1] += A->entries[si].val * x[i][1];
for ( k = si + 1; k < A->end[i]; ++k )
{
j = A->entries[k].j;
H = A->entries[k].val;
b[i][0] += H * x[j][0];
b[i][1] += H * x[j][1];
// comment out for tryQEq
//if( j < A->n ) {
b[j][0] += H * x[i][0];
b[j][1] += H * x[i][1];
//}
}
int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2
*b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout,
simulation_data *data )
int i, j, n, N, matvecs, scale;
rvec2 tmp, alpha, beta;
rvec2 my_sum, norm_sqr, b_norm, my_dot;
rvec2 sig_old, sig_new;
MPI_Comm comm;
n = system->n;
N = system->N;
comm = mpi_data->world;
matvecs = 0;
scale = sizeof(rvec2) / sizeof(void);
if ( system->my_rank == MASTER_NODE )
{
matvecs = 0;
t_start = matvec_time = dot_time = 0;
t_start = Get_Time( );
}
Kurt A. O'Hearn
committed
#ifdef HAVE_CUDA
Kurt A. O'Hearn
committed
#endif
Dist( system, mpi_data, x, mpi_data->mpi_rvec2, scale, rvec2_packer );
Kurt A. O'Hearn
committed
#ifdef HAVE_CUDA
Kurt A. O'Hearn
committed
#endif
// tryQEq
Coll(system, mpi_data, workspace->q2, mpi_data->mpi_rvec2, scale, rvec2_unpacker);
if ( system->my_rank == MASTER_NODE )
Update_Timing_Info( &t_start, &matvec_time );
for ( j = 0; j < system->n; ++j )
{
/* residual */
workspace->r2[j][0] = b[j][0] - workspace->q2[j][0];
workspace->r2[j][1] = b[j][1] - workspace->q2[j][1];
/* apply diagonal pre-conditioner */
workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
}
/* norm of b */
my_sum[0] = my_sum[1] = 0;
for ( j = 0; j < n; ++j )
{
my_sum[0] += SQR( b[j][0] );
my_sum[1] += SQR( b[j][1] );
//fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]);
MPI_Allreduce( &my_sum, &norm_sqr, 2, MPI_DOUBLE, MPI_SUM, comm );
b_norm[0] = SQRT( norm_sqr[0] );
b_norm[1] = SQRT( norm_sqr[1] );
//fprintf( stderr, "bnorm = %f %f\n", b_norm[0], b_norm[1] );
for ( j = 0; j < n; ++j )
{
my_dot[0] += workspace->r2[j][0] * workspace->d2[j][0];
my_dot[1] += workspace->r2[j][1] * workspace->d2[j][1];
//fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] );
MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
//fprintf( stderr, "HOST:sig_new: %f %f\n", sig_new[0], sig_new[1] );
#endif
for ( i = 1; i < 300; ++i )
{
Dist(system, mpi_data, workspace->d2, mpi_data->mpi_rvec2, scale, rvec2_packer);
//print_host_rvec2 (workspace->d2, N);
dual_Sparse_MatVec( H, workspace->d2, workspace->q2, N );
// tryQEq
Coll(system, mpi_data, workspace->q2, mpi_data->mpi_rvec2, scale, rvec2_unpacker);
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
#if defined(CG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
Update_Timing_Info( &t_start, &matvec_time );
#endif
/* dot product: d.q */
my_dot[0] = my_dot[1] = 0;
for ( j = 0; j < n; ++j )
{
my_dot[0] += workspace->d2[j][0] * workspace->q2[j][0];
my_dot[1] += workspace->d2[j][1] * workspace->q2[j][1];
}
MPI_Allreduce( &my_dot, &tmp, 2, MPI_DOUBLE, MPI_SUM, comm );
//fprintf( stderr, "tmp: %f %f\n", tmp[0], tmp[1] );
alpha[0] = sig_new[0] / tmp[0];
alpha[1] = sig_new[1] / tmp[1];
my_dot[0] = my_dot[1] = 0;
for ( j = 0; j < system->n; ++j )
{
/* update x */
x[j][0] += alpha[0] * workspace->d2[j][0];
x[j][1] += alpha[1] * workspace->d2[j][1];
/* update residual */
workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0];
workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1];
/* apply diagonal pre-conditioner */
workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
/* dot product: r.p */
my_dot[0] += workspace->r2[j][0] * workspace->p2[j][0];
my_dot[1] += workspace->r2[j][1] * workspace->p2[j][1];
}
sig_old[0] = sig_new[0];
sig_old[1] = sig_new[1];
MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
//fprintf( stderr, "HOST:sig_new: %f %f\n", sig_new[0], sig_new[1] );
#if defined(CG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
Update_Timing_Info( &t_start, &dot_time );
if ( SQRT(sig_new[0]) / b_norm[0] <= tol || SQRT(sig_new[1]) / b_norm[1] <= tol )
beta[0] = sig_new[0] / sig_old[0];
beta[1] = sig_new[1] / sig_old[1];
for ( j = 0; j < system->n; ++j )
{
/* d = p + beta * d */
workspace->d2[j][0] = workspace->p2[j][0] + beta[0] * workspace->d2[j][0];
workspace->d2[j][1] = workspace->p2[j][1] + beta[1] * workspace->d2[j][1];
}
}
if ( SQRT(sig_new[0]) / b_norm[0] <= tol )
{
for ( j = 0; j < n; ++j )
matvecs = CG( system, workspace, H, workspace->b_t, tol, workspace->t,
mpi_data, fout );
Kurt A. O'Hearn
committed
#if defined(DEBUG)
fprintf (stderr, " CG1: iterations --> %d \n", matvecs );
Kurt A. O'Hearn
committed
#endif
}
else if ( SQRT(sig_new[1]) / b_norm[1] <= tol )
{
for ( j = 0; j < n; ++j )
matvecs = CG( system, workspace, H, workspace->b_s, tol, workspace->s,
mpi_data, fout );
Kurt A. O'Hearn
committed
#if defined(DEBUG)
fprintf (stderr, " CG2: iterations --> %d \n", matvecs );
Kurt A. O'Hearn
committed
#endif
if ( system->my_rank == MASTER_NODE )
fprintf( fout, "QEq %d + %d iters. matvecs: %f dot: %f\n",
i + 1, matvecs, matvec_time, dot_time );
Kurt A. O'Hearn
committed
#ifdef HAVE_CUDA
int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout,
simulation_data *data )
int i, j, n, N, matvecs, scale;
rvec2 tmp, alpha, beta;
rvec2 my_sum, norm_sqr, b_norm, my_dot;
rvec2 sig_old, sig_new;
MPI_Comm comm;
rvec2 *spad = (rvec2 *) host_scratch;
int a;
n = system->n;
N = system->N;
comm = mpi_data->world;
matvecs = 0;
scale = sizeof(rvec2) / sizeof(void);
if ( system->my_rank == MASTER_NODE )
{
matvecs = 0;
t_start = matvec_time = dot_time = 0;
t_start = Get_Time( );
}
//#ifdef __CUDA_DEBUG__
// Dist( system, mpi_data, workspace->x, mpi_data->mpi_rvec2, scale, rvec2_packer );
//#endif
get_from_device( spad, x, sizeof (rvec2) * system->total_cap, "CG:x:get" );
Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer );
put_on_device( spad, x, sizeof (rvec2) * system->total_cap, "CG:x:put" );
// compare_rvec2 (workspace->x, x, N, "x");
// if (data->step > 0) {
// compare_rvec2 (workspace->b, dev_workspace->b, system->N, "b");
// compare_rvec2 (workspace->x, dev_workspace->x, system->N, "x");
//#ifdef __CUDA_DEBUG__
// dual_Sparse_MatVec( &workspace->H, workspace->x, workspace->q2, N );
//#endif
//originally we were using only H->n which was system->n (init_md.c)
//Cuda_Dual_Matvec ( H, x, dev_workspace->q2, H->n, system->total_cap);
Cuda_Dual_Matvec ( H, x, dev_workspace->q2, system->N, system->total_cap);
// compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
//#ifdef __CUDA_DEBUG__
// Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker);
//#endif
get_from_device (spad, dev_workspace->q2, sizeof (rvec2) *
system->total_cap, "CG:q2:get" );
Coll(system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker);
put_on_device (spad, dev_workspace->q2, sizeof (rvec2) * system->total_cap,
"CG:q2:put" );
//#ifdef __CUDA_DEBUG__
// for( j = 0; j < system->n; ++j ) {
// workspace->r2[j][0] = workspace->b[j][0] - workspace->q2[j][0];
// workspace->r2[j][1] = workspace->b[j][1] - workspace->q2[j][1];
// // apply diagonal pre-conditioner
// workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
// workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
Cuda_CG_Diagonal_Preconditioner( dev_workspace, b, system->n );
// compare_rvec2 (workspace->r2, dev_workspace->r2, n, "r2");
// compare_rvec2 (workspace->d2, dev_workspace->d2, n, "d2");
//#ifdef __CUDA_DEBUG__
// my_sum[0] = my_sum[1] = 0;
// for( j = 0; j < n; ++j ) {
// my_sum[0] += SQR( workspace->b[j][0] );
// my_sum[1] += SQR( workspace->b[j][1] );
// }
// fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]);
//#endif
my_sum[0] = my_sum[1] = 0;
Cuda_Norm (b, n, my_sum);
// fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]);
MPI_Allreduce( &my_sum, &norm_sqr, 2, MPI_DOUBLE, MPI_SUM, comm );
b_norm[0] = SQRT( norm_sqr[0] );
b_norm[1] = SQRT( norm_sqr[1] );
//fprintf( stderr, "bnorm = %f %f\n", b_norm[0], b_norm[1] );
//#ifdef __CUDA_DEBUG__
// my_dot[0] = my_dot[1] = 0;
// for( j = 0; j < n; ++j ) {
// my_dot[0] += workspace->r2[j][0] * workspace->d2[j][0];
// my_dot[1] += workspace->r2[j][1] * workspace->d2[j][1];
// }
// fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] );
//#endif
my_dot[0] = my_dot[1] = 0;
Cuda_Dot (dev_workspace->r2, dev_workspace->d2, my_dot, n);
// fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] );
MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
//fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] );
//#ifdef __CUDA_DEBUG__
// Dist(system,mpi_data,workspace->d2,mpi_data->mpi_rvec2,scale,rvec2_packer);
//#endif
get_from_device( spad, dev_workspace->d2, sizeof (rvec2) *
system->total_cap, "cg:d2:get" );
Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer );
put_on_device( spad, dev_workspace->d2, sizeof (rvec2) *
system->total_cap, "cg:d2:put" );
//#ifdef __CUDA_DEBUG__
// dual_Sparse_MatVec( &workspace->H, workspace->d2, workspace->q2, N );
//#endif
Cuda_Dual_Matvec( H, dev_workspace->d2, dev_workspace->q2, system->N,
system->total_cap );
/*
fprintf (stderr, "******************* Device sparse Matrix--------> %d \n", H->n );
fprintf (stderr, " ******* HOST SPARSE MATRIX ******** \n");
print_sparse_matrix_host (&workspace->H);
fprintf (stderr, " ******* HOST Vector ***************\n");
print_host_rvec2 (workspace->d2, system->N);
fprintf (stderr, " ******* Device SPARSE MATRIX ******** \n");
print_sparse_matrix (&dev_workspace->H);
fprintf (stderr, " ******* Device Vector ***************\n");
print_device_rvec2 (dev_workspace->d2, system->N);
*/
//compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
// tryQEq
// MVAPICH2
//#ifdef __CUDA_DEBUG__
// Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker);
//#endif
get_from_device( spad, dev_workspace->q2, sizeof (rvec2) *
system->total_cap, "cg:q2:get" );
Coll( system, mpi_data, spad, mpi_data->mpi_rvec2, scale,
rvec2_unpacker );
put_on_device( spad, dev_workspace->q2, sizeof (rvec2) *
system->total_cap, "cg:q2:put" );
// compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
//#ifdef __CUDA_DEBUG__
// my_dot[0] = my_dot[1] = 0;
// for( j = 0; j < n; ++j ) {
// my_dot[0] += workspace->d2[j][0] * workspace->q2[j][0];
// my_dot[1] += workspace->d2[j][1] * workspace->q2[j][1];
// }
// fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] );
my_dot[0] = my_dot[1] = 0;
Cuda_Dot (dev_workspace->d2, dev_workspace->q2, my_dot, n);
//fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] );
MPI_Allreduce( &my_dot, &tmp, 2, MPI_DOUBLE, MPI_SUM, comm );
//fprintf( stderr, "tmp: %f %f\n", tmp[0], tmp[1] );
alpha[0] = sig_new[0] / tmp[0];
alpha[1] = sig_new[1] / tmp[1];
my_dot[0] = my_dot[1] = 0;
//#ifdef __CUDA_DEBUG__
// for( j = 0; j < system->n; ++j ) {
// workspace->x[j][0] += alpha[0] * workspace->d2[j][0];
// workspace->x[j][1] += alpha[1] * workspace->d2[j][1];
// // update residual
// workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0];
// workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1];
// // apply diagonal pre-conditioner
// workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
// workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
// my_dot[0] += workspace->r2[j][0] * workspace->p2[j][0];
// my_dot[1] += workspace->r2[j][1] * workspace->p2[j][1];
// }
// fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] );
//#endif
Cuda_DualCG_Preconditioner( dev_workspace, x, alpha, system->n, my_dot );
//fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] );
// compare_rvec2 (workspace->x, dev_workspace->x, N, "x");
// compare_rvec2 (workspace->r2, dev_workspace->r2, N, "r2");
// compare_rvec2 (workspace->p2, dev_workspace->p2, N, "p2");
sig_old[0] = sig_new[0];
sig_old[1] = sig_new[1];
MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
//fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] );
if ( SQRT(sig_new[0]) / b_norm[0] <= tol || SQRT(sig_new[1]) / b_norm[1] <= tol )
beta[0] = sig_new[0] / sig_old[0];
beta[1] = sig_new[1] / sig_old[1];
//#ifdef __CUDA_DEBUG__
// for( j = 0; j < system->n; ++j ) {
// workspace->d2[j][0] = workspace->p2[j][0] + beta[0] * workspace->d2[j][0];
// workspace->d2[j][1] = workspace->p2[j][1] + beta[1] * workspace->d2[j][1];
// }
//#endif
Cuda_Vector_Sum_Rvec2( dev_workspace->d2, dev_workspace->p2, beta,
dev_workspace->d2, system->n );
// compare_rvec2 (workspace->d2, dev_workspace->d2, N, "q2");
}
if ( SQRT(sig_new[0]) / b_norm[0] <= tol )
{
//for( j = 0; j < n; ++j )
// workspace->t[j] = workspace->x[j][1];
//fprintf (stderr, "Getting started with Cuda_CG1 \n");
Cuda_RvecCopy_From( dev_workspace->t, dev_workspace->x, 1, system->n );
//compare_array (workspace->b_t, dev_workspace->b_t, system->n, "b_t");
//compare_array (workspace->t, dev_workspace->t, system->n, "t");
matvecs = Cuda_CG( system, workspace, H, dev_workspace->b_t, tol, dev_workspace->t,
//fprintf (stderr, " Cuda_CG1: iterations --> %d \n", matvecs );
//for( j = 0; j < n; ++j )
// workspace->x[j][1] = workspace->t[j];
Cuda_RvecCopy_To( dev_workspace->x, dev_workspace->t, 1, system->n );
}
else if ( SQRT(sig_new[1]) / b_norm[1] <= tol )
{
//for( j = 0; j < n; ++j )
// workspace->s[j] = workspace->x[j][0];
Cuda_RvecCopy_From( dev_workspace->s, dev_workspace->x, 0, system->n );
//compare_array (workspace->s, dev_workspace->s, system->n, "s");
//compare_array (workspace->b_s, dev_workspace->b_s, system->n, "b_s");
//fprintf (stderr, "Getting started with Cuda_CG2 \n");
matvecs = Cuda_CG( system, workspace, H, dev_workspace->b_s, tol, dev_workspace->s,
//fprintf (stderr, " Cuda_CG2: iterations --> %d \n", matvecs );
//for( j = 0; j < system->n; ++j )
// workspace->x[j][0] = workspace->s[j];
Cuda_RvecCopy_To( dev_workspace->x, dev_workspace->s, 0, system->n );
}
fprintf( stderr, "Dual CG convergence failed! -> %d\n", i );
fprintf( fout, "QEq %d + %d iters. matvecs: %f dot: %f\n",
i + 1, matvecs, matvec_time, dot_time );
}
Kurt A. O'Hearn
committed
#endif
void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N )
{
int i, j, k, si;
real H;
for ( i = 0; i < N; ++i )
/* perform multiplication */
for ( i = 0; i < A->n; ++i )
{
si = A->start[i];
b[i] += A->entries[si].val * x[i];
for ( k = si + 1; k < A->end[i]; ++k )
{
j = A->entries[k].j;
H = A->entries[k].val;
b[i] += H * x[j];
//if( j < A->n ) // comment out for tryQEq
b[j] += H * x[i];
}
int CG( reax_system *system, storage *workspace, sparse_matrix *H, real *b,
real tol, real *x, mpi_datatypes* mpi_data, FILE *fout)
int i, j, scale;
real tmp, alpha, beta, b_norm;
real sig_old, sig_new, sig0;
scale = sizeof(real) / sizeof(void);
Dist( system, mpi_data, x, MPI_DOUBLE, scale, real_packer );
Sparse_MatVec( H, x, workspace->q, system->N );
// tryQEq
Coll( system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker );
Vector_Sum( workspace->r , 1., b, -1., workspace->q, system->n );
for ( j = 0; j < system->n; ++j )
workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition
b_norm = Parallel_Norm( b, system->n, mpi_data->world );
sig_new = Parallel_Dot(workspace->r, workspace->d, system->n, mpi_data->world);
sig0 = sig_new;
for ( i = 1; i < 300 && SQRT(sig_new) / b_norm > tol; ++i )
{
Dist( system, mpi_data, workspace->d, MPI_DOUBLE, scale, real_packer );
Sparse_MatVec( H, workspace->d, workspace->q, system->N );
//tryQEq
Coll(system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker);
tmp = Parallel_Dot(workspace->d, workspace->q, system->n, mpi_data->world);
alpha = sig_new / tmp;
Vector_Add( x, alpha, workspace->d, system->n );
Vector_Add( workspace->r, -alpha, workspace->q, system->n );
/* pre-conditioning */
for ( j = 0; j < system->n; ++j )
workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
sig_old = sig_new;
sig_new = Parallel_Dot(workspace->r, workspace->p, system->n, mpi_data->world);
//fprintf (stderr, "Host : sig_new: %f \n", sig_new );
beta = sig_new / sig_old;
Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, system->n );
}
if ( i >= 300 )
{
fprintf( stderr, "CG convergence failed!\n" );
return i;
}
Kurt A. O'Hearn
committed
#ifdef HAVE_CUDA
int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, real
*b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
int i, j, scale;
real tmp, alpha, beta, b_norm;
real sig_old, sig_new, sig0;
real *spad = (real *) host_scratch;
scale = sizeof(real) / sizeof(void);
/* x is on the device */
//MVAPICH2
memset( spad, 0, sizeof (real) * system->total_cap );
get_from_device( spad, x, sizeof (real) * system->total_cap, "cuda_cg:x:get" );
Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer );
//MVAPICH2
put_on_device( spad, x, sizeof (real) * system->total_cap , "cuda_cg:x:put" );
Cuda_Matvec( H, x, dev_workspace->q, system->N, system->total_cap );
get_from_device( spad, dev_workspace->q, sizeof (real) * system->total_cap,
"cuda_cg:q:get" );
Coll( system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker );
put_on_device( spad, dev_workspace->q, sizeof (real) * system->total_cap,
"cuda_cg:q:put" );
Cuda_Vector_Sum( dev_workspace->r , 1., b, -1., dev_workspace->q,
system->n );
//for( j = 0; j < system->n; ++j )
// workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition
Cuda_CG_Preconditioner( dev_workspace->d, dev_workspace->r,
dev_workspace->Hdia_inv, system->n );
//TODO do the parallel_norm on the device for the local sum
get_from_device( spad, b, sizeof (real) * system->n, "cuda_cg:b:get" );
b_norm = Parallel_Norm( spad, system->n, mpi_data->world );
//TODO do the parallel dot on the device for the local sum
get_from_device( spad, dev_workspace->r, sizeof (real) * system->total_cap,
"cuda_cg:r:get" );
get_from_device( spad + system->total_cap, dev_workspace->d, sizeof (real)
* system->total_cap, "cuda_cg:d:get" );
sig_new = Parallel_Dot( spad, spad + system->total_cap, system->n,
mpi_data->world );
for ( i = 1; i < 300 && SQRT(sig_new) / b_norm > tol; ++i )
{
//MVAPICH2
get_from_device( spad, dev_workspace->d, sizeof (real) *
system->total_cap, "cuda_cg:d:get" );
Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer );
put_on_device( spad, dev_workspace->d, sizeof (real) *
system->total_cap, "cuda_cg:d:put" );
Cuda_Matvec( H, dev_workspace->d, dev_workspace->q, system->N, system->total_cap );
get_from_device( spad, dev_workspace->q, sizeof (real) *
system->total_cap, "cuda_cg:q:get" );
Coll( system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker );
put_on_device( spad, dev_workspace->q, sizeof (real) *
system->total_cap , "cuda_cg:q:get" );
//TODO do the parallel dot on the device for the local sum
get_from_device( spad, dev_workspace->d, sizeof (real) * system->n,
"cuda_cg:d:get" );
get_from_device( spad + system->n, dev_workspace->q, sizeof (real) *
system->n, "cuda_cg:q:get" );
tmp = Parallel_Dot( spad, spad + system->n, system->n, mpi_data->world );
alpha = sig_new / tmp;
//Cuda_Vector_Add( x, alpha, dev_workspace->d, system->n );
Cuda_Vector_Sum( x, alpha, dev_workspace->d, 1.0, x, system->n );
//Cuda_Vector_Add( workspace->r, -alpha, workspace->q, system->n );
Cuda_Vector_Sum( dev_workspace->r, -alpha, dev_workspace->q, 1.0,
dev_workspace->r, system->n );
/* pre-conditioning */
//for( j = 0; j < system->n; ++j )
// workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
Cuda_CG_Preconditioner( dev_workspace->p, dev_workspace->r,
dev_workspace->Hdia_inv, system->n );
sig_old = sig_new;
//TODO do the parallel dot on the device for the local sum
get_from_device( spad, dev_workspace->r, sizeof (real) * system->n,
"cuda_cg:r:get" );
get_from_device( spad + system->n, dev_workspace->p, sizeof (real) *
system->n, "cuda_cg:p:get" );
sig_new = Parallel_Dot( spad , spad + system->n, system->n, mpi_data->world );
//fprintf (stderr, "Device: sig_new: %f \n", sig_new );
beta = sig_new / sig_old;
Cuda_Vector_Sum( dev_workspace->d, 1., dev_workspace->p, beta,
dev_workspace->d, system->n );
}
if ( i >= 300 )
{
fprintf( stderr, "CG convergence failed!\n" );
return i;
}
Kurt A. O'Hearn
committed
#endif
int CG_test( reax_system *system, storage *workspace, sparse_matrix *H, real
*b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
int i, j, scale;
real tmp, alpha, beta, b_norm;
real sig_old, sig_new, sig0;
scale = sizeof(real) / sizeof(void);
b_norm = Parallel_Norm( b, system->n, mpi_data->world );
if ( system->my_rank == MASTER_NODE )
{
fprintf( stderr, "n=%d, N=%d\n", system->n, system->N );
fprintf( stderr, "p%d CGinit: b_norm=%24.15e\n", system->my_rank, b_norm );
//Vector_Print( stderr, "d", workspace->d, system->N );
//Vector_Print( stderr, "q", workspace->q, system->N );
}
MPI_Barrier( mpi_data->world );
Sparse_MatVec( H, x, workspace->q, system->N );
//Coll( system, mpi_data, workspace->q, MPI_DOUBLE, real_unpacker );
Vector_Sum( workspace->r , 1., b, -1., workspace->q, system->n );
for ( j = 0; j < system->n; ++j )
workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition
sig_new = Parallel_Dot( workspace->r, workspace->d, system->n,
mpi_data->world );
sig0 = sig_new;
//if( system->my_rank == MASTER_NODE ) {
fprintf( stderr, "p%d CG:sig_new=%24.15e,d_norm=%24.15e,q_norm=%24.15e\n",
system->my_rank, sqrt(sig_new),
Parallel_Norm(workspace->d, system->n, mpi_data->world),
Parallel_Norm(workspace->q, system->n, mpi_data->world) );
//Vector_Print( stderr, "d", workspace->d, system->N );
//Vector_Print( stderr, "q", workspace->q, system->N );
//}
for ( i = 1; i < 300 && SQRT(sig_new) / b_norm > tol; ++i )
{
if ( system->my_rank == MASTER_NODE )
t_start = Get_Time( );
Dist( system, mpi_data, workspace->d, MPI_DOUBLE, scale, real_packer );
Sparse_MatVec( H, workspace->d, workspace->q, system->N );
//tryQEq
//Coll(system, mpi_data, workspace->q, MPI_DOUBLE, real_unpacker);
if ( system->my_rank == MASTER_NODE )
{
t_elapsed = Get_Timing_Info( t_start );
matvec_time += t_elapsed;
}
if ( system->my_rank == MASTER_NODE )
t_start = Get_Time( );
#endif
tmp = Parallel_Dot(workspace->d, workspace->q, system->n, mpi_data->world);
alpha = sig_new / tmp;
//if( system->my_rank == MASTER_NODE ){
fprintf(stderr,
"p%d CG iter%d:d_norm=%24.15e,q_norm=%24.15e,tmp = %24.15e\n",
system->my_rank, i,
//Parallel_Norm(workspace->d, system->n, mpi_data->world),
//Parallel_Norm(workspace->q, system->n, mpi_data->world),
Norm(workspace->d, system->n), Norm(workspace->q, system->n), tmp);
//Vector_Print( stderr, "d", workspace->d, system->N );
//for( j = 0; j < system->N; ++j )
// fprintf( stdout, "%d %24.15e\n",
// system->my_atoms[j].orig_id, workspace->q[j] );
//fprintf( stdout, "\n" );
//}
MPI_Barrier( mpi_data->world );
Vector_Add( x, alpha, workspace->d, system->n );
Vector_Add( workspace->r, -alpha, workspace->q, system->n );
/* pre-conditioning */
for ( j = 0; j < system->n; ++j )
workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
sig_old = sig_new;
sig_new = Parallel_Dot(workspace->r, workspace->p, system->n, mpi_data->world);
beta = sig_new / sig_old;
Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, system->n );
if ( system->my_rank == MASTER_NODE )
fprintf(stderr, "p%d CG iter%d: sig_new = %24.15e\n",
system->my_rank, i, sqrt(sig_new) );
MPI_Barrier( mpi_data->world );
if ( system->my_rank == MASTER_NODE )
{
t_elapsed = Get_Timing_Info( t_start );
dot_time += t_elapsed;
}
if ( system->my_rank == MASTER_NODE )
fprintf( stderr, "CG took %d iterations\n", i );