Newer
Older
/*----------------------------------------------------------------------
PuReMD - Purdue ReaxFF Molecular Dynamics Program
Copyright (2010) Purdue University
Hasan Metin Aktulga, haktulga@cs.purdue.edu
Joseph Fogarty, jcfogart@mail.usf.edu
Sagar Pandit, pandit@usf.edu
Ananth Y Grama, ayg@cs.purdue.edu
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License as
published by the Free Software Foundation; either version 2 of
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
See the GNU General Public License for more details:
<http://www.gnu.org/licenses/>.
----------------------------------------------------------------------*/
#include "linear_solvers.h"
#include "basic_comm.h"
#include "io_tools.h"
#include "tool_box.h"
#include "vector.h"
Kurt A. O'Hearn
committed
#ifdef HAVE_CUDA
Kurt A. O'Hearn
committed
#endif
#if defined(CG_PERFORMANCE)
real t_start, t_elapsed, matvec_time, dot_time;
#endif
void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N )
{
int i, j, k, si;
real H;
for ( i = 0; i < N; ++i )
b[i][0] = b[i][1] = 0;
/* perform multiplication */
for ( i = 0; i < A->n; ++i )
{
si = A->start[i];
b[i][0] += A->entries[si].val * x[i][0];
b[i][1] += A->entries[si].val * x[i][1];
for ( k = si + 1; k < A->end[i]; ++k )
{
j = A->entries[k].j;
H = A->entries[k].val;
b[i][0] += H * x[j][0];
b[i][1] += H * x[j][1];
// comment out for tryQEq
//if( j < A->n ) {
b[j][0] += H * x[i][0];
b[j][1] += H * x[i][1];
//}
}
int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout, simulation_data *data )
int i, j, n, N, matvecs, scale;
rvec2 tmp, alpha, beta;
rvec2 my_sum, norm_sqr, b_norm, my_dot;
rvec2 sig_old, sig_new;
MPI_Comm comm;
n = system->n;
N = system->N;
comm = mpi_data->world;
matvecs = 0;
scale = sizeof(rvec2) / sizeof(void);
if ( system->my_rank == MASTER_NODE )
{
matvecs = 0;
t_start = matvec_time = dot_time = 0;
t_start = Get_Time( );
}
Kurt A. O'Hearn
committed
#ifdef HAVE_CUDA
Kurt A. O'Hearn
committed
#endif
Dist( system, mpi_data, x, mpi_data->mpi_rvec2, scale, rvec2_packer );
Kurt A. O'Hearn
committed
#ifdef HAVE_CUDA
Kurt A. O'Hearn
committed
#endif
// tryQEq
Coll(system, mpi_data, workspace->q2, mpi_data->mpi_rvec2, scale, rvec2_unpacker);
if ( system->my_rank == MASTER_NODE )
Update_Timing_Info( &t_start, &matvec_time );
for ( j = 0; j < system->n; ++j )
{
/* residual */
workspace->r2[j][0] = b[j][0] - workspace->q2[j][0];
workspace->r2[j][1] = b[j][1] - workspace->q2[j][1];
/* apply diagonal pre-conditioner */
workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
}
/* norm of b */
my_sum[0] = my_sum[1] = 0;
for ( j = 0; j < n; ++j )
{
my_sum[0] += SQR( b[j][0] );
my_sum[1] += SQR( b[j][1] );
//fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]);
MPI_Allreduce( &my_sum, &norm_sqr, 2, MPI_DOUBLE, MPI_SUM, comm );
b_norm[0] = SQRT( norm_sqr[0] );
b_norm[1] = SQRT( norm_sqr[1] );
//fprintf( stderr, "bnorm = %f %f\n", b_norm[0], b_norm[1] );
for ( j = 0; j < n; ++j )
{
my_dot[0] += workspace->r2[j][0] * workspace->d2[j][0];
my_dot[1] += workspace->r2[j][1] * workspace->d2[j][1];
//fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] );
MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
//fprintf( stderr, "HOST:sig_new: %f %f\n", sig_new[0], sig_new[1] );
#if defined(CG_PERFORMANCE)
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
if ( system->my_rank == MASTER_NODE )
Update_Timing_Info( &t_start, &dot_time );
#endif
for ( i = 1; i < 300; ++i )
{
Dist(system, mpi_data, workspace->d2, mpi_data->mpi_rvec2, scale, rvec2_packer);
//print_host_rvec2 (workspace->d2, N);
dual_Sparse_MatVec( H, workspace->d2, workspace->q2, N );
// tryQEq
Coll(system, mpi_data, workspace->q2, mpi_data->mpi_rvec2, scale, rvec2_unpacker);
#if defined(CG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
Update_Timing_Info( &t_start, &matvec_time );
#endif
/* dot product: d.q */
my_dot[0] = my_dot[1] = 0;
for ( j = 0; j < n; ++j )
{
my_dot[0] += workspace->d2[j][0] * workspace->q2[j][0];
my_dot[1] += workspace->d2[j][1] * workspace->q2[j][1];
}
MPI_Allreduce( &my_dot, &tmp, 2, MPI_DOUBLE, MPI_SUM, comm );
//fprintf( stderr, "tmp: %f %f\n", tmp[0], tmp[1] );
alpha[0] = sig_new[0] / tmp[0];
alpha[1] = sig_new[1] / tmp[1];
my_dot[0] = my_dot[1] = 0;
for ( j = 0; j < system->n; ++j )
{
/* update x */
x[j][0] += alpha[0] * workspace->d2[j][0];
x[j][1] += alpha[1] * workspace->d2[j][1];
/* update residual */
workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0];
workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1];
/* apply diagonal pre-conditioner */
workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
/* dot product: r.p */
my_dot[0] += workspace->r2[j][0] * workspace->p2[j][0];
my_dot[1] += workspace->r2[j][1] * workspace->p2[j][1];
}
sig_old[0] = sig_new[0];
sig_old[1] = sig_new[1];
MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
//fprintf( stderr, "HOST:sig_new: %f %f\n", sig_new[0], sig_new[1] );
#if defined(CG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
Update_Timing_Info( &t_start, &dot_time );
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
if ( SQRT(sig_new[0]) / b_norm[0] <= tol || SQRT(sig_new[1]) / b_norm[1] <= tol )
break;
beta[0] = sig_new[0] / sig_old[0];
beta[1] = sig_new[1] / sig_old[1];
for ( j = 0; j < system->n; ++j )
{
/* d = p + beta * d */
workspace->d2[j][0] = workspace->p2[j][0] + beta[0] * workspace->d2[j][0];
workspace->d2[j][1] = workspace->p2[j][1] + beta[1] * workspace->d2[j][1];
}
}
if ( SQRT(sig_new[0]) / b_norm[0] <= tol )
{
for ( j = 0; j < n; ++j )
workspace->t[j] = workspace->x[j][1];
matvecs = CG( system, workspace, H, workspace->b_t, tol, workspace->t,
mpi_data, fout );
fprintf (stderr, " CG1: iterations --> %d \n", matvecs );
for ( j = 0; j < n; ++j )
workspace->x[j][1] = workspace->t[j];
}
else if ( SQRT(sig_new[1]) / b_norm[1] <= tol )
{
for ( j = 0; j < n; ++j )
workspace->s[j] = workspace->x[j][0];
matvecs = CG( system, workspace, H, workspace->b_s, tol, workspace->s,
mpi_data, fout );
fprintf (stderr, " CG2: iterations --> %d \n", matvecs );
for ( j = 0; j < system->n; ++j )
workspace->x[j][0] = workspace->s[j];
if ( i >= 300 )
fprintf( stderr, "CG convergence failed!\n" );
if ( system->my_rank == MASTER_NODE )
fprintf( fout, "QEq %d + %d iters. matvecs: %f dot: %f\n",
i + 1, matvecs, matvec_time, dot_time );
Kurt A. O'Hearn
committed
#ifdef HAVE_CUDA
int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout, simulation_data *data )
int i, j, n, N, matvecs, scale;
rvec2 tmp, alpha, beta;
rvec2 my_sum, norm_sqr, b_norm, my_dot;
rvec2 sig_old, sig_new;
MPI_Comm comm;
rvec2 *spad = (rvec2 *) host_scratch;
int a;
n = system->n;
N = system->N;
comm = mpi_data->world;
matvecs = 0;
scale = sizeof(rvec2) / sizeof(void);
if ( system->my_rank == MASTER_NODE )
{
matvecs = 0;
t_start = matvec_time = dot_time = 0;
t_start = Get_Time( );
}
//#ifdef __CUDA_DEBUG__
// Dist( system, mpi_data, workspace->x, mpi_data->mpi_rvec2, scale, rvec2_packer );
//#endif
get_from_device (spad, x, sizeof (rvec2) * system->total_cap, "CG:x:get");
Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer );
put_on_device (spad, x, sizeof (rvec2) * system->total_cap, "CG:x:put");
// compare_rvec2 (workspace->x, x, N, "x");
// if (data->step > 0) {
// compare_rvec2 (workspace->b, dev_workspace->b, system->N, "b");
// compare_rvec2 (workspace->x, dev_workspace->x, system->N, "x");
//#ifdef __CUDA_DEBUG__
// dual_Sparse_MatVec( &workspace->H, workspace->x, workspace->q2, N );
//#endif
//originally we were using only H->n which was system->n (init_md.c)
//Cuda_Dual_Matvec ( H, x, dev_workspace->q2, H->n, system->total_cap);
Cuda_Dual_Matvec ( H, x, dev_workspace->q2, system->N, system->total_cap);
// compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
//#ifdef __CUDA_DEBUG__
// Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker);
//#endif
get_from_device (spad, dev_workspace->q2, sizeof (rvec2) * system->total_cap, "CG:q2:get" );
Coll(system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker);
put_on_device (spad, dev_workspace->q2, sizeof (rvec2) * system->total_cap, "CG:q2:put" );
if ( system->my_rank == MASTER_NODE )
Update_Timing_Info( &t_start, &matvec_time );
#endif
//#ifdef __CUDA_DEBUG__
// for( j = 0; j < system->n; ++j ) {
// workspace->r2[j][0] = workspace->b[j][0] - workspace->q2[j][0];
// workspace->r2[j][1] = workspace->b[j][1] - workspace->q2[j][1];
// // apply diagonal pre-conditioner
// workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
// workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
Cuda_CG_Diagnol_Preconditioner (dev_workspace, b, system->n);
// compare_rvec2 (workspace->r2, dev_workspace->r2, n, "r2");
// compare_rvec2 (workspace->d2, dev_workspace->d2, n, "d2");
//#ifdef __CUDA_DEBUG__
// my_sum[0] = my_sum[1] = 0;
// for( j = 0; j < n; ++j ) {
// my_sum[0] += SQR( workspace->b[j][0] );
// my_sum[1] += SQR( workspace->b[j][1] );
// }
// fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]);
//#endif
my_sum[0] = my_sum[1] = 0;
Cuda_Norm (b, n, my_sum);
// fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]);
MPI_Allreduce( &my_sum, &norm_sqr, 2, MPI_DOUBLE, MPI_SUM, comm );
b_norm[0] = SQRT( norm_sqr[0] );
b_norm[1] = SQRT( norm_sqr[1] );
//fprintf( stderr, "bnorm = %f %f\n", b_norm[0], b_norm[1] );
//#ifdef __CUDA_DEBUG__
// my_dot[0] = my_dot[1] = 0;
// for( j = 0; j < n; ++j ) {
// my_dot[0] += workspace->r2[j][0] * workspace->d2[j][0];
// my_dot[1] += workspace->r2[j][1] * workspace->d2[j][1];
// }
// fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] );
//#endif
my_dot[0] = my_dot[1] = 0;
Cuda_Dot (dev_workspace->r2, dev_workspace->d2, my_dot, n);
// fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] );
MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
//fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] );
if ( system->my_rank == MASTER_NODE )
Update_Timing_Info( &t_start, &dot_time );
#endif
//#ifdef __CUDA_DEBUG__
// Dist(system,mpi_data,workspace->d2,mpi_data->mpi_rvec2,scale,rvec2_packer);
//#endif
get_from_device (spad, dev_workspace->d2, sizeof (rvec2) * system->total_cap, "cg:d2:get");
Dist(system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer);
put_on_device (spad, dev_workspace->d2, sizeof (rvec2) * system->total_cap, "cg:d2:put");
//print_device_rvec2 (dev_workspace->d2, N);
//#ifdef __CUDA_DEBUG__
// dual_Sparse_MatVec( &workspace->H, workspace->d2, workspace->q2, N );
//#endif
Cuda_Dual_Matvec( H, dev_workspace->d2, dev_workspace->q2, system->N, system->total_cap );
/*
fprintf (stderr, "******************* Device sparse Matrix--------> %d \n", H->n );
fprintf (stderr, " ******* HOST SPARSE MATRIX ******** \n");
print_sparse_matrix_host (&workspace->H);
fprintf (stderr, " ******* HOST Vector ***************\n");
print_host_rvec2 (workspace->d2, system->N);
fprintf (stderr, " ******* Device SPARSE MATRIX ******** \n");
print_sparse_matrix (&dev_workspace->H);
fprintf (stderr, " ******* Device Vector ***************\n");
print_device_rvec2 (dev_workspace->d2, system->N);
*/
//compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
// tryQEq
// MVAPICH2
//#ifdef __CUDA_DEBUG__
// Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker);
//#endif
get_from_device (spad, dev_workspace->q2, sizeof (rvec2) * system->total_cap, "cg:q2:get");
Coll(system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker);
put_on_device (spad, dev_workspace->q2, sizeof (rvec2) * system->total_cap, "cg:q2:put");
// compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
if ( system->my_rank == MASTER_NODE )
Update_Timing_Info( &t_start, &matvec_time );
#endif
//#ifdef __CUDA_DEBUG__
// my_dot[0] = my_dot[1] = 0;
// for( j = 0; j < n; ++j ) {
// my_dot[0] += workspace->d2[j][0] * workspace->q2[j][0];
// my_dot[1] += workspace->d2[j][1] * workspace->q2[j][1];
// }
// fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] );
my_dot[0] = my_dot[1] = 0;
Cuda_Dot (dev_workspace->d2, dev_workspace->q2, my_dot, n);
//fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] );
MPI_Allreduce( &my_dot, &tmp, 2, MPI_DOUBLE, MPI_SUM, comm );
//fprintf( stderr, "tmp: %f %f\n", tmp[0], tmp[1] );
alpha[0] = sig_new[0] / tmp[0];
alpha[1] = sig_new[1] / tmp[1];
my_dot[0] = my_dot[1] = 0;
//#ifdef __CUDA_DEBUG__
// for( j = 0; j < system->n; ++j ) {
// workspace->x[j][0] += alpha[0] * workspace->d2[j][0];
// workspace->x[j][1] += alpha[1] * workspace->d2[j][1];
// // update residual
// workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0];
// workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1];
// // apply diagonal pre-conditioner
// workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
// workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
// my_dot[0] += workspace->r2[j][0] * workspace->p2[j][0];
// my_dot[1] += workspace->r2[j][1] * workspace->p2[j][1];
// }
// fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] );
//#endif
my_dot[0] = my_dot[1] = 0;
Cuda_DualCG_Preconditioer (dev_workspace, x, alpha, system->n, my_dot);
//fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] );
// compare_rvec2 (workspace->x, dev_workspace->x, N, "x");
// compare_rvec2 (workspace->r2, dev_workspace->r2, N, "r2");
// compare_rvec2 (workspace->p2, dev_workspace->p2, N, "p2");
sig_old[0] = sig_new[0];
sig_old[1] = sig_new[1];
MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
//fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] );
if ( system->my_rank == MASTER_NODE )
Update_Timing_Info( &t_start, &dot_time );
if ( SQRT(sig_new[0]) / b_norm[0] <= tol || SQRT(sig_new[1]) / b_norm[1] <= tol )
break;
beta[0] = sig_new[0] / sig_old[0];
beta[1] = sig_new[1] / sig_old[1];
//#ifdef __CUDA_DEBUG__
// for( j = 0; j < system->n; ++j ) {
// workspace->d2[j][0] = workspace->p2[j][0] + beta[0] * workspace->d2[j][0];
// workspace->d2[j][1] = workspace->p2[j][1] + beta[1] * workspace->d2[j][1];
// }
//#endif
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
Cuda_Vector_Sum_Rvec2 (dev_workspace->d2, dev_workspace->p2, beta,
dev_workspace->d2, system->n);
// compare_rvec2 (workspace->d2, dev_workspace->d2, N, "q2");
}
if ( SQRT(sig_new[0]) / b_norm[0] <= tol )
{
//for( j = 0; j < n; ++j )
// workspace->t[j] = workspace->x[j][1];
//fprintf (stderr, "Getting started with Cuda_CG1 \n");
Cuda_RvecCopy_From (dev_workspace->t, dev_workspace->x, 1, system->n);
//compare_array (workspace->b_t, dev_workspace->b_t, system->n, "b_t");
//compare_array (workspace->t, dev_workspace->t, system->n, "t");
matvecs = Cuda_CG( system, workspace, H, dev_workspace->b_t, tol, dev_workspace->t,
mpi_data, fout );
//fprintf (stderr, " Cuda_CG1: iterations --> %d \n", matvecs );
//for( j = 0; j < n; ++j )
// workspace->x[j][1] = workspace->t[j];
Cuda_RvecCopy_To (dev_workspace->x, dev_workspace->t, 1, system->n);
}
else if ( SQRT(sig_new[1]) / b_norm[1] <= tol )
{
//for( j = 0; j < n; ++j )
// workspace->s[j] = workspace->x[j][0];
Cuda_RvecCopy_From (dev_workspace->s, dev_workspace->x, 0, system->n);
//compare_array (workspace->s, dev_workspace->s, system->n, "s");
//compare_array (workspace->b_s, dev_workspace->b_s, system->n, "b_s");
//fprintf (stderr, "Getting started with Cuda_CG2 \n");
matvecs = Cuda_CG( system, workspace, H, dev_workspace->b_s, tol, dev_workspace->s,
mpi_data, fout );
//fprintf (stderr, " Cuda_CG2: iterations --> %d \n", matvecs );
//for( j = 0; j < system->n; ++j )
// workspace->x[j][0] = workspace->s[j];
Cuda_RvecCopy_To (dev_workspace->x, dev_workspace->s, 0, system->n);
}
if ( i >= 300 )
fprintf( stderr, "Dual CG convergence failed! -> %d\n", i );
if ( system->my_rank == MASTER_NODE )
fprintf( fout, "QEq %d + %d iters. matvecs: %f dot: %f\n",
i + 1, matvecs, matvec_time, dot_time );
Kurt A. O'Hearn
committed
#endif
void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N )
{
int i, j, k, si;
real H;
for ( i = 0; i < N; ++i )
b[i] = 0;
/* perform multiplication */
for ( i = 0; i < A->n; ++i )
{
si = A->start[i];
b[i] += A->entries[si].val * x[i];
for ( k = si + 1; k < A->end[i]; ++k )
{
j = A->entries[k].j;
H = A->entries[k].val;
b[i] += H * x[j];
//if( j < A->n ) // comment out for tryQEq
b[j] += H * x[i];
}
int CG( reax_system *system, storage *workspace, sparse_matrix *H,
real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout)
int i, j, scale;
real tmp, alpha, beta, b_norm;
real sig_old, sig_new, sig0;
scale = sizeof(real) / sizeof(void);
Dist( system, mpi_data, x, MPI_DOUBLE, scale, real_packer );
Sparse_MatVec( H, x, workspace->q, system->N );
// tryQEq
Coll( system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker );
if ( system->my_rank == MASTER_NODE )
Update_Timing_Info( &t_start, &matvec_time );
Vector_Sum( workspace->r , 1., b, -1., workspace->q, system->n );
for ( j = 0; j < system->n; ++j )
workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition
b_norm = Parallel_Norm( b, system->n, mpi_data->world );
sig_new = Parallel_Dot(workspace->r, workspace->d, system->n, mpi_data->world);
sig0 = sig_new;
if ( system->my_rank == MASTER_NODE )
Update_Timing_Info( &t_start, &dot_time );
for ( i = 1; i < 300 && SQRT(sig_new) / b_norm > tol; ++i )
{
Dist( system, mpi_data, workspace->d, MPI_DOUBLE, scale, real_packer );
Sparse_MatVec( H, workspace->d, workspace->q, system->N );
//tryQEq
Coll(system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker);
if ( system->my_rank == MASTER_NODE )
Update_Timing_Info( &t_start, &matvec_time );
tmp = Parallel_Dot(workspace->d, workspace->q, system->n, mpi_data->world);
alpha = sig_new / tmp;
Vector_Add( x, alpha, workspace->d, system->n );
Vector_Add( workspace->r, -alpha, workspace->q, system->n );
/* pre-conditioning */
for ( j = 0; j < system->n; ++j )
workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
sig_old = sig_new;
sig_new = Parallel_Dot(workspace->r, workspace->p, system->n, mpi_data->world);
//fprintf (stderr, "Host : sig_new: %f \n", sig_new );
beta = sig_new / sig_old;
Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, system->n );
if ( system->my_rank == MASTER_NODE )
Update_Timing_Info( &t_start, &dot_time );
}
if ( i >= 300 )
{
fprintf( stderr, "CG convergence failed!\n" );
return i;
}
Kurt A. O'Hearn
committed
#ifdef HAVE_CUDA
int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H,
real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
int i, j, scale;
real tmp, alpha, beta, b_norm;
real sig_old, sig_new, sig0;
real *spad = (real *) host_scratch;
scale = sizeof(real) / sizeof(void);
/* x is on the device */
//MVAPICH2
memset (spad, 0, sizeof (real) * system->total_cap);
get_from_device (spad, x, sizeof (real) * system->total_cap, "cuda_cg:x:get");
Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer );
//MVAPICH2
put_on_device (spad, x, sizeof (real) * system->total_cap , "cuda_cg:x:put");
Cuda_Matvec( H, x, dev_workspace->q, system->N, system->total_cap );
// tryQEq
// MVAPICH2
get_from_device (spad, dev_workspace->q, sizeof (real) * system->total_cap, "cuda_cg:q:get" );
Coll( system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker );
//MVAPICH2
put_on_device (spad, dev_workspace->q, sizeof (real) * system->total_cap, "cuda_cg:q:put" );
if ( system->my_rank == MASTER_NODE )
Update_Timing_Info( &t_start, &matvec_time );
Cuda_Vector_Sum( dev_workspace->r , 1., b, -1., dev_workspace->q, system->n );
//for( j = 0; j < system->n; ++j )
// workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition
Cuda_CG_Preconditioner (dev_workspace->d, dev_workspace->r, dev_workspace->Hdia_inv, system->n);
//TODO do the parallel_norm on the device for the local sum
get_from_device (spad, b, sizeof (real) * system->n, "cuda_cg:b:get");
b_norm = Parallel_Norm( spad, system->n, mpi_data->world );
//TODO do the parallel dot on the device for the local sum
get_from_device (spad, dev_workspace->r, sizeof (real) * system->total_cap, "cuda_cg:r:get");
get_from_device (spad + system->total_cap, dev_workspace->d, sizeof (real) * system->total_cap, "cuda_cg:d:get");
sig_new = Parallel_Dot(spad, spad + system->total_cap, system->n, mpi_data->world);
if ( system->my_rank == MASTER_NODE )
Update_Timing_Info( &t_start, &dot_time );
for ( i = 1; i < 300 && SQRT(sig_new) / b_norm > tol; ++i )
{
//MVAPICH2
get_from_device (spad, dev_workspace->d, sizeof (real) * system->total_cap, "cuda_cg:d:get");
Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer );
put_on_device (spad, dev_workspace->d, sizeof (real) * system->total_cap, "cuda_cg:d:put");
Cuda_Matvec( H, dev_workspace->d, dev_workspace->q, system->N, system->total_cap );
//tryQEq
get_from_device (spad, dev_workspace->q, sizeof (real) * system->total_cap, "cuda_cg:q:get" );
Coll(system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker);
put_on_device (spad, dev_workspace->q, sizeof (real) * system->total_cap , "cuda_cg:q:get");
if ( system->my_rank == MASTER_NODE )
Update_Timing_Info( &t_start, &matvec_time );
//TODO do the parallel dot on the device for the local sum
get_from_device (spad, dev_workspace->d, sizeof (real) * system->n, "cuda_cg:d:get");
get_from_device (spad + system->n, dev_workspace->q, sizeof (real) * system->n, "cuda_cg:q:get");
tmp = Parallel_Dot(spad, spad + system->n, system->n, mpi_data->world);
alpha = sig_new / tmp;
//Cuda_Vector_Add( x, alpha, dev_workspace->d, system->n );
Cuda_Vector_Sum( x, alpha, dev_workspace->d, 1.0, x, system->n );
//Cuda_Vector_Add( workspace->r, -alpha, workspace->q, system->n );
Cuda_Vector_Sum( dev_workspace->r, -alpha, dev_workspace->q, 1.0, dev_workspace->r, system->n );
/* pre-conditioning */
//for( j = 0; j < system->n; ++j )
// workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
Cuda_CG_Preconditioner (dev_workspace->p, dev_workspace->r, dev_workspace->Hdia_inv, system->n);
sig_old = sig_new;
//TODO do the parallel dot on the device for the local sum
get_from_device (spad, dev_workspace->r, sizeof (real) * system->n, "cuda_cg:r:get");
get_from_device (spad + system->n, dev_workspace->p, sizeof (real) * system->n, "cuda_cg:p:get");
sig_new = Parallel_Dot(spad , spad + system->n, system->n, mpi_data->world);
//fprintf (stderr, "Device: sig_new: %f \n", sig_new );
beta = sig_new / sig_old;
Cuda_Vector_Sum( dev_workspace->d, 1., dev_workspace->p, beta, dev_workspace->d, system->n );
if ( system->my_rank == MASTER_NODE )
Update_Timing_Info( &t_start, &dot_time );
}
if ( i >= 300 )
{
fprintf( stderr, "CG convergence failed!\n" );
return i;
}
Kurt A. O'Hearn
committed
#endif
int CG_test( reax_system *system, storage *workspace, sparse_matrix *H,
real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
int i, j, scale;
real tmp, alpha, beta, b_norm;
real sig_old, sig_new, sig0;
scale = sizeof(real) / sizeof(void);
b_norm = Parallel_Norm( b, system->n, mpi_data->world );
if ( system->my_rank == MASTER_NODE )
{
fprintf( stderr, "n=%d, N=%d\n", system->n, system->N );
fprintf( stderr, "p%d CGinit: b_norm=%24.15e\n", system->my_rank, b_norm );
//Vector_Print( stderr, "d", workspace->d, system->N );
//Vector_Print( stderr, "q", workspace->q, system->N );
}
MPI_Barrier( mpi_data->world );
Sparse_MatVec( H, x, workspace->q, system->N );
//Coll( system, mpi_data, workspace->q, MPI_DOUBLE, real_unpacker );
Vector_Sum( workspace->r , 1., b, -1., workspace->q, system->n );
for ( j = 0; j < system->n; ++j )
workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition
sig_new = Parallel_Dot( workspace->r, workspace->d, system->n,
mpi_data->world );
sig0 = sig_new;
//if( system->my_rank == MASTER_NODE ) {
fprintf( stderr, "p%d CG:sig_new=%24.15e,d_norm=%24.15e,q_norm=%24.15e\n",
system->my_rank, sqrt(sig_new),
Parallel_Norm(workspace->d, system->n, mpi_data->world),
Parallel_Norm(workspace->q, system->n, mpi_data->world) );
//Vector_Print( stderr, "d", workspace->d, system->N );
//Vector_Print( stderr, "q", workspace->q, system->N );
//}
for ( i = 1; i < 300 && SQRT(sig_new) / b_norm > tol; ++i )
{
if ( system->my_rank == MASTER_NODE )
t_start = Get_Time( );
Dist( system, mpi_data, workspace->d, MPI_DOUBLE, scale, real_packer );
Sparse_MatVec( H, workspace->d, workspace->q, system->N );
//tryQEq
//Coll(system, mpi_data, workspace->q, MPI_DOUBLE, real_unpacker);
if ( system->my_rank == MASTER_NODE )
{
t_elapsed = Get_Timing_Info( t_start );
matvec_time += t_elapsed;
}
if ( system->my_rank == MASTER_NODE )
t_start = Get_Time( );
#endif
tmp = Parallel_Dot(workspace->d, workspace->q, system->n, mpi_data->world);
alpha = sig_new / tmp;
//if( system->my_rank == MASTER_NODE ){
fprintf(stderr,
"p%d CG iter%d:d_norm=%24.15e,q_norm=%24.15e,tmp = %24.15e\n",
system->my_rank, i,
//Parallel_Norm(workspace->d, system->n, mpi_data->world),
//Parallel_Norm(workspace->q, system->n, mpi_data->world),
Norm(workspace->d, system->n), Norm(workspace->q, system->n), tmp);
//Vector_Print( stderr, "d", workspace->d, system->N );
//for( j = 0; j < system->N; ++j )
// fprintf( stdout, "%d %24.15e\n",
// system->my_atoms[j].orig_id, workspace->q[j] );
//fprintf( stdout, "\n" );
//}
MPI_Barrier( mpi_data->world );
Vector_Add( x, alpha, workspace->d, system->n );
Vector_Add( workspace->r, -alpha, workspace->q, system->n );
/* pre-conditioning */
for ( j = 0; j < system->n; ++j )
workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
sig_old = sig_new;
sig_new = Parallel_Dot(workspace->r, workspace->p, system->n, mpi_data->world);
beta = sig_new / sig_old;
Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, system->n );
if ( system->my_rank == MASTER_NODE )
fprintf(stderr, "p%d CG iter%d: sig_new = %24.15e\n",
system->my_rank, i, sqrt(sig_new) );
MPI_Barrier( mpi_data->world );
if ( system->my_rank == MASTER_NODE )
{
t_elapsed = Get_Timing_Info( t_start );
dot_time += t_elapsed;
}
if ( system->my_rank == MASTER_NODE )
fprintf( stderr, "CG took %d iterations\n", i );
if ( system->my_rank == MASTER_NODE )
fprintf( stderr, "%f %f\n", matvec_time, dot_time );
if ( i >= 300 )
{
fprintf( stderr, "CG convergence failed!\n" );
return i;
}
return i;
}
void Forward_Subs( sparse_matrix *L, real *b, real *y )
{
int i, pj, j, si, ei;
real val;
for ( i = 0; i < L->n; ++i )
{
y[i] = b[i];
si = L->start[i];
ei = L->end[i];
for ( pj = si; pj < ei - 1; ++pj )
{
j = L->entries[pj].j;
val = L->entries[pj].val;
y[i] -= val * y[j];
}
y[i] /= L->entries[pj].val;
}
}
void Backward_Subs( sparse_matrix *U, real *y, real *x )
{
int i, pj, j, si, ei;
real val;
for ( i = U->n - 1; i >= 0; --i )
{
x[i] = y[i];
si = U->start[i];
ei = U->end[i];
for ( pj = si + 1; pj < ei; ++pj )
{
j = U->entries[pj].j;
val = U->entries[pj].val;
x[i] -= val * x[j];
}
x[i] /= U->entries[si].val;
int PCG( reax_system *system, storage *workspace,
sparse_matrix *H, real *b, real tol,
sparse_matrix *L, sparse_matrix *U, real *x,
mpi_datatypes* mpi_data, FILE *fout )
int i, me, n, N, scale;
real tmp, alpha, beta, b_norm, r_norm, sig_old, sig_new;
MPI_Comm world;
me = system->my_rank;
n = system->n;
N = system->N;
world = mpi_data->world;
scale = sizeof(real) / sizeof(void);
b_norm = Parallel_Norm( b, n, world );
if ( me == MASTER_NODE )
{
fprintf( stderr, "init_PCG: n=%d, N=%d\n", n, N );
fprintf( stderr, "init_PCG: |b|=%24.15e\n", b_norm );
}
MPI_Barrier( world );
Sparse_MatVec( H, x, workspace->q, N );
//Coll( system, workspace, mpi_data, workspace->q );
Vector_Sum( workspace->r , 1., b, -1., workspace->q, n );
r_norm = Parallel_Norm( workspace->r, n, world );
Forward_Subs( L, workspace->r, workspace->d );
Backward_Subs( U, workspace->d, workspace->p );
sig_new = Parallel_Dot( workspace->r, workspace->p, n, world );
if ( me == MASTER_NODE )
{
fprintf( stderr, "init_PCG: sig_new=%.15e\n", r_norm );
fprintf( stderr, "init_PCG: |d|=%.15e |q|=%.15e\n",
Parallel_Norm(workspace->d, n, world),
Parallel_Norm(workspace->q, n, world) );
}
MPI_Barrier( world );
for ( i = 1; i < 100 && r_norm / b_norm > tol; ++i )
{
Dist( system, mpi_data, workspace->p, MPI_DOUBLE, scale, real_packer );
Sparse_MatVec( H, workspace->p, workspace->q, N );
// tryQEq
//Coll(system,mpi_data,workspace->q, MPI_DOUBLE, real_unpacker);
tmp = Parallel_Dot( workspace->q, workspace->p, n, world );
alpha = sig_new / tmp;
Vector_Add( x, alpha, workspace->p, n );
if ( me == MASTER_NODE )
fprintf(stderr, "iter%d: |p|=%.15e |q|=%.15e tmp=%.15e\n",
i, Parallel_Norm(workspace->p, n, world),
Parallel_Norm(workspace->q, n, world), tmp );
MPI_Barrier( world );
Vector_Add( workspace->r, -alpha, workspace->q, n );
r_norm = Parallel_Norm( workspace->r, n, world );
if ( me == MASTER_NODE )
fprintf( stderr, "iter%d: res=%.15e\n", i, r_norm );
MPI_Barrier( world );
Forward_Subs( L, workspace->r, workspace->d );
Backward_Subs( U, workspace->d, workspace->d );
sig_old = sig_new;
sig_new = Parallel_Dot( workspace->r, workspace->d, n, world );
beta = sig_new / sig_old;
Vector_Sum( workspace->p, 1., workspace->d, beta, workspace->p, n );
}