-
Kurt A. O'Hearn authoredKurt A. O'Hearn authored
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
linear_solvers.c 48.85 KiB
/*----------------------------------------------------------------------
PuReMD - Purdue ReaxFF Molecular Dynamics Program
Copyright (2010) Purdue University
Hasan Metin Aktulga, haktulga@cs.purdue.edu
Joseph Fogarty, jcfogart@mail.usf.edu
Sagar Pandit, pandit@usf.edu
Ananth Y Grama, ayg@cs.purdue.edu
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License as
published by the Free Software Foundation; either version 2 of
the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details:
<http://www.gnu.org/licenses/>.
----------------------------------------------------------------------*/
#include "linear_solvers.h"
#include "basic_comm.h"
#include "io_tools.h"
#include "tool_box.h"
#include "vector.h"
#ifdef HAVE_CUDA
#include "validation.h"
#endif
#if defined(CG_PERFORMANCE)
real t_start, t_elapsed, matvec_time, dot_time;
#endif
void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N )
{
int i, j, k, si;
real H;
for ( i = 0; i < N; ++i )
{
b[i][0] = b[i][1] = 0;
}
/* perform multiplication */
for ( i = 0; i < A->n; ++i )
{
si = A->start[i];
b[i][0] += A->entries[si].val * x[i][0];
b[i][1] += A->entries[si].val * x[i][1];
for ( k = si + 1; k < A->end[i]; ++k )
{
j = A->entries[k].j;
H = A->entries[k].val;
b[i][0] += H * x[j][0];
b[i][1] += H * x[j][1];
// comment out for tryQEq
//if( j < A->n ) {
b[j][0] += H * x[i][0];
b[j][1] += H * x[i][1];
//}
}
}
}
int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2
*b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout,
simulation_data *data )
{
int i, j, n, N, matvecs, scale;
rvec2 tmp, alpha, beta;
rvec2 my_sum, norm_sqr, b_norm, my_dot;
rvec2 sig_old, sig_new;
MPI_Comm comm;
int a;
n = system->n;
N = system->N;
comm = mpi_data->world;
matvecs = 0;
scale = sizeof(rvec2) / sizeof(void);
#if defined(CG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
{
matvecs = 0;
t_start = matvec_time = dot_time = 0;
t_start = Get_Time( );
}
#endif
#ifdef HAVE_CUDA
check_zeros_host (x, system->N, "x");
#endif
Dist( system, mpi_data, x, mpi_data->mpi_rvec2, scale, rvec2_packer );
#ifdef HAVE_CUDA
check_zeros_host (x, system->N, "x");
#endif
dual_Sparse_MatVec( H, x, workspace->q2, N );
// if (data->step > 0) return;
// tryQEq
Coll(system, mpi_data, workspace->q2, mpi_data->mpi_rvec2, scale, rvec2_unpacker);
#if defined(CG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
Update_Timing_Info( &t_start, &matvec_time );
#endif
for ( j = 0; j < system->n; ++j )
{
/* residual */
workspace->r2[j][0] = b[j][0] - workspace->q2[j][0];
workspace->r2[j][1] = b[j][1] - workspace->q2[j][1];
/* apply diagonal pre-conditioner */
workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
}
//print_host_rvec2 (workspace->r2, n);
/* norm of b */
my_sum[0] = my_sum[1] = 0;
for ( j = 0; j < n; ++j )
{
my_sum[0] += SQR( b[j][0] );
my_sum[1] += SQR( b[j][1] );
}
//fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]);
MPI_Allreduce( &my_sum, &norm_sqr, 2, MPI_DOUBLE, MPI_SUM, comm );
b_norm[0] = SQRT( norm_sqr[0] );
b_norm[1] = SQRT( norm_sqr[1] );
//fprintf( stderr, "bnorm = %f %f\n", b_norm[0], b_norm[1] );
/* dot product: r.d */
my_dot[0] = my_dot[1] = 0;
for ( j = 0; j < n; ++j )
{
my_dot[0] += workspace->r2[j][0] * workspace->d2[j][0];
my_dot[1] += workspace->r2[j][1] * workspace->d2[j][1];
}
//fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] );
MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
//fprintf( stderr, "HOST:sig_new: %f %f\n", sig_new[0], sig_new[1] );
#if defined(CG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
{
Update_Timing_Info( &t_start, &dot_time );
}
#endif
for ( i = 1; i < 300; ++i )
{
Dist(system, mpi_data, workspace->d2, mpi_data->mpi_rvec2, scale, rvec2_packer);
//print_host_rvec2 (workspace->d2, N);
dual_Sparse_MatVec( H, workspace->d2, workspace->q2, N );
// tryQEq
Coll(system, mpi_data, workspace->q2, mpi_data->mpi_rvec2, scale, rvec2_unpacker);
#if defined(CG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
Update_Timing_Info( &t_start, &matvec_time );
#endif
/* dot product: d.q */
my_dot[0] = my_dot[1] = 0;
for ( j = 0; j < n; ++j )
{
my_dot[0] += workspace->d2[j][0] * workspace->q2[j][0];
my_dot[1] += workspace->d2[j][1] * workspace->q2[j][1];
}
MPI_Allreduce( &my_dot, &tmp, 2, MPI_DOUBLE, MPI_SUM, comm );
//fprintf( stderr, "tmp: %f %f\n", tmp[0], tmp[1] );
alpha[0] = sig_new[0] / tmp[0];
alpha[1] = sig_new[1] / tmp[1];
my_dot[0] = my_dot[1] = 0;
for ( j = 0; j < system->n; ++j )
{
/* update x */
x[j][0] += alpha[0] * workspace->d2[j][0];
x[j][1] += alpha[1] * workspace->d2[j][1];
/* update residual */
workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0];
workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1];
/* apply diagonal pre-conditioner */
workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
/* dot product: r.p */
my_dot[0] += workspace->r2[j][0] * workspace->p2[j][0];
my_dot[1] += workspace->r2[j][1] * workspace->p2[j][1];
}
sig_old[0] = sig_new[0];
sig_old[1] = sig_new[1];
MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
//fprintf( stderr, "HOST:sig_new: %f %f\n", sig_new[0], sig_new[1] );
#if defined(CG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
Update_Timing_Info( &t_start, &dot_time );
#endif
if ( SQRT(sig_new[0]) / b_norm[0] <= tol || SQRT(sig_new[1]) / b_norm[1] <= tol )
{
break;
}
beta[0] = sig_new[0] / sig_old[0];
beta[1] = sig_new[1] / sig_old[1];
for ( j = 0; j < system->n; ++j )
{
/* d = p + beta * d */
workspace->d2[j][0] = workspace->p2[j][0] + beta[0] * workspace->d2[j][0];
workspace->d2[j][1] = workspace->p2[j][1] + beta[1] * workspace->d2[j][1];
}
}
if ( SQRT(sig_new[0]) / b_norm[0] <= tol )
{
for ( j = 0; j < n; ++j )
{
workspace->t[j] = workspace->x[j][1];
}
matvecs = CG( system, workspace, H, workspace->b_t, tol, workspace->t,
mpi_data, fout );
fprintf (stderr, " CG1: iterations --> %d \n", matvecs );
for ( j = 0; j < n; ++j )
{
workspace->x[j][1] = workspace->t[j];
}
}
else if ( SQRT(sig_new[1]) / b_norm[1] <= tol )
{
for ( j = 0; j < n; ++j )
{
workspace->s[j] = workspace->x[j][0];
}
matvecs = CG( system, workspace, H, workspace->b_s, tol, workspace->s,
mpi_data, fout );
fprintf (stderr, " CG2: iterations --> %d \n", matvecs );
for ( j = 0; j < system->n; ++j )
{
workspace->x[j][0] = workspace->s[j];
}
}
if ( i >= 300 )
{
fprintf( stderr, "CG convergence failed!\n" );
}
#if defined(CG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
fprintf( fout, "QEq %d + %d iters. matvecs: %f dot: %f\n",
i + 1, matvecs, matvec_time, dot_time );
#endif
return (i + 1) + matvecs;
}
#ifdef HAVE_CUDA
int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout,
simulation_data *data )
{
int i, j, n, N, matvecs, scale;
rvec2 tmp, alpha, beta;
rvec2 my_sum, norm_sqr, b_norm, my_dot;
rvec2 sig_old, sig_new;
MPI_Comm comm;
rvec2 *spad = (rvec2 *) host_scratch;
int a;
n = system->n;
N = system->N;
comm = mpi_data->world;
matvecs = 0;
scale = sizeof(rvec2) / sizeof(void);
#if defined(CG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
{
matvecs = 0;
t_start = matvec_time = dot_time = 0;
t_start = Get_Time( );
}
#endif
//MVAPICH2
//#ifdef __CUDA_DEBUG__
// Dist( system, mpi_data, workspace->x, mpi_data->mpi_rvec2, scale, rvec2_packer );
//#endif
// check_zeros_device( x, system->N, "x" );
get_from_device( spad, x, sizeof (rvec2) * system->total_cap, "CG:x:get" );
Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer );
put_on_device( spad, x, sizeof (rvec2) * system->total_cap, "CG:x:put" );
// check_zeros_device( x, system->N, "x" );
// compare_rvec2 (workspace->x, x, N, "x");
// if (data->step > 0) {
// compare_rvec2 (workspace->b, dev_workspace->b, system->N, "b");
// compare_rvec2 (workspace->x, dev_workspace->x, system->N, "x");
//
// exit (0);
// }
//#ifdef __CUDA_DEBUG__
// dual_Sparse_MatVec( &workspace->H, workspace->x, workspace->q2, N );
//#endif
//originally we were using only H->n which was system->n (init_md.c)
//Cuda_Dual_Matvec ( H, x, dev_workspace->q2, H->n, system->total_cap);
Cuda_Dual_Matvec ( H, x, dev_workspace->q2, system->N, system->total_cap);
// compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
// if (data->step > 0) exit (0);
// tryQEq
//MVAPICH2
//#ifdef __CUDA_DEBUG__
// Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker);
//#endif
get_from_device (spad, dev_workspace->q2, sizeof (rvec2) *
system->total_cap, "CG:q2:get" );
Coll(system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker);
put_on_device (spad, dev_workspace->q2, sizeof (rvec2) * system->total_cap,
"CG:q2:put" );
#if defined(CG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
{
Update_Timing_Info( &t_start, &matvec_time );
}
#endif
//#ifdef __CUDA_DEBUG__
// for( j = 0; j < system->n; ++j ) {
// // residual
// workspace->r2[j][0] = workspace->b[j][0] - workspace->q2[j][0];
// workspace->r2[j][1] = workspace->b[j][1] - workspace->q2[j][1];
// // apply diagonal pre-conditioner
// workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
// workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
// }
//#endif
Cuda_CG_Diagnol_Preconditioner (dev_workspace, b, system->n);
// compare_rvec2 (workspace->r2, dev_workspace->r2, n, "r2");
// compare_rvec2 (workspace->d2, dev_workspace->d2, n, "d2");
/* norm of b */
//#ifdef __CUDA_DEBUG__
// my_sum[0] = my_sum[1] = 0;
// for( j = 0; j < n; ++j ) {
// my_sum[0] += SQR( workspace->b[j][0] );
// my_sum[1] += SQR( workspace->b[j][1] );
// }
// fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]);
//#endif
my_sum[0] = my_sum[1] = 0;
Cuda_Norm (b, n, my_sum);
// fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]);
MPI_Allreduce( &my_sum, &norm_sqr, 2, MPI_DOUBLE, MPI_SUM, comm );
b_norm[0] = SQRT( norm_sqr[0] );
b_norm[1] = SQRT( norm_sqr[1] );
//fprintf( stderr, "bnorm = %f %f\n", b_norm[0], b_norm[1] );
/* dot product: r.d */
//#ifdef __CUDA_DEBUG__
// my_dot[0] = my_dot[1] = 0;
// for( j = 0; j < n; ++j ) {
// my_dot[0] += workspace->r2[j][0] * workspace->d2[j][0];
// my_dot[1] += workspace->r2[j][1] * workspace->d2[j][1];
// }
// fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] );
//#endif
my_dot[0] = my_dot[1] = 0;
Cuda_Dot (dev_workspace->r2, dev_workspace->d2, my_dot, n);
// fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] );
MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
//fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] );
#if defined(CG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
{
Update_Timing_Info( &t_start, &dot_time );
}
#endif
for ( i = 1; i < 300; ++i )
{
//MVAPICH2
//#ifdef __CUDA_DEBUG__
// Dist(system,mpi_data,workspace->d2,mpi_data->mpi_rvec2,scale,rvec2_packer);
//#endif
get_from_device( spad, dev_workspace->d2, sizeof (rvec2) *
system->total_cap, "cg:d2:get" );
Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer );
put_on_device( spad, dev_workspace->d2, sizeof (rvec2) *
system->total_cap, "cg:d2:put" );
//print_device_rvec2 (dev_workspace->d2, N);
//#ifdef __CUDA_DEBUG__
// dual_Sparse_MatVec( &workspace->H, workspace->d2, workspace->q2, N );
//#endif
Cuda_Dual_Matvec( H, dev_workspace->d2, dev_workspace->q2, system->N,
system->total_cap );
/*
fprintf (stderr, "******************* Device sparse Matrix--------> %d \n", H->n );
fprintf (stderr, " ******* HOST SPARSE MATRIX ******** \n");
print_sparse_matrix_host (&workspace->H);
fprintf (stderr, " ******* HOST Vector ***************\n");
print_host_rvec2 (workspace->d2, system->N);
fprintf (stderr, " ******* Device SPARSE MATRIX ******** \n");
print_sparse_matrix (&dev_workspace->H);
fprintf (stderr, " ******* Device Vector ***************\n");
print_device_rvec2 (dev_workspace->d2, system->N);
*/
//compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
// tryQEq
// MVAPICH2
//#ifdef __CUDA_DEBUG__
// Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker);
//#endif
get_from_device( spad, dev_workspace->q2, sizeof (rvec2) *
system->total_cap, "cg:q2:get" );
Coll( system, mpi_data, spad, mpi_data->mpi_rvec2, scale,
rvec2_unpacker );
put_on_device( spad, dev_workspace->q2, sizeof (rvec2) *
system->total_cap, "cg:q2:put" );
// compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
#if defined(CG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
{
Update_Timing_Info( &t_start, &matvec_time );
}
#endif
/* dot product: d.q */
//#ifdef __CUDA_DEBUG__
// my_dot[0] = my_dot[1] = 0;
// for( j = 0; j < n; ++j ) {
// my_dot[0] += workspace->d2[j][0] * workspace->q2[j][0];
// my_dot[1] += workspace->d2[j][1] * workspace->q2[j][1];
// }
// fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] );
//#endif
my_dot[0] = my_dot[1] = 0;
Cuda_Dot (dev_workspace->d2, dev_workspace->q2, my_dot, n);
//fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] );
MPI_Allreduce( &my_dot, &tmp, 2, MPI_DOUBLE, MPI_SUM, comm );
//fprintf( stderr, "tmp: %f %f\n", tmp[0], tmp[1] );
alpha[0] = sig_new[0] / tmp[0];
alpha[1] = sig_new[1] / tmp[1];
my_dot[0] = my_dot[1] = 0;
//#ifdef __CUDA_DEBUG__
// for( j = 0; j < system->n; ++j ) {
// // update x
// workspace->x[j][0] += alpha[0] * workspace->d2[j][0];
// workspace->x[j][1] += alpha[1] * workspace->d2[j][1];
// // update residual
// workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0];
// workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1];
// // apply diagonal pre-conditioner
// workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
// workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
// // dot product: r.p
// my_dot[0] += workspace->r2[j][0] * workspace->p2[j][0];
// my_dot[1] += workspace->r2[j][1] * workspace->p2[j][1];
// }
// fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] );
//#endif
my_dot[0] = my_dot[1] = 0;
Cuda_DualCG_Preconditioner( dev_workspace, x, alpha, system->n, my_dot );
//fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] );
// compare_rvec2 (workspace->x, dev_workspace->x, N, "x");
// compare_rvec2 (workspace->r2, dev_workspace->r2, N, "r2");
// compare_rvec2 (workspace->p2, dev_workspace->p2, N, "p2");
sig_old[0] = sig_new[0];
sig_old[1] = sig_new[1];
MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
//fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] );
#if defined(CG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
{
Update_Timing_Info( &t_start, &dot_time );
}
#endif
if ( SQRT(sig_new[0]) / b_norm[0] <= tol || SQRT(sig_new[1]) / b_norm[1] <= tol )
{
break;
}
beta[0] = sig_new[0] / sig_old[0];
beta[1] = sig_new[1] / sig_old[1];
//#ifdef __CUDA_DEBUG__
// for( j = 0; j < system->n; ++j ) {
// // d = p + beta * d
// workspace->d2[j][0] = workspace->p2[j][0] + beta[0] * workspace->d2[j][0];
// workspace->d2[j][1] = workspace->p2[j][1] + beta[1] * workspace->d2[j][1];
// }
//#endif
Cuda_Vector_Sum_Rvec2( dev_workspace->d2, dev_workspace->p2, beta,
dev_workspace->d2, system->n );
// compare_rvec2 (workspace->d2, dev_workspace->d2, N, "q2");
}
if ( SQRT(sig_new[0]) / b_norm[0] <= tol )
{
//for( j = 0; j < n; ++j )
// workspace->t[j] = workspace->x[j][1];
//fprintf (stderr, "Getting started with Cuda_CG1 \n");
Cuda_RvecCopy_From( dev_workspace->t, dev_workspace->x, 1, system->n );
//compare_array (workspace->b_t, dev_workspace->b_t, system->n, "b_t");
//compare_array (workspace->t, dev_workspace->t, system->n, "t");
matvecs = Cuda_CG( system, workspace, H, dev_workspace->b_t, tol, dev_workspace->t,
mpi_data, fout );
//fprintf (stderr, " Cuda_CG1: iterations --> %d \n", matvecs );
//for( j = 0; j < n; ++j )
// workspace->x[j][1] = workspace->t[j];
Cuda_RvecCopy_To( dev_workspace->x, dev_workspace->t, 1, system->n );
}
else if ( SQRT(sig_new[1]) / b_norm[1] <= tol )
{
//for( j = 0; j < n; ++j )
// workspace->s[j] = workspace->x[j][0];
Cuda_RvecCopy_From( dev_workspace->s, dev_workspace->x, 0, system->n );
//compare_array (workspace->s, dev_workspace->s, system->n, "s");
//compare_array (workspace->b_s, dev_workspace->b_s, system->n, "b_s");
//fprintf (stderr, "Getting started with Cuda_CG2 \n");
matvecs = Cuda_CG( system, workspace, H, dev_workspace->b_s, tol, dev_workspace->s,
mpi_data, fout );
//fprintf (stderr, " Cuda_CG2: iterations --> %d \n", matvecs );
//for( j = 0; j < system->n; ++j )
// workspace->x[j][0] = workspace->s[j];
Cuda_RvecCopy_To( dev_workspace->x, dev_workspace->s, 0, system->n );
}
if ( i >= 300 )
{
fprintf( stderr, "Dual CG convergence failed! -> %d\n", i );
}
#if defined(CG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
{
fprintf( fout, "QEq %d + %d iters. matvecs: %f dot: %f\n",
i + 1, matvecs, matvec_time, dot_time );
}
#endif
return (i + 1) + matvecs;
}
#endif
void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N )
{
int i, j, k, si;
real H;
for ( i = 0; i < N; ++i )
{
b[i] = 0;
}
/* perform multiplication */
for ( i = 0; i < A->n; ++i )
{
si = A->start[i];
b[i] += A->entries[si].val * x[i];
for ( k = si + 1; k < A->end[i]; ++k )
{
j = A->entries[k].j;
H = A->entries[k].val;
b[i] += H * x[j];
//if( j < A->n ) // comment out for tryQEq
b[j] += H * x[i];
}
}
}
int CG( reax_system *system, storage *workspace, sparse_matrix *H, real *b,
real tol, real *x, mpi_datatypes* mpi_data, FILE *fout)
{
int i, j, scale;
real tmp, alpha, beta, b_norm;
real sig_old, sig_new, sig0;
scale = sizeof(real) / sizeof(void);
Dist( system, mpi_data, x, MPI_DOUBLE, scale, real_packer );
Sparse_MatVec( H, x, workspace->q, system->N );
// tryQEq
Coll( system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker );
#if defined(CG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
{
Update_Timing_Info( &t_start, &matvec_time );
}
#endif
Vector_Sum( workspace->r , 1., b, -1., workspace->q, system->n );
for ( j = 0; j < system->n; ++j )
{
workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition
}
b_norm = Parallel_Norm( b, system->n, mpi_data->world );
sig_new = Parallel_Dot(workspace->r, workspace->d, system->n, mpi_data->world);
sig0 = sig_new;
#if defined(CG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
{
Update_Timing_Info( &t_start, &dot_time );
}
#endif
for ( i = 1; i < 300 && SQRT(sig_new) / b_norm > tol; ++i )
{
Dist( system, mpi_data, workspace->d, MPI_DOUBLE, scale, real_packer );
Sparse_MatVec( H, workspace->d, workspace->q, system->N );
//tryQEq
Coll(system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker);
#if defined(CG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
{
Update_Timing_Info( &t_start, &matvec_time );
}
#endif
tmp = Parallel_Dot(workspace->d, workspace->q, system->n, mpi_data->world);
alpha = sig_new / tmp;
Vector_Add( x, alpha, workspace->d, system->n );
Vector_Add( workspace->r, -alpha, workspace->q, system->n );
/* pre-conditioning */
for ( j = 0; j < system->n; ++j )
{
workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
}
sig_old = sig_new;
sig_new = Parallel_Dot(workspace->r, workspace->p, system->n, mpi_data->world);
//fprintf (stderr, "Host : sig_new: %f \n", sig_new );
beta = sig_new / sig_old;
Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, system->n );
#if defined(CG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
{
Update_Timing_Info( &t_start, &dot_time );
}
#endif
}
if ( i >= 300 )
{
fprintf( stderr, "CG convergence failed!\n" );
return i;
}
return i;
}
#ifdef HAVE_CUDA
int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, real
*b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
{
int i, j, scale;
real tmp, alpha, beta, b_norm;
real sig_old, sig_new, sig0;
real *spad = (real *) host_scratch;
scale = sizeof(real) / sizeof(void);
/* x is on the device */
//MVAPICH2
memset( spad, 0, sizeof (real) * system->total_cap );
get_from_device( spad, x, sizeof (real) * system->total_cap, "cuda_cg:x:get" );
Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer );
//MVAPICH2
put_on_device( spad, x, sizeof (real) * system->total_cap , "cuda_cg:x:put" );
Cuda_Matvec( H, x, dev_workspace->q, system->N, system->total_cap );
// tryQEq
// MVAPICH2
get_from_device( spad, dev_workspace->q, sizeof (real) * system->total_cap,
"cuda_cg:q:get" );
Coll( system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker );
//MVAPICH2
put_on_device( spad, dev_workspace->q, sizeof (real) * system->total_cap,
"cuda_cg:q:put" );
#if defined(CG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
{
Update_Timing_Info( &t_start, &matvec_time );
}
#endif
Cuda_Vector_Sum( dev_workspace->r , 1., b, -1., dev_workspace->q,
system->n );
//for( j = 0; j < system->n; ++j )
// workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition
Cuda_CG_Preconditioner( dev_workspace->d, dev_workspace->r,
dev_workspace->Hdia_inv, system->n );
//TODO do the parallel_norm on the device for the local sum
get_from_device( spad, b, sizeof (real) * system->n, "cuda_cg:b:get" );
b_norm = Parallel_Norm( spad, system->n, mpi_data->world );
//TODO do the parallel dot on the device for the local sum
get_from_device( spad, dev_workspace->r, sizeof (real) * system->total_cap,
"cuda_cg:r:get" );
get_from_device( spad + system->total_cap, dev_workspace->d, sizeof (real)
* system->total_cap, "cuda_cg:d:get" );
sig_new = Parallel_Dot( spad, spad + system->total_cap, system->n,
mpi_data->world );
sig0 = sig_new;
#if defined(CG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
{
Update_Timing_Info( &t_start, &dot_time );
}
#endif
for ( i = 1; i < 300 && SQRT(sig_new) / b_norm > tol; ++i )
{
//MVAPICH2
get_from_device( spad, dev_workspace->d, sizeof (real) *
system->total_cap, "cuda_cg:d:get" );
Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer );
put_on_device( spad, dev_workspace->d, sizeof (real) *
system->total_cap, "cuda_cg:d:put" );
Cuda_Matvec( H, dev_workspace->d, dev_workspace->q, system->N, system->total_cap );
//tryQEq
get_from_device( spad, dev_workspace->q, sizeof (real) *
system->total_cap, "cuda_cg:q:get" );
Coll( system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker );
put_on_device( spad, dev_workspace->q, sizeof (real) *
system->total_cap , "cuda_cg:q:get" );
#if defined(CG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
{
Update_Timing_Info( &t_start, &matvec_time );
}
#endif
//TODO do the parallel dot on the device for the local sum
get_from_device( spad, dev_workspace->d, sizeof (real) * system->n,
"cuda_cg:d:get" );
get_from_device( spad + system->n, dev_workspace->q, sizeof (real) *
system->n, "cuda_cg:q:get" );
tmp = Parallel_Dot( spad, spad + system->n, system->n, mpi_data->world );
alpha = sig_new / tmp;
//Cuda_Vector_Add( x, alpha, dev_workspace->d, system->n );
Cuda_Vector_Sum( x, alpha, dev_workspace->d, 1.0, x, system->n );
//Cuda_Vector_Add( workspace->r, -alpha, workspace->q, system->n );
Cuda_Vector_Sum( dev_workspace->r, -alpha, dev_workspace->q, 1.0,
dev_workspace->r, system->n );
/* pre-conditioning */
//for( j = 0; j < system->n; ++j )
// workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
Cuda_CG_Preconditioner( dev_workspace->p, dev_workspace->r,
dev_workspace->Hdia_inv, system->n );
sig_old = sig_new;
//TODO do the parallel dot on the device for the local sum
get_from_device( spad, dev_workspace->r, sizeof (real) * system->n,
"cuda_cg:r:get" );
get_from_device( spad + system->n, dev_workspace->p, sizeof (real) *
system->n, "cuda_cg:p:get" );
sig_new = Parallel_Dot( spad , spad + system->n, system->n, mpi_data->world );
//fprintf (stderr, "Device: sig_new: %f \n", sig_new );
beta = sig_new / sig_old;
Cuda_Vector_Sum( dev_workspace->d, 1., dev_workspace->p, beta,
dev_workspace->d, system->n );
#if defined(CG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
{
Update_Timing_Info( &t_start, &dot_time );
}
#endif
}
if ( i >= 300 )
{
fprintf( stderr, "CG convergence failed!\n" );
return i;
}
return i;
}
#endif
int CG_test( reax_system *system, storage *workspace, sparse_matrix *H, real
*b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
{
int i, j, scale;
real tmp, alpha, beta, b_norm;
real sig_old, sig_new, sig0;
scale = sizeof(real) / sizeof(void);
b_norm = Parallel_Norm( b, system->n, mpi_data->world );
#if defined(DEBUG)
if ( system->my_rank == MASTER_NODE )
{
fprintf( stderr, "n=%d, N=%d\n", system->n, system->N );
fprintf( stderr, "p%d CGinit: b_norm=%24.15e\n", system->my_rank, b_norm );
//Vector_Print( stderr, "d", workspace->d, system->N );
//Vector_Print( stderr, "q", workspace->q, system->N );
}
MPI_Barrier( mpi_data->world );
#endif
Sparse_MatVec( H, x, workspace->q, system->N );
//Coll( system, mpi_data, workspace->q, MPI_DOUBLE, real_unpacker );
Vector_Sum( workspace->r , 1., b, -1., workspace->q, system->n );
for ( j = 0; j < system->n; ++j )
workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition
sig_new = Parallel_Dot( workspace->r, workspace->d, system->n,
mpi_data->world );
sig0 = sig_new;
#if defined(DEBUG)
//if( system->my_rank == MASTER_NODE ) {
fprintf( stderr, "p%d CG:sig_new=%24.15e,d_norm=%24.15e,q_norm=%24.15e\n",
system->my_rank, sqrt(sig_new),
Parallel_Norm(workspace->d, system->n, mpi_data->world),
Parallel_Norm(workspace->q, system->n, mpi_data->world) );
//Vector_Print( stderr, "d", workspace->d, system->N );
//Vector_Print( stderr, "q", workspace->q, system->N );
//}
MPI_Barrier( mpi_data->world );
#endif
for ( i = 1; i < 300 && SQRT(sig_new) / b_norm > tol; ++i )
{
#if defined(CG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
t_start = Get_Time( );
#endif
Dist( system, mpi_data, workspace->d, MPI_DOUBLE, scale, real_packer );
Sparse_MatVec( H, workspace->d, workspace->q, system->N );
//tryQEq
//Coll(system, mpi_data, workspace->q, MPI_DOUBLE, real_unpacker);
#if defined(CG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
{
t_elapsed = Get_Timing_Info( t_start );
matvec_time += t_elapsed;
}
#endif
#if defined(CG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
t_start = Get_Time( );
#endif
tmp = Parallel_Dot(workspace->d, workspace->q, system->n, mpi_data->world);
alpha = sig_new / tmp;
#if defined(DEBUG)
//if( system->my_rank == MASTER_NODE ){
fprintf(stderr,
"p%d CG iter%d:d_norm=%24.15e,q_norm=%24.15e,tmp = %24.15e\n",
system->my_rank, i,
//Parallel_Norm(workspace->d, system->n, mpi_data->world),
//Parallel_Norm(workspace->q, system->n, mpi_data->world),
Norm(workspace->d, system->n), Norm(workspace->q, system->n), tmp);
//Vector_Print( stderr, "d", workspace->d, system->N );
//for( j = 0; j < system->N; ++j )
// fprintf( stdout, "%d %24.15e\n",
// system->my_atoms[j].orig_id, workspace->q[j] );
//fprintf( stdout, "\n" );
//}
MPI_Barrier( mpi_data->world );
#endif
Vector_Add( x, alpha, workspace->d, system->n );
Vector_Add( workspace->r, -alpha, workspace->q, system->n );
/* pre-conditioning */
for ( j = 0; j < system->n; ++j )
workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
sig_old = sig_new;
sig_new = Parallel_Dot(workspace->r, workspace->p, system->n, mpi_data->world);
beta = sig_new / sig_old;
Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, system->n );
#if defined(DEBUG)
if ( system->my_rank == MASTER_NODE )
fprintf(stderr, "p%d CG iter%d: sig_new = %24.15e\n",
system->my_rank, i, sqrt(sig_new) );
MPI_Barrier( mpi_data->world );
#endif
#if defined(CG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
{
t_elapsed = Get_Timing_Info( t_start );
dot_time += t_elapsed;
}
#endif
}
#if defined(DEBUG)
if ( system->my_rank == MASTER_NODE )
fprintf( stderr, "CG took %d iterations\n", i );
#endif
#if defined(CG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
fprintf( stderr, "%f %f\n", matvec_time, dot_time );
#endif
if ( i >= 300 )
{
fprintf( stderr, "CG convergence failed!\n" );
return i;
}
return i;
}
void Forward_Subs( sparse_matrix *L, real *b, real *y )
{
int i, pj, j, si, ei;
real val;
for ( i = 0; i < L->n; ++i )
{
y[i] = b[i];
si = L->start[i];
ei = L->end[i];
for ( pj = si; pj < ei - 1; ++pj )
{
j = L->entries[pj].j;
val = L->entries[pj].val;
y[i] -= val * y[j];
}
y[i] /= L->entries[pj].val;
}
}
void Backward_Subs( sparse_matrix *U, real *y, real *x )
{
int i, pj, j, si, ei;
real val;
for ( i = U->n - 1; i >= 0; --i )
{
x[i] = y[i];
si = U->start[i];
ei = U->end[i];
for ( pj = si + 1; pj < ei; ++pj )
{
j = U->entries[pj].j;
val = U->entries[pj].val;
x[i] -= val * x[j];
}
x[i] /= U->entries[si].val;
}
}
int PCG( reax_system *system, storage *workspace, sparse_matrix *H, real *b,
real tol, sparse_matrix *L, sparse_matrix *U, real *x, mpi_datatypes*
mpi_data, FILE *fout )
{
int i, me, n, N, scale;
real tmp, alpha, beta, b_norm, r_norm, sig_old, sig_new;
MPI_Comm world;
me = system->my_rank;
n = system->n;
N = system->N;
world = mpi_data->world;
scale = sizeof(real) / sizeof(void);
b_norm = Parallel_Norm( b, n, world );
#if defined(DEBUG_FOCUS)
if ( me == MASTER_NODE )
{
fprintf( stderr, "init_PCG: n=%d, N=%d\n", n, N );
fprintf( stderr, "init_PCG: |b|=%24.15e\n", b_norm );
}
MPI_Barrier( world );
#endif
Sparse_MatVec( H, x, workspace->q, N );
//Coll( system, workspace, mpi_data, workspace->q );
Vector_Sum( workspace->r , 1., b, -1., workspace->q, n );
r_norm = Parallel_Norm( workspace->r, n, world );
Forward_Subs( L, workspace->r, workspace->d );
Backward_Subs( U, workspace->d, workspace->p );
sig_new = Parallel_Dot( workspace->r, workspace->p, n, world );
#if defined(DEBUG_FOCUS)
if ( me == MASTER_NODE )
{
fprintf( stderr, "init_PCG: sig_new=%.15e\n", r_norm );
fprintf( stderr, "init_PCG: |d|=%.15e |q|=%.15e\n",
Parallel_Norm(workspace->d, n, world),
Parallel_Norm(workspace->q, n, world) );
}
MPI_Barrier( world );
#endif
for ( i = 1; i < 100 && r_norm / b_norm > tol; ++i )
{
Dist( system, mpi_data, workspace->p, MPI_DOUBLE, scale, real_packer );
Sparse_MatVec( H, workspace->p, workspace->q, N );
// tryQEq
//Coll(system,mpi_data,workspace->q, MPI_DOUBLE, real_unpacker);
tmp = Parallel_Dot( workspace->q, workspace->p, n, world );
alpha = sig_new / tmp;
Vector_Add( x, alpha, workspace->p, n );
#if defined(DEBUG_FOCUS)
if ( me == MASTER_NODE )
{
fprintf(stderr, "iter%d: |p|=%.15e |q|=%.15e tmp=%.15e\n", i,
Parallel_Norm(workspace->p, n, world),
Parallel_Norm(workspace->q, n, world), tmp );
}
MPI_Barrier( world );
#endif
Vector_Add( workspace->r, -alpha, workspace->q, n );
r_norm = Parallel_Norm( workspace->r, n, world );
#if defined(DEBUG_FOCUS)
if ( me == MASTER_NODE )
{
fprintf( stderr, "iter%d: res=%.15e\n", i, r_norm );
}
MPI_Barrier( world );
#endif
Forward_Subs( L, workspace->r, workspace->d );
Backward_Subs( U, workspace->d, workspace->d );
sig_old = sig_new;
sig_new = Parallel_Dot( workspace->r, workspace->d, n, world );
beta = sig_new / sig_old;
Vector_Sum( workspace->p, 1., workspace->d, beta, workspace->p, n );
}
#if defined(DEBUG_FOCUS)
if ( me == MASTER_NODE )
{
fprintf( stderr, "PCG took %d iterations\n", i );
}
#endif
if ( i >= 100 )
{
fprintf( stderr, "PCG convergence failed!\n" );
}
return i;
}
#if defined(OLD_STUFF)
int sCG( reax_system *system, storage *workspace, sparse_matrix *H,
real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
{
int i, j;
real tmp, alpha, beta, b_norm;
real sig_old, sig_new, sig0;
b_norm = Norm( b, system->n );
#if defined(DEBUG)
if ( system->my_rank == MASTER_NODE )
{
fprintf( stderr, "n=%d, N=%d\n", system->n, system->N );
fprintf( stderr, "p%d CGinit: b_norm=%24.15e\n", system->my_rank, b_norm );
//Vector_Print( stderr, "d", workspace->d, system->N );
//Vector_Print( stderr, "q", workspace->q, system->N );
}
MPI_Barrier( mpi_data->world );
#endif
Sparse_MatVec( H, x, workspace->q, system->N );
//Coll_Vector( system, workspace, mpi_data, workspace->q );
Vector_Sum( workspace->r , 1., b, -1., workspace->q, system->n );
for ( j = 0; j < system->n; ++j )
workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition
sig_new = Dot( workspace->r, workspace->d, system->n );
sig0 = sig_new;
#if defined(DEBUG)
if ( system->my_rank == MASTER_NODE )
{
fprintf( stderr, "p%d CGinit:sig_new=%24.15e\n", system->my_rank, sig_new );
//Vector_Print( stderr, "d", workspace->d, system->N );
//Vector_Print( stderr, "q", workspace->q, system->N );
}
MPI_Barrier( mpi_data->world );
#endif
for ( i = 1; i < 100 && SQRT(sig_new) / b_norm > tol; ++i )
{
//Dist_Vector( system, mpi_data, workspace->d );
Sparse_MatVec( H, workspace->d, workspace->q, system->N );
//Coll_Vector( system, workspace, mpi_data, workspace->q );
tmp = Dot( workspace->d, workspace->q, system->n );
alpha = sig_new / tmp;
#if defined(DEBUG)
if ( system->my_rank == MASTER_NODE )
{
fprintf(stderr,
"p%d CG iter%d:d_norm=%24.15e,q_norm=%24.15e,tmp = %24.15e\n",
system->my_rank, i,
Parallel_Norm(workspace->d, system->n, mpi_data->world),
Parallel_Norm(workspace->q, system->n, mpi_data->world), tmp );
//Vector_Print( stderr, "d", workspace->d, system->N );
//Vector_Print( stderr, "q", workspace->q, system->N );
}
MPI_Barrier( mpi_data->world );
#endif
Vector_Add( x, alpha, workspace->d, system->n );
Vector_Add( workspace->r, -alpha, workspace->q, system->n );
/* pre-conditioning */
for ( j = 0; j < system->n; ++j )
workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
sig_old = sig_new;
sig_new = Dot( workspace->r, workspace->p, system->n );
beta = sig_new / sig_old;
Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, system->n );
#if defined(DEBUG)
if ( system->my_rank == MASTER_NODE )
fprintf(stderr, "p%d CG iter%d: sig_new = %24.15e\n",
system->my_rank, i, sig_new );
MPI_Barrier( mpi_data->world );
#endif
}
#if defined(DEBUG)
if ( system->my_rank == MASTER_NODE )
fprintf( stderr, "CG took %d iterations\n", i );
#endif
if ( i >= 100 )
{
fprintf( stderr, "CG convergence failed!\n" );
return i;
}
return i;
}
int GMRES( reax_system *system, storage *workspace, sparse_matrix *H,
real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
{
int i, j, k, itr, N;
real cc, tmp1, tmp2, temp, bnorm;
N = system->N;
bnorm = Norm( b, N );
/* apply the diagonal pre-conditioner to rhs */
for ( i = 0; i < N; ++i )
workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i];
/* GMRES outer-loop */
for ( itr = 0; itr < MAX_ITR; ++itr )
{
/* calculate r0 */
Sparse_MatVec( H, x, workspace->b_prm, N );
for ( i = 0; i < N; ++i )
workspace->b_prm[i] *= workspace->Hdia_inv[i]; // pre-conditioner
Vector_Sum( workspace->v[0],
1., workspace->b_prc, -1., workspace->b_prm, N );
workspace->g[0] = Norm( workspace->v[0], N );
Vector_Scale( workspace->v[0],
1. / workspace->g[0], workspace->v[0], N );
// fprintf( stderr, "%10.6f\n", workspace->g[0] );
/* GMRES inner-loop */
for ( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ )
{
/* matvec */
Sparse_MatVec( H, workspace->v[j], workspace->v[j + 1], N );
for ( k = 0; k < N; ++k )
workspace->v[j + 1][k] *= workspace->Hdia_inv[k]; // pre-conditioner
// fprintf( stderr, "%d-%d: matvec done.\n", itr, j );
/* apply modified Gram-Schmidt to orthogonalize the new residual */
for ( i = 0; i <= j; i++ )
{
workspace->h[i][j] = Dot(workspace->v[i], workspace->v[j + 1], N);
Vector_Add( workspace->v[j + 1],
-workspace->h[i][j], workspace->v[i], N );
}
workspace->h[j + 1][j] = Norm( workspace->v[j + 1], N );
Vector_Scale( workspace->v[j + 1],
1. / workspace->h[j + 1][j], workspace->v[j + 1], N );
// fprintf(stderr, "%d-%d: orthogonalization completed.\n", itr, j);
/* Givens rotations on the H matrix to make it U */
for ( i = 0; i <= j; i++ )
{
if ( i == j )
{
cc = SQRT(SQR(workspace->h[j][j]) + SQR(workspace->h[j + 1][j]));
workspace->hc[j] = workspace->h[j][j] / cc;
workspace->hs[j] = workspace->h[j + 1][j] / cc;
}
tmp1 = workspace->hc[i] * workspace->h[i][j] +
workspace->hs[i] * workspace->h[i + 1][j];
tmp2 = -workspace->hs[i] * workspace->h[i][j] +
workspace->hc[i] * workspace->h[i + 1][j];
workspace->h[i][j] = tmp1;
workspace->h[i + 1][j] = tmp2;
}
/* apply Givens rotations to the rhs as well */
tmp1 = workspace->hc[j] * workspace->g[j];
tmp2 = -workspace->hs[j] * workspace->g[j];
workspace->g[j] = tmp1;
workspace->g[j + 1] = tmp2;
// fprintf( stderr, "%10.6f\n", fabs(workspace->g[j+1]) );
}
/* solve Hy = g.
H is now upper-triangular, do back-substitution */
for ( i = j - 1; i >= 0; i-- )
{
temp = workspace->g[i];
for ( k = j - 1; k > i; k-- )
temp -= workspace->h[i][k] * workspace->y[k];
workspace->y[i] = temp / workspace->h[i][i];
}
/* update x = x_0 + Vy */
for ( i = 0; i < j; i++ )
Vector_Add( x, workspace->y[i], workspace->v[i], N );
/* stopping condition */
if ( fabs(workspace->g[j]) / bnorm <= tol )
break;
}
/*Sparse_MatVec( system, H, x, workspace->b_prm, mpi_data );
for( i = 0; i < N; ++i )
workspace->b_prm[i] *= workspace->Hdia_inv[i];
fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" );
for( i = 0; i < N; ++i )
fprintf( fout, "%10.5f%15.12f%15.12f\n",
workspace->b_prc[i], workspace->b_prm[i], x[i] );*/
fprintf( fout, "GMRES outer: %d, inner: %d - |rel residual| = %15.10f\n",
itr, j, fabs( workspace->g[j] ) / bnorm );
if ( itr >= MAX_ITR )
{
fprintf( stderr, "GMRES convergence failed\n" );
return FAILURE;
}
return SUCCESS;
}
int GMRES_HouseHolder( reax_system *system, storage *workspace,
sparse_matrix *H, real *b, real tol, real *x,
mpi_datatypes* mpi_data, FILE *fout )
{
int i, j, k, itr, N;
real cc, tmp1, tmp2, temp, bnorm;
real v[10000], z[RESTART + 2][10000], w[RESTART + 2];
real u[RESTART + 2][10000];
N = system->N;
bnorm = Norm( b, N );
/* apply the diagonal pre-conditioner to rhs */
for ( i = 0; i < N; ++i )
workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i];
/* GMRES outer-loop */
for ( itr = 0; itr < MAX_ITR; ++itr )
{
/* compute z = r0 */
Sparse_MatVec( H, x, workspace->b_prm, N );
for ( i = 0; i < N; ++i )
workspace->b_prm[i] *= workspace->Hdia_inv[i]; /* pre-conditioner */
Vector_Sum( z[0], 1., workspace->b_prc, -1., workspace->b_prm, N );
Vector_MakeZero( w, RESTART + 1 );
w[0] = Norm( z[0], N );
Vector_Copy( u[0], z[0], N );
u[0][0] += ( u[0][0] < 0.0 ? -1 : 1 ) * w[0];
Vector_Scale( u[0], 1 / Norm( u[0], N ), u[0], N );
w[0] *= ( u[0][0] < 0.0 ? 1 : -1 );
// fprintf( stderr, "\n\n%12.6f\n", w[0] );
/* GMRES inner-loop */
for ( j = 0; j < RESTART && fabs( w[j] ) / bnorm > tol; j++ )
{
/* compute v_j */
Vector_Scale( z[j], -2 * u[j][j], u[j], N );
z[j][j] += 1.; /* due to e_j */
for ( i = j - 1; i >= 0; --i )
Vector_Add( z[j] + i, -2 * Dot( u[i] + i, z[j] + i, N - i ), u[i] + i, N - i );
/* matvec */
Sparse_MatVec( H, z[j], v, N );
for ( k = 0; k < N; ++k )
v[k] *= workspace->Hdia_inv[k]; /* pre-conditioner */
for ( i = 0; i <= j; ++i )
Vector_Add( v + i, -2 * Dot( u[i] + i, v + i, N - i ), u[i] + i, N - i );
if ( !Vector_isZero( v + (j + 1), N - (j + 1) ) )
{
/* compute the HouseHolder unit vector u_j+1 */
for ( i = 0; i <= j; ++i )
u[j + 1][i] = 0;
Vector_Copy( u[j + 1] + (j + 1), v + (j + 1), N - (j + 1) );
u[j + 1][j + 1] +=
( v[j + 1] < 0.0 ? -1 : 1 ) * Norm( v + (j + 1), N - (j + 1) );
Vector_Scale( u[j + 1], 1 / Norm( u[j + 1], N ), u[j + 1], N );
/* overwrite v with P_m+1 * v */
v[j + 1] -=
2 * Dot( u[j + 1] + (j + 1), v + (j + 1), N - (j + 1) ) * u[j + 1][j + 1];
Vector_MakeZero( v + (j + 2), N - (j + 2) );
}
/* previous Givens rotations on H matrix to make it U */
for ( i = 0; i < j; i++ )
{
tmp1 = workspace->hc[i] * v[i] + workspace->hs[i] * v[i + 1];
tmp2 = -workspace->hs[i] * v[i] + workspace->hc[i] * v[i + 1];
v[i] = tmp1;
v[i + 1] = tmp2;
}
/* apply the new Givens rotation to H and right-hand side */
if ( fabs(v[j + 1]) >= ALMOST_ZERO )
{
cc = SQRT( SQR( v[j] ) + SQR( v[j + 1] ) );
workspace->hc[j] = v[j] / cc;
workspace->hs[j] = v[j + 1] / cc;
tmp1 = workspace->hc[j] * v[j] + workspace->hs[j] * v[j + 1];
tmp2 = -workspace->hs[j] * v[j] + workspace->hc[j] * v[j + 1];
v[j] = tmp1;
v[j + 1] = tmp2;
/* Givens rotations to rhs */
tmp1 = workspace->hc[j] * w[j];
tmp2 = -workspace->hs[j] * w[j];
w[j] = tmp1;
w[j + 1] = tmp2;
}
/* extend R */
for ( i = 0; i <= j; ++i )
workspace->h[i][j] = v[i];
// fprintf( stderr, "h:" );
// for( i = 0; i <= j+1 ; ++i )
// fprintf( stderr, "%.6f ", h[i][j] );
// fprintf( stderr, "\n" );
// fprintf( stderr, "%12.6f\n", w[j+1] );
}
/* solve Hy = w.
H is now upper-triangular, do back-substitution */
for ( i = j - 1; i >= 0; i-- )
{
temp = w[i];
for ( k = j - 1; k > i; k-- )
temp -= workspace->h[i][k] * workspace->y[k];
workspace->y[i] = temp / workspace->h[i][i];
}
for ( i = j - 1; i >= 0; i-- )
Vector_Add( x, workspace->y[i], z[i], N );
/* stopping condition */
if ( fabs( w[j] ) / bnorm <= tol )
break;
}
// Sparse_MatVec( system, H, x, workspace->b_prm );
// for( i = 0; i < N; ++i )
// workspace->b_prm[i] *= workspace->Hdia_inv[i];
// fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" );
// for( i = 0; i < N; ++i )
// fprintf( fout, "%10.5f%15.12f%15.12f\n",
// workspace->b_prc[i], workspace->b_prm[i], x[i] );
fprintf( fout, "GMRES outer:%d inner:%d iters, |rel residual| = %15.10f\n",
itr, j, fabs( workspace->g[j] ) / bnorm );
if ( itr >= MAX_ITR )
{
fprintf( stderr, "GMRES convergence failed\n" );
return FAILURE;
}
return SUCCESS;
}
#endif