From 8e91b1e8798f2855315a831013add942f42d940e Mon Sep 17 00:00:00 2001
From: Abdullah Alperen <alperena@msu.edu>
Date: Mon, 18 Feb 2019 19:58:34 -0500
Subject: [PATCH] PuReMD-old: implementation of dual-PIPECG

---
 PuReMD/src/.linear_solvers.c.swo | Bin 0 -> 16384 bytes
 PuReMD/src/allocate.c            |  24 +-
 PuReMD/src/linear_solvers.c      | 478 +++++++++++++++++++++++++++++--
 PuReMD/src/linear_solvers.h      |   3 +
 PuReMD/src/qEq.c                 |   7 +-
 PuReMD/src/reax_types.h          |   8 +-
 6 files changed, 486 insertions(+), 34 deletions(-)
 create mode 100644 PuReMD/src/.linear_solvers.c.swo

diff --git a/PuReMD/src/.linear_solvers.c.swo b/PuReMD/src/.linear_solvers.c.swo
new file mode 100644
index 0000000000000000000000000000000000000000..7c35c4cfa5c0de52efcb099600e66d1da9c475bb
GIT binary patch
literal 16384
zcmeI3YmgjO6~`NbL|zJ5QhWg~n~KiP&SPhJS=nsNu<R?@oh-Ani6PlEJu`P^I@{CT
zbocD;5P}7$h?W7Wz_PSTu#_T}A|=S9w1867Qlto<5WW!9vPvE*CcFjpe{SEN>FM1p
zP{kLm>B?`r`<{EwJ@?#q@21$lePW7ToVq~4b-JR=jIG~YyZd-$PfbxY!{!b*H4?3M
zx${P{zrTOAEJAioU@$%1$n&X7gPZG=(uQtw%~4&;Sm2JED!r$%Zia3Ix)Jz*5jagb
ze;^|U>bjou*eA}M{eb%2?A-`-BhZaNHv-)VbR*D>KsN&22y`RxzZ?P2K3n+$Dsa54
z#O?BaT;%>Ac~xZk9TEAzrTki%exfuG*m+mVEh)btB7aB9$rrj_kI4Te<-v!6hIBZv
z|F)E?zPj>EME;hPFG%^tYy5{m|4k{krTp%Q{0%ABr2L!{I`v<d@++kLnuz>0DX&WT
za}oKgQocvZM?T^|1pXbB@-IpG_agEm5&N+dJN5r5<p-qxO%eGkQtnClIcq!h{~_f;
z{cnlL|1RabrT&qK{BKfzt(50J+G+n~DG%EBP(=Qcln4E9;G|Cd7o|MtU*C+#WpgTl
z|7%a~)R%5`Z`}xVBhZaNHv-)VbR*D>KsN&22y`RRjllnH1hlfEoPpZ?fX>VK{eN)&
zfAHgqay_^X%z-pm3;y~sMfnwY3|tQ`0mp&gpQb2}gRg)wa5{MQR7H6Z7+@Z30~dqy
zz$xI3QxxTo;Cb*AxEFjEd=;2rC&+`#!1-VecpK-ox4@g=XW)Kt5ZnfC1x>IA%z-T+
z3(}w$oC!_>Z{lot1iS#A2ZzA#z;D4r-~n(y_$k;A_JI-53qAur4c3E~a7KLrJOlm!
z9s>`9AA&o<_rVR|T2Kd7PyxHZ7BB%W1Zi*<_#}86=jK<yv)~Z;C3pzj3BC<(1Gj=N
zgD-;X!F7Ovr*NkKG58U<8C1Y!U=&;c&H^WZHQ>+qe((gK?+6C~2W7AY#K6hmD85k~
z0fzy7uXq?71b2Z0;A*fB=wKFn7Mur80Y`D*ei}Rq?gY1hCa8iOm;xDaCU^&Z@7LfS
zK>hF_Saz-BhRYR(t5Pu<ZY{=KuWU6u);s8@+Kz5|l^{9Mv%bv4zujg}f@PDNOz|^U
z5Vv}uW;W`oV=cODi1OqLgUUL+!pgjYzeg#@awD^ae0EYTjO6nZg>Ctry`rp=Rj)&#
zZtH4U^R(oqMawbDgJ=a@Mcq`n!Ry@g$ZBqOa?<y(?Apc(DXL+WG@~P@Te2EaZJD-0
z?qZ`f&pktTy$&Po<P~&u({6aG<~UljL&??`tmUR$t!^89Sqbgr6-sc+`m*}E4lgaV
z$`jG@R2j_vRKFi4*V7!T$<*|Oy2I1!Jcbq)c^!^8d6uUcLdnlA5`|8Jd1~u{tV%U9
zG)yfPN<&v+@o-D|#zeAcC%;As78h?JX-7LiF5kmBsFo!0BM^2z9&Hq4YbS*&6mK;|
zI58S<Dx}pk;f$f?4JCUuhr2b)C?}XOkAT%}=^oWQ7z@4%r5L>8C77dEYq;6S-*C*f
z)ZUw5u6`A7r_8q>(TY{>skUR4+KCv0rdrj!8h4-?#Mq@&F$&r_^#lzhlt|5uvMqN|
z*(Z7dh8Ppxl4+=l4p7kzk2}mCo|GdPn_koAE{g}RDAm>+m#cNnbMz$^&n`=eXI09h
z+M4Gj6u*0MuVIT+XlgbJUQ%_F{!op_sQe|>ZMq(>qdCN#!cuP-nx|W)m?6-OFi#-a
zvmC9;SsW9_JW46?1SY!u3VH0H5XWSwr+D=fNzi_xH;$o!o=uN*wPVtnA1S$0hi08R
z>tmUf^f2AU7>(^+^*OeYX$Yc1Cg`GBF6Kg$_p&q_@YVey7i+q~q3J@02Rh`O!A+J7
zilUM+t`#!Y_DNP1Il~r(BO2S}J0s1^1*Vs2%jv@R#;*iQ9zuD63iT3n5T2lb?Ncx}
zczQ|n9n9`!)gO2<hE=49s>CS<2_Z(=i6B|TLpBlBf-X!X=pkG!7crsB_2)|0a=aK5
zQ)&W1!?;krUjJPc1}Vv^#QfrV6K#(cEWONxzOdN#NE)Mhh)1$Rm6PtsFygjSgb{h!
z-uvLPLVmZpNJlXJCL(!9Mn*B!G89p5z#mvqwPdsazX>zcjRt~Nz|#<m%WAVYg`_Hb
z5?POmU{WDpWUFYI1Z?@;gyJA7EBp*KXSh9LXdM$vAq6!~Wx+rsMYyJ{<7Qc}#MAx|
z4=o+s&}^4ulEfUbz)LEvLDU^Q@ieHU<KZpB@R&M1k{{icpUUQjN6<LFU2G%FlF=x0
z)&p}IR_Ii%2QmX83U(Lc+3h3hWOh0`ycLS+I4W16Ke#P%=BF}v*S1riH-wF2SWu_7
zP6{g%rpFD2H&PGjYX;4+p6^J@JJb-9l5aTh*t0A{Em}*Rsk)_-!0k*eYOY=q>v3m#
z<PUmLc{FSCL%C2b#x~YG&)$$uFD@>os%9f)IaT}-afzEQchj3v$T7n?!%D-(a<goV
zo7}<fVH!gnbHJpZkyR{*)h&mkNjyzA+zn8h8p#ijXLE(@B@>eqg`EspMkflnk(n8W
zR%Tf?oy`{}hG!?Uc{V+ppWZezLZub472PpW7~eKqVA<SGwj-O*!^+M<wy1kG+Nkmc
zj9zj`$8JjJWwuCz+Vq;ts=z>@rn?MFOV!cp43F47V2OfR<kWq|nq^a!G>_>X%o+wO
za@KHprD0$lVZykqT7vwJA!pNSILxwXDZ<b*n+(emR!UmDkVKx_6ce|GVv0>+V$?z8
zW~_<_2RR$XUKFbrG7mD2s<f5afbV(T!a!=0Psj-|)e&XXU5LfYDlMS7D^$Hg4!F{R
zL8<~PX(on8J0?_`W|q^ILlw`Oh+~cIWMc>()>*At-CS~0cEc$*c#4-B$TOo=HHS?@
zNvF+>xNmN_6)E0gx!kUyw#el*6Krp(A|5x_(J@l(eB+u6SEjg!Hp$L=4Wo+kYMTGF
ztk7Z0Zi?*_iw*Q=`g_p<$t5;x>Y@d_Cb{qbIXqTPDa~U`F*?eoEQ6P@jd0kdO;fAu
zC6`V6{R5N5hcc3q3Z4Hyh4U7j)9Cyk{C@O4ob~SoH-SO00lbKF{tMve;C8SXtO2j#
ztp6%F3Vs5<2KIv#IE-`tbKp_%9dI+a2~2>q!O7rBoa^rZbmp&vbHO2;)1L(QfCJ#`
zpacfMao|rlfByp92kr&m0xlQ_V_+1V3y$DS{RsF5xB_ekv!DP5!Miv&KMkG$_kkP1
zR<H$}0oH;O!C{=44}!bEjbIPh3P!-C;8gH3&dd*h`+)=Upa(pU^Dv#E>CC(XYzG^`
zMc`e;{V(7!cn&-cegmjI9s%8}8-Z0KAU7NP#gf=ML(I$$4`aPrX;(-`jM41G<ZOP#
z-&8SNmC&~n?%-(FGd7hUnW23XW~r*!kyJ1Z@-jZA1RLJa##{Fswh1Tp?B~^qLO$5|
zk!89d#dMlr6|LyTSnK4I+*IAQ7u$OrEQ+S(w7$r+oT*^ZY`KDG?rgvZ9jt{|%IQf*
z3byxGJJ{{uV5^rKnsGHYl<PAWrZW8%`tC!MD`V+2{q(O;@kJa?OS|;BMA#=vtc%1Y
z{0!;}3sJY$XZlkYRzz6?!A%rN<xqahi7{R5t#RAPa)Yd|Pe*J5c@T41&D|y}y09k@
zAFX<XhxmFW9_sY4B*Guqh*XCBX7QnLb;irOMn_dh{N{cip3V3t<SEVD&PzDp$s_G}
zf-Nl%6-7w6(>@ze1&cTp%Z~x{D$WJ6{W}|d;}DMch71d3g@ZUH+qQ02ssCZI#?Hep
zg2Z&eEo@oUY`GkZxYQIaca`{=3mT79Xrt#h$6j$bMU$Yz_75^WnGE;W&RT@xOP0}?
zION;tupvPW0{oLvpt6EX<V|)ohsZpSMhaVK?wXg46KMM0&Z9i(?L1D!`XHMZ&IY;Y
zF3La=!h=BIk4$=dk7@@Bh&f~9wCOahMb@zK-~N)=dunEtb{};1VMQEal5V5udb-!3
zCSgiQTl}{@HcsCX7gLz70#Ro;sqY279v^T0nPz=vAa#)#tf<A_zD#S(1vXaH3e7{S
z=35Wj3<F05o@Sf&q(^#WyK|f(<b4?y9;>U`uWj%Lq!Q!`?$I8CvWHlE7zWwe4McsB
zVKnSLM%jGH!-19M+rcFpodKp41d+b<h|XHK<;j@@!G#QSct&5Y+o2Caff%WDU{8iC
z;F1dZEN-3gBPB8eg>a9ql~z+ovoUObO_w?Nwntsb#T3Atn%h}5vc2$Ll|hDZWfW2h
zU#rwcsP?Voc2MdEs_P2oWZ2r~l(1{`ye!DnM`Zh@SxJ654QqupM3jZb9Ok5za+gYI
z8TU&1Fg0^oK1RlA#qDdo`CX+qIxgBX2$kKQy#wcZY4*%cPHmehsF#oAw}s|!Qoyjn
zS5m`2O`3c$+ThgQ<8*xUKP@L~^oRwVclLngof`va_0CNr3=RaHTOn`4*UJ5YmQ67@
zVZ1ZIfkc%0e)~h9G#4@Fl5?V<70h*pJUTDXw4nF(EswM9@CDs!xX}>f>}?{(@-sK$
zUDxLE<-ue0Up~VTSsrA$yiP&ka_(5@HE=gZy>g&8!E#tX<x8~J1ZD?N$xAKtxkk{J
zrG5tbEo1c3GL}IXgjS#+^_B-b=zuIa)KLp68okcKc6HIg#t_BOXBRfD)-jzU=AD(M
z?6R6O_~jeE;s+%sSO#M$hy7?c4SR`1D2v!yC}bphi6JIh7h693N{fAtB{OUTj8gS{
ev&W9W^7;mDwE{VG%!<&bhPKZ^ldT@|LFK;)5>(~@

literal 0
HcmV?d00001

diff --git a/PuReMD/src/allocate.c b/PuReMD/src/allocate.c
index 32b68ec1..b2c9aa5b 100644
--- a/PuReMD/src/allocate.c
+++ b/PuReMD/src/allocate.c
@@ -202,7 +202,8 @@ void DeAllocate_Workspace( control_params *control, storage *workspace )
         sfree( workspace->w, "w" );
     }
 
-    if ( control->cm_solver_type == CG_S )
+    if ( control->cm_solver_type == CG_S 
+            || control->cm_solver_type == PIPECG_S )
     {
         sfree( workspace->r2, "r2" );
         sfree( workspace->d2, "d2" );
@@ -210,6 +211,15 @@ void DeAllocate_Workspace( control_params *control, storage *workspace )
         sfree( workspace->p2, "p2" );
     }
 
+    if ( control->cm_solver_type == PIPECG_S )
+    {
+        sfree( workspace->m2, "m2" );
+        sfree( workspace->n2, "n2" );
+        sfree( workspace->u2, "u2" );
+        sfree( workspace->w2, "w2" );
+        sfree( workspace->w2, "z2" );
+    }
+
     /* integrator */
     // sfree( workspace->f_old );
     sfree( workspace->v_const, "v_const" );
@@ -363,7 +373,8 @@ int Allocate_Workspace( reax_system *system, control_params *control,
         workspace->w = scalloc( total_cap, sizeof(real), "w", comm );
     }
 
-    if ( control->cm_solver_type == CG_S )
+    if ( control->cm_solver_type == CG_S
+            || control->cm_solver_type == PIPECG_S )
     {
         workspace->d2 = scalloc( total_cap, sizeof(rvec2), "d2", comm );
         workspace->r2 = scalloc( total_cap, sizeof(rvec2), "r2", comm );
@@ -371,6 +382,15 @@ int Allocate_Workspace( reax_system *system, control_params *control,
         workspace->q2 = scalloc( total_cap, sizeof(rvec2), "q2", comm );
     }
 
+    if ( control->cm_solver_type == PIPECG_S )
+    {
+        workspace->m2 = scalloc( total_cap, sizeof(rvec2), "m2", comm );
+        workspace->n2 = scalloc( total_cap, sizeof(rvec2), "n2", comm );
+        workspace->u2 = scalloc( total_cap, sizeof(rvec2), "u2", comm );
+        workspace->w2 = scalloc( total_cap, sizeof(rvec2), "w2", comm );
+        workspace->z2 = scalloc( total_cap, sizeof(rvec2), "z2", comm );
+    }
+
     /* integrator storage */
     workspace->v_const = smalloc( local_rvec, "v_const", comm );
 
diff --git a/PuReMD/src/linear_solvers.c b/PuReMD/src/linear_solvers.c
index 0fcb83e6..939b702b 100644
--- a/PuReMD/src/linear_solvers.c
+++ b/PuReMD/src/linear_solvers.c
@@ -423,8 +423,11 @@ real setup_sparse_approx_inverse( reax_system *system, simulation_data *data, st
           "setup_sparse_approx_inverse::bucketlist_local", MPI_COMM_WORLD );
     dspls = smalloc( sizeof(int) * nprocs,
            "setup_sparse_approx_inverse::dspls", MPI_COMM_WORLD );
-    pivotlist = smalloc( sizeof(real) *  (nprocs - 1),
-           "setup_sparse_approx_inverse::pivotlist", MPI_COMM_WORLD );
+    if ( nprocs > 1 )
+    {
+        pivotlist = smalloc( sizeof(real) *  (nprocs - 1),
+                "setup_sparse_approx_inverse::pivotlist", MPI_COMM_WORLD );
+    }
     samplelist_local = smalloc( sizeof(real) * s_local,
            "setup_sparse_approx_inverse::samplelist_local", MPI_COMM_WORLD );
     if ( system->my_rank == MASTER_NODE )
@@ -690,7 +693,10 @@ real setup_sparse_approx_inverse( reax_system *system, simulation_data *data, st
     sfree( dspls_local, "setup_sparse_approx_inverse::displs_local" );
     sfree( bucketlist_local, "setup_sparse_approx_inverse::bucketlist_local" );
     sfree( dspls, "setup_sparse_approx_inverse::dspls" );
-    sfree( pivotlist, "setup_sparse_approx_inverse::pivotlist" );
+    if ( nprocs > 1)
+    {
+        sfree( pivotlist, "setup_sparse_approx_inverse::pivotlist" );
+    }
     sfree( samplelist_local, "setup_sparse_approx_inverse::samplelist_local" );
     if ( system->my_rank == MASTER_NODE )
     {
@@ -1745,29 +1751,6 @@ int dual_CG( reax_system *system, control_params *control, simulation_data *data
     b_norm[0] = sqrt( redux[4] );
     b_norm[1] = sqrt( redux[5] );
 
-    /*// norm of b
-    my_sum[0] = my_sum[1] = 0;
-    for ( j = 0; j < n; ++j )
-    {
-        my_sum[0] += SQR( b[j][0] );
-        my_sum[1] += SQR( b[j][1] );
-    }
-
-    MPI_Allreduce( &my_sum, &norm_sqr, 2, MPI_DOUBLE, MPI_SUM, comm );
-
-    b_norm[0] = sqrt( norm_sqr[0] );
-    b_norm[1] = sqrt( norm_sqr[1] );
-
-    // dot product: r.d
-    my_dot[0] = my_dot[1] = 0;
-    for ( j = 0; j < n; ++j )
-    {
-        my_dot[0] += workspace->r2[j][0] * workspace->d2[j][0];
-        my_dot[1] += workspace->r2[j][1] * workspace->d2[j][1];
-    }
-
-    MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );*/
-
     for ( i = 0; i < control->cm_solver_max_iters; ++i )
     {
         if ( norm[0] / b_norm[0] <= tol || norm[1] / b_norm[1] <= tol )
@@ -1915,7 +1898,7 @@ int dual_CG( reax_system *system, control_params *control, simulation_data *data
         MPI_Reduce( timings, NULL, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
     }
 
-    // continue to solve the system that is not converged yet
+    // continue to solve the system that has not converged yet
     if ( norm[0] / b_norm[0] > tol )
     {
         for ( j = 0; j < system->n; ++j )
@@ -2157,6 +2140,446 @@ int CG( reax_system *system, control_params *control, simulation_data *data,
 }
 
 
+/* Pipelined Preconditioned Conjugate Gradient Method
+ *
+ * References:
+ * 1) Hiding global synchronization latency in the preconditioned Conjugate Gradient algorithm,
+ *  P. Ghysels and W. Vanroose, Parallel Computing, 2014.
+ * 2) Scalable Non-blocking Preconditioned Conjugate Gradient Methods,
+ *  Paul R. Eller and William Gropp, SC '16 Proceedings of the International Conference
+ *  for High Performance Computing, Networking, Storage and Analysis, 2016.
+ *  */
+int dual_PIPECG( reax_system *system, control_params *control, simulation_data *data,
+        storage *workspace, sparse_matrix *H, rvec2 *b,
+        real tol, rvec2 *x, mpi_datatypes* mpi_data )
+{
+    int i, j;
+    rvec2 alpha, beta, delta, gamma_old, gamma_new, norm, b_norm;
+    real t_start, t_pa, t_spmv, t_vops, t_comm, t_allreduce;
+    real timings[5], redux[8];
+    MPI_Request req;
+
+    t_pa = 0.0;
+    t_spmv = 0.0;
+    t_vops = 0.0;
+    t_comm = 0.0;
+    t_allreduce = 0.0;
+
+    t_start = MPI_Wtime( );
+    Dist( system, mpi_data, x, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+    t_comm += MPI_Wtime( ) - t_start;
+
+    t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+    dual_Sparse_MatVec( H, x, workspace->u2, H->NT );
+#else
+    dual_Sparse_MatVec( H, x, workspace->u2, system->N );
+#endif
+    t_spmv += MPI_Wtime( ) - t_start;
+
+    if ( H->format == SYM_HALF_MATRIX )
+    {
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->u2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+        t_comm += MPI_Wtime( ) - t_start;
+    }
+#if defined(NEUTRAL_TERRITORY)
+    else
+    {
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->u2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+        t_comm += MPI_Wtime( ) - t_start;
+    }
+#endif
+
+    t_start = MPI_Wtime( );
+    //Vector_Sum( workspace->r , 1.0,  b, -1.0, workspace->u, system->n );
+    for ( j = 0; j < system->n; ++j )
+    {
+        workspace->r2[j][0] = b[j][0] - workspace->u2[j][0];
+        workspace->r2[j][1] = b[j][1] - workspace->u2[j][1];
+    }
+    t_vops += MPI_Wtime( ) - t_start;
+
+    /* pre-conditioning */
+    if ( control->cm_solver_pre_comp_type == NONE_PC )
+    {
+        //Vector_Copy( workspace->u, workspace->r, system->n );
+        for ( j = 0; j < system->n ; ++j )
+        {
+            workspace->u2[j][0] = workspace->r2[j][0];
+            workspace->u2[j][1] = workspace->r2[j][1];
+        }
+    }
+    else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
+    {
+        t_start = MPI_Wtime( );
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->u2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
+            workspace->u2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
+        }
+        t_pa += MPI_Wtime( ) - t_start;
+    }
+    else if ( control->cm_solver_pre_comp_type == SAI_PC )
+    {
+        t_start = MPI_Wtime( );
+        Dist( system, mpi_data, workspace->r2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+        t_comm += MPI_Wtime( ) - t_start;
+        
+        t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+        dual_Sparse_MatVec( workspace->H_app_inv, workspace->r2, workspace->u2, H->NT );
+#else
+        dual_Sparse_MatVec( workspace->H_app_inv, workspace->r2, workspace->u2, system->n );
+#endif
+        t_pa += MPI_Wtime( ) - t_start;
+    }
+
+    t_start = MPI_Wtime( );
+    Dist( system, mpi_data, workspace->u2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+    t_comm += MPI_Wtime( ) - t_start;
+
+    t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+    dual_Sparse_MatVec( H, workspace->u2, workspace->w2, H->NT );
+#else
+    dual_Sparse_MatVec( H, workspace->u2, workspace->w2, system->N );
+#endif
+    t_spmv += MPI_Wtime( ) - t_start;
+
+    if ( H->format == SYM_HALF_MATRIX )
+    {
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->w2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+        t_comm += MPI_Wtime( ) - t_start;
+    }
+#if defined(NEUTRAL_TERRITORY)
+    else
+    {
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->w2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+        t_comm += MPI_Wtime( ) - t_start;
+    }
+#endif
+
+    t_start = MPI_Wtime( );
+    //redux[0] = Dot_local( workspace->w, workspace->u, system->n );
+    //redux[1] = Dot_local( workspace->r, workspace->u, system->n );
+    //redux[2] = Dot_local( workspace->u, workspace->u, system->n );
+    //redux[3] = Dot_local( b, b, system->n );
+    for ( j = 0; j < 8; ++j )
+    {
+        redux[j] = 0.0;
+    }
+    for( j = 0; j < system->n; ++j )
+    {
+        redux[0] += workspace->w2[j][0] * workspace->u2[j][0];
+        redux[1] += workspace->w2[j][1] * workspace->u2[j][1];
+
+        redux[2] += workspace->r2[j][0] * workspace->u2[j][0];
+        redux[3] += workspace->r2[j][1] * workspace->u2[j][1];
+
+        redux[4] += workspace->u2[j][0] * workspace->u2[j][0];
+        redux[5] += workspace->u2[j][1] * workspace->u2[j][1];
+
+        redux[6] += b[j][0] * b[j][0];
+        redux[7] += b[j][1] * b[j][1];
+    }
+    t_vops += MPI_Wtime( ) - t_start;
+
+    MPI_Iallreduce( MPI_IN_PLACE, redux, 8, MPI_DOUBLE, MPI_SUM, mpi_data->world, &req );
+
+    /* pre-conditioning */
+    if ( control->cm_solver_pre_comp_type == NONE_PC )
+    {
+        //Vector_Copy( workspace->m, workspace->w, system->n );
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->m2[j][0] = workspace->w2[j][0];
+            workspace->m2[j][1] = workspace->w2[j][1];
+        }
+    }
+    else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
+    {
+        t_start = MPI_Wtime( );
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->m2[j][0] = workspace->w2[j][0] * workspace->Hdia_inv[j];
+            workspace->m2[j][1] = workspace->w2[j][1] * workspace->Hdia_inv[j];
+        }
+        t_pa += MPI_Wtime( ) - t_start;
+    }
+    else if ( control->cm_solver_pre_comp_type == SAI_PC )
+    {
+        t_start = MPI_Wtime( );
+        Dist( system, mpi_data, workspace->w2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+        t_comm += MPI_Wtime( ) - t_start;
+        
+        t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+        dual_Sparse_MatVec( workspace->H_app_inv, workspace->w2, workspace->m2, H->NT );
+#else
+        dual_Sparse_MatVec( workspace->H_app_inv, workspace->w2, workspace->m2, system->n );
+#endif
+        t_pa += MPI_Wtime( ) - t_start;
+    }
+
+    t_start = MPI_Wtime( );
+    Dist( system, mpi_data, workspace->m2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+    t_comm += MPI_Wtime( ) - t_start;
+
+    t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+    dual_Sparse_MatVec( H, workspace->m2, workspace->n2, H->NT );
+#else
+    dual_Sparse_MatVec( H, workspace->m2, workspace->n2, system->N );
+#endif
+    t_spmv += MPI_Wtime( ) - t_start;
+
+    if ( H->format == SYM_HALF_MATRIX )
+    {
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->n2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+        t_comm += MPI_Wtime( ) - t_start;
+    }
+#if defined(NEUTRAL_TERRITORY)
+    else
+    {
+        t_start = MPI_Wtime( );
+        Coll( system, mpi_data, workspace->n2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+        t_comm += MPI_Wtime( ) - t_start;
+    }
+#endif
+
+    t_start = MPI_Wtime( );
+    MPI_Wait( &req, MPI_STATUS_IGNORE );
+    t_allreduce += MPI_Wtime( ) - t_start;
+    delta[0] = redux[0];
+    delta[1] = redux[1];
+    gamma_new[0] = redux[2];
+    gamma_new[1] = redux[3];
+    norm[0] = sqrt( redux[4] );
+    norm[1] = sqrt( redux[5] );
+    b_norm[0] = sqrt( redux[6] );
+    b_norm[1] = sqrt( redux[7] );
+
+    for ( i = 0; i < control->cm_solver_max_iters; ++i )
+    {
+        if ( norm[0] / b_norm[0] <= tol || norm[1] / b_norm[1] <= tol )
+        {
+            break;
+        }
+        if ( i > 0 )
+        {
+            beta[0] = gamma_new[0] / gamma_old[0];
+            beta[1] = gamma_new[1] / gamma_old[1];
+            alpha[0] = gamma_new[0] / (delta[0] - beta[0] / alpha[0] * gamma_new[0]);
+            alpha[1] = gamma_new[1] / (delta[1] - beta[1] / alpha[1] * gamma_new[1]);
+        }
+        else
+        {
+            beta[0] = 0.0;
+            beta[1] = 0.0;
+            alpha[0] = gamma_new[0] / delta[0];
+            alpha[1] = gamma_new[1] / delta[1];
+        }
+
+        t_start = MPI_Wtime( );
+        //Vector_Sum( workspace->z, 1.0, workspace->n, beta, workspace->z, system->n );
+        //Vector_Sum( workspace->q, 1.0, workspace->m, beta, workspace->q, system->n );
+        //Vector_Sum( workspace->p, 1.0, workspace->u, beta, workspace->p, system->n );
+        //Vector_Sum( workspace->d, 1.0, workspace->w, beta, workspace->d, system->n );
+        //Vector_Sum( x, 1.0, x, alpha, workspace->p, system->n );
+        //Vector_Sum( workspace->u, 1.0, workspace->u, -alpha, workspace->q, system->n );
+        //Vector_Sum( workspace->w, 1.0, workspace->w, -alpha, workspace->z, system->n );
+        //Vector_Sum( workspace->r, 1.0, workspace->r, -alpha, workspace->d, system->n );
+        //redux[0] = Dot_local( workspace->w, workspace->u, system->n );
+        //redux[1] = Dot_local( workspace->r, workspace->u, system->n );
+        //redux[2] = Dot_local( workspace->u, workspace->u, system->n );
+        for ( j = 0; j < 6; ++j )
+        {
+            redux[j] = 0.0;
+        }
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->z2[j][0] = workspace->n2[j][0] + beta[0] * workspace->z2[j][0];
+            workspace->z2[j][1] = workspace->n2[j][1] + beta[1] * workspace->z2[j][1];
+
+            workspace->q2[j][0] = workspace->m2[j][0] + beta[0] * workspace->q2[j][0];
+            workspace->q2[j][1] = workspace->m2[j][1] + beta[1] * workspace->q2[j][1];
+
+            workspace->p2[j][0] = workspace->u2[j][0] + beta[0] * workspace->p2[j][0];
+            workspace->p2[j][1] = workspace->u2[j][1] + beta[1] * workspace->p2[j][1];
+
+            workspace->d2[j][0] = workspace->w2[j][0] + beta[0] * workspace->d2[j][0];
+            workspace->d2[j][1] = workspace->w2[j][1] + beta[1] * workspace->d2[j][1];
+
+            x[j][0] += alpha[0] * workspace->p2[j][0];
+            x[j][1] += alpha[1] * workspace->p2[j][1];
+
+            workspace->u2[j][0] -= alpha[0] * workspace->q2[j][0];
+            workspace->u2[j][1] -= alpha[1] * workspace->q2[j][1];
+
+            workspace->w2[j][0] -= alpha[0] * workspace->z2[j][0];
+            workspace->w2[j][1] -= alpha[1] * workspace->z2[j][1];
+
+            workspace->r2[j][0] -= alpha[0] * workspace->d2[j][0];
+            workspace->r2[j][1] -= alpha[1] * workspace->d2[j][1];
+
+            redux[0] += workspace->w2[j][0] * workspace->u2[j][0];
+            redux[1] += workspace->w2[j][1] * workspace->u2[j][1];
+            
+            redux[2] += workspace->r2[j][0] * workspace->u2[j][0];
+            redux[3] += workspace->r2[j][1] * workspace->u2[j][1];
+            
+            redux[4] += workspace->u2[j][0] * workspace->u2[j][0];
+            redux[5] += workspace->u2[j][1] * workspace->u2[j][1];
+
+        }
+        t_vops += MPI_Wtime( ) - t_start;
+
+        MPI_Iallreduce( MPI_IN_PLACE, redux, 6, MPI_DOUBLE, MPI_SUM, mpi_data->world, &req );
+
+        /* pre-conditioning */
+        if ( control->cm_solver_pre_comp_type == NONE_PC )
+        {
+            //Vector_Copy( workspace->m, workspace->w, system->n );
+            for ( j = 0; j < system->n; ++j )
+            {
+                workspace->m2[j][0] = workspace->w2[j][0];
+                workspace->m2[j][1] = workspace->w2[j][1];
+            }
+        }
+        else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
+        {
+            t_start = MPI_Wtime( );
+            for ( j = 0; j < system->n; ++j )
+            {
+                workspace->m2[j][0] = workspace->w2[j][0] * workspace->Hdia_inv[j];
+                workspace->m2[j][1] = workspace->w2[j][1] * workspace->Hdia_inv[j];
+            }
+            t_pa += MPI_Wtime( ) - t_start;
+        }
+        else if ( control->cm_solver_pre_comp_type == SAI_PC )
+        {
+            t_start = MPI_Wtime( );
+            Dist( system, mpi_data, workspace->w2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+            t_comm += MPI_Wtime( ) - t_start;
+            
+            t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+            dual_Sparse_MatVec( workspace->H_app_inv, workspace->w2, workspace->m2, H->NT );
+#else
+            dual_Sparse_MatVec( workspace->H_app_inv, workspace->w2, workspace->m2, system->n );
+#endif
+            t_pa += MPI_Wtime( ) - t_start;
+        }
+
+        t_start = MPI_Wtime( );
+        Dist( system, mpi_data, workspace->m2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2);
+        t_comm += MPI_Wtime( ) - t_start;
+
+        t_start = MPI_Wtime( );
+#if defined(NEUTRAL_TERRITORY)
+        dual_Sparse_MatVec( H, workspace->m2, workspace->n2, H->NT );
+#else
+        dual_Sparse_MatVec( H, workspace->m2, workspace->n2, system->N );
+#endif
+        t_spmv += MPI_Wtime( ) - t_start;
+
+        if ( H->format == SYM_HALF_MATRIX )
+        {
+            t_start = MPI_Wtime( );
+            Coll( system, mpi_data, workspace->n2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
+            t_comm += MPI_Wtime( ) - t_start;
+        }
+#if defined(NEUTRAL_TERRITORY)
+        else
+        {
+            t_start = MPI_Wtime( );
+            Coll( system, mpi_data, workspace->n2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2);
+            t_comm += MPI_Wtime( ) - t_start;
+        }
+#endif
+
+        gamma_old[0] = gamma_new[0];
+        gamma_old[1] = gamma_new[1];
+
+        t_start = MPI_Wtime( );
+        MPI_Wait( &req, MPI_STATUS_IGNORE );
+        t_allreduce += MPI_Wtime( ) - t_start;
+        delta[0] = redux[0];
+        delta[1] = redux[1];
+        gamma_new[0] = redux[2];
+        gamma_new[1] = redux[3];
+        norm[0] = sqrt( redux[4] );
+        norm[1] = sqrt( redux[5] );
+    }
+
+    timings[0] = t_pa;
+    timings[1] = t_spmv;
+    timings[2] = t_vops;
+    timings[3] = t_comm;
+    timings[4] = t_allreduce;
+
+    if ( system->my_rank == MASTER_NODE )
+    {
+        MPI_Reduce( MPI_IN_PLACE, timings, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
+
+        data->timing.cm_solver_pre_app += timings[0] / control->nprocs;
+        data->timing.cm_solver_spmv += timings[1] / control->nprocs;
+        data->timing.cm_solver_vector_ops += timings[2] / control->nprocs;
+        data->timing.cm_solver_comm += timings[3] / control->nprocs;
+        data->timing.cm_solver_allreduce += timings[4] / control->nprocs;
+    }
+    else
+    {
+        MPI_Reduce( timings, NULL, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
+    }
+
+    // continue to solve the system that has not converged yet
+    if ( norm[0] / b_norm[0] > tol )
+    {
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->s[j] = workspace->x[j][0];
+        }
+
+        i += PIPECG( system, control, data, workspace,
+                H, workspace->b_s, tol, workspace->s, mpi_data );
+
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->x[j][0] = workspace->s[j];
+        }
+    }
+    else if ( norm[1] / b_norm[1] > tol )
+    {
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->t[j] = workspace->x[j][1];
+        }
+
+        i += PIPECG( system, control, data, workspace,
+                H, workspace->b_t, tol, workspace->t, mpi_data );
+
+        for ( j = 0; j < system->n; ++j )
+        {
+            workspace->x[j][1] = workspace->t[j];
+        }
+    }
+
+    if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE )
+    {
+        fprintf( stderr, "[WARNING] PIPECG convergence failed!\n" );
+        return i;
+    }
+
+    return i;
+}
+
+
 /* Pipelined Preconditioned Conjugate Gradient Method
  *
  * References:
@@ -2647,7 +3070,6 @@ int PIPECR( reax_system *system, control_params *control, simulation_data *data,
             t_comm += MPI_Wtime( ) - t_start;
         }
 #endif
-
         t_start = MPI_Wtime( );
         MPI_Wait( &req, MPI_STATUS_IGNORE );
         t_allreduce += MPI_Wtime( ) - t_start;
diff --git a/PuReMD/src/linear_solvers.h b/PuReMD/src/linear_solvers.h
index 42892a8f..701ee82d 100644
--- a/PuReMD/src/linear_solvers.h
+++ b/PuReMD/src/linear_solvers.h
@@ -37,6 +37,9 @@ int dual_CG( reax_system*, control_params*, simulation_data*, storage*, sparse_m
 int CG( reax_system*, control_params*, simulation_data*, storage*, sparse_matrix*,
         real*, real, real*, mpi_datatypes* );
 
+int dual_PIPECG( reax_system*, control_params*, simulation_data*, storage*, sparse_matrix*,
+        rvec2*, real, rvec2*, mpi_datatypes* );
+
 int PIPECG( reax_system*, control_params*, simulation_data*, storage*, sparse_matrix*,
         real*, real, real*, mpi_datatypes* );
 
diff --git a/PuReMD/src/qEq.c b/PuReMD/src/qEq.c
index 96336c66..3ba18eeb 100644
--- a/PuReMD/src/qEq.c
+++ b/PuReMD/src/qEq.c
@@ -445,7 +445,7 @@ void QEq( reax_system *system, control_params *control, simulation_data *data,
     switch ( control->cm_solver_type )
     {
     case CG_S:
-#if defined(DUAL_CG)
+#if defined(DUAL_SOLVER)
         iters = dual_CG( system, control, data, workspace, workspace->H, workspace->b,
                 control->cm_solver_q_err, workspace->x, mpi_data );
 #else
@@ -478,6 +478,10 @@ void QEq( reax_system *system, control_params *control, simulation_data *data,
         break;
 
     case PIPECG_S:
+#if defined(DUAL_SOLVER)
+        iters = dual_PIPECG( system, control, data, workspace, workspace->H, workspace->b,
+                control->cm_solver_q_err, workspace->x, mpi_data );
+#else
         for ( j = 0; j < system->n; ++j )
         {
             workspace->s[j] = workspace->x[j][0];
@@ -503,6 +507,7 @@ void QEq( reax_system *system, control_params *control, simulation_data *data,
         {
             workspace->x[j][1] = workspace->t[j];
         }
+#endif
         break;
 
     case PIPECR_S:
diff --git a/PuReMD/src/reax_types.h b/PuReMD/src/reax_types.h
index 224b78c7..c55d413f 100644
--- a/PuReMD/src/reax_types.h
+++ b/PuReMD/src/reax_types.h
@@ -40,8 +40,8 @@
 /************* SOME DEFS - crucial for reax_types.h *********/
 
 #define PURE_REAX
-//#define DUAL_CG
-//#define NEUTRAL_TERRITORY
+#define DUAL_SOLVER
+#define NEUTRAL_TERRITORY
 //#define LAMMPS_REAX
 //#define DEBUG
 //#define DEBUG_FOCUS
@@ -1059,7 +1059,9 @@ typedef struct
     /* PIPECG, PIPECR storage */
     real *m, *n, *u, *w;
     /* dual-CG storage */
-    rvec2 *r2, *d2, *q2, *p2;
+    rvec2 *d2, *p2, *q2, *r2;
+    /* dual-PIPECG storage */
+    rvec2 *m2, *n2, *u2, *w2, *z2;
     /* Taper */
     real Tap[8];
 
-- 
GitLab