From 8e91b1e8798f2855315a831013add942f42d940e Mon Sep 17 00:00:00 2001 From: Abdullah Alperen <alperena@msu.edu> Date: Mon, 18 Feb 2019 19:58:34 -0500 Subject: [PATCH] PuReMD-old: implementation of dual-PIPECG --- PuReMD/src/.linear_solvers.c.swo | Bin 0 -> 16384 bytes PuReMD/src/allocate.c | 24 +- PuReMD/src/linear_solvers.c | 478 +++++++++++++++++++++++++++++-- PuReMD/src/linear_solvers.h | 3 + PuReMD/src/qEq.c | 7 +- PuReMD/src/reax_types.h | 8 +- 6 files changed, 486 insertions(+), 34 deletions(-) create mode 100644 PuReMD/src/.linear_solvers.c.swo diff --git a/PuReMD/src/.linear_solvers.c.swo b/PuReMD/src/.linear_solvers.c.swo new file mode 100644 index 0000000000000000000000000000000000000000..7c35c4cfa5c0de52efcb099600e66d1da9c475bb GIT binary patch literal 16384 zcmeI3YmgjO6~`NbL|zJ5QhWg~n~KiP&SPhJS=nsNu<R?@oh-Ani6PlEJu`P^I@{CT zbocD;5P}7$h?W7Wz_PSTu#_T}A|=S9w1867Qlto<5WW!9vPvE*CcFjpe{SEN>FM1p zP{kLm>B?`r`<{EwJ@?#q@21$lePW7ToVq~4b-JR=jIG~YyZd-$PfbxY!{!b*H4?3M zx${P{zrTOAEJAioU@$%1$n&X7gPZG=(uQtw%~4&;Sm2JED!r$%Zia3Ix)Jz*5jagb ze;^|U>bjou*eA}M{eb%2?A-`-BhZaNHv-)VbR*D>KsN&22y`RxzZ?P2K3n+$Dsa54 z#O?BaT;%>Ac~xZk9TEAzrTki%exfuG*m+mVEh)btB7aB9$rrj_kI4Te<-v!6hIBZv z|F)E?zPj>EME;hPFG%^tYy5{m|4k{krTp%Q{0%ABr2L!{I`v<d@++kLnuz>0DX&WT za}oKgQocvZM?T^|1pXbB@-IpG_agEm5&N+dJN5r5<p-qxO%eGkQtnClIcq!h{~_f; z{cnlL|1RabrT&qK{BKfzt(50J+G+n~DG%EBP(=Qcln4E9;G|Cd7o|MtU*C+#WpgTl z|7%a~)R%5`Z`}xVBhZaNHv-)VbR*D>KsN&22y`RRjllnH1hlfEoPpZ?fX>VK{eN)& zfAHgqay_^X%z-pm3;y~sMfnwY3|tQ`0mp&gpQb2}gRg)wa5{MQR7H6Z7+@Z30~dqy zz$xI3QxxTo;Cb*AxEFjEd=;2rC&+`#!1-VecpK-ox4@g=XW)Kt5ZnfC1x>IA%z-T+ z3(}w$oC!_>Z{lot1iS#A2ZzA#z;D4r-~n(y_$k;A_JI-53qAur4c3E~a7KLrJOlm! z9s>`9AA&o<_rVR|T2Kd7PyxHZ7BB%W1Zi*<_#}86=jK<yv)~Z;C3pzj3BC<(1Gj=N zgD-;X!F7Ovr*NkKG58U<8C1Y!U=&;c&H^WZHQ>+qe((gK?+6C~2W7AY#K6hmD85k~ z0fzy7uXq?71b2Z0;A*fB=wKFn7Mur80Y`D*ei}Rq?gY1hCa8iOm;xDaCU^&Z@7LfS zK>hF_Saz-BhRYR(t5Pu<ZY{=KuWU6u);s8@+Kz5|l^{9Mv%bv4zujg}f@PDNOz|^U z5Vv}uW;W`oV=cODi1OqLgUUL+!pgjYzeg#@awD^ae0EYTjO6nZg>Ctry`rp=Rj)&# zZtH4U^R(oqMawbDgJ=a@Mcq`n!Ry@g$ZBqOa?<y(?Apc(DXL+WG@~P@Te2EaZJD-0 z?qZ`f&pktTy$&Po<P~&u({6aG<~UljL&??`tmUR$t!^89Sqbgr6-sc+`m*}E4lgaV z$`jG@R2j_vRKFi4*V7!T$<*|Oy2I1!Jcbq)c^!^8d6uUcLdnlA5`|8Jd1~u{tV%U9 zG)yfPN<&v+@o-D|#zeAcC%;As78h?JX-7LiF5kmBsFo!0BM^2z9&Hq4YbS*&6mK;| zI58S<Dx}pk;f$f?4JCUuhr2b)C?}XOkAT%}=^oWQ7z@4%r5L>8C77dEYq;6S-*C*f z)ZUw5u6`A7r_8q>(TY{>skUR4+KCv0rdrj!8h4-?#Mq@&F$&r_^#lzhlt|5uvMqN| z*(Z7dh8Ppxl4+=l4p7kzk2}mCo|GdPn_koAE{g}RDAm>+m#cNnbMz$^&n`=eXI09h z+M4Gj6u*0MuVIT+XlgbJUQ%_F{!op_sQe|>ZMq(>qdCN#!cuP-nx|W)m?6-OFi#-a zvmC9;SsW9_JW46?1SY!u3VH0H5XWSwr+D=fNzi_xH;$o!o=uN*wPVtnA1S$0hi08R z>tmUf^f2AU7>(^+^*OeYX$Yc1Cg`GBF6Kg$_p&q_@YVey7i+q~q3J@02Rh`O!A+J7 zilUM+t`#!Y_DNP1Il~r(BO2S}J0s1^1*Vs2%jv@R#;*iQ9zuD63iT3n5T2lb?Ncx} zczQ|n9n9`!)gO2<hE=49s>CS<2_Z(=i6B|TLpBlBf-X!X=pkG!7crsB_2)|0a=aK5 zQ)&W1!?;krUjJPc1}Vv^#QfrV6K#(cEWONxzOdN#NE)Mhh)1$Rm6PtsFygjSgb{h! z-uvLPLVmZpNJlXJCL(!9Mn*B!G89p5z#mvqwPdsazX>zcjRt~Nz|#<m%WAVYg`_Hb z5?POmU{WDpWUFYI1Z?@;gyJA7EBp*KXSh9LXdM$vAq6!~Wx+rsMYyJ{<7Qc}#MAx| z4=o+s&}^4ulEfUbz)LEvLDU^Q@ieHU<KZpB@R&M1k{{icpUUQjN6<LFU2G%FlF=x0 z)&p}IR_Ii%2QmX83U(Lc+3h3hWOh0`ycLS+I4W16Ke#P%=BF}v*S1riH-wF2SWu_7 zP6{g%rpFD2H&PGjYX;4+p6^J@JJb-9l5aTh*t0A{Em}*Rsk)_-!0k*eYOY=q>v3m# z<PUmLc{FSCL%C2b#x~YG&)$$uFD@>os%9f)IaT}-afzEQchj3v$T7n?!%D-(a<goV zo7}<fVH!gnbHJpZkyR{*)h&mkNjyzA+zn8h8p#ijXLE(@B@>eqg`EspMkflnk(n8W zR%Tf?oy`{}hG!?Uc{V+ppWZezLZub472PpW7~eKqVA<SGwj-O*!^+M<wy1kG+Nkmc zj9zj`$8JjJWwuCz+Vq;ts=z>@rn?MFOV!cp43F47V2OfR<kWq|nq^a!G>_>X%o+wO za@KHprD0$lVZykqT7vwJA!pNSILxwXDZ<b*n+(emR!UmDkVKx_6ce|GVv0>+V$?z8 zW~_<_2RR$XUKFbrG7mD2s<f5afbV(T!a!=0Psj-|)e&XXU5LfYDlMS7D^$Hg4!F{R zL8<~PX(on8J0?_`W|q^ILlw`Oh+~cIWMc>()>*At-CS~0cEc$*c#4-B$TOo=HHS?@ zNvF+>xNmN_6)E0gx!kUyw#el*6Krp(A|5x_(J@l(eB+u6SEjg!Hp$L=4Wo+kYMTGF ztk7Z0Zi?*_iw*Q=`g_p<$t5;x>Y@d_Cb{qbIXqTPDa~U`F*?eoEQ6P@jd0kdO;fAu zC6`V6{R5N5hcc3q3Z4Hyh4U7j)9Cyk{C@O4ob~SoH-SO00lbKF{tMve;C8SXtO2j# ztp6%F3Vs5<2KIv#IE-`tbKp_%9dI+a2~2>q!O7rBoa^rZbmp&vbHO2;)1L(QfCJ#` zpacfMao|rlfByp92kr&m0xlQ_V_+1V3y$DS{RsF5xB_ekv!DP5!Miv&KMkG$_kkP1 zR<H$}0oH;O!C{=44}!bEjbIPh3P!-C;8gH3&dd*h`+)=Upa(pU^Dv#E>CC(XYzG^` zMc`e;{V(7!cn&-cegmjI9s%8}8-Z0KAU7NP#gf=ML(I$$4`aPrX;(-`jM41G<ZOP# z-&8SNmC&~n?%-(FGd7hUnW23XW~r*!kyJ1Z@-jZA1RLJa##{Fswh1Tp?B~^qLO$5| zk!89d#dMlr6|LyTSnK4I+*IAQ7u$OrEQ+S(w7$r+oT*^ZY`KDG?rgvZ9jt{|%IQf* z3byxGJJ{{uV5^rKnsGHYl<PAWrZW8%`tC!MD`V+2{q(O;@kJa?OS|;BMA#=vtc%1Y z{0!;}3sJY$XZlkYRzz6?!A%rN<xqahi7{R5t#RAPa)Yd|Pe*J5c@T41&D|y}y09k@ zAFX<XhxmFW9_sY4B*Guqh*XCBX7QnLb;irOMn_dh{N{cip3V3t<SEVD&PzDp$s_G} zf-Nl%6-7w6(>@ze1&cTp%Z~x{D$WJ6{W}|d;}DMch71d3g@ZUH+qQ02ssCZI#?Hep zg2Z&eEo@oUY`GkZxYQIaca`{=3mT79Xrt#h$6j$bMU$Yz_75^WnGE;W&RT@xOP0}? zION;tupvPW0{oLvpt6EX<V|)ohsZpSMhaVK?wXg46KMM0&Z9i(?L1D!`XHMZ&IY;Y zF3La=!h=BIk4$=dk7@@Bh&f~9wCOahMb@zK-~N)=dunEtb{};1VMQEal5V5udb-!3 zCSgiQTl}{@HcsCX7gLz70#Ro;sqY279v^T0nPz=vAa#)#tf<A_zD#S(1vXaH3e7{S z=35Wj3<F05o@Sf&q(^#WyK|f(<b4?y9;>U`uWj%Lq!Q!`?$I8CvWHlE7zWwe4McsB zVKnSLM%jGH!-19M+rcFpodKp41d+b<h|XHK<;j@@!G#QSct&5Y+o2Caff%WDU{8iC z;F1dZEN-3gBPB8eg>a9ql~z+ovoUObO_w?Nwntsb#T3Atn%h}5vc2$Ll|hDZWfW2h zU#rwcsP?Voc2MdEs_P2oWZ2r~l(1{`ye!DnM`Zh@SxJ654QqupM3jZb9Ok5za+gYI z8TU&1Fg0^oK1RlA#qDdo`CX+qIxgBX2$kKQy#wcZY4*%cPHmehsF#oAw}s|!Qoyjn zS5m`2O`3c$+ThgQ<8*xUKP@L~^oRwVclLngof`va_0CNr3=RaHTOn`4*UJ5YmQ67@ zVZ1ZIfkc%0e)~h9G#4@Fl5?V<70h*pJUTDXw4nF(EswM9@CDs!xX}>f>}?{(@-sK$ zUDxLE<-ue0Up~VTSsrA$yiP&ka_(5@HE=gZy>g&8!E#tX<x8~J1ZD?N$xAKtxkk{J zrG5tbEo1c3GL}IXgjS#+^_B-b=zuIa)KLp68okcKc6HIg#t_BOXBRfD)-jzU=AD(M z?6R6O_~jeE;s+%sSO#M$hy7?c4SR`1D2v!yC}bphi6JIh7h693N{fAtB{OUTj8gS{ ev&W9W^7;mDwE{VG%!<&bhPKZ^ldT@|LFK;)5>(~@ literal 0 HcmV?d00001 diff --git a/PuReMD/src/allocate.c b/PuReMD/src/allocate.c index 32b68ec1..b2c9aa5b 100644 --- a/PuReMD/src/allocate.c +++ b/PuReMD/src/allocate.c @@ -202,7 +202,8 @@ void DeAllocate_Workspace( control_params *control, storage *workspace ) sfree( workspace->w, "w" ); } - if ( control->cm_solver_type == CG_S ) + if ( control->cm_solver_type == CG_S + || control->cm_solver_type == PIPECG_S ) { sfree( workspace->r2, "r2" ); sfree( workspace->d2, "d2" ); @@ -210,6 +211,15 @@ void DeAllocate_Workspace( control_params *control, storage *workspace ) sfree( workspace->p2, "p2" ); } + if ( control->cm_solver_type == PIPECG_S ) + { + sfree( workspace->m2, "m2" ); + sfree( workspace->n2, "n2" ); + sfree( workspace->u2, "u2" ); + sfree( workspace->w2, "w2" ); + sfree( workspace->w2, "z2" ); + } + /* integrator */ // sfree( workspace->f_old ); sfree( workspace->v_const, "v_const" ); @@ -363,7 +373,8 @@ int Allocate_Workspace( reax_system *system, control_params *control, workspace->w = scalloc( total_cap, sizeof(real), "w", comm ); } - if ( control->cm_solver_type == CG_S ) + if ( control->cm_solver_type == CG_S + || control->cm_solver_type == PIPECG_S ) { workspace->d2 = scalloc( total_cap, sizeof(rvec2), "d2", comm ); workspace->r2 = scalloc( total_cap, sizeof(rvec2), "r2", comm ); @@ -371,6 +382,15 @@ int Allocate_Workspace( reax_system *system, control_params *control, workspace->q2 = scalloc( total_cap, sizeof(rvec2), "q2", comm ); } + if ( control->cm_solver_type == PIPECG_S ) + { + workspace->m2 = scalloc( total_cap, sizeof(rvec2), "m2", comm ); + workspace->n2 = scalloc( total_cap, sizeof(rvec2), "n2", comm ); + workspace->u2 = scalloc( total_cap, sizeof(rvec2), "u2", comm ); + workspace->w2 = scalloc( total_cap, sizeof(rvec2), "w2", comm ); + workspace->z2 = scalloc( total_cap, sizeof(rvec2), "z2", comm ); + } + /* integrator storage */ workspace->v_const = smalloc( local_rvec, "v_const", comm ); diff --git a/PuReMD/src/linear_solvers.c b/PuReMD/src/linear_solvers.c index 0fcb83e6..939b702b 100644 --- a/PuReMD/src/linear_solvers.c +++ b/PuReMD/src/linear_solvers.c @@ -423,8 +423,11 @@ real setup_sparse_approx_inverse( reax_system *system, simulation_data *data, st "setup_sparse_approx_inverse::bucketlist_local", MPI_COMM_WORLD ); dspls = smalloc( sizeof(int) * nprocs, "setup_sparse_approx_inverse::dspls", MPI_COMM_WORLD ); - pivotlist = smalloc( sizeof(real) * (nprocs - 1), - "setup_sparse_approx_inverse::pivotlist", MPI_COMM_WORLD ); + if ( nprocs > 1 ) + { + pivotlist = smalloc( sizeof(real) * (nprocs - 1), + "setup_sparse_approx_inverse::pivotlist", MPI_COMM_WORLD ); + } samplelist_local = smalloc( sizeof(real) * s_local, "setup_sparse_approx_inverse::samplelist_local", MPI_COMM_WORLD ); if ( system->my_rank == MASTER_NODE ) @@ -690,7 +693,10 @@ real setup_sparse_approx_inverse( reax_system *system, simulation_data *data, st sfree( dspls_local, "setup_sparse_approx_inverse::displs_local" ); sfree( bucketlist_local, "setup_sparse_approx_inverse::bucketlist_local" ); sfree( dspls, "setup_sparse_approx_inverse::dspls" ); - sfree( pivotlist, "setup_sparse_approx_inverse::pivotlist" ); + if ( nprocs > 1) + { + sfree( pivotlist, "setup_sparse_approx_inverse::pivotlist" ); + } sfree( samplelist_local, "setup_sparse_approx_inverse::samplelist_local" ); if ( system->my_rank == MASTER_NODE ) { @@ -1745,29 +1751,6 @@ int dual_CG( reax_system *system, control_params *control, simulation_data *data b_norm[0] = sqrt( redux[4] ); b_norm[1] = sqrt( redux[5] ); - /*// norm of b - my_sum[0] = my_sum[1] = 0; - for ( j = 0; j < n; ++j ) - { - my_sum[0] += SQR( b[j][0] ); - my_sum[1] += SQR( b[j][1] ); - } - - MPI_Allreduce( &my_sum, &norm_sqr, 2, MPI_DOUBLE, MPI_SUM, comm ); - - b_norm[0] = sqrt( norm_sqr[0] ); - b_norm[1] = sqrt( norm_sqr[1] ); - - // dot product: r.d - my_dot[0] = my_dot[1] = 0; - for ( j = 0; j < n; ++j ) - { - my_dot[0] += workspace->r2[j][0] * workspace->d2[j][0]; - my_dot[1] += workspace->r2[j][1] * workspace->d2[j][1]; - } - - MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );*/ - for ( i = 0; i < control->cm_solver_max_iters; ++i ) { if ( norm[0] / b_norm[0] <= tol || norm[1] / b_norm[1] <= tol ) @@ -1915,7 +1898,7 @@ int dual_CG( reax_system *system, control_params *control, simulation_data *data MPI_Reduce( timings, NULL, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world ); } - // continue to solve the system that is not converged yet + // continue to solve the system that has not converged yet if ( norm[0] / b_norm[0] > tol ) { for ( j = 0; j < system->n; ++j ) @@ -2157,6 +2140,446 @@ int CG( reax_system *system, control_params *control, simulation_data *data, } +/* Pipelined Preconditioned Conjugate Gradient Method + * + * References: + * 1) Hiding global synchronization latency in the preconditioned Conjugate Gradient algorithm, + * P. Ghysels and W. Vanroose, Parallel Computing, 2014. + * 2) Scalable Non-blocking Preconditioned Conjugate Gradient Methods, + * Paul R. Eller and William Gropp, SC '16 Proceedings of the International Conference + * for High Performance Computing, Networking, Storage and Analysis, 2016. + * */ +int dual_PIPECG( reax_system *system, control_params *control, simulation_data *data, + storage *workspace, sparse_matrix *H, rvec2 *b, + real tol, rvec2 *x, mpi_datatypes* mpi_data ) +{ + int i, j; + rvec2 alpha, beta, delta, gamma_old, gamma_new, norm, b_norm; + real t_start, t_pa, t_spmv, t_vops, t_comm, t_allreduce; + real timings[5], redux[8]; + MPI_Request req; + + t_pa = 0.0; + t_spmv = 0.0; + t_vops = 0.0; + t_comm = 0.0; + t_allreduce = 0.0; + + t_start = MPI_Wtime( ); + Dist( system, mpi_data, x, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 ); + t_comm += MPI_Wtime( ) - t_start; + + t_start = MPI_Wtime( ); +#if defined(NEUTRAL_TERRITORY) + dual_Sparse_MatVec( H, x, workspace->u2, H->NT ); +#else + dual_Sparse_MatVec( H, x, workspace->u2, system->N ); +#endif + t_spmv += MPI_Wtime( ) - t_start; + + if ( H->format == SYM_HALF_MATRIX ) + { + t_start = MPI_Wtime( ); + Coll( system, mpi_data, workspace->u2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 ); + t_comm += MPI_Wtime( ) - t_start; + } +#if defined(NEUTRAL_TERRITORY) + else + { + t_start = MPI_Wtime( ); + Coll( system, mpi_data, workspace->u2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 ); + t_comm += MPI_Wtime( ) - t_start; + } +#endif + + t_start = MPI_Wtime( ); + //Vector_Sum( workspace->r , 1.0, b, -1.0, workspace->u, system->n ); + for ( j = 0; j < system->n; ++j ) + { + workspace->r2[j][0] = b[j][0] - workspace->u2[j][0]; + workspace->r2[j][1] = b[j][1] - workspace->u2[j][1]; + } + t_vops += MPI_Wtime( ) - t_start; + + /* pre-conditioning */ + if ( control->cm_solver_pre_comp_type == NONE_PC ) + { + //Vector_Copy( workspace->u, workspace->r, system->n ); + for ( j = 0; j < system->n ; ++j ) + { + workspace->u2[j][0] = workspace->r2[j][0]; + workspace->u2[j][1] = workspace->r2[j][1]; + } + } + else if ( control->cm_solver_pre_comp_type == JACOBI_PC ) + { + t_start = MPI_Wtime( ); + for ( j = 0; j < system->n; ++j ) + { + workspace->u2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; + workspace->u2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; + } + t_pa += MPI_Wtime( ) - t_start; + } + else if ( control->cm_solver_pre_comp_type == SAI_PC ) + { + t_start = MPI_Wtime( ); + Dist( system, mpi_data, workspace->r2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 ); + t_comm += MPI_Wtime( ) - t_start; + + t_start = MPI_Wtime( ); +#if defined(NEUTRAL_TERRITORY) + dual_Sparse_MatVec( workspace->H_app_inv, workspace->r2, workspace->u2, H->NT ); +#else + dual_Sparse_MatVec( workspace->H_app_inv, workspace->r2, workspace->u2, system->n ); +#endif + t_pa += MPI_Wtime( ) - t_start; + } + + t_start = MPI_Wtime( ); + Dist( system, mpi_data, workspace->u2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 ); + t_comm += MPI_Wtime( ) - t_start; + + t_start = MPI_Wtime( ); +#if defined(NEUTRAL_TERRITORY) + dual_Sparse_MatVec( H, workspace->u2, workspace->w2, H->NT ); +#else + dual_Sparse_MatVec( H, workspace->u2, workspace->w2, system->N ); +#endif + t_spmv += MPI_Wtime( ) - t_start; + + if ( H->format == SYM_HALF_MATRIX ) + { + t_start = MPI_Wtime( ); + Coll( system, mpi_data, workspace->w2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 ); + t_comm += MPI_Wtime( ) - t_start; + } +#if defined(NEUTRAL_TERRITORY) + else + { + t_start = MPI_Wtime( ); + Coll( system, mpi_data, workspace->w2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 ); + t_comm += MPI_Wtime( ) - t_start; + } +#endif + + t_start = MPI_Wtime( ); + //redux[0] = Dot_local( workspace->w, workspace->u, system->n ); + //redux[1] = Dot_local( workspace->r, workspace->u, system->n ); + //redux[2] = Dot_local( workspace->u, workspace->u, system->n ); + //redux[3] = Dot_local( b, b, system->n ); + for ( j = 0; j < 8; ++j ) + { + redux[j] = 0.0; + } + for( j = 0; j < system->n; ++j ) + { + redux[0] += workspace->w2[j][0] * workspace->u2[j][0]; + redux[1] += workspace->w2[j][1] * workspace->u2[j][1]; + + redux[2] += workspace->r2[j][0] * workspace->u2[j][0]; + redux[3] += workspace->r2[j][1] * workspace->u2[j][1]; + + redux[4] += workspace->u2[j][0] * workspace->u2[j][0]; + redux[5] += workspace->u2[j][1] * workspace->u2[j][1]; + + redux[6] += b[j][0] * b[j][0]; + redux[7] += b[j][1] * b[j][1]; + } + t_vops += MPI_Wtime( ) - t_start; + + MPI_Iallreduce( MPI_IN_PLACE, redux, 8, MPI_DOUBLE, MPI_SUM, mpi_data->world, &req ); + + /* pre-conditioning */ + if ( control->cm_solver_pre_comp_type == NONE_PC ) + { + //Vector_Copy( workspace->m, workspace->w, system->n ); + for ( j = 0; j < system->n; ++j ) + { + workspace->m2[j][0] = workspace->w2[j][0]; + workspace->m2[j][1] = workspace->w2[j][1]; + } + } + else if ( control->cm_solver_pre_comp_type == JACOBI_PC ) + { + t_start = MPI_Wtime( ); + for ( j = 0; j < system->n; ++j ) + { + workspace->m2[j][0] = workspace->w2[j][0] * workspace->Hdia_inv[j]; + workspace->m2[j][1] = workspace->w2[j][1] * workspace->Hdia_inv[j]; + } + t_pa += MPI_Wtime( ) - t_start; + } + else if ( control->cm_solver_pre_comp_type == SAI_PC ) + { + t_start = MPI_Wtime( ); + Dist( system, mpi_data, workspace->w2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 ); + t_comm += MPI_Wtime( ) - t_start; + + t_start = MPI_Wtime( ); +#if defined(NEUTRAL_TERRITORY) + dual_Sparse_MatVec( workspace->H_app_inv, workspace->w2, workspace->m2, H->NT ); +#else + dual_Sparse_MatVec( workspace->H_app_inv, workspace->w2, workspace->m2, system->n ); +#endif + t_pa += MPI_Wtime( ) - t_start; + } + + t_start = MPI_Wtime( ); + Dist( system, mpi_data, workspace->m2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 ); + t_comm += MPI_Wtime( ) - t_start; + + t_start = MPI_Wtime( ); +#if defined(NEUTRAL_TERRITORY) + dual_Sparse_MatVec( H, workspace->m2, workspace->n2, H->NT ); +#else + dual_Sparse_MatVec( H, workspace->m2, workspace->n2, system->N ); +#endif + t_spmv += MPI_Wtime( ) - t_start; + + if ( H->format == SYM_HALF_MATRIX ) + { + t_start = MPI_Wtime( ); + Coll( system, mpi_data, workspace->n2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 ); + t_comm += MPI_Wtime( ) - t_start; + } +#if defined(NEUTRAL_TERRITORY) + else + { + t_start = MPI_Wtime( ); + Coll( system, mpi_data, workspace->n2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 ); + t_comm += MPI_Wtime( ) - t_start; + } +#endif + + t_start = MPI_Wtime( ); + MPI_Wait( &req, MPI_STATUS_IGNORE ); + t_allreduce += MPI_Wtime( ) - t_start; + delta[0] = redux[0]; + delta[1] = redux[1]; + gamma_new[0] = redux[2]; + gamma_new[1] = redux[3]; + norm[0] = sqrt( redux[4] ); + norm[1] = sqrt( redux[5] ); + b_norm[0] = sqrt( redux[6] ); + b_norm[1] = sqrt( redux[7] ); + + for ( i = 0; i < control->cm_solver_max_iters; ++i ) + { + if ( norm[0] / b_norm[0] <= tol || norm[1] / b_norm[1] <= tol ) + { + break; + } + if ( i > 0 ) + { + beta[0] = gamma_new[0] / gamma_old[0]; + beta[1] = gamma_new[1] / gamma_old[1]; + alpha[0] = gamma_new[0] / (delta[0] - beta[0] / alpha[0] * gamma_new[0]); + alpha[1] = gamma_new[1] / (delta[1] - beta[1] / alpha[1] * gamma_new[1]); + } + else + { + beta[0] = 0.0; + beta[1] = 0.0; + alpha[0] = gamma_new[0] / delta[0]; + alpha[1] = gamma_new[1] / delta[1]; + } + + t_start = MPI_Wtime( ); + //Vector_Sum( workspace->z, 1.0, workspace->n, beta, workspace->z, system->n ); + //Vector_Sum( workspace->q, 1.0, workspace->m, beta, workspace->q, system->n ); + //Vector_Sum( workspace->p, 1.0, workspace->u, beta, workspace->p, system->n ); + //Vector_Sum( workspace->d, 1.0, workspace->w, beta, workspace->d, system->n ); + //Vector_Sum( x, 1.0, x, alpha, workspace->p, system->n ); + //Vector_Sum( workspace->u, 1.0, workspace->u, -alpha, workspace->q, system->n ); + //Vector_Sum( workspace->w, 1.0, workspace->w, -alpha, workspace->z, system->n ); + //Vector_Sum( workspace->r, 1.0, workspace->r, -alpha, workspace->d, system->n ); + //redux[0] = Dot_local( workspace->w, workspace->u, system->n ); + //redux[1] = Dot_local( workspace->r, workspace->u, system->n ); + //redux[2] = Dot_local( workspace->u, workspace->u, system->n ); + for ( j = 0; j < 6; ++j ) + { + redux[j] = 0.0; + } + for ( j = 0; j < system->n; ++j ) + { + workspace->z2[j][0] = workspace->n2[j][0] + beta[0] * workspace->z2[j][0]; + workspace->z2[j][1] = workspace->n2[j][1] + beta[1] * workspace->z2[j][1]; + + workspace->q2[j][0] = workspace->m2[j][0] + beta[0] * workspace->q2[j][0]; + workspace->q2[j][1] = workspace->m2[j][1] + beta[1] * workspace->q2[j][1]; + + workspace->p2[j][0] = workspace->u2[j][0] + beta[0] * workspace->p2[j][0]; + workspace->p2[j][1] = workspace->u2[j][1] + beta[1] * workspace->p2[j][1]; + + workspace->d2[j][0] = workspace->w2[j][0] + beta[0] * workspace->d2[j][0]; + workspace->d2[j][1] = workspace->w2[j][1] + beta[1] * workspace->d2[j][1]; + + x[j][0] += alpha[0] * workspace->p2[j][0]; + x[j][1] += alpha[1] * workspace->p2[j][1]; + + workspace->u2[j][0] -= alpha[0] * workspace->q2[j][0]; + workspace->u2[j][1] -= alpha[1] * workspace->q2[j][1]; + + workspace->w2[j][0] -= alpha[0] * workspace->z2[j][0]; + workspace->w2[j][1] -= alpha[1] * workspace->z2[j][1]; + + workspace->r2[j][0] -= alpha[0] * workspace->d2[j][0]; + workspace->r2[j][1] -= alpha[1] * workspace->d2[j][1]; + + redux[0] += workspace->w2[j][0] * workspace->u2[j][0]; + redux[1] += workspace->w2[j][1] * workspace->u2[j][1]; + + redux[2] += workspace->r2[j][0] * workspace->u2[j][0]; + redux[3] += workspace->r2[j][1] * workspace->u2[j][1]; + + redux[4] += workspace->u2[j][0] * workspace->u2[j][0]; + redux[5] += workspace->u2[j][1] * workspace->u2[j][1]; + + } + t_vops += MPI_Wtime( ) - t_start; + + MPI_Iallreduce( MPI_IN_PLACE, redux, 6, MPI_DOUBLE, MPI_SUM, mpi_data->world, &req ); + + /* pre-conditioning */ + if ( control->cm_solver_pre_comp_type == NONE_PC ) + { + //Vector_Copy( workspace->m, workspace->w, system->n ); + for ( j = 0; j < system->n; ++j ) + { + workspace->m2[j][0] = workspace->w2[j][0]; + workspace->m2[j][1] = workspace->w2[j][1]; + } + } + else if ( control->cm_solver_pre_comp_type == JACOBI_PC ) + { + t_start = MPI_Wtime( ); + for ( j = 0; j < system->n; ++j ) + { + workspace->m2[j][0] = workspace->w2[j][0] * workspace->Hdia_inv[j]; + workspace->m2[j][1] = workspace->w2[j][1] * workspace->Hdia_inv[j]; + } + t_pa += MPI_Wtime( ) - t_start; + } + else if ( control->cm_solver_pre_comp_type == SAI_PC ) + { + t_start = MPI_Wtime( ); + Dist( system, mpi_data, workspace->w2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 ); + t_comm += MPI_Wtime( ) - t_start; + + t_start = MPI_Wtime( ); +#if defined(NEUTRAL_TERRITORY) + dual_Sparse_MatVec( workspace->H_app_inv, workspace->w2, workspace->m2, H->NT ); +#else + dual_Sparse_MatVec( workspace->H_app_inv, workspace->w2, workspace->m2, system->n ); +#endif + t_pa += MPI_Wtime( ) - t_start; + } + + t_start = MPI_Wtime( ); + Dist( system, mpi_data, workspace->m2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2); + t_comm += MPI_Wtime( ) - t_start; + + t_start = MPI_Wtime( ); +#if defined(NEUTRAL_TERRITORY) + dual_Sparse_MatVec( H, workspace->m2, workspace->n2, H->NT ); +#else + dual_Sparse_MatVec( H, workspace->m2, workspace->n2, system->N ); +#endif + t_spmv += MPI_Wtime( ) - t_start; + + if ( H->format == SYM_HALF_MATRIX ) + { + t_start = MPI_Wtime( ); + Coll( system, mpi_data, workspace->n2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 ); + t_comm += MPI_Wtime( ) - t_start; + } +#if defined(NEUTRAL_TERRITORY) + else + { + t_start = MPI_Wtime( ); + Coll( system, mpi_data, workspace->n2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2); + t_comm += MPI_Wtime( ) - t_start; + } +#endif + + gamma_old[0] = gamma_new[0]; + gamma_old[1] = gamma_new[1]; + + t_start = MPI_Wtime( ); + MPI_Wait( &req, MPI_STATUS_IGNORE ); + t_allreduce += MPI_Wtime( ) - t_start; + delta[0] = redux[0]; + delta[1] = redux[1]; + gamma_new[0] = redux[2]; + gamma_new[1] = redux[3]; + norm[0] = sqrt( redux[4] ); + norm[1] = sqrt( redux[5] ); + } + + timings[0] = t_pa; + timings[1] = t_spmv; + timings[2] = t_vops; + timings[3] = t_comm; + timings[4] = t_allreduce; + + if ( system->my_rank == MASTER_NODE ) + { + MPI_Reduce( MPI_IN_PLACE, timings, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world ); + + data->timing.cm_solver_pre_app += timings[0] / control->nprocs; + data->timing.cm_solver_spmv += timings[1] / control->nprocs; + data->timing.cm_solver_vector_ops += timings[2] / control->nprocs; + data->timing.cm_solver_comm += timings[3] / control->nprocs; + data->timing.cm_solver_allreduce += timings[4] / control->nprocs; + } + else + { + MPI_Reduce( timings, NULL, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world ); + } + + // continue to solve the system that has not converged yet + if ( norm[0] / b_norm[0] > tol ) + { + for ( j = 0; j < system->n; ++j ) + { + workspace->s[j] = workspace->x[j][0]; + } + + i += PIPECG( system, control, data, workspace, + H, workspace->b_s, tol, workspace->s, mpi_data ); + + for ( j = 0; j < system->n; ++j ) + { + workspace->x[j][0] = workspace->s[j]; + } + } + else if ( norm[1] / b_norm[1] > tol ) + { + for ( j = 0; j < system->n; ++j ) + { + workspace->t[j] = workspace->x[j][1]; + } + + i += PIPECG( system, control, data, workspace, + H, workspace->b_t, tol, workspace->t, mpi_data ); + + for ( j = 0; j < system->n; ++j ) + { + workspace->x[j][1] = workspace->t[j]; + } + } + + if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE ) + { + fprintf( stderr, "[WARNING] PIPECG convergence failed!\n" ); + return i; + } + + return i; +} + + /* Pipelined Preconditioned Conjugate Gradient Method * * References: @@ -2647,7 +3070,6 @@ int PIPECR( reax_system *system, control_params *control, simulation_data *data, t_comm += MPI_Wtime( ) - t_start; } #endif - t_start = MPI_Wtime( ); MPI_Wait( &req, MPI_STATUS_IGNORE ); t_allreduce += MPI_Wtime( ) - t_start; diff --git a/PuReMD/src/linear_solvers.h b/PuReMD/src/linear_solvers.h index 42892a8f..701ee82d 100644 --- a/PuReMD/src/linear_solvers.h +++ b/PuReMD/src/linear_solvers.h @@ -37,6 +37,9 @@ int dual_CG( reax_system*, control_params*, simulation_data*, storage*, sparse_m int CG( reax_system*, control_params*, simulation_data*, storage*, sparse_matrix*, real*, real, real*, mpi_datatypes* ); +int dual_PIPECG( reax_system*, control_params*, simulation_data*, storage*, sparse_matrix*, + rvec2*, real, rvec2*, mpi_datatypes* ); + int PIPECG( reax_system*, control_params*, simulation_data*, storage*, sparse_matrix*, real*, real, real*, mpi_datatypes* ); diff --git a/PuReMD/src/qEq.c b/PuReMD/src/qEq.c index 96336c66..3ba18eeb 100644 --- a/PuReMD/src/qEq.c +++ b/PuReMD/src/qEq.c @@ -445,7 +445,7 @@ void QEq( reax_system *system, control_params *control, simulation_data *data, switch ( control->cm_solver_type ) { case CG_S: -#if defined(DUAL_CG) +#if defined(DUAL_SOLVER) iters = dual_CG( system, control, data, workspace, workspace->H, workspace->b, control->cm_solver_q_err, workspace->x, mpi_data ); #else @@ -478,6 +478,10 @@ void QEq( reax_system *system, control_params *control, simulation_data *data, break; case PIPECG_S: +#if defined(DUAL_SOLVER) + iters = dual_PIPECG( system, control, data, workspace, workspace->H, workspace->b, + control->cm_solver_q_err, workspace->x, mpi_data ); +#else for ( j = 0; j < system->n; ++j ) { workspace->s[j] = workspace->x[j][0]; @@ -503,6 +507,7 @@ void QEq( reax_system *system, control_params *control, simulation_data *data, { workspace->x[j][1] = workspace->t[j]; } +#endif break; case PIPECR_S: diff --git a/PuReMD/src/reax_types.h b/PuReMD/src/reax_types.h index 224b78c7..c55d413f 100644 --- a/PuReMD/src/reax_types.h +++ b/PuReMD/src/reax_types.h @@ -40,8 +40,8 @@ /************* SOME DEFS - crucial for reax_types.h *********/ #define PURE_REAX -//#define DUAL_CG -//#define NEUTRAL_TERRITORY +#define DUAL_SOLVER +#define NEUTRAL_TERRITORY //#define LAMMPS_REAX //#define DEBUG //#define DEBUG_FOCUS @@ -1059,7 +1059,9 @@ typedef struct /* PIPECG, PIPECR storage */ real *m, *n, *u, *w; /* dual-CG storage */ - rvec2 *r2, *d2, *q2, *p2; + rvec2 *d2, *p2, *q2, *r2; + /* dual-PIPECG storage */ + rvec2 *m2, *n2, *u2, *w2, *z2; /* Taper */ real Tap[8]; -- GitLab