From 0f8f5bf2772591bf247c2d7e6d9d97e4e02493aa Mon Sep 17 00:00:00 2001
From: "Kurt A. O'Hearn" <ohearnku@msu.edu>
Date: Fri, 7 May 2021 19:17:42 -0400
Subject: [PATCH] PG-PuReMD: utlize multiple CUDA streams to run independent
 kernels in the ReaxFF+QEq task dependency graph.

---
 PG-PuReMD/src/cuda/cuda_allocate.cu       |  12 +-
 PG-PuReMD/src/cuda/cuda_bond_orders.cu    |   8 +-
 PG-PuReMD/src/cuda/cuda_bonds.cu          |  15 +-
 PG-PuReMD/src/cuda/cuda_charges.cu        |  70 ++++----
 PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu  |  15 +-
 PG-PuReMD/src/cuda/cuda_environment.cu    |  40 +++--
 PG-PuReMD/src/cuda/cuda_forces.cu         | 159 +++++++++++------
 PG-PuReMD/src/cuda/cuda_forces.h          |  10 +-
 PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu |   5 +-
 PG-PuReMD/src/cuda/cuda_init_md.cu        |  19 +-
 PG-PuReMD/src/cuda/cuda_multi_body.cu     |   5 +-
 PG-PuReMD/src/cuda/cuda_nonbonded.cu      |  36 ++--
 PG-PuReMD/src/cuda/cuda_reset_tools.cu    |   5 +-
 PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu   | 205 +++++++++++-----------
 PG-PuReMD/src/cuda/cuda_system_props.cu   |  20 +--
 PG-PuReMD/src/cuda/cuda_torsion_angles.cu |   5 +-
 PG-PuReMD/src/cuda/cuda_valence_angles.cu |   8 +-
 PG-PuReMD/src/reax_types.h                |  13 +-
 18 files changed, 372 insertions(+), 278 deletions(-)

diff --git a/PG-PuReMD/src/cuda/cuda_allocate.cu b/PG-PuReMD/src/cuda/cuda_allocate.cu
index 5c346e7c..87fd72fa 100644
--- a/PG-PuReMD/src/cuda/cuda_allocate.cu
+++ b/PG-PuReMD/src/cuda/cuda_allocate.cu
@@ -43,10 +43,10 @@ static void Cuda_Reallocate_System_Part1( reax_system *system,
 {
     int *temp;
 
-    sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size,
+    sCudaCheckMalloc( &workspace->scratch[0], &workspace->scratch_size[0],
             sizeof(int) * local_cap_old,
             __FILE__, __LINE__ );
-    temp = (int *) workspace->scratch;
+    temp = (int *) workspace->scratch[0];
 
     sCudaMemcpyAsync( temp, system->d_cm_entries, sizeof(int) * local_cap_old,
             cudaMemcpyDeviceToDevice, control->streams[0], __FILE__, __LINE__ );
@@ -76,11 +76,11 @@ static void Cuda_Reallocate_System_Part2( reax_system *system, control_params *c
     int *temp;
     reax_atom *temp_atom;
 
-    sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size,
+    sCudaCheckMalloc( &workspace->scratch[0], &workspace->scratch_size[0],
             MAX( sizeof(reax_atom), sizeof(int) ) * total_cap_old,
             __FILE__, __LINE__ );
-    temp = (int *) workspace->scratch;
-    temp_atom = (reax_atom *) workspace->scratch;
+    temp = (int *) workspace->scratch[0];
+    temp_atom = (reax_atom *) workspace->scratch[0];
 
     /* free the existing storage for atoms, leave other info allocated */
     sCudaMemcpyAsync( temp_atom, system->d_my_atoms, sizeof(reax_atom) * total_cap_old,
@@ -173,7 +173,7 @@ void Cuda_Allocate_Grid( reax_system *system, control_params *control )
 //    grid_cell local_cell;
     grid *host = &system->my_grid;
     grid *device = &system->d_my_grid;
-//    ivec *nbrs_x = (ivec *) workspace->scratch;
+//    ivec *nbrs_x = (ivec *) workspace->scratch[0];
 
     total = host->ncells[0] * host->ncells[1] * host->ncells[2];
     ivec_Copy( device->ncells, host->ncells );
diff --git a/PG-PuReMD/src/cuda/cuda_bond_orders.cu b/PG-PuReMD/src/cuda/cuda_bond_orders.cu
index b8be616a..c8ff70e9 100644
--- a/PG-PuReMD/src/cuda/cuda_bond_orders.cu
+++ b/PG-PuReMD/src/cuda/cuda_bond_orders.cu
@@ -999,6 +999,8 @@ void Cuda_Compute_Bond_Orders( reax_system * const system, control_params * cons
 {
     int blocks;
 
+    cudaStreamWaitEvent( control->streams[0], control->stream_events[2] );
+
     k_bond_order_part1 <<< control->blocks_n, control->block_size_n, 0,
                        control->streams[0] >>>
         ( system->d_my_atoms, system->reax_param.d_sbp, 
@@ -1032,6 +1034,8 @@ void Cuda_Compute_Bond_Orders( reax_system * const system, control_params * cons
         ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, 
          *(workspace->d_workspace), system->N );
     cudaCheckError( );
+
+    cudaEventRecord( control->stream_events[4], control->streams[0] );
 }
 
 
@@ -1062,9 +1066,9 @@ void Cuda_Total_Forces_Part1( reax_system * const system, control_params * const
     }
     else
     {
-        sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size,
+        sCudaCheckMalloc( &workspace->scratch[0], &workspace->scratch_size[0],
                 sizeof(rvec) * 2 * system->N, __FILE__, __LINE__ );
-        spad_rvec = (rvec *) workspace->scratch;
+        spad_rvec = (rvec *) workspace->scratch[0];
         sCudaMemsetAsync( spad_rvec, 0, sizeof(rvec) * 2 * system->N,
                 control->streams[0], __FILE__, __LINE__ );
         cudaStreamSynchronize( control->streams[0] );
diff --git a/PG-PuReMD/src/cuda/cuda_bonds.cu b/PG-PuReMD/src/cuda/cuda_bonds.cu
index 6f8e9ff9..96f45704 100644
--- a/PG-PuReMD/src/cuda/cuda_bonds.cu
+++ b/PG-PuReMD/src/cuda/cuda_bonds.cu
@@ -279,19 +279,20 @@ void Cuda_Compute_Bonds( reax_system *system, control_params *control,
     int update_energy;
     real *spad;
 
-    sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size,
+    sCudaCheckMalloc( &workspace->scratch[0], &workspace->scratch_size[0],
             sizeof(real) * system->n, __FILE__, __LINE__ );
 
-    spad = (real *) workspace->scratch;
+    spad = (real *) workspace->scratch[0];
     update_energy = (out_control->energy_update_freq > 0
             && data->step % out_control->energy_update_freq == 0) ? TRUE : FALSE;
 #else
     sCudaMemsetAsync( &((simulation_data *)data->d_simulation_data)->my_en.e_bond,
-            0, sizeof(real), control->streams[0], __FILE__, __LINE__ );
-    cudaStreamSynchronize( control->streams[0] );
+            0, sizeof(real), control->streams[1], __FILE__, __LINE__ );
 #endif
 
-//    k_bonds <<< control->blocks, control->block_size, 0, control->streams[0] >>>
+    cudaStreamWaitEvent( control->streams[1], control->stream_events[4] );
+
+//    k_bonds <<< control->blocks, control->block_size, 0, control->streams[1] >>>
 //        ( system->d_my_atoms, system->reax_param.d_gp,
 //          system->reax_param.d_sbp, system->reax_param.d_tbp,
 //          *(workspace->d_workspace), *(lists[BONDS]), 
@@ -309,7 +310,7 @@ void Cuda_Compute_Bonds( reax_system *system, control_params *control,
 
     k_bonds_opt <<< blocks, DEF_BLOCK_SIZE,
                 sizeof(cub::WarpReduce<double>::TempStorage) * (DEF_BLOCK_SIZE / 32),
-                control->streams[0] >>>
+                control->streams[1] >>>
         ( system->d_my_atoms, system->reax_param.d_gp,
           system->reax_param.d_sbp, system->reax_param.d_tbp,
           *(workspace->d_workspace), *(lists[BONDS]), 
@@ -329,4 +330,6 @@ void Cuda_Compute_Bonds( reax_system *system, control_params *control,
                 system->n );
     }
 #endif
+
+    cudaEventRecord( control->stream_events[5], control->streams[1] );
 }
diff --git a/PG-PuReMD/src/cuda/cuda_charges.cu b/PG-PuReMD/src/cuda/cuda_charges.cu
index deaca1fd..b85a1a88 100644
--- a/PG-PuReMD/src/cuda/cuda_charges.cu
+++ b/PG-PuReMD/src/cuda/cuda_charges.cu
@@ -61,7 +61,7 @@ static void jacobi( reax_system const * const system,
     blocks = system->n / DEF_BLOCK_SIZE
         + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
 
-    k_jacobi <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>>
+    k_jacobi <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[4] >>>
         ( system->d_my_atoms, system->reax_param.d_sbp, 
           *(workspace->d_workspace), system->n );
     cudaCheckError( );
@@ -89,11 +89,9 @@ void Sort_Matrix_Rows( sparse_matrix * const A, reax_system const * const system
     start = (int *) smalloc( sizeof(int) * system->total_cap, "Sort_Matrix_Rows::start" );
     end = (int *) smalloc( sizeof(int) * system->total_cap, "Sort_Matrix_Rows::end" );
     sCudaMemcpyAsync( start, A->start, sizeof(int) * system->total_cap, 
-            cudaMemcpyDeviceToHost, control->streams[0], __FILE__, __LINE__ );
-    cudaStreamSynchronize( control->streams[0] );
+            cudaMemcpyDeviceToHost, control->streams[4], __FILE__, __LINE__ );
     sCudaMemcpyAsync( end, A->end, sizeof(int) * system->total_cap, 
-            cudaMemcpyDeviceToHost, control->streams[0], __FILE__, __LINE__ );
-    cudaStreamSynchronize( control->streams[0] );
+            cudaMemcpyDeviceToHost, control->streams[4], __FILE__, __LINE__ );
 
     /* make copies of column indices and non-zero values */
     sCudaMalloc( (void **) &d_j_temp, sizeof(int) * system->total_cm_entries,
@@ -101,11 +99,11 @@ void Sort_Matrix_Rows( sparse_matrix * const A, reax_system const * const system
     sCudaMalloc( (void **) &d_val_temp, sizeof(real) * system->total_cm_entries,
             __FILE__, __LINE__ );
     sCudaMemcpyAsync( d_j_temp, A->j, sizeof(int) * system->total_cm_entries,
-            cudaMemcpyDeviceToDevice, control->streams[0], __FILE__, __LINE__ );
-    cudaStreamSynchronize( control->streams[0] );
+            cudaMemcpyDeviceToDevice, control->streams[4], __FILE__, __LINE__ );
     sCudaMemcpyAsync( d_val_temp, A->val, sizeof(real) * system->total_cm_entries,
-            cudaMemcpyDeviceToDevice, control->streams[0], __FILE__, __LINE__ );
-    cudaStreamSynchronize( control->streams[0] );
+            cudaMemcpyDeviceToDevice, control->streams[4], __FILE__, __LINE__ );
+
+    cudaStreamSynchronize( control->streams[4] );
 
     for ( i = 0; i < system->n; ++i )
     {
@@ -246,7 +244,7 @@ static void Spline_Extrapolate_Charges_QEq( reax_system const * const system,
         + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
 
     k_spline_extrapolate_charges_qeq <<< blocks, DEF_BLOCK_SIZE, 0,
-                                     control->streams[0] >>>
+                                     control->streams[4] >>>
         ( system->d_my_atoms, system->reax_param.d_sbp, 
           (control_params *)control->d_control_params,
           *(workspace->d_workspace), system->n );
@@ -418,21 +416,21 @@ static void Extrapolate_Charges_QEq_Part2( reax_system const * const system,
     blocks = system->n / DEF_BLOCK_SIZE
         + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
 
-    sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size,
+    sCudaCheckMalloc( &workspace->scratch[4], &workspace->scratch_size[4],
             sizeof(real) * system->n, __FILE__, __LINE__ );
-    spad = (real *) workspace->scratch;
+    spad = (real *) workspace->scratch[4];
     sCudaMemsetAsync( spad, 0, sizeof(real) * system->n,
-            control->streams[0], __FILE__, __LINE__ );
-    cudaStreamSynchronize( control->streams[0] );
+            control->streams[4], __FILE__, __LINE__ );
 
     k_extrapolate_charges_qeq_part2 <<< blocks, DEF_BLOCK_SIZE, 0,
-                                    control->streams[0] >>>
+                                    control->streams[4] >>>
         ( system->d_my_atoms, *(workspace->d_workspace), u, spad, system->n );
     cudaCheckError( );
 
     sCudaMemcpyAsync( q, spad, sizeof(real) * system->n, 
-            cudaMemcpyDeviceToHost, control->streams[0], __FILE__, __LINE__ );
-    cudaStreamSynchronize( control->streams[0] );
+            cudaMemcpyDeviceToHost, control->streams[4], __FILE__, __LINE__ );
+
+    cudaStreamSynchronize( control->streams[4] );
 }
 
 
@@ -462,15 +460,17 @@ static void Update_Ghost_Atom_Charges( reax_system const * const system,
     blocks = (system->N - system->n) / DEF_BLOCK_SIZE
         + (((system->N - system->n) % DEF_BLOCK_SIZE == 0) ? 0 : 1);
 
-    sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size,
+    sCudaCheckMalloc( &workspace->scratch[4], &workspace->scratch_size[4],
             sizeof(real) * (system->N - system->n), __FILE__, __LINE__ );
-    spad = (real *) workspace->scratch;
+    spad = (real *) workspace->scratch[4];
+
     sCudaMemcpyAsync( spad, &q[system->n], sizeof(real) * (system->N - system->n),
-            cudaMemcpyHostToDevice, control->streams[0], __FILE__, __LINE__ );
-    cudaStreamSynchronize( control->streams[0] );
+            cudaMemcpyHostToDevice, control->streams[4], __FILE__, __LINE__ );
+
+    cudaStreamSynchronize( control->streams[4] );
 
     k_update_ghost_atom_charges <<< blocks, DEF_BLOCK_SIZE, 0,
-                                control->streams[0] >>>
+                                control->streams[4] >>>
         ( system->d_my_atoms, spad, system->n, system->N );
     cudaCheckError( );
 }
@@ -494,41 +494,43 @@ static void Calculate_Charges_QEq( reax_system const * const system,
     blocks = system->n / DEF_BLOCK_SIZE
         + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
 
-    sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size,
+    sCudaCheckMalloc( &workspace->scratch[4], &workspace->scratch_size[4],
             sizeof(rvec2) * (blocks + 1), __FILE__, __LINE__ );
-    spad = (rvec2 *) workspace->scratch;
+    spad = (rvec2 *) workspace->scratch[4];
+
     sCudaMemsetAsync( spad, 0, sizeof(rvec2) * (blocks + 1),
-            control->streams[0], __FILE__, __LINE__ );
-    cudaStreamSynchronize( control->streams[0] );
+            control->streams[4], __FILE__, __LINE__ );
 
     /* compute local sums of pseudo-charges in s and t on device */
     k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE,
                       sizeof(rvec2) * (DEF_BLOCK_SIZE / 32),
-                      control->streams[0] >>>
+                      control->streams[4] >>>
         ( workspace->d_workspace->x, spad, system->n );
     cudaCheckError( );
 
     k_reduction_rvec2 <<< 1, ((blocks + 31) / 32) * 32,
                       sizeof(rvec2) * ((blocks + 31) / 32),
-                      control->streams[0] >>>
+                      control->streams[4] >>>
         ( spad, &spad[blocks], blocks );
     cudaCheckError( );
 
     sCudaMemcpyAsync( &my_sum, &spad[blocks], sizeof(rvec2),
-            cudaMemcpyDeviceToHost, control->streams[0], __FILE__, __LINE__ );
-    cudaStreamSynchronize( control->streams[0] );
+            cudaMemcpyDeviceToHost, control->streams[4], __FILE__, __LINE__ );
+
+    cudaStreamSynchronize( control->streams[4] );
 #else
-    sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size,
+    sCudaCheckMalloc( &workspace->scratch[4], &workspace->scratch_size[4],
             sizeof(real) * 2, __FILE__, __LINE__ );
-    spad = (real *) workspace->scratch;
+    spad = (real *) workspace->scratch[4];
 
     /* local reductions (sums) on device */
     Cuda_Reduction_Sum( workspace->d_workspace->s, &spad[0], system->n );
     Cuda_Reduction_Sum( workspace->d_workspace->t, &spad[1], system->n );
 
     sCudaMemcpyAsync( my_sum, spad, sizeof(real) * 2,
-            cudaMemcpyDeviceToHost, control->streams[0], __FILE__, __LINE__ );
-    cudaStreamSynchronize( control->streams[0] );
+            cudaMemcpyDeviceToHost, control->streams[4], __FILE__, __LINE__ );
+
+    cudaStreamSynchronize( control->streams[4] );
 #endif
 
     /* global reduction on pseudo-charges for s and t */
diff --git a/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu
index b4c62dc2..57652907 100644
--- a/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu
+++ b/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu
@@ -562,9 +562,9 @@ real Dot( storage * const workspace,
     real temp;
 //#endif
 
-    sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size,
+    sCudaCheckMalloc( &workspace->scratch[4], &workspace->scratch_size[4],
             sizeof(real) * (k + 1), __FILE__, __LINE__ );
-    spad = (real *) workspace->scratch;
+    spad = (real *) workspace->scratch[4];
 
     Vector_Mult( spad, v1, v2, k );
 
@@ -578,6 +578,7 @@ real Dot( storage * const workspace,
 //#else
     sCudaMemcpyAsync( &temp, &spad[k], sizeof(real),
             cudaMemcpyDeviceToHost, s, __FILE__, __LINE__ );
+
     cudaStreamSynchronize( s );
 
     ret = MPI_Allreduce( &temp, &sum, 1, MPI_DOUBLE, MPI_SUM, comm );
@@ -603,9 +604,9 @@ real Dot_local( storage * const workspace,
 {
     real sum, *spad;
 
-    sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size,
+    sCudaCheckMalloc( &workspace->scratch[4], &workspace->scratch_size[4],
             sizeof(real) * (k + 1), __FILE__, __LINE__ );
-    spad = (real *) workspace->scratch;
+    spad = (real *) workspace->scratch[4];
 
     Vector_Mult( spad, v1, v2, k );
 
@@ -615,6 +616,7 @@ real Dot_local( storage * const workspace,
     //TODO: keep result of reduction on devie and pass directly to CUDA-aware MPI
     sCudaMemcpyAsync( &sum, &spad[k], sizeof(real),
             cudaMemcpyDeviceToHost, s, __FILE__, __LINE__ );
+
     cudaStreamSynchronize( s );
 
     return sum;
@@ -640,9 +642,9 @@ void Dot_local_rvec2( storage * const workspace,
     blocks = (k / DEF_BLOCK_SIZE)
         + ((k % DEF_BLOCK_SIZE == 0) ? 0 : 1);
 
-    sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size,
+    sCudaCheckMalloc( &workspace->scratch[4], &workspace->scratch_size[4],
             sizeof(rvec2) * (k + blocks + 1), __FILE__, __LINE__ );
-    spad = (rvec2 *) workspace->scratch;
+    spad = (rvec2 *) workspace->scratch[4];
 
     Vector_Mult_rvec2( spad, v1, v2, k );
 
@@ -662,6 +664,7 @@ void Dot_local_rvec2( storage * const workspace,
     //TODO: keep result of reduction on devie and pass directly to CUDA-aware MPI
     sCudaMemcpyAsync( &sum, &spad[k + blocks], sizeof(rvec2),
             cudaMemcpyDeviceToHost, s, __FILE__, __LINE__ );
+
     cudaStreamSynchronize( s );
 
     *sum1 = sum[0];
diff --git a/PG-PuReMD/src/cuda/cuda_environment.cu b/PG-PuReMD/src/cuda/cuda_environment.cu
index 6cfecd92..4a1b26ea 100644
--- a/PG-PuReMD/src/cuda/cuda_environment.cu
+++ b/PG-PuReMD/src/cuda/cuda_environment.cu
@@ -62,17 +62,17 @@ extern "C" void Cuda_Setup_Environment( reax_system const * const system,
         exit( CANNOT_INITIALIZE );
     }
 
-    /* stream assignment:
-     * 0: init bonds, bond order (uncorrected/corrected), lone pair/over coord/under coord
-     * 1: (after bond order) bonds, valence angels, torsions
-     * 2: init hbonds, (after bonds) hbonds
-     * 3: van der Waals
-     * 4: init CM, CM, Coulomb
+    /* stream assignment (default to 0 for any kernel not listed):
+     * 0: init dist, init CM, bond order (uncorrected/corrected), lone pair/over coord/under coord
+     * 1: (after init dist) init bonds, (after bond order) bonds, valence angles, torsions
+     * 2: (after init dist) init hbonds, (after bonds) hbonds
+     * 3: (after init dist) van der Waals
+     * 4: (after init CM) CM, Coulomb
      */
-    for ( i = 0; i < CUDA_MAX_STREAMS; ++i )
+    for ( i = 0; i < MAX_CUDA_STREAMS; ++i )
     {
         /* all non-CM streams of equal priority */
-        if ( i < CUDA_MAX_STREAMS - 1 )
+        if ( i < MAX_CUDA_STREAMS - 1 )
         {
             ret = cudaStreamCreateWithPriority( &control->streams[i], cudaStreamNonBlocking, least_priority );
         }
@@ -84,7 +84,27 @@ extern "C" void Cuda_Setup_Environment( reax_system const * const system,
 
         if ( ret != cudaSuccess )
         {
-            fprintf( stderr, "[ERROR] CUDA strema creation failed (%d). Terminating...\n",
+            fprintf( stderr, "[ERROR] CUDA stream creation failed (%d). Terminating...\n",
+                    i );
+            exit( CANNOT_INITIALIZE );
+        }
+    }
+
+    /* stream event assignment:
+     * 0: init dist done (stream 0)
+     * 1: init CM done (stream 4)
+     * 2: init bonds done (stream 1)
+     * 3: init hbonds done (stream 2)
+     * 4: bond orders done (stream 0)
+     * 5: bonds done (stream 1)
+     */
+    for ( i = 0; i < MAX_CUDA_STREAM_EVENTS; ++i )
+    {
+        ret = cudaEventCreateWithFlags( &control->stream_events[i], cudaEventDisableTiming );
+
+        if ( ret != cudaSuccess )
+        {
+            fprintf( stderr, "[ERROR] CUDA event creation failed (%d). Terminating...\n",
                     i );
             exit( CANNOT_INITIALIZE );
         }
@@ -112,7 +132,7 @@ extern "C" void Cuda_Cleanup_Environment( control_params const * const control )
     int i;
     cudaError_t ret;
 
-    for ( i = 0; i < CUDA_MAX_STREAMS; ++i )
+    for ( i = 0; i < MAX_CUDA_STREAMS; ++i )
     {
         ret = cudaStreamDestroy( control->streams[i] );
 
diff --git a/PG-PuReMD/src/cuda/cuda_forces.cu b/PG-PuReMD/src/cuda/cuda_forces.cu
index 120f0a43..bc3bb518 100644
--- a/PG-PuReMD/src/cuda/cuda_forces.cu
+++ b/PG-PuReMD/src/cuda/cuda_forces.cu
@@ -1671,23 +1671,22 @@ void Cuda_Init_Neighbor_Indices( reax_system *system, control_params *control,
 /* Initialize indices for far hydrogen bonds list post reallocation
  *
  * system: atomic system info. */
-void Cuda_Init_HBond_Indices( reax_system *system, control_params *control,
-        storage *workspace, reax_list *hbond_list )
+void Cuda_Init_HBond_Indices( reax_system *system, storage *workspace,
+        reax_list *hbond_list, cudaStream_t s )
 {
     int blocks, *temp;
 
     blocks = system->total_cap / DEF_BLOCK_SIZE
         + (system->total_cap % DEF_BLOCK_SIZE == 0 ? 0 : 1);
 
-    sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size,
+    sCudaCheckMalloc( &workspace->scratch[2], &workspace->scratch_size[2],
             sizeof(int) * system->total_cap, __FILE__, __LINE__ );
-    temp = (int *) workspace->scratch;
+    temp = (int *) workspace->scratch[2];
 
     /* init indices and end_indices */
-    Cuda_Scan_Excl_Sum( system->d_max_hbonds, temp, system->total_cap,
-            control->streams[0] );
+    Cuda_Scan_Excl_Sum( system->d_max_hbonds, temp, system->total_cap, s );
 
-    k_init_hbond_indices <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>>
+    k_init_hbond_indices <<< blocks, DEF_BLOCK_SIZE, 0, s >>>
         ( system->d_my_atoms, system->reax_param.d_sbp, system->d_hbonds, temp, 
           hbond_list->index, hbond_list->end_index, system->total_cap );
     cudaCheckError( );
@@ -1697,8 +1696,8 @@ void Cuda_Init_HBond_Indices( reax_system *system, control_params *control,
 /* Initialize indices for far bonds list post reallocation
  *
  * system: atomic system info. */
-void Cuda_Init_Bond_Indices( reax_system *system, control_params * control,
-        reax_list * bond_list )
+void Cuda_Init_Bond_Indices( reax_system *system, reax_list * bond_list,
+        cudaStream_t s )
 {
     int blocks;
 
@@ -1706,11 +1705,11 @@ void Cuda_Init_Bond_Indices( reax_system *system, control_params * control,
         (system->total_cap % DEF_BLOCK_SIZE == 0 ? 0 : 1);
 
     /* init indices */
-    Cuda_Scan_Excl_Sum( system->d_max_bonds, bond_list->index, system->total_cap,
-            control->streams[0] );
+    Cuda_Scan_Excl_Sum( system->d_max_bonds, bond_list->index,
+            system->total_cap, s );
 
     /* init end_indices */
-    k_init_end_index <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>>
+    k_init_end_index <<< blocks, DEF_BLOCK_SIZE, 0, s >>>
         ( system->d_bonds, bond_list->index, bond_list->end_index, system->total_cap );
     cudaCheckError( );
 }
@@ -1720,8 +1719,8 @@ void Cuda_Init_Bond_Indices( reax_system *system, control_params * control,
  *
  * system: atomic system info.
  * H: charge matrix */
-void Cuda_Init_Sparse_Matrix_Indices( reax_system *system, control_params *control,
-        sparse_matrix *H )
+void Cuda_Init_Sparse_Matrix_Indices( reax_system *system, sparse_matrix *H,
+        cudaStream_t s )
 {
     int blocks;
 
@@ -1729,12 +1728,11 @@ void Cuda_Init_Sparse_Matrix_Indices( reax_system *system, control_params *contr
         + (H->n_max % DEF_BLOCK_SIZE == 0 ? 0 : 1);
 
     /* init indices */
-    Cuda_Scan_Excl_Sum( system->d_max_cm_entries, H->start, H->n_max,
-           control->streams[0] );
+    Cuda_Scan_Excl_Sum( system->d_max_cm_entries, H->start, H->n_max, s );
 
     //TODO: not needed for full format (Init_Forces sets H->end)
     /* init end_indices */
-    k_init_end_index <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>>
+    k_init_end_index <<< blocks, DEF_BLOCK_SIZE, 0, s >>>
         ( system->d_cm_entries, H->start, H->end, H->n_max );
     cudaCheckError( );
 }
@@ -1848,9 +1846,9 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
     static int dist_done = FALSE, cm_done = FALSE, bonds_done = FALSE, hbonds_done = FALSE;
 #if defined(LOG_PERFORMANCE)
     float time_elapsed;
-    cudaEvent_t time_event[4];
+    cudaEvent_t time_event[7];
     
-    for ( int i = 0; i < 4; ++i )
+    for ( int i = 0; i < 7; ++i )
     {
         cudaEventCreate( &time_event[i] );
     }
@@ -1859,13 +1857,21 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
     renbr = (data->step - data->prev_steps) % control->reneighbor == 0 ? TRUE : FALSE;
 
     /* reset reallocation flags on device */
-    sCudaMemsetAsync( system->d_realloc_cm_entries, FALSE, sizeof(int), 
-            control->streams[0], __FILE__, __LINE__ );
-    sCudaMemsetAsync( system->d_realloc_bonds, FALSE, sizeof(int), 
-            control->streams[0], __FILE__, __LINE__ );
-    sCudaMemsetAsync( system->d_realloc_hbonds, FALSE, sizeof(int), 
-            control->streams[0], __FILE__, __LINE__ );
-    cudaStreamSynchronize( control->streams[0] );
+    if ( cm_done == FALSE )
+    {
+        sCudaMemsetAsync( system->d_realloc_cm_entries, FALSE, sizeof(int), 
+                control->streams[4], __FILE__, __LINE__ );
+    }
+    if ( bonds_done == FALSE )
+    {
+        sCudaMemsetAsync( system->d_realloc_bonds, FALSE, sizeof(int), 
+                control->streams[1], __FILE__, __LINE__ );
+    }
+    if ( hbonds_done == FALSE )
+    {
+        sCudaMemsetAsync( system->d_realloc_hbonds, FALSE, sizeof(int), 
+                control->streams[2], __FILE__, __LINE__ );
+    }
 
 #if defined(LOG_PERFORMANCE)
     cudaEventRecord( time_event[0], control->streams[0] );
@@ -1883,11 +1889,13 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
             ( system->d_my_atoms, *(lists[FAR_NBRS]), system->N );
         cudaCheckError( );
 
+        cudaEventRecord( control->stream_events[0], control->streams[0] );
+
         dist_done = TRUE;
     }
 
 #if defined(LOG_PERFORMANCE)
-    cudaEventRecord( time_event[1], control->streams[0] );
+    cudaEventRecord( time_event[1], control->streams[4] );
 #endif
 
     if ( cm_done == FALSE )
@@ -1898,13 +1906,16 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
         /* update num. rows in matrix for this GPU */
         workspace->d_workspace->H.n = system->n;
 
-        Cuda_Init_Sparse_Matrix_Indices( system, control, &workspace->d_workspace->H );
+        Cuda_Init_Sparse_Matrix_Indices( system, &workspace->d_workspace->H,
+                control->streams[4] );
+
+        cudaStreamWaitEvent( control->streams[4], control->stream_events[0] );
 
         if ( workspace->d_workspace->H.format == SYM_HALF_MATRIX )
         {
             if ( control->tabulate <= 0 )
             {
-                k_init_cm_half_fs <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>>
+                k_init_cm_half_fs <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[4] >>>
                     ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp,
                       *(workspace->d_workspace), (control_params *) control->d_control_params,
                       *(lists[FAR_NBRS]), system->reax_param.num_atom_types,
@@ -1912,7 +1923,7 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
             }
             else
             {
-                k_init_cm_half_fs_tab <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>>
+                k_init_cm_half_fs_tab <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[4] >>>
                     ( system->d_my_atoms, system->reax_param.d_sbp,
                       *(workspace->d_workspace), (control_params *) control->d_control_params,
                       *(lists[FAR_NBRS]), workspace->d_LR, system->reax_param.num_atom_types,
@@ -1923,7 +1934,7 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
         {
             if ( control->tabulate <= 0 )
             {
-//                k_init_cm_full_fs <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>>
+//                k_init_cm_full_fs <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[4] >>>
 //                    ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp,
 //                      *(workspace->d_workspace), (control_params *) control->d_control_params,
 //                      *(lists[FAR_NBRS]), system->reax_param.num_atom_types,
@@ -1934,7 +1945,7 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
 
                 k_init_cm_full_fs_opt <<< blocks, DEF_BLOCK_SIZE,
                                       sizeof(cub::WarpScan<int>::TempStorage) * (DEF_BLOCK_SIZE / 32),
-                                      control->streams[0] >>>
+                                      control->streams[4] >>>
                     ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp,
                       *(workspace->d_workspace), (control_params *) control->d_control_params,
                       *(lists[FAR_NBRS]), system->reax_param.num_atom_types,
@@ -1943,7 +1954,7 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
             else
             {
                 k_init_cm_full_fs_tab <<< blocks, DEF_BLOCK_SIZE, 0,
-                                      control->streams[0] >>>
+                                      control->streams[4] >>>
                     ( system->d_my_atoms, system->reax_param.d_sbp,
                       *(workspace->d_workspace), (control_params *) control->d_control_params,
                       *(lists[FAR_NBRS]), workspace->d_LR, system->reax_param.num_atom_types,
@@ -1951,10 +1962,13 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
             }
         }
         cudaCheckError( );
+
+        cudaEventRecord( control->stream_events[1], control->streams[4] );
     }
 
 #if defined(LOG_PERFORMANCE)
-    cudaEventRecord( time_event[2], control->streams[0] );
+    cudaEventRecord( time_event[2], control->streams[4] );
+    cudaEventRecord( time_event[3], control->streams[1] );
 #endif
 
     if ( bonds_done == FALSE )
@@ -1962,13 +1976,15 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
         blocks = system->total_cap / DEF_BLOCK_SIZE
             + ((system->total_cap % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
 
-        k_reset_bond_orders <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[0] >>>
+        k_reset_bond_orders <<< blocks, DEF_BLOCK_SIZE, 0, control->streams[1] >>>
             ( *(workspace->d_workspace), system->total_cap );
         cudaCheckError( );
 
-        Cuda_Init_Bond_Indices( system, control, lists[BONDS] );
+        Cuda_Init_Bond_Indices( system, lists[BONDS], control->streams[1] );
 
-//        k_init_bonds <<< control->blocks_n, control->block_size_n, 0, control->streams[0] >>>
+        cudaStreamWaitEvent( control->streams[1], control->stream_events[0] );
+
+//        k_init_bonds <<< control->blocks_n, control->block_size_n, 0, control->streams[1] >>>
 //            ( system->d_my_atoms, system->reax_param.d_sbp,
 //              system->reax_param.d_tbp, *(workspace->d_workspace),
 //              (control_params *) control->d_control_params,
@@ -1982,7 +1998,7 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
 
         k_init_bonds_opt <<< blocks, DEF_BLOCK_SIZE,
                      sizeof(cub::WarpScan<int>::TempStorage) * (DEF_BLOCK_SIZE / 32),
-                     control->streams[0] >>>
+                     control->streams[1] >>>
             ( system->d_my_atoms, system->reax_param.d_sbp,
               system->reax_param.d_tbp, *(workspace->d_workspace),
               (control_params *) control->d_control_params,
@@ -1990,13 +2006,23 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
               system->n, system->N, system->reax_param.num_atom_types,
               system->d_max_bonds, system->d_realloc_bonds );
         cudaCheckError( );
+
+        cudaEventRecord( control->stream_events[2], control->streams[1] );
     }
 
+#if defined(LOG_PERFORMANCE)
+    cudaEventRecord( time_event[4], control->streams[1] );
+    cudaEventRecord( time_event[5], control->streams[2] );
+#endif
+
     if ( control->hbond_cut > 0.0 && system->numH > 0 && hbonds_done == FALSE )
     {
-        Cuda_Init_HBond_Indices( system, control, workspace, lists[HBONDS] );
+        Cuda_Init_HBond_Indices( system, workspace, lists[HBONDS],
+                control->streams[2] );
 
-//        k_init_hbonds <<< control->blocks_n, control->block_size_n, 0, control->streams[0] >>>
+        cudaStreamWaitEvent( control->streams[2], control->stream_events[0] );
+
+//        k_init_hbonds <<< control->blocks_n, control->block_size_n, 0, control->streams[2] >>>
 //            ( system->d_my_atoms, system->reax_param.d_sbp,
 //              (control_params *) control->d_control_params,
 //              *(lists[FAR_NBRS]), *(lists[HBONDS]),
@@ -2009,17 +2035,18 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
 
         k_init_hbonds_opt <<< blocks, DEF_BLOCK_SIZE,
                           sizeof(cub::WarpScan<int>::TempStorage) * (DEF_BLOCK_SIZE / 32),
-                          control->streams[0] >>>
+                          control->streams[2] >>>
             ( system->d_my_atoms, system->reax_param.d_sbp,
               (control_params *) control->d_control_params,
               *(lists[FAR_NBRS]), *(lists[HBONDS]),
               system->n, system->N, system->reax_param.num_atom_types,
               system->d_max_hbonds, system->d_realloc_hbonds );
         cudaCheckError( );
-    }
 
+        cudaEventRecord( control->stream_events[3], control->streams[2] );
+    }
 #if defined(LOG_PERFORMANCE)
-    cudaEventRecord( time_event[3], control->streams[0] );
+    cudaEventRecord( time_event[6], control->streams[2] );
 #endif
 
     /* check reallocation flags on device */
@@ -2035,7 +2062,7 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
     if ( bonds_done == FALSE )
     {
         sCudaMemcpyAsync( &realloc_bonds, system->d_realloc_bonds, sizeof(int), 
-                cudaMemcpyDeviceToHost, control->streams[0], __FILE__, __LINE__ );
+                cudaMemcpyDeviceToHost, control->streams[1], __FILE__, __LINE__ );
     }
     else
     {
@@ -2044,14 +2071,16 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
     if ( hbonds_done == FALSE )
     {
         sCudaMemcpyAsync( &realloc_hbonds, system->d_realloc_hbonds, sizeof(int), 
-                cudaMemcpyDeviceToHost, control->streams[0], __FILE__, __LINE__ );
+                cudaMemcpyDeviceToHost, control->streams[2], __FILE__, __LINE__ );
     }
     else
     {
         realloc_hbonds = FALSE;
     }
 
-    cudaStreamSynchronize( control->streams[0] );
+    cudaStreamSynchronize( control->streams[4] );
+    cudaStreamSynchronize( control->streams[1] );
+    cudaStreamSynchronize( control->streams[2] );
 
 #if defined(LOG_PERFORMANCE)
     if ( cudaEventQuery( time_event[0] ) != cudaSuccess ) 
@@ -2080,10 +2109,28 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
         cudaEventSynchronize( time_event[3] );
     }
 
-    cudaEventElapsedTime( &time_elapsed, time_event[2], time_event[3] ); 
+    if ( cudaEventQuery( time_event[4] ) != cudaSuccess ) 
+    {
+        cudaEventSynchronize( time_event[4] );
+    }
+
+    cudaEventElapsedTime( &time_elapsed, time_event[3], time_event[4] ); 
+    data->timing.init_bond += (real) (time_elapsed / 1000.0);
+
+    if ( cudaEventQuery( time_event[5] ) != cudaSuccess ) 
+    {
+        cudaEventSynchronize( time_event[5] );
+    }
+
+    if ( cudaEventQuery( time_event[6] ) != cudaSuccess ) 
+    {
+        cudaEventSynchronize( time_event[6] );
+    }
+
+    cudaEventElapsedTime( &time_elapsed, time_event[5], time_event[6] ); 
     data->timing.init_bond += (real) (time_elapsed / 1000.0);
     
-    for ( int i = 0; i < 4; ++i )
+    for ( int i = 0; i < 7; ++i )
     {
         cudaEventDestroy( time_event[i] );
     }
@@ -2204,7 +2251,7 @@ int Cuda_Init_Forces_No_Charges( reax_system *system, control_params *control,
             ( *(workspace->d_workspace), system->total_cap );
         cudaCheckError( );
 
-        Cuda_Init_Bond_Indices( system, control, lists[BONDS] );
+        Cuda_Init_Bond_Indices( system, lists[BONDS], control->streams[0] );
 
 //        k_init_bonds <<< control->blocks_n, control->block_size_n, 0, control->streams[0] >>>
 //            ( system->d_my_atoms, system->reax_param.d_sbp,
@@ -2231,7 +2278,8 @@ int Cuda_Init_Forces_No_Charges( reax_system *system, control_params *control,
 
     if ( control->hbond_cut > 0.0 && system->numH > 0 && hbonds_done == FALSE )
     {
-        Cuda_Init_HBond_Indices( system, control, workspace, lists[HBONDS] );
+        Cuda_Init_HBond_Indices( system, workspace, lists[HBONDS],
+                control->streams[0] );
 
 //        k_init_hbonds <<< control->blocks_n, control->block_size_n, 0, control->streams[0] >>>
 //            ( system->d_my_atoms, system->reax_param.d_sbp,
@@ -2441,13 +2489,13 @@ extern "C" int Cuda_Compute_Forces( reax_system *system, control_params *control
         simulation_data *data, storage *workspace, reax_list **lists,
         output_controls *out_control, mpi_datatypes *mpi_data )
 {
-    int charge_flag, ret;
+    int i, charge_flag, ret;
     static int init_forces_done = FALSE;
 #if defined(LOG_PERFORMANCE)
     float time_elapsed;
     cudaEvent_t time_event[6];
     
-    for ( int i = 0; i < 6; ++i )
+    for ( i = 0; i < 6; ++i )
     {
         cudaEventCreate( &time_event[i] );
     }
@@ -2521,6 +2569,11 @@ extern "C" int Cuda_Compute_Forces( reax_system *system, control_params *control
         cudaEventRecord( time_event[4], control->streams[0] );
 #endif
 
+        for ( i = 0; i < MAX_CUDA_STREAMS; ++i )
+        {
+            cudaStreamSynchronize( control->streams[i] );
+        }
+
         Cuda_Compute_Total_Force( system, control, data, workspace, lists, mpi_data );
 
 #if defined(LOG_PERFORMANCE)
@@ -2579,7 +2632,7 @@ extern "C" int Cuda_Compute_Forces( reax_system *system, control_params *control
         data->timing.bonded += (real) (time_elapsed / 1000.0);
     }
     
-    for ( int i = 0; i < 6; ++i )
+    for ( i = 0; i < 6; ++i )
     {
         cudaEventDestroy( time_event[i] );
     }
diff --git a/PG-PuReMD/src/cuda/cuda_forces.h b/PG-PuReMD/src/cuda/cuda_forces.h
index 3347db7a..fde3edf9 100644
--- a/PG-PuReMD/src/cuda/cuda_forces.h
+++ b/PG-PuReMD/src/cuda/cuda_forces.h
@@ -7,13 +7,13 @@
 
 void Cuda_Init_Neighbor_Indices( reax_system *, control_params *, reax_list * );
 
-void Cuda_Init_HBond_Indices( reax_system *, control_params *, storage *,
-        reax_list * );
+void Cuda_Init_HBond_Indices( reax_system *, storage *, reax_list *,
+        cudaStream_t );
 
-void Cuda_Init_Bond_Indices( reax_system *, control_params *, reax_list * );
+void Cuda_Init_Bond_Indices( reax_system *, reax_list *, cudaStream_t );
 
-void Cuda_Init_Sparse_Matrix_Indices( reax_system *, control_params *,
-        sparse_matrix * );
+void Cuda_Init_Sparse_Matrix_Indices( reax_system *, sparse_matrix *,
+       cudaStream_t );
 
 void Cuda_Init_Three_Body_Indices( int *, int, reax_list ** );
 
diff --git a/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu
index c9db3090..ecb88b49 100644
--- a/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu
+++ b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu
@@ -749,10 +749,10 @@ void Cuda_Compute_Hydrogen_Bonds( reax_system *system, control_params *control,
     real *spad;
     rvec *rvec_spad;
 
-    sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size,
+    sCudaCheckMalloc( &workspace->scratch[0], &workspace->scratch_size[0],
             (sizeof(real) * 3 + sizeof(rvec)) * system->N + sizeof(rvec) * control->blocks_n,
             __FILE__, __LINE__ );
-    spad = (real *) workspace->scratch;
+    spad = (real *) workspace->scratch[0];
     update_energy = (out_control->energy_update_freq > 0
             && data->step % out_control->energy_update_freq == 0) ? TRUE : FALSE;
 #else
@@ -763,7 +763,6 @@ void Cuda_Compute_Hydrogen_Bonds( reax_system *system, control_params *control,
         sCudaMemsetAsync( &((simulation_data *)data->d_simulation_data)->my_ext_press,
                 0, sizeof(rvec), control->streams[0], __FILE__, __LINE__ );
     }
-    cudaStreamSynchronize( control->streams[0] );
 #endif
 
     if ( control->virial == 1 )
diff --git a/PG-PuReMD/src/cuda/cuda_init_md.cu b/PG-PuReMD/src/cuda/cuda_init_md.cu
index ad777753..43b33bd0 100644
--- a/PG-PuReMD/src/cuda/cuda_init_md.cu
+++ b/PG-PuReMD/src/cuda/cuda_init_md.cu
@@ -210,17 +210,19 @@ void Cuda_Init_Lists( reax_system *system, control_params *control,
 
     Cuda_Allocate_Matrix( &workspace->d_workspace->H, system->n,
             system->local_cap, system->total_cm_entries, SYM_FULL_MATRIX );
-    Cuda_Init_Sparse_Matrix_Indices( system, control, &workspace->d_workspace->H );
+    Cuda_Init_Sparse_Matrix_Indices( system, &workspace->d_workspace->H,
+           control->streams[0] );
 
     Cuda_Make_List( system->total_cap, system->total_bonds,
             TYP_BOND, lists[BONDS] );
-    Cuda_Init_Bond_Indices( system, control, lists[BONDS] );
+    Cuda_Init_Bond_Indices( system, lists[BONDS], control->streams[0] );
 
     if ( control->hbond_cut > 0.0 && system->numH > 0 )
     {
         Cuda_Make_List( system->total_cap, system->total_hbonds,
                 TYP_HBOND, lists[HBONDS] );
-        Cuda_Init_HBond_Indices( system, control, workspace, lists[HBONDS] );
+        Cuda_Init_HBond_Indices( system, workspace, lists[HBONDS],
+                control->streams[0] );
     }
 
     /* 3bodies list: since a more accurate estimate of the num.
@@ -234,6 +236,8 @@ extern "C" void Cuda_Initialize( reax_system *system, control_params *control,
         reax_list **lists, output_controls *out_control,
         mpi_datatypes *mpi_data )
 {
+    int i;
+
     Init_MPI_Datatypes( system, workspace, mpi_data );
 
 #if defined(CUDA_DEVICE_PACK)
@@ -249,7 +253,7 @@ extern "C" void Cuda_Initialize( reax_system *system, control_params *control,
     mpi_data->d_in2_buffer = NULL;
     mpi_data->d_in2_buffer_size = 0;
 
-    for ( int i = 0; i < MAX_NBRS; ++i )
+    for ( i = 0; i < MAX_NBRS; ++i )
     {
         mpi_data->d_out_buffers[i].cnt = 0;
         mpi_data->d_out_buffers[i].index = NULL;
@@ -263,8 +267,11 @@ extern "C" void Cuda_Initialize( reax_system *system, control_params *control,
 
     /* scratch space - set before Cuda_Init_Workspace
      * as Cuda_Init_System utilizes these variables */
-    workspace->scratch = NULL;
-    workspace->scratch_size = 0;
+    for ( i = 0; i < MAX_CUDA_STREAMS; ++i )
+    {
+        workspace->scratch[i] = NULL;
+        workspace->scratch_size[i] = 0;
+    }
     workspace->host_scratch = NULL;
     workspace->host_scratch_size = 0;
 
diff --git a/PG-PuReMD/src/cuda/cuda_multi_body.cu b/PG-PuReMD/src/cuda/cuda_multi_body.cu
index d7d81cc7..e932dfec 100644
--- a/PG-PuReMD/src/cuda/cuda_multi_body.cu
+++ b/PG-PuReMD/src/cuda/cuda_multi_body.cu
@@ -548,10 +548,10 @@ void Cuda_Compute_Atom_Energy( reax_system *system, control_params *control,
     int update_energy;
     real *spad;
 
-    sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size,
+    sCudaCheckMalloc( &workspace->scratch[0], &workspace->scratch_size[0],
             sizeof(real) * 3 * system->n, __FILE__, __LINE__ );
 
-    spad = (real *) workspace->scratch;
+    spad = (real *) workspace->scratch[0];
     update_energy = (out_control->energy_update_freq > 0
             && data->step % out_control->energy_update_freq == 0) ? TRUE : FALSE;
 #else
@@ -561,7 +561,6 @@ void Cuda_Compute_Atom_Energy( reax_system *system, control_params *control,
             0, sizeof(real), control->streams[0], __FILE__, __LINE__ );
     sCudaMemsetAsync( &((simulation_data *)data->d_simulation_data)->my_en.e_un,
             0, sizeof(real), control->streams[0], __FILE__, __LINE__ );
-    cudaStreamSynchronize( control->streams[0] );
 #endif
 
 //    k_atom_energy_part1 <<< control->blocks, control->block_size, 0, control->streams[0] >>>
diff --git a/PG-PuReMD/src/cuda/cuda_nonbonded.cu b/PG-PuReMD/src/cuda/cuda_nonbonded.cu
index 6f7a425a..8f2936a6 100644
--- a/PG-PuReMD/src/cuda/cuda_nonbonded.cu
+++ b/PG-PuReMD/src/cuda/cuda_nonbonded.cu
@@ -835,20 +835,20 @@ static void Cuda_Compute_Polarization_Energy( reax_system *system,
 #if !defined(CUDA_ACCUM_ATOMIC)
     real *spad;
 
-    sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size,
+    sCudaCheckMalloc( &workspace->scratch[0], &workspace->scratch_size[0],
             sizeof(real) * system->n, __FILE__, __LINE__ );
-    spad = (real *) workspace->scratch;
+    spad = (real *) workspace->scratch[0];
 #else
     sCudaMemsetAsync( &((simulation_data *)data->d_simulation_data)->my_en.e_pol,
-            0, sizeof(real), control->streams[0], __FILE__, __LINE__ );
-    cudaStreamSynchronize( control->streams[0] );
+            0, sizeof(real), control->streams[4], __FILE__, __LINE__ );
+    cudaStreamSynchronize( control->streams[4] );
 #endif
 
     blocks = system->n / DEF_BLOCK_SIZE
         + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
 
     k_compute_polarization_energy <<< blocks, DEF_BLOCK_SIZE, 0,
-                                  control->streams[0] >>>
+                                  control->streams[4] >>>
         ( system->d_my_atoms, system->reax_param.d_sbp, 
           system->n,
 #if !defined(CUDA_ACCUM_ATOMIC)
@@ -890,22 +890,22 @@ void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control
     {
         s = sizeof(real) * 2 * system->n;
     }
-    sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size,
+    sCudaCheckMalloc( &workspace->scratch[0], &workspace->scratch_size[0],
             s, __FILE__, __LINE__ );
-    spad = (real *) workspace->scratch;
+    spad = (real *) workspace->scratch[0];
 #endif
 
 #if defined(CUDA_ACCUM_ATOMIC)
         sCudaMemsetAsync( &((simulation_data *)data->d_simulation_data)->my_en.e_vdW,
-                0, sizeof(real), control->streams[0], __FILE__, __LINE__ );
+                0, sizeof(real), control->streams[4], __FILE__, __LINE__ );
         sCudaMemsetAsync( &((simulation_data *)data->d_simulation_data)->my_en.e_ele,
-                0, sizeof(real), control->streams[0], __FILE__, __LINE__ );
+                0, sizeof(real), control->streams[4], __FILE__, __LINE__ );
         if ( control->virial == 1 )
         {
             sCudaMemsetAsync( &((simulation_data *)data->d_simulation_data)->my_ext_press,
-                    0, sizeof(rvec), control->streams[0], __FILE__, __LINE__ );
+                    0, sizeof(rvec), control->streams[4], __FILE__, __LINE__ );
         }
-        cudaStreamSynchronize( control->streams[0] );
+        cudaStreamSynchronize( control->streams[4] );
 #endif
 
     blocks = system->n * 32 / DEF_BLOCK_SIZE
@@ -916,7 +916,7 @@ void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control
         if ( control->virial == 1 )
         {
 //            k_vdW_coulomb_energy_virial_full <<< control->blocks, control->block_size,
-//                                             0, control->streams[0] >>>
+//                                             0, control->streams[4] >>>
 //                ( system->d_my_atoms, system->reax_param.d_tbp, 
 //                  system->reax_param.d_gp, (control_params *) control->d_control_params, 
 //                  *(workspace->d_workspace), *(lists[FAR_NBRS]), 
@@ -932,7 +932,7 @@ void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control
 
         k_vdW_coulomb_energy_virial_full_opt <<< blocks, DEF_BLOCK_SIZE,
                                  sizeof(real) * (DEF_BLOCK_SIZE / 32),
-                                 control->streams[0] >>>
+                                 control->streams[4] >>>
             ( system->d_my_atoms, system->reax_param.d_tbp, 
               system->reax_param.d_gp, (control_params *) control->d_control_params, 
               *(workspace->d_workspace), *(lists[FAR_NBRS]), 
@@ -949,7 +949,7 @@ void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control
         else
         {
 //            k_vdW_coulomb_energy_full <<< control->blocks, control->block_size,
-//                                      0, control->streams[0] >>>
+//                                      0, control->streams[4] >>>
 //                ( system->d_my_atoms, system->reax_param.d_tbp, 
 //                  system->reax_param.d_gp, (control_params *) control->d_control_params, 
 //                  *(workspace->d_workspace), *(lists[FAR_NBRS]), 
@@ -964,7 +964,7 @@ void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control
 
         k_vdW_coulomb_energy_full_opt <<< blocks, DEF_BLOCK_SIZE,
                                  sizeof(real) * (DEF_BLOCK_SIZE / 32),
-                                 control->streams[0] >>>
+                                 control->streams[4] >>>
             ( system->d_my_atoms, system->reax_param.d_tbp, 
               system->reax_param.d_gp, (control_params *) control->d_control_params, 
               *(workspace->d_workspace), *(lists[FAR_NBRS]), 
@@ -982,7 +982,7 @@ void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control
     else
     {
         k_vdW_coulomb_energy_tab_full <<< control->blocks, control->block_size,
-                                      0, control->streams[0] >>>
+                                      0, control->streams[4] >>>
             ( system->d_my_atoms, system->reax_param.d_gp, 
               (control_params *) control->d_control_params, 
               *(workspace->d_workspace), *(lists[FAR_NBRS]), 
@@ -1022,13 +1022,13 @@ void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control
         /* reduction for ext_press */
         k_reduction_rvec <<< control->blocks, control->block_size,
                          sizeof(rvec) * (control->block_size / 32),
-                         control->streams[0] >>>
+                         control->streams[4] >>>
             ( spad_rvec, &spad_rvec[system->n], system->n );
         cudaCheckError( );
 
         k_reduction_rvec <<< 1, control->blocks_pow_2,
                          sizeof(rvec) * (control->blocks_pow_2 / 32),
-                         control->streams[0] >>>
+                         control->streams[4] >>>
             ( &spad_rvec[system->n],
               &((simulation_data *)data->d_simulation_data)->my_ext_press,
               control->blocks );
diff --git a/PG-PuReMD/src/cuda/cuda_reset_tools.cu b/PG-PuReMD/src/cuda/cuda_reset_tools.cu
index e957d352..73ebf11a 100644
--- a/PG-PuReMD/src/cuda/cuda_reset_tools.cu
+++ b/PG-PuReMD/src/cuda/cuda_reset_tools.cu
@@ -75,9 +75,9 @@ void Cuda_Reset_Atoms_HBond_Indices( reax_system* system, control_params *contro
 #if !defined(CUDA_ACCUM_ATOMIC)
     int *hindex;
 
-    sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size,
+    sCudaCheckMalloc( &workspace->scratch[0], &workspace->scratch_size[0],
             sizeof(int) * system->total_cap, __FILE__, __LINE__ );
-    hindex = (int *) workspace->scratch;
+    hindex = (int *) workspace->scratch[0];
 #endif
 
     k_reset_hindex <<< control->blocks_n, control->block_size_n, 0,
@@ -97,6 +97,7 @@ void Cuda_Reset_Atoms_HBond_Indices( reax_system* system, control_params *contro
 
     sCudaMemcpyAsync( &system->numH, system->d_numH, sizeof(int), 
             cudaMemcpyDeviceToHost, control->streams[0], __FILE__, __LINE__ );
+
     cudaStreamSynchronize( control->streams[0] );
 }
 
diff --git a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
index 15210720..0f31b8c3 100644
--- a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
+++ b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
@@ -602,15 +602,15 @@ static void Dual_Sparse_MatVec_Comm_Part1( const reax_system * const system,
     spad = (rvec2 *) workspace->host_scratch;
 
     sCudaMemcpyAsync( spad, (void *) x, sizeof(rvec2) * n,
-            cudaMemcpyDeviceToHost, control->streams[0], __FILE__, __LINE__ );
-    cudaStreamSynchronize( control->streams[0] );
+            cudaMemcpyDeviceToHost, control->streams[4], __FILE__, __LINE__ );
+
+    cudaStreamSynchronize( control->streams[4] );
 
     /* exploit 3D domain decomposition of simulation space with 3-stage communication pattern */
     Dist( system, mpi_data, spad, buf_type, mpi_type );
 
     sCudaMemcpyAsync( (void *) x, spad, sizeof(rvec2) * n,
-            cudaMemcpyHostToDevice, control->streams[0], __FILE__, __LINE__ );
-    cudaStreamSynchronize( control->streams[0] );
+            cudaMemcpyHostToDevice, control->streams[4], __FILE__, __LINE__ );
 #endif
 }
 
@@ -633,7 +633,6 @@ static void Dual_Sparse_MatVec_local( control_params const * const control,
     {
         /* half-format requires entries of b be initialized to zero */
         sCudaMemsetAsync( b, 0, sizeof(rvec2) * n, s, __FILE__, __LINE__ );
-        cudaStreamSynchronize( s );
 
         /* 1 thread per row implementation */
 //        k_dual_sparse_matvec_half_csr <<< control->blocks, control->block_size, 0, s >>>
@@ -699,15 +698,16 @@ static void Dual_Sparse_MatVec_Comm_Part2( const reax_system * const system,
                 sizeof(rvec2) * n1, TRUE, SAFE_ZONE,
                 "Dual_Sparse_MatVec_Comm_Part2::workspace->host_scratch" );
         spad = (rvec2 *) workspace->host_scratch;
+
         sCudaMemcpyAsync( spad, b, sizeof(rvec2) * n1,
-                cudaMemcpyDeviceToHost, control->streams[0], __FILE__, __LINE__ );
-        cudaStreamSynchronize( control->streams[0] );
+                cudaMemcpyDeviceToHost, control->streams[4], __FILE__, __LINE__ );
+
+        cudaStreamSynchronize( control->streams[4] );
 
         Coll( system, mpi_data, spad, buf_type, mpi_type );
 
         sCudaMemcpyAsync( b, spad, sizeof(rvec2) * n2,
-                cudaMemcpyHostToDevice, control->streams[0], __FILE__, __LINE__ );
-        cudaStreamSynchronize( control->streams[0] );
+                cudaMemcpyHostToDevice, control->streams[4], __FILE__, __LINE__ );
 #endif
     }
 }
@@ -743,7 +743,7 @@ static void Dual_Sparse_MatVec( const reax_system * const system,
     Update_Timing_Info( &time, &data->timing.cm_solver_comm );
 #endif
 
-    Dual_Sparse_MatVec_local( control, A, x, b, n, control->streams[0] );
+    Dual_Sparse_MatVec_local( control, A, x, b, n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
     Update_Timing_Info( &time, &data->timing.cm_solver_spmv );
@@ -783,16 +783,17 @@ static void Sparse_MatVec_Comm_Part1( const reax_system * const system,
             sizeof(real) * n, TRUE, SAFE_ZONE,
             "Sparse_MatVec_Comm_Part1::workspace->host_scratch" );
     spad = (real *) workspace->host_scratch;
+
     sCudaMemcpyAsync( spad, (void *) x, sizeof(real) * n,
-            cudaMemcpyDeviceToHost, control->streams[0], __FILE__, __LINE__ );
-    cudaStreamSynchronize( control->streams[0] );
+            cudaMemcpyDeviceToHost, control->streams[4], __FILE__, __LINE__ );
+
+    cudaStreamSynchronize( control->streams[4] );
 
     /* exploit 3D domain decomposition of simulation space with 3-stage communication pattern */
     Dist( system, mpi_data, spad, buf_type, mpi_type );
 
     sCudaMemcpyAsync( (void *) x, spad, sizeof(real) * n,
-            cudaMemcpyHostToDevice, control->streams[0], __FILE__, __LINE__ );
-    cudaStreamSynchronize( control->streams[0] );
+            cudaMemcpyHostToDevice, control->streams[4], __FILE__, __LINE__ );
 #endif
 }
 
@@ -815,7 +816,6 @@ static void Sparse_MatVec_local( control_params const * const control,
     {
         /* half-format requires entries of b be initialized to zero */
         sCudaMemsetAsync( b, 0, sizeof(real) * n, s, __FILE__, __LINE__ );
-        cudaStreamSynchronize( s );
 
         /* 1 thread per row implementation */
 //        k_sparse_matvec_half_csr <<< control->blocks, control->block_size, 0, s >>>
@@ -879,15 +879,16 @@ static void Sparse_MatVec_Comm_Part2( const reax_system * const system,
                 sizeof(real) * n1, TRUE, SAFE_ZONE,
                 "Sparse_MatVec_Comm_Part2::workspace->host_scratch" );
         spad = (real *) workspace->host_scratch;
+
         sCudaMemcpyAsync( spad, b, sizeof(real) * n1,
-                cudaMemcpyDeviceToHost, control->streams[0], __FILE__, __LINE__ );
-        cudaStreamSynchronize( control->streams[0] );
+                cudaMemcpyDeviceToHost, control->streams[4], __FILE__, __LINE__ );
+
+        cudaStreamSynchronize( control->streams[4] );
 
         Coll( system, mpi_data, spad, buf_type, mpi_type );
 
         sCudaMemcpyAsync( b, spad, sizeof(real) * n2,
-                cudaMemcpyHostToDevice, control->streams[0], __FILE__, __LINE__ );
-        cudaStreamSynchronize( control->streams[0] );
+                cudaMemcpyHostToDevice, control->streams[4], __FILE__, __LINE__ );
 #endif
     }
 }
@@ -923,7 +924,7 @@ static void Sparse_MatVec( reax_system const * const system,
     Update_Timing_Info( &time, &data->timing.cm_solver_comm );
 #endif
 
-    Sparse_MatVec_local( control, A, x, b, n, control->streams[0] );
+    Sparse_MatVec_local( control, A, x, b, n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
     Update_Timing_Info( &time, &data->timing.cm_solver_spmv );
@@ -967,15 +968,15 @@ int Cuda_dual_SDM( reax_system const * const system,
 #endif
 
     dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r2,
-            workspace->d_workspace->d2, system->n, control->streams[0] );
+            workspace->d_workspace->d2, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
     Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
 #endif
 
-    Dot_local_rvec2( workspace, b, b, system->n, &redux[0], &redux[1], control->streams[0] );
+    Dot_local_rvec2( workspace, b, b, system->n, &redux[0], &redux[1], control->streams[4] );
     Dot_local_rvec2( workspace, workspace->d_workspace->r2,
-            workspace->d_workspace->d2, system->n, &redux[2], &redux[3], control->streams[0] );
+            workspace->d_workspace->d2, system->n, &redux[2], &redux[3], control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
     Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
@@ -1008,9 +1009,9 @@ int Cuda_dual_SDM( reax_system const * const system,
 #endif
 
         Dot_local_rvec2( workspace, workspace->d_workspace->r2,
-                workspace->d_workspace->d2, system->n, &redux[0], &redux[1], control->streams[0] );
+                workspace->d_workspace->d2, system->n, &redux[0], &redux[1], control->streams[4] );
         Dot_local_rvec2( workspace, workspace->d_workspace->d2,
-                workspace->d_workspace->q2, system->n, &redux[2], &redux[3], control->streams[0] );
+                workspace->d_workspace->q2, system->n, &redux[2], &redux[3], control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
@@ -1039,7 +1040,7 @@ int Cuda_dual_SDM( reax_system const * const system,
 #endif
 
         dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r2,
-                workspace->d_workspace->d2, system->n, control->streams[0] );
+                workspace->d_workspace->d2, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
@@ -1118,15 +1119,15 @@ int Cuda_SDM( reax_system const * const system, control_params const * const con
 #endif
 
     jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r,
-            workspace->d_workspace->d, system->n, control->streams[0] );
+            workspace->d_workspace->d, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
     Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
 #endif
 
-    redux[0] = Dot_local( workspace, b, b, system->n, control->streams[0] );
+    redux[0] = Dot_local( workspace, b, b, system->n, control->streams[4] );
     redux[1] = Dot_local( workspace, workspace->d_workspace->r,
-            workspace->d_workspace->d, system->n, control->streams[0] );
+            workspace->d_workspace->d, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
     Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
@@ -1152,9 +1153,9 @@ int Cuda_SDM( reax_system const * const system, control_params const * const con
 #endif
 
         redux[0] = Dot_local( workspace, workspace->d_workspace->r,
-                workspace->d_workspace->d, system->n, control->streams[0] );
+                workspace->d_workspace->d, system->n, control->streams[4] );
         redux[1] = Dot_local( workspace, workspace->d_workspace->d,
-                workspace->d_workspace->q, system->n, control->streams[0] );
+                workspace->d_workspace->q, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
@@ -1180,7 +1181,7 @@ int Cuda_SDM( reax_system const * const system, control_params const * const con
 #endif
 
         jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r,
-                workspace->d_workspace->d, system->n, control->streams[0] );
+                workspace->d_workspace->d, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
@@ -1229,17 +1230,17 @@ int Cuda_dual_CG( reax_system const * const system,
 #endif
 
     dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r2,
-            workspace->d_workspace->d2, system->n, control->streams[0] );
+            workspace->d_workspace->d2, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
     Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
 #endif
 
     Dot_local_rvec2( workspace, workspace->d_workspace->r2,
-            workspace->d_workspace->d2, system->n, &redux[0], &redux[1], control->streams[0] );
+            workspace->d_workspace->d2, system->n, &redux[0], &redux[1], control->streams[4] );
     Dot_local_rvec2( workspace, workspace->d_workspace->d2,
-            workspace->d_workspace->d2, system->n, &redux[2], &redux[3], control->streams[0] );
-    Dot_local_rvec2( workspace, b, b, system->n, &redux[4], &redux[5], control->streams[0] );
+            workspace->d_workspace->d2, system->n, &redux[2], &redux[3], control->streams[4] );
+    Dot_local_rvec2( workspace, b, b, system->n, &redux[4], &redux[5], control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
     Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
@@ -1273,7 +1274,7 @@ int Cuda_dual_CG( reax_system const * const system,
 #endif
 
         Dot_local_rvec2( workspace, workspace->d_workspace->d2,
-                workspace->d_workspace->q2, system->n, &redux[0], &redux[1], control->streams[0] );
+                workspace->d_workspace->q2, system->n, &redux[0], &redux[1], control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
@@ -1300,16 +1301,16 @@ int Cuda_dual_CG( reax_system const * const system,
 #endif
 
         dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r2,
-                workspace->d_workspace->p2, system->n, control->streams[0] );
+                workspace->d_workspace->p2, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
 #endif
 
         Dot_local_rvec2( workspace, workspace->d_workspace->r2,
-                workspace->d_workspace->p2, system->n, &redux[0], &redux[1], control->streams[0] );
+                workspace->d_workspace->p2, system->n, &redux[0], &redux[1], control->streams[4] );
         Dot_local_rvec2( workspace, workspace->d_workspace->p2,
-                workspace->d_workspace->p2, system->n, &redux[2], &redux[3], control->streams[0] );
+                workspace->d_workspace->p2, system->n, &redux[2], &redux[3], control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
@@ -1413,17 +1414,17 @@ int Cuda_CG( reax_system const * const system, control_params const * const cont
 #endif
 
     jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r,
-            workspace->d_workspace->d, system->n, control->streams[0] );
+            workspace->d_workspace->d, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
     Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
 #endif
 
     redux[0] = Dot_local( workspace, workspace->d_workspace->r,
-            workspace->d_workspace->d, system->n, control->streams[0] );
+            workspace->d_workspace->d, system->n, control->streams[4] );
     redux[1] = Dot_local( workspace, workspace->d_workspace->d,
-            workspace->d_workspace->d, system->n, control->streams[0] );
-    redux[2] = Dot_local( workspace, b, b, system->n, control->streams[0] );
+            workspace->d_workspace->d, system->n, control->streams[4] );
+    redux[2] = Dot_local( workspace, b, b, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
     Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
@@ -1450,7 +1451,7 @@ int Cuda_CG( reax_system const * const system, control_params const * const cont
 #endif
 
         tmp = Dot( workspace, workspace->d_workspace->d, workspace->d_workspace->q,
-                system->n, MPI_COMM_WORLD, control->streams[0] );
+                system->n, MPI_COMM_WORLD, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
@@ -1466,16 +1467,16 @@ int Cuda_CG( reax_system const * const system, control_params const * const cont
 #endif
 
         jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r,
-                workspace->d_workspace->p, system->n, control->streams[0] );
+                workspace->d_workspace->p, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
 #endif
 
         redux[0] = Dot_local( workspace, workspace->d_workspace->r,
-                workspace->d_workspace->p, system->n, control->streams[0] );
+                workspace->d_workspace->p, system->n, control->streams[4] );
         redux[1] = Dot_local( workspace, workspace->d_workspace->p,
-                workspace->d_workspace->p, system->n, control->streams[0] );
+                workspace->d_workspace->p, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
@@ -1552,9 +1553,9 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const *
     Vector_Sum_rvec2( workspace->d_workspace->r2, 1.0, 1.0, b,
             -1.0, -1.0, workspace->d_workspace->d2, system->n );
     Dot_local_rvec2( workspace, b,
-            b, system->n, &redux[0], &redux[1], control->streams[0] );
+            b, system->n, &redux[0], &redux[1], control->streams[4] );
     Dot_local_rvec2( workspace, workspace->d_workspace->r2,
-            workspace->d_workspace->r2, system->n, &redux[2], &redux[3], control->streams[0] );
+            workspace->d_workspace->r2, system->n, &redux[2], &redux[3], control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
     Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
@@ -1599,7 +1600,7 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const *
         }
 
         Dot_local_rvec2( workspace, workspace->d_workspace->r_hat2,
-                workspace->d_workspace->r2, system->n, &redux[0], &redux[1], control->streams[0] );
+                workspace->d_workspace->r2, system->n, &redux[0], &redux[1], control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
@@ -1641,7 +1642,7 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const *
 #endif
 
         dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->p2,
-                workspace->d_workspace->d2, system->n, control->streams[0] );
+                workspace->d_workspace->d2, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
@@ -1655,7 +1656,7 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const *
 #endif
 
         Dot_local_rvec2( workspace, workspace->d_workspace->r_hat2,
-                workspace->d_workspace->z2, system->n, &redux[0], &redux[1], control->streams[0] );
+                workspace->d_workspace->z2, system->n, &redux[0], &redux[1], control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
@@ -1677,7 +1678,7 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const *
                 1.0, 1.0, workspace->d_workspace->r2,
                 -1.0 * alpha[0], -1.0 * alpha[1], workspace->d_workspace->z2, system->n );
         Dot_local_rvec2( workspace, workspace->d_workspace->q2,
-                workspace->d_workspace->q2, system->n, &redux[0], &redux[1], control->streams[0] );
+                workspace->d_workspace->q2, system->n, &redux[0], &redux[1], control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
@@ -1705,7 +1706,7 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const *
 #endif
 
         dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->q2,
-                workspace->d_workspace->q_hat2, system->n, control->streams[0] );
+                workspace->d_workspace->q_hat2, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
@@ -1719,9 +1720,9 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const *
 #endif
 
         Dot_local_rvec2( workspace, workspace->d_workspace->y2,
-                workspace->d_workspace->q2, system->n, &redux[0], &redux[1], control->streams[0] );
+                workspace->d_workspace->q2, system->n, &redux[0], &redux[1], control->streams[4] );
         Dot_local_rvec2( workspace, workspace->d_workspace->y2,
-                workspace->d_workspace->y2, system->n, &redux[2], &redux[3], control->streams[0] );
+                workspace->d_workspace->y2, system->n, &redux[2], &redux[3], control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
@@ -1749,7 +1750,7 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const *
                 1.0, 1.0, workspace->d_workspace->q2,
                 -1.0 * omega[0], -1.0 * omega[1], workspace->d_workspace->y2, system->n );
         Dot_local_rvec2( workspace, workspace->d_workspace->r2,
-                workspace->d_workspace->r2, system->n, &redux[0], &redux[1], control->streams[0] );
+                workspace->d_workspace->r2, system->n, &redux[0], &redux[1], control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
@@ -1858,9 +1859,9 @@ int Cuda_BiCGStab( reax_system const * const system, control_params const * cons
 
     Vector_Sum( workspace->d_workspace->r, 1.0, b,
             -1.0, workspace->d_workspace->d, system->n );
-    redux[0] = Dot_local( workspace, b, b, system->n, control->streams[0] );
+    redux[0] = Dot_local( workspace, b, b, system->n, control->streams[4] );
     redux[1] = Dot_local( workspace, workspace->d_workspace->r,
-            workspace->d_workspace->r, system->n, control->streams[0] );
+            workspace->d_workspace->r, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
     Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
@@ -1892,7 +1893,7 @@ int Cuda_BiCGStab( reax_system const * const system, control_params const * cons
     for ( i = 0; i < control->cm_solver_max_iters && r_norm / b_norm > tol; ++i )
     {
         redux[0] = Dot_local( workspace, workspace->d_workspace->r_hat,
-                workspace->d_workspace->r, system->n, control->streams[0] );
+                workspace->d_workspace->r, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
@@ -1932,7 +1933,7 @@ int Cuda_BiCGStab( reax_system const * const system, control_params const * cons
 #endif
 
         jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->p,
-                workspace->d_workspace->d, system->n, control->streams[0] );
+                workspace->d_workspace->d, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
@@ -1946,7 +1947,7 @@ int Cuda_BiCGStab( reax_system const * const system, control_params const * cons
 #endif
 
         redux[0] = Dot_local( workspace, workspace->d_workspace->r_hat,
-                workspace->d_workspace->z, system->n, control->streams[0] );
+                workspace->d_workspace->z, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
@@ -1966,7 +1967,7 @@ int Cuda_BiCGStab( reax_system const * const system, control_params const * cons
                 1.0, workspace->d_workspace->r,
                 -1.0 * alpha, workspace->d_workspace->z, system->n );
         redux[0] = Dot_local( workspace, workspace->d_workspace->q,
-                workspace->d_workspace->q, system->n, control->streams[0] );
+                workspace->d_workspace->q, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
@@ -1993,7 +1994,7 @@ int Cuda_BiCGStab( reax_system const * const system, control_params const * cons
 #endif
 
         jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->q,
-                workspace->d_workspace->q_hat, system->n, control->streams[0] );
+                workspace->d_workspace->q_hat, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
@@ -2007,9 +2008,9 @@ int Cuda_BiCGStab( reax_system const * const system, control_params const * cons
 #endif
 
         redux[0] = Dot_local( workspace, workspace->d_workspace->y,
-                workspace->d_workspace->q, system->n, control->streams[0] );
+                workspace->d_workspace->q, system->n, control->streams[4] );
         redux[1] = Dot_local( workspace, workspace->d_workspace->y,
-                workspace->d_workspace->y, system->n, control->streams[0] );
+                workspace->d_workspace->y, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
@@ -2034,7 +2035,7 @@ int Cuda_BiCGStab( reax_system const * const system, control_params const * cons
                 1.0, workspace->d_workspace->q,
                 -1.0 * omega, workspace->d_workspace->y, system->n );
         redux[0] = Dot_local( workspace, workspace->d_workspace->r,
-                workspace->d_workspace->r, system->n, control->streams[0] );
+                workspace->d_workspace->r, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
@@ -2110,7 +2111,7 @@ int Cuda_dual_PIPECG( reax_system const * const system, control_params const * c
 #endif
 
     dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r2,
-            workspace->d_workspace->u2, system->n, control->streams[0] );
+            workspace->d_workspace->u2, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
     Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
@@ -2124,12 +2125,12 @@ int Cuda_dual_PIPECG( reax_system const * const system, control_params const * c
 #endif
 
     Dot_local_rvec2( workspace, workspace->d_workspace->w2,
-            workspace->d_workspace->u2, system->n, &redux[0], &redux[1], control->streams[0] );
+            workspace->d_workspace->u2, system->n, &redux[0], &redux[1], control->streams[4] );
     Dot_local_rvec2( workspace, workspace->d_workspace->r2,
-            workspace->d_workspace->u2, system->n, &redux[2], &redux[3], control->streams[0] );
+            workspace->d_workspace->u2, system->n, &redux[2], &redux[3], control->streams[4] );
     Dot_local_rvec2( workspace, workspace->d_workspace->u2,
-            workspace->d_workspace->u2, system->n, &redux[4], &redux[5], control->streams[0] );
-    Dot_local_rvec2( workspace, b, b, system->n, &redux[6], &redux[7], control->streams[0] );
+            workspace->d_workspace->u2, system->n, &redux[4], &redux[5], control->streams[4] );
+    Dot_local_rvec2( workspace, b, b, system->n, &redux[6], &redux[7], control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
     Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
@@ -2140,7 +2141,7 @@ int Cuda_dual_PIPECG( reax_system const * const system, control_params const * c
     Check_MPI_Error( ret, __FILE__, __LINE__ );
 
     dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->w2,
-            workspace->d_workspace->m2, system->n, control->streams[0] );
+            workspace->d_workspace->m2, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
     Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
@@ -2207,11 +2208,11 @@ int Cuda_dual_PIPECG( reax_system const * const system, control_params const * c
         Vector_Sum_rvec2( workspace->d_workspace->r2, 1.0, 1.0, workspace->d_workspace->r2,
                 -1.0 * alpha[0], -1.0 * alpha[1], workspace->d_workspace->d2, system->n );
         Dot_local_rvec2( workspace, workspace->d_workspace->w2,
-                workspace->d_workspace->u2, system->n, &redux[0], &redux[1], control->streams[0] );
+                workspace->d_workspace->u2, system->n, &redux[0], &redux[1], control->streams[4] );
         Dot_local_rvec2( workspace, workspace->d_workspace->r2,
-                workspace->d_workspace->u2, system->n, &redux[2], &redux[3], control->streams[0] );
+                workspace->d_workspace->u2, system->n, &redux[2], &redux[3], control->streams[4] );
         Dot_local_rvec2( workspace, workspace->d_workspace->u2,
-                workspace->d_workspace->u2, system->n, &redux[4], &redux[5], control->streams[0] );
+                workspace->d_workspace->u2, system->n, &redux[4], &redux[5], control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
@@ -2222,7 +2223,7 @@ int Cuda_dual_PIPECG( reax_system const * const system, control_params const * c
         Check_MPI_Error( ret, __FILE__, __LINE__ );
 
         dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->w2,
-                workspace->d_workspace->m2, system->n, control->streams[0] );
+                workspace->d_workspace->m2, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
@@ -2333,7 +2334,7 @@ int Cuda_PIPECG( reax_system const * const system, control_params const * const
 #endif
 
     jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r,
-            workspace->d_workspace->u, system->n, control->streams[0] );
+            workspace->d_workspace->u, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
     Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
@@ -2347,12 +2348,12 @@ int Cuda_PIPECG( reax_system const * const system, control_params const * const
 #endif
 
     redux[0] = Dot_local( workspace, workspace->d_workspace->w,
-            workspace->d_workspace->u, system->n, control->streams[0] );
+            workspace->d_workspace->u, system->n, control->streams[4] );
     redux[1] = Dot_local( workspace, workspace->d_workspace->r,
-            workspace->d_workspace->u, system->n, control->streams[0] );
+            workspace->d_workspace->u, system->n, control->streams[4] );
     redux[2] = Dot_local( workspace, workspace->d_workspace->u,
-            workspace->d_workspace->u, system->n, control->streams[0] );
-    redux[3] = Dot_local( workspace, b, b, system->n, control->streams[0] );
+            workspace->d_workspace->u, system->n, control->streams[4] );
+    redux[3] = Dot_local( workspace, b, b, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
     Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
@@ -2363,7 +2364,7 @@ int Cuda_PIPECG( reax_system const * const system, control_params const * const
     Check_MPI_Error( ret, __FILE__, __LINE__ );
 
     jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->w,
-            workspace->d_workspace->m, system->n, control->streams[0] );
+            workspace->d_workspace->m, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
     Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
@@ -2417,11 +2418,11 @@ int Cuda_PIPECG( reax_system const * const system, control_params const * const
         Vector_Sum( workspace->d_workspace->r, 1.0, workspace->d_workspace->r,
                 -1.0 * alpha, workspace->d_workspace->d, system->n );
         redux[0] = Dot_local( workspace, workspace->d_workspace->w,
-                workspace->d_workspace->u, system->n, control->streams[0] );
+                workspace->d_workspace->u, system->n, control->streams[4] );
         redux[1] = Dot_local( workspace, workspace->d_workspace->r,
-                workspace->d_workspace->u, system->n, control->streams[0] );
+                workspace->d_workspace->u, system->n, control->streams[4] );
         redux[2] = Dot_local( workspace, workspace->d_workspace->u,
-                workspace->d_workspace->u, system->n, control->streams[0] );
+                workspace->d_workspace->u, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
@@ -2432,7 +2433,7 @@ int Cuda_PIPECG( reax_system const * const system, control_params const * const
         Check_MPI_Error( ret, __FILE__, __LINE__ );
 
         jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->w,
-                workspace->d_workspace->m, system->n, control->streams[0] );
+                workspace->d_workspace->m, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
@@ -2505,15 +2506,15 @@ int Cuda_dual_PIPECR( reax_system const * const system, control_params const * c
 #endif
 
     dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r2,
-            workspace->d_workspace->u2, system->n, control->streams[0] );
+            workspace->d_workspace->u2, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
     Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
 #endif
 
-    Dot_local_rvec2( workspace, b, b, system->n, &redux[0], &redux[1], control->streams[0] );
+    Dot_local_rvec2( workspace, b, b, system->n, &redux[0], &redux[1], control->streams[4] );
     Dot_local_rvec2( workspace, workspace->d_workspace->u2,
-            workspace->d_workspace->u2, system->n, &redux[2], &redux[3], control->streams[0] );
+            workspace->d_workspace->u2, system->n, &redux[2], &redux[3], control->streams[4] );
 
     ret = MPI_Iallreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE, MPI_SUM,
             MPI_COMM_WORLD, &req );
@@ -2549,18 +2550,18 @@ int Cuda_dual_PIPECR( reax_system const * const system, control_params const * c
         }
 
         dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->w2,
-                workspace->d_workspace->m2, system->n, control->streams[0] );
+                workspace->d_workspace->m2, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
 #endif
 
         Dot_local_rvec2( workspace, workspace->d_workspace->w2,
-                workspace->d_workspace->u2, system->n, &redux[0], &redux[1], control->streams[0] );
+                workspace->d_workspace->u2, system->n, &redux[0], &redux[1], control->streams[4] );
         Dot_local_rvec2( workspace, workspace->d_workspace->m2,
-                workspace->d_workspace->w2, system->n, &redux[2], &redux[3], control->streams[0] );
+                workspace->d_workspace->w2, system->n, &redux[2], &redux[3], control->streams[4] );
         Dot_local_rvec2( workspace, workspace->d_workspace->u2,
-                workspace->d_workspace->u2, system->n, &redux[4], &redux[5], control->streams[0] );
+                workspace->d_workspace->u2, system->n, &redux[4], &redux[5], control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
@@ -2708,15 +2709,15 @@ int Cuda_PIPECR( reax_system const * const system, control_params const * const
 #endif
 
     jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r,
-            workspace->d_workspace->u, system->n, control->streams[0] );
+            workspace->d_workspace->u, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
     Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
 #endif
 
-    redux[0] = Dot_local( workspace, b, b, system->n, control->streams[0] );
+    redux[0] = Dot_local( workspace, b, b, system->n, control->streams[4] );
     redux[1] = Dot_local( workspace, workspace->d_workspace->u,
-            workspace->d_workspace->u, system->n, control->streams[0] );
+            workspace->d_workspace->u, system->n, control->streams[4] );
 
     ret = MPI_Iallreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE, MPI_SUM,
             MPI_COMM_WORLD, &req );
@@ -2745,18 +2746,18 @@ int Cuda_PIPECR( reax_system const * const system, control_params const * const
     for ( i = 0; i < control->cm_solver_max_iters && r_norm / b_norm > tol; ++i )
     {
         jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->w,
-                workspace->d_workspace->m, system->n, control->streams[0] );
+                workspace->d_workspace->m, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
 #endif
 
         redux[0] = Dot_local( workspace, workspace->d_workspace->w,
-                workspace->d_workspace->u, system->n, control->streams[0] );
+                workspace->d_workspace->u, system->n, control->streams[4] );
         redux[1] = Dot_local( workspace, workspace->d_workspace->m,
-                workspace->d_workspace->w, system->n, control->streams[0] );
+                workspace->d_workspace->w, system->n, control->streams[4] );
         redux[2] = Dot_local( workspace, workspace->d_workspace->u,
-                workspace->d_workspace->u, system->n, control->streams[0] );
+                workspace->d_workspace->u, system->n, control->streams[4] );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
diff --git a/PG-PuReMD/src/cuda/cuda_system_props.cu b/PG-PuReMD/src/cuda/cuda_system_props.cu
index 6df030a6..fb623e44 100644
--- a/PG-PuReMD/src/cuda/cuda_system_props.cu
+++ b/PG-PuReMD/src/cuda/cuda_system_props.cu
@@ -520,9 +520,9 @@ static void Cuda_Compute_Momentum( reax_system *system, control_params *control,
 {
     rvec *spad;
 
-    sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size,
+    sCudaCheckMalloc( &workspace->scratch[0], &workspace->scratch_size[0],
             sizeof(rvec) * (control->blocks + 1), __FILE__, __LINE__ );
-    spad = (rvec *) workspace->scratch;
+    spad = (rvec *) workspace->scratch[0];
 
     // xcm
     sCudaMemsetAsync( spad, 0, sizeof(rvec) * (control->blocks + 1),
@@ -594,9 +594,9 @@ static void Cuda_Compute_Inertial_Tensor( reax_system *system, control_params *c
 {
     real *spad;
 
-    sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size,
+    sCudaCheckMalloc( &workspace->scratch[0], &workspace->scratch_size[0],
             sizeof(real) * 6 * (control->blocks + 1), __FILE__, __LINE__ );
-    spad = (real *) workspace->scratch;
+    spad = (real *) workspace->scratch[0];
     sCudaMemsetAsync( spad, 0, sizeof(real) * 6 * (control->blocks + 1),
             control->streams[0], __FILE__, __LINE__ );
     cudaStreamSynchronize( control->streams[0] );
@@ -690,9 +690,9 @@ extern "C" void Cuda_Compute_Kinetic_Energy( reax_system *system,
     int ret;
     real *kinetic_energy;
 
-    sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size,
+    sCudaCheckMalloc( &workspace->scratch[0], &workspace->scratch_size[0],
             sizeof(real) * (system->n + 1), __FILE__, __LINE__ );
-    kinetic_energy = (real *) workspace->scratch;
+    kinetic_energy = (real *) workspace->scratch[0];
 
     k_compute_kinetic_energy <<< control->blocks, control->block_size, 0, control->streams[0] >>>
         ( system->reax_param.d_sbp, system->d_my_atoms, kinetic_energy, system->n );
@@ -727,9 +727,9 @@ void Cuda_Compute_Total_Mass( reax_system *system, control_params *control,
     int ret;
     real my_M, *spad;
 
-    sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size,
+    sCudaCheckMalloc( &workspace->scratch[0], &workspace->scratch_size[0],
             sizeof(real) * (system->n + 1), __FILE__, __LINE__ );
-    spad = (real *) workspace->scratch;
+    spad = (real *) workspace->scratch[0];
 
     k_compute_total_mass <<< control->blocks, control->block_size, 0, control->streams[0]  >>>
         ( system->reax_param.d_sbp, system->d_my_atoms, spad, system->n );
@@ -873,10 +873,10 @@ void Cuda_Compute_Pressure( reax_system* system, control_params *control,
     /* 0: both int and ext, 1: ext only, 2: int only */
     if ( control->press_mode == 0 || control->press_mode == 2 )
     {
-        sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size,
+        sCudaCheckMalloc( &workspace->scratch[0], &workspace->scratch_size[0],
                 sizeof(rvec) * (system->n + control->blocks + 1),
                 __FILE__, __LINE__ );
-        rvec_spad = (rvec *) workspace->scratch;
+        rvec_spad = (rvec *) workspace->scratch[0];
 
         k_compute_pressure <<< control->blocks, control->block_size, 0,
                            control->streams[0] >>>
diff --git a/PG-PuReMD/src/cuda/cuda_torsion_angles.cu b/PG-PuReMD/src/cuda/cuda_torsion_angles.cu
index 42f3be11..afd5c708 100644
--- a/PG-PuReMD/src/cuda/cuda_torsion_angles.cu
+++ b/PG-PuReMD/src/cuda/cuda_torsion_angles.cu
@@ -1299,10 +1299,10 @@ void Cuda_Compute_Torsion_Angles( reax_system *system, control_params *control,
     {
         s = (sizeof(real) * 2 * system->n;
     }
-    sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size,
+    sCudaCheckMalloc( &workspace->scratch[0], &workspace->scratch_size[0],
             s, __FILE__, __LINE__ );
 
-    spad = (real *) workspace->scratch;
+    spad = (real *) workspace->scratch[0];
     update_energy = (out_control->energy_update_freq > 0
             && data->step % out_control->energy_update_freq == 0) ? TRUE : FALSE;
 #else
@@ -1315,7 +1315,6 @@ void Cuda_Compute_Torsion_Angles( reax_system *system, control_params *control,
         sCudaMemsetAsync( &((simulation_data *)data->d_simulation_data)->my_ext_press,
                 0, sizeof(rvec), control->streams[0], __FILE__, __LINE__ );
     }
-    cudaStreamSynchronize( control->streams[0] );
 #endif
 
     if ( control->virial == 1 )
diff --git a/PG-PuReMD/src/cuda/cuda_valence_angles.cu b/PG-PuReMD/src/cuda/cuda_valence_angles.cu
index d88dbb74..fa309d91 100644
--- a/PG-PuReMD/src/cuda/cuda_valence_angles.cu
+++ b/PG-PuReMD/src/cuda/cuda_valence_angles.cu
@@ -1348,7 +1348,6 @@ static int Cuda_Estimate_Storage_Three_Body( reax_system *system, control_params
 
     sCudaMemsetAsync( thbody, 0, system->total_bonds * sizeof(int),
             control->streams[0], __FILE__, __LINE__ );
-    cudaStreamSynchronize( control->streams[0] );
 
 //    k_estimate_valence_angles <<< control->blocks_n, control->block_size_n, 0, control->streams[0] >>>
 //        ( system->d_my_atoms, (control_params *)control->d_control_params, 
@@ -1437,12 +1436,12 @@ int Cuda_Compute_Valence_Angles( reax_system *system, control_params *control,
     s = sizeof(int) * system->total_bonds;
 #endif
 
-    sCudaCheckMalloc( &workspace->scratch, &workspace->scratch_size,
+    sCudaCheckMalloc( &workspace->scratch[0], &workspace->scratch_size[0],
             s, __FILE__, __LINE__ );
 
-    thbody = (int *) workspace->scratch;
+    thbody = (int *) workspace->scratch[0];
 #if !defined(CUDA_ACCUM_ATOMIC)
-    spad = (real *) workspace->scratch;
+    spad = (real *) workspace->scratch[0];
     update_energy = (out_control->energy_update_freq > 0
             && data->step % out_control->energy_update_freq == 0) ? TRUE : FALSE;
 #endif
@@ -1463,7 +1462,6 @@ int Cuda_Compute_Valence_Angles( reax_system *system, control_params *control,
                 0, sizeof(real), control->streams[0], __FILE__, __LINE__ );
         sCudaMemsetAsync( &((simulation_data *)data->d_simulation_data)->my_ext_press,
                 0, sizeof(rvec), control->streams[0], __FILE__, __LINE__ );
-        cudaStreamSynchronize( control->streams[0] );
 #endif
 
         if ( control->virial == 1 )
diff --git a/PG-PuReMD/src/reax_types.h b/PG-PuReMD/src/reax_types.h
index cf3a055a..109e1476 100644
--- a/PG-PuReMD/src/reax_types.h
+++ b/PG-PuReMD/src/reax_types.h
@@ -358,7 +358,10 @@
   #endif
 
   /* max. num. of active CUDA streams */
-  #define CUDA_MAX_STREAMS (5)
+  #define MAX_CUDA_STREAMS (5)
+
+  /* max. num. of CUDA events used for synchronizing streams */
+  #define MAX_CUDA_STREAM_EVENTS (6)
 #endif
 
 
@@ -1659,7 +1662,9 @@ struct control_params
      * for kernels with 1 thread per atom (local AND ghost) */
     int blocks_pow_2_n;
     /* CUDA stream */
-    cudaStream_t streams[CUDA_MAX_STREAMS];
+    cudaStream_t streams[MAX_CUDA_STREAMS];
+    /* CUDA events for synchronizing streams */
+    cudaEvent_t stream_events[MAX_CUDA_STREAM_EVENTS];
 #endif
 };
 
@@ -2322,9 +2327,9 @@ struct storage
     /* size of temporary host workspace, in bytes */
     size_t host_scratch_size;
     /* temporary workspace (GPU) */
-    void *scratch;
+    void *scratch[MAX_CUDA_STREAMS];
     /* size of temporary workspace (GPU), in bytes */
-    size_t scratch_size;
+    size_t scratch_size[MAX_CUDA_STREAMS];
     /* lookup table for force tabulation (GPU) */
     LR_lookup_table *d_LR;
     /* storage (GPU) */
-- 
GitLab