diff --git a/PuReMD/Makefile.am b/PuReMD/Makefile.am
index 718c453fa84b45f83c4c3890eda1735fbb0b7f20..c657b37a7bde1349eaa877de14d82b8d8bf727cd 100644
--- a/PuReMD/Makefile.am
+++ b/PuReMD/Makefile.am
@@ -6,9 +6,8 @@ bin_puremd_SOURCES = src/allocate.c src/basic_comm.c src/ffield.c src/grid.c src
       src/vector.c src/analyze.c src/box.c src/system_props.c src/control.c src/comm_tools.c \
       src/geo_tools.c src/linear_solvers.c src/neighbors.c src/qEq.c src/bond_orders.c src/multi_body.c \
       src/bonds.c src/valence_angles.c src/hydrogen_bonds.c src/torsion_angles.c src/nonbonded.c src/forces.c \
-      src/integrate.c src/init_md.c src/parallelreax.c
-
-include_HEADERS = src/reax_defs.h src/reax_types.h \
+      src/integrate.c src/init_md.c src/parallelreax.c \
+      src/reax_defs.h src/reax_types.h \
       src/allocate.h src/basic_comm.h src/ffield.h src/grid.h src/list.h src/lookup.h \
       src/io_tools.h src/reset_tools.h src/restart.h src/random.h src/tool_box.h src/traj.h \
       src/vector.h src/analyze.h src/box.h src/system_props.h src/control.h src/comm_tools.h \
@@ -16,5 +15,5 @@ include_HEADERS = src/reax_defs.h src/reax_types.h \
       src/bonds.h src/valence_angles.h src/hydrogen_bonds.h src/torsion_angles.h src/nonbonded.h src/forces.h \
       src/integrate.h src/init_md.h
 
-bin_puremd_CFLAGS = $(CFLAGS) $(MPI_CFLAGS)
-bin_puremd_LDADD = $(LDADD) $(MPI_LIBS)
+bin_puremd_CFLAGS = $(MPI_CFLAGS)
+bin_puremd_LDADD = $(MPI_LIBS)
diff --git a/PuReMD/configure.ac b/PuReMD/configure.ac
index 8833fc248a64e49a684106feeaeb4682c54bf715..6a1bb0a187736d4baf0207f537e2d224758a3901 100644
--- a/PuReMD/configure.ac
+++ b/PuReMD/configure.ac
@@ -74,7 +74,6 @@ fi
 CONFIGURE_HEADLINE([ MPI compiler ])
 ACX_MPI([], [AC_MSG_ERROR([could not find mpi library])])
 AC_CHECK_PROG(MPIRUN, mpirun, mpirun)
-AC_SUBST(MPIRUN)
 
 # try to find if we are using OpenMPI / MPICH by looking inside mpi.h
 save_CC="${CC}"
diff --git a/PuReMD/src/basic_comm.c b/PuReMD/src/basic_comm.c
index 135abb6e49404c8305938ea510e5c0bb2aafd9c6..8d6277de6008da24a98e7ba22f1dfdcac8d8c21f 100644
--- a/PuReMD/src/basic_comm.c
+++ b/PuReMD/src/basic_comm.c
@@ -132,7 +132,7 @@ void Dist( reax_system* system, mpi_datatypes *mpi_data,
 void Dist_NT( reax_system* system, mpi_datatypes *mpi_data,
         void *buf, MPI_Datatype type, int scale, dist_packer pack )
 {
-    int d;
+    int d, count, index;
     mpi_out_data *out_bufs;
     MPI_Comm comm;
     MPI_Request req[6];
@@ -145,12 +145,14 @@ void Dist_NT( reax_system* system, mpi_datatypes *mpi_data,
     comm = mpi_data->comm_mesh3D;
     out_bufs = mpi_data->out_nt_buffers;
 
+    count = 0;
     /* initiate recvs */
     for( d = 0; d < 6; ++d )
     {
         nbr = &(system->my_nt_nbrs[d]);
         if ( nbr->atoms_cnt )
         {
+            count++;
             MPI_Irecv( buf + nbr->atoms_str * scale, nbr->atoms_cnt, type,
                     nbr->receive_rank, d, comm, &(req[d]) );
         }
@@ -167,15 +169,21 @@ void Dist_NT( reax_system* system, mpi_datatypes *mpi_data,
         }
     }
 
+    /*
     for( d = 0; d < 6; ++d )
     {
         nbr = &(system->my_nbrs[d]);
         if ( nbr->atoms_cnt )
         {
-            // TODO: Wait(MPI_WAITANY)
             MPI_Wait( &(req[d]), &(stat[d]) );
         }
     }
+    */
+    for( d = 0; d < count; ++d )
+    {
+        MPI_Waitany( MAX_NT_NBRS, req, &index, stat);
+    }
+
 
 #if defined(DEBUG)
     fprintf( stderr, "p%d dist: done\n", system->my_rank );
@@ -295,7 +303,7 @@ void Coll( reax_system* system, mpi_datatypes *mpi_data,
 void Coll_NT( reax_system* system, mpi_datatypes *mpi_data,
         void *buf, MPI_Datatype type, int scale, coll_unpacker unpack )
 {
-    int d;
+    int d, count, index;
     void *in[6];
     mpi_out_data *out_bufs;
     MPI_Comm comm;
@@ -309,6 +317,7 @@ void Coll_NT( reax_system* system, mpi_datatypes *mpi_data,
     comm = mpi_data->comm_mesh3D;
     out_bufs = mpi_data->out_buffers;
 
+    count = 0;
     for ( d = 0; d < 6; ++d )
     {
         /* initiate recvs */
@@ -316,6 +325,7 @@ void Coll_NT( reax_system* system, mpi_datatypes *mpi_data,
         in[d] = mpi_data->in_nt_buffer[d];
         if ( out_bufs[d].cnt )
         {
+            count++;
             MPI_Irecv(in[d], out_bufs[d].cnt, type, nbr->receive_rank, d, comm, &(req[d]));
         }
     }
@@ -331,15 +341,21 @@ void Coll_NT( reax_system* system, mpi_datatypes *mpi_data,
         }
     }
 
+    /*
     for( d = 0; d < 6; ++d )
     {
         if ( out_bufs[d].cnt )
         {
-            //TODO: WAITANY
             MPI_Wait( &(req[d]), &(stat[d]));
             unpack( in[d], buf, out_bufs + d );
         }
     }
+    */
+    for( d = 0; d < count; ++d )
+    {
+        MPI_Waitany( MAX_NT_NBRS, req, &index, stat);
+        unpack( in[index], buf, out_bufs + index );
+    }
 
 #if defined(DEBUG)
     fprintf( stderr, "p%d coll: done\n", system->my_rank );
diff --git a/PuReMD/src/comm_tools.c b/PuReMD/src/comm_tools.c
index 9c2728f86ae1885b9abc8b4157c64cd207c5686c..28a41c5e2368ee9ca68872c9a6f28bb6ae57070c 100644
--- a/PuReMD/src/comm_tools.c
+++ b/PuReMD/src/comm_tools.c
@@ -46,8 +46,8 @@ void Setup_NT_Comm( reax_system* system, control_params* control,
         {0, 0, -1}, // -z
         {0, +1, 0}, // +y
         {+1, +1, 0}, // +x+y
-        {+1, 0, 0}, // -x
-        {+1, -1, 0}  // -x+y
+        {+1, 0, 0}, // +x
+        {+1, -1, 0}  // +x-y
     };
     my_box = &(system->my_box);
     bndry_cut = system->bndry_cuts.ghost_cutoff;
diff --git a/PuReMD/src/forces.c b/PuReMD/src/forces.c
index dadb988eedf2d143d02fd26ec430aabc14b91dd5..76a38527d8702e4767d0b49bfc9bfa7a0d9b8bb4 100644
--- a/PuReMD/src/forces.c
+++ b/PuReMD/src/forces.c
@@ -346,11 +346,16 @@ void Init_Forces( reax_system *system, control_params *control,
     int jhb_top;
     int start_j, end_j;
     int btop_j;
+    int total_cnt[6];
+    int bin[6];
+    int total_sum[6];
+    int nt_flag;
 
     far_nbrs = lists[FAR_NBRS];
     bonds = lists[BONDS];
     hbonds = lists[HBONDS];
 
+
     for ( i = 0; i < system->n; ++i )
         workspace->bond_mark[i] = 0;
     for ( i = system->n; i < system->N; ++i )
@@ -360,13 +365,47 @@ void Init_Forces( reax_system *system, control_params *control,
     }
 
     H = workspace->H;
-    H->n = system->n;
+    H->n = system->N;
     Htop = 0;
     num_bonds = 0;
     num_hbonds = 0;
     btop_i = 0;
     renbr = (data->step - data->prev_steps) % control->reneighbor == 0;
 
+    if( renbr )
+    {
+        for ( i = 0; i < 6; ++i )
+        {
+            total_cnt[i] = 0;
+            bin[i] = 0;
+            total_sum[i] = 0;
+        }
+        for ( i = system->n; i < system->N; ++i )
+        {
+            atom_i = &(system->my_atoms[i]);
+            if( atom_i->nt_dir != -1 )
+            {
+                total_cnt[ atom_i->nt_dir ]++;
+            }
+        }
+        total_sum[0] = system->n;
+        for ( i = 1; i < 6; ++i )
+        {
+            total_sum[i] = total_sum[i-1] + total_cnt[i-1];
+        }
+        for ( i = system->n; i < system->N; ++i )
+        {
+            atom_i = &(system->my_atoms[i]);
+            if( atom_i->nt_dir != -1 )
+            {
+                atom_i->pos = total_sum[ atom_i->nt_dir ] + bin[ atom_i->nt_dir ];
+                bin[ atom_i->nt_dir ]++;
+            }
+        }
+        H->NT = total_sum[6] + total_cnt[6];
+
+    }
+
     for ( i = 0; i < system->N; ++i )
     {
         atom_i = &(system->my_atoms[i]);
@@ -385,12 +424,17 @@ void Init_Forces( reax_system *system, control_params *control,
         }
         sbp_i = &(system->reax_param.sbp[type_i]);
 
-        //TODO: edit this part to include NT atoms
         if ( i < system->n )
         {
             local = 1;
             cutoff = control->nonb_cut;
         }
+        else if( atom_i->nt_dir != -1 )
+        {
+            local = 2;
+            cutoff = control->nonb_cut;
+            nt_flag = 0;
+        }
         else
         {
             local = 0;
@@ -399,7 +443,7 @@ void Init_Forces( reax_system *system, control_params *control,
 
         ihb = -1;
         ihb_top = -1;
-        if ( local )
+        if ( local == 1 )
         {
             H->start[i] = Htop;
             H->entries[Htop].j = i;
@@ -428,6 +472,14 @@ void Init_Forces( reax_system *system, control_params *control,
                 }
             }
         }
+        // TODO: Diag entries for NT atoms?
+        /*else if( local == 2 )
+        {
+            H->start[atom_i->pos] = Htop;
+            H->entries[Htop].j = atom_i->pos;
+            H->entries[Htop].val = sbp_i->eta;
+            ++Htop;
+        }*/
 
         /* update i-j distance - check if j is within cutoff */
         for ( pj = start_i; pj < end_i; ++pj )
@@ -465,18 +517,34 @@ void Init_Forces( reax_system *system, control_params *control,
                 sbp_j = &(system->reax_param.sbp[type_j]);
                 twbp = &(system->reax_param.tbp[type_i][type_j]);
 
-                if ( local )
+                if ( local == 1 )
                 {
                     /* H matrix entry */
                     if ( (far_nbrs->format == HALF_LIST
                             && (j < system->n || atom_i->orig_id < atom_j->orig_id))
                       || far_nbrs->format == FULL_LIST )
                     {
-                        H->entries[Htop].j = j;
-                        if ( control->tabulate == 0 )
-                            H->entries[Htop].val = Compute_H(r_ij, twbp->gamma, workspace->Tap);
-                        else H->entries[Htop].val = Compute_tabH(r_ij, type_i, type_j);
-                        ++Htop;
+                        if( atom_j->nt_dir != -1 || j < system->n )
+                        {
+                            if( j < system->n )
+                            {
+                                H->entries[Htop].j = j;
+                            }
+                            else
+                            {
+                                H->entries[Htop].j = atom_i->pos;
+                            }
+
+                            if ( control->tabulate == 0 )
+                            {
+                                H->entries[Htop].val = Compute_H(r_ij, twbp->gamma, workspace->Tap);
+                            }
+                            else 
+                            {
+                                H->entries[Htop].val = Compute_tabH(r_ij, type_i, type_j);
+                            }
+                            ++Htop;
+                        }
                     }
 
                     /* hydrogen bond lists */
@@ -506,6 +574,42 @@ void Init_Forces( reax_system *system, control_params *control,
                         }
                     }
                 }
+                else if ( local == 2 )
+                {
+                    /* H matrix entry */
+#if defined(HALF_LIST)
+                    if ( j < system->n || atom_i->orig_id < atom_j->orig_id )
+#endif
+                    {
+                        if( atom_j->nt_dir != -1 || j < system->n )
+                        {
+                            if( !nt_flag )
+                            {
+                                nt_flag = 1;
+                                H->start[atom_i->pos] = Htop;
+                            }
+                            if( j < system->n )
+                            {
+                                H->entries[Htop].j = j;
+                            }
+                            else
+                            {
+                                H->entries[Htop].j = atom_i->pos;
+                            }
+
+                            if ( control->tabulate == 0 )
+                            {
+                                H->entries[Htop].val = Compute_H(r_ij, twbp->gamma, workspace->Tap);
+                            }
+                            else 
+                            {
+                                H->entries[Htop].val = Compute_tabH(r_ij, type_i, type_j);
+                            }
+                            ++Htop;
+                        }
+                    }
+
+                }
 
                 /* uncorrected bond orders */
                 if ( //(workspace->bond_mark[i] < 3 || workspace->bond_mark[j] < 3) &&
@@ -527,8 +631,17 @@ void Init_Forces( reax_system *system, control_params *control,
             }
         }
 
+        if ( local == 1 )
+        {
+            H->end[i] = Htop;
+        }
+        else if ( local == 2 )
+        {
+            H->end[atom_i->pos] = Htop;
+        }
+
         Set_End_Index( i, btop_i, bonds );
-        if ( local )
+        if ( local == 1 )
         {
             H->end[i] = Htop;
             if ( ihb == 1 )
@@ -829,7 +942,7 @@ void Init_Forces_noQEq( reax_system *system, control_params *control,
 
 void Estimate_Storages( reax_system *system, control_params *control,
                         reax_list **lists, int *Htop, int *hb_top,
-                        int *bond_top, int *num_3body, MPI_Comm comm )
+                        int *bond_top, int *num_3body, MPI_Comm comm, int *matrix_dim )
 {
     int i, j, pj;
     int start_i, end_i;
@@ -848,6 +961,7 @@ void Estimate_Storages( reax_system *system, control_params *control,
 
     far_nbrs = lists[FAR_NBRS];
     *Htop = 0;
+    *matrix_dim = 0;
     memset( hb_top, 0, sizeof(int) * system->local_cap );
     memset( bond_top, 0, sizeof(int) * system->total_cap );
     *num_3body = 0;
@@ -865,8 +979,18 @@ void Estimate_Storages( reax_system *system, control_params *control,
             local = 1;
             cutoff = control->nonb_cut;
             ++(*Htop);
+            ++(*matrix_dim);
             ihb = sbp_i->p_hbond;
         }
+        else if ( atom_i->nt_dir != -1 )
+        {
+            local = 2;
+            cutoff = control->nonb_cut;
+            // TODO: Diag entries for NT atoms?
+            //++(*Htop);
+            //++(*matrix_dim);
+            ihb = -1;
+        }
         else
         {
             local = 0;
@@ -890,13 +1014,16 @@ void Estimate_Storages( reax_system *system, control_params *control,
                 sbp_j = &(system->reax_param.sbp[type_j]);
                 twbp = &(system->reax_param.tbp[type_i][type_j]);
 
-                if ( local )
+                if ( local == 1 )
                 {
                     if ( (far_nbrs->format == HALF_LIST
                             && (j < system->n || atom_i->orig_id < atom_j->orig_id))
                       || far_nbrs->format == FULL_LIST )
                     {
-                        ++(*Htop);
+                        if( j < system->n || (system->my_atoms[j]).nt_dir != -1 )
+                        {
+                            ++(*Htop);
+                        }
                     }
 
                     /* hydrogen bond lists */
@@ -917,6 +1044,20 @@ void Estimate_Storages( reax_system *system, control_params *control,
                     }
                 }
 
+                else if ( local == 2 )
+                {
+
+#if defined(HALF_LIST)
+                    if ( j < system->n || atom_i->orig_id < atom_j->orig_id ) //tryQEq ||1
+#endif
+                    {
+                        if( j < system->n || (system->my_atoms[j]).nt_dir != -1 )
+                        {
+                            ++(*Htop);
+                        }
+                    }
+                }
+
                 /* uncorrected bond orders */
                 if ( nbr_pj->d <= control->bond_cut )
                 {
@@ -960,6 +1101,7 @@ void Estimate_Storages( reax_system *system, control_params *control,
     }
 
     *Htop = (int)(MAX( *Htop * SAFE_ZONE, MIN_CAP * MIN_HENTRIES ));
+    *matrix_dim = (int)(MAX( *matrix_dim * SAFE_ZONE, MIN_CAP ));
     for ( i = 0; i < system->n; ++i )
         hb_top[i] = (int)(MAX( hb_top[i] * SAFER_ZONE, MIN_HBONDS ));
 
@@ -1003,13 +1145,21 @@ void Compute_Forces( reax_system *system, control_params *control,
 #elif defined(LAMMPS_REAX)
     qeq_flag = 0;
 #endif
-
+#if defined(NT_DEBUG)
+    fprintf( stdout, "forces.c before Init_Forces call\n");
+    fflush( stdout);
+#endif
     if ( qeq_flag )
         Init_Forces( system, control, data, workspace, lists, out_control, comm );
     else
         Init_Forces_noQEq( system, control, data, workspace,
                            lists, out_control, comm );
 
+#if defined(NT_DEBUG)
+    //MPI_Barrier(MPI_COMM_WORLD);  
+    fprintf( stdout, "p%d, forces.c after Init_Forces call\n", system->my_rank);
+    fflush( stdout);
+#endif
 #if defined(LOG_PERFORMANCE)
     //MPI_Barrier( mpi_data->world );
     if ( system->my_rank == MASTER_NODE )
@@ -1020,6 +1170,11 @@ void Compute_Forces( reax_system *system, control_params *control,
     /********* bonded interactions ************/
     Compute_Bonded_Forces( system, control, data, workspace,
                            lists, out_control, mpi_data->world );
+#if defined(NT_DEBUG)
+    fprintf( stdout, "forces.c after Compute_Bonded_Forces call\n");
+    fflush( stdout);
+    MPI_Barrier( mpi_data->world );
+#endif
 
 #if defined(LOG_PERFORMANCE)
     //MPI_Barrier( mpi_data->world );
@@ -1034,11 +1189,20 @@ void Compute_Forces( reax_system *system, control_params *control,
     MPI_Barrier( mpi_data->world );
 #endif
 
+#if defined(NT_DEBUG)
+    fprintf( stdout, "p%d, forces.c before QEq call\n", system->my_rank );
+    fflush( stdout);
+    MPI_Barrier( mpi_data->world );
+#endif
 
     /**************** qeq ************************/
 #if defined(PURE_REAX)
     if ( qeq_flag )
         QEq( system, control, data, workspace, out_control, mpi_data );
+#if defined(NT_DEBUG)
+    fprintf( stdout, "forces.c after QEq call\n");
+    fflush( stdout);
+#endif
 
 #if defined(LOG_PERFORMANCE)
     //MPI_Barrier( mpi_data->world );
diff --git a/PuReMD/src/forces.h b/PuReMD/src/forces.h
index 43d47cb46f66f45dc2b54d9415f8c92405a1d557..10b34395b3af6940a1779255fb8a3ebea7b45976 100644
--- a/PuReMD/src/forces.h
+++ b/PuReMD/src/forces.h
@@ -31,5 +31,5 @@ void Init_Force_Functions( control_params* );
 void Compute_Forces( reax_system*, control_params*, simulation_data*,
                      storage*, reax_list**, output_controls*, mpi_datatypes* );
 void Estimate_Storages( reax_system*, control_params*, reax_list**,
-                        int*, int*, int*, int*, MPI_Comm );
+                        int*, int*, int*, int*, MPI_Comm, int* );
 #endif
diff --git a/PuReMD/src/grid.c b/PuReMD/src/grid.c
index 0064f2201695b85625dcd15db75861fe7fcb80b9..1bd1b72a4fc98f9ee2b887e07b475d230c1e4cc4 100644
--- a/PuReMD/src/grid.c
+++ b/PuReMD/src/grid.c
@@ -30,11 +30,21 @@
 /* determines the exchange boundaries with nbrs in terms of gcells */
 void Mark_GCells( reax_system* system, grid *g, ivec procs, MPI_Comm comm )
 {
-    int x, y, z, d;
+    int i, x, y, z, d, len;
     ivec r, nbr_coord, prdc;
     ivec send_span, recv_span;
     ivec str_send, end_send;
     ivec str_recv, end_recv;
+    ivec nt_str, nt_end;
+
+    ivec dir[6] = {
+        {0, 0, +1}, // +z
+        {0, 0, -1}, // -z
+        {0, +1, 0}, // +y
+        {+1, +1, 0}, // +x+y
+        {+1, 0, 0}, // +x
+        {+1, -1, 0}  // +x-y
+    };
 
     /* clear all gcell type info */
     for ( x = 0; x < g->ncells[0]; x++ )
@@ -50,6 +60,39 @@ void Mark_GCells( reax_system* system, grid *g, ivec procs, MPI_Comm comm )
                 g->cells[x][y][z].type = NATIVE;
                 ivec_MakeZero( g->cells[x][y][z].rel_box );
             }
+    
+    /* mark NT cells */
+    for ( i = 0; i < 6; ++i )
+    {
+        for ( d = 0; d < 3; ++d )
+        {
+            if ( dir[i][d] > 0 )
+            {
+                nt_str[d] = MIN(g->native_end[d], g->ncells[d]);
+                nt_end[d] = MIN(g->native_end[d] + g->vlist_span[d], g->ncells[d]);
+            }
+            else if ( dir[i][d] < 0 )
+            {
+                nt_str[d] = MAX(0, g->native_str[d] - g->vlist_span[d] );
+                nt_end[d] = g->native_str[d];
+            }
+            else
+            {
+                nt_str[d] = g->native_str[d];
+                nt_end[d] = g->native_end[d];
+            }
+        }
+        for ( x = nt_str[0]; x < nt_end[0]; x++ )
+        {
+            for ( y = nt_str[1]; y < nt_end[1]; y++ )
+            {
+                for ( z = nt_str[2]; z < nt_end[2]; z++ )
+                {
+                    g->cells[x][y][z].type = NT_NBRS + i;
+                }
+            }
+        }
+    }
 
     /* loop over neighbors */
     for ( r[0] = -1; r[0] <= 1; ++r[0])
@@ -137,7 +180,7 @@ void Find_Neighbor_GridCells( grid *g, control_params *control )
                 top = 0;
                 //fprintf( stderr, "grid1: %d %d %d:\n", ci[0], ci[1], ci[2] );
 
-                if ( gc->type == NATIVE )
+                if ( gc->type == NATIVE || ( gc->type >= NT_NBRS && gc->type < NT_NBRS + 6 ) )
                     gc->cutoff = control->vlist_cut;
                 else gc->cutoff = control->bond_cut;
 
@@ -201,13 +244,13 @@ void Reorder_GridCells( grid *g )
     fprintf( stderr, "reordered gcells:\n" );
     for ( i = 0; i < top; ++i )
         fprintf( stderr, "order%d: %d %d %d\n",
-                 i, g->order[i][0], g->order[i][1], g->order[i][2] );
+                i, g->order[i][0], g->order[i][1], g->order[i][2] );
 #endif
 }
 
 
 void Setup_New_Grid( reax_system* system, control_params* control,
-                     MPI_Comm comm )
+        MPI_Comm comm )
 {
     int              d, i, j, k;
     grid            *g;
@@ -242,13 +285,13 @@ void Setup_New_Grid( reax_system* system, control_params* control,
         g->bond_span[d] = (int)ceil( control->bond_cut / g->cell_len[d] );
         /* span of the ghost region in terms of gcells */
         g->ghost_span[d] = (int)ceil(system->bndry_cuts.ghost_cutoff /
-                                     g->cell_len[d]);
+                g->cell_len[d]);
         g->ghost_nonb_span[d] = (int)ceil(system->bndry_cuts.ghost_nonb /
-                                          g->cell_len[d]);
+                g->cell_len[d]);
         g->ghost_hbond_span[d] = (int)ceil( system->bndry_cuts.ghost_hbond /
-                                            g->cell_len[d] );
+                g->cell_len[d] );
         g->ghost_bond_span[d] = (int)ceil( system->bndry_cuts.ghost_bond /
-                                           g->cell_len[d] );
+                g->cell_len[d] );
     }
 
     /* total number of grid cells */
@@ -262,8 +305,8 @@ void Setup_New_Grid( reax_system* system, control_params* control,
     /* upper bound on the number of gcells to be exchanged with a single nbr */
     system->gcell_cap =
         MAX3( g->native_cells[0] * g->native_cells[1] * g->ghost_span[2],
-              g->native_cells[0] * g->native_cells[2] * g->ghost_span[1],
-              g->native_cells[1] * g->native_cells[2] * g->ghost_span[0] ) + 1;
+                g->native_cells[0] * g->native_cells[2] * g->ghost_span[1],
+                g->native_cells[1] * g->native_cells[2] * g->ghost_span[0] ) + 1;
 
     /* allocate grid space */
     Allocate_Grid( system, comm );
@@ -331,9 +374,9 @@ void Update_Grid( reax_system* system, control_params* control, MPI_Comm comm )
         ghost_span[d] = (int)ceil(system->bndry_cuts.ghost_cutoff / cell_len[d]);
         ghost_nonb_span[d] = (int)ceil(system->bndry_cuts.ghost_nonb / cell_len[d]);
         ghost_hbond_span[d] = (int)ceil( system->bndry_cuts.ghost_hbond /
-                                         cell_len[d] );
+                cell_len[d] );
         ghost_bond_span[d] = (int)ceil( system->bndry_cuts.ghost_bond /
-                                        cell_len[d] );
+                cell_len[d] );
     }
 
 
@@ -418,14 +461,14 @@ void Bin_My_Atoms( reax_system *system, reallocate_data *realloc )
                 if ( atoms[l].x[d] < my_box->min[d] || atoms[l].x[d] > my_box->max[d] )
                 {
                     fprintf( stderr, "p%d: local atom%d [%f %f %f] is out of my box!\n",
-                             system->my_rank, l,
-                             atoms[l].x[0], atoms[l].x[1], atoms[l].x[2] );
+                            system->my_rank, l,
+                            atoms[l].x[0], atoms[l].x[1], atoms[l].x[2] );
                     fprintf( stderr, "p%d: orig atom id is %d!\n",
-                             system->my_rank, atoms[l].orig_id);
+                            system->my_rank, atoms[l].orig_id);
                     fprintf( stderr, "p%d: my_box=[%f-%f, %f-%f, %f-%f]\n",
-                             system->my_rank, my_box->min[0], my_box->max[0],
-                             my_box->min[1], my_box->max[1],
-                             my_box->min[2], my_box->max[2] );
+                            system->my_rank, my_box->min[0], my_box->max[0],
+                            my_box->min[1], my_box->max[1],
+                            my_box->min[2], my_box->max[2] );
                     MPI_Abort( MPI_COMM_WORLD, -1 );
                 }
 
@@ -437,10 +480,10 @@ void Bin_My_Atoms( reax_system *system, reallocate_data *realloc )
             }
 #if defined(DEBUG)
             fprintf( stderr, "p%d bin_my_atoms: l:%d - atom%d @ %.5f %.5f %.5f"\
-                     "--> cell: %d %d %d\n",
-                     system->my_rank, l, atoms[l].orig_id,
-                     atoms[l].x[0], atoms[l].x[1], atoms[l].x[2],
-                     c[0], c[1], c[2] );
+                    "--> cell: %d %d %d\n",
+                    system->my_rank, l, atoms[l].orig_id,
+                    atoms[l].x[0], atoms[l].x[1], atoms[l].x[2],
+                    c[0], c[1], c[2] );
 #endif
             gc = &( g->cells[c[0]][c[1]][c[2]] );
             gc->atoms[ gc->top++ ] = l;
@@ -460,13 +503,13 @@ void Bin_My_Atoms( reax_system *system, reallocate_data *realloc )
                     max_atoms = gc->top;
 #if defined(DEBUG)
                 fprintf( stderr, "p%d gc[%d,%d,%d]->top=%d\n",
-                         system->my_rank, i, j, k, gc->top );
+                        system->my_rank, i, j, k, gc->top );
 #endif
             }
 
 #if defined(DEBUG)
     fprintf( stderr, "p%d max_atoms=%d, g->max_atoms=%d\n",
-             system->my_rank, max_atoms, g->max_atoms );
+            system->my_rank, max_atoms, g->max_atoms );
 #endif
     /* check if current gcell->max_atoms is safe */
     if ( max_atoms >= g->max_atoms * DANGER_ZONE )
@@ -524,7 +567,7 @@ void Reorder_My_Atoms( reax_system *system, storage *workspace )
 
 
 void Get_Boundary_GCell( grid *g, rvec base, rvec x, grid_cell **gc,
-                         rvec *cur_min, rvec *cur_max )
+        rvec *cur_min, rvec *cur_max )
 {
     int d;
     ivec c;
@@ -540,7 +583,7 @@ void Get_Boundary_GCell( grid *g, rvec base, rvec x, grid_cell **gc,
     }
 #if defined(DEBUG)
     fprintf( stderr, "get_bndry_gc: base=[%f %f %f] x=[%f %f %f] c=[%d %d %d]\n",
-             base[0], base[1], base[2], x[0], x[1], x[2], c[0], c[1], c[2] );
+            base[0], base[1], base[2], x[0], x[1], x[2], c[0], c[1], c[2] );
 #endif
 
     *gc = &( g->cells[c[0]][c[1]][c[2]] );
@@ -548,11 +591,11 @@ void Get_Boundary_GCell( grid *g, rvec base, rvec x, grid_cell **gc,
     rvec_Sum( *cur_max, (*gc)->max, loosen );
 #if defined(DEBUG)
     fprintf( stderr, "get_bndry_gc: gcmin=[%f %f %f] gcmax=[%f %f %f]\n",
-             (*gc)->min[0], (*gc)->min[1], (*gc)->min[2],
-             (*gc)->max[0], (*gc)->max[1], (*gc)->max[2] );
+            (*gc)->min[0], (*gc)->min[1], (*gc)->min[2],
+            (*gc)->max[0], (*gc)->max[1], (*gc)->max[2] );
     fprintf( stderr, "get_bndry_gc: curmin=[%f %f %f] curmax=[%f %f %f]\n",
-             (*cur_min)[0], (*cur_min)[1], (*cur_min)[2],
-             (*cur_max)[0], (*cur_max)[1], (*cur_max)[2] );
+            (*cur_min)[0], (*cur_min)[1], (*cur_min)[2],
+            (*cur_max)[0], (*cur_max)[1], (*cur_max)[2] );
 #endif
 }
 
@@ -599,8 +642,8 @@ void Bin_Boundary_Atoms( reax_system *system )
     if ( !is_Within_GCell( atoms[start].x, ext_box->min, ext_box->max ) )
     {
         fprintf( stderr, "p%d: ghost atom%d [%f %f %f] is out of my box!\n",
-                 system->my_rank, start,
-                 atoms[start].x[0], atoms[start].x[1], atoms[start].x[2] );
+                system->my_rank, start,
+                atoms[start].x[0], atoms[start].x[1], atoms[start].x[2] );
         MPI_Abort( MPI_COMM_WORLD, -1 );
     }
 
@@ -613,8 +656,8 @@ void Bin_Boundary_Atoms( reax_system *system )
         if ( !is_Within_GCell( atoms[i].x, ext_box->min, ext_box->max ) )
         {
             fprintf( stderr, "p%d: ghost atom%d [%f %f %f] is out of my box!\n",
-                     system->my_rank, i,
-                     atoms[i].x[0], atoms[i].x[1], atoms[i].x[2] );
+                    system->my_rank, i,
+                    atoms[i].x[0], atoms[i].x[1], atoms[i].x[2] );
             MPI_Abort( MPI_COMM_WORLD, -1 );
         }
 
@@ -628,11 +671,11 @@ void Bin_Boundary_Atoms( reax_system *system )
             if ( gc->top != 0 )
             {
                 fprintf( stderr, "p%d bin_boundary_atoms: atom%d map was unexpected! ",
-                         system->my_rank, i );
+                        system->my_rank, i );
                 fprintf( stderr, "[%f %f %f] --> [%f %f %f] to [%f %f %f]\n",
-                         atoms[i].x[0], atoms[i].x[1], atoms[i].x[2],
-                         gc->min[0], gc->min[1], gc->min[2],
-                         gc->max[0], gc->max[1], gc->max[2] );
+                        atoms[i].x[0], atoms[i].x[1], atoms[i].x[2],
+                        gc->min[0], gc->min[1], gc->min[2],
+                        gc->max[0], gc->max[1], gc->max[2] );
                 MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT );
             }
             gc->str = i;
@@ -646,4 +689,4 @@ void Bin_Boundary_Atoms( reax_system *system )
 #if defined(DEBUG)
     fprintf( stderr, "p%d bin_boundary_atoms: done\n", system->my_rank );
 #endif
-}
+    }
diff --git a/PuReMD/src/init_md.c b/PuReMD/src/init_md.c
index 4fac0f1d08a9ab918e29906bc76b13c4c9f83c3c..45cdc2e999a55fb3243058f78620a7cf5f4790f6 100644
--- a/PuReMD/src/init_md.c
+++ b/PuReMD/src/init_md.c
@@ -568,7 +568,7 @@ int  Init_Lists( reax_system *system, control_params *control,
         simulation_data *data, storage *workspace, reax_list **lists,
         mpi_datatypes *mpi_data, char *msg )
 {
-    int i, num_nbrs, far_nbr_list_format, cm_format;
+    int i, num_nbrs, far_nbr_list_format, cm_format, matrix_dim;
     int total_hbonds, total_bonds, bond_cap, num_3body, cap_3body, Htop;
     int *hb_top, *bond_top;
     MPI_Comm comm;
@@ -631,21 +631,13 @@ int  Init_Lists( reax_system *system, control_params *control,
     hb_top = (int*) calloc( system->local_cap, sizeof(int) );
     //bond_top = (int*) malloc( system->total_cap * sizeof(int) );
     //hb_top = (int*) malloc( system->local_cap * sizeof(int) );
+    
     Estimate_Storages( system, control, lists,
-            &Htop, hb_top, bond_top, &num_3body, comm );
+            &Htop, hb_top, bond_top, &num_3body, comm, &matrix_dim );
 
-    Allocate_Matrix( &(workspace->H), system->local_cap, Htop,
-            cm_format, comm );
+    Allocate_Matrix( &(workspace->H), matrix_dim, Htop, cm_format, comm );
     workspace->L = NULL;
     workspace->U = NULL;
-
-    //TODO: uncomment for SAI
-//    Allocate_Matrix2( &(workspace->H_spar_patt), workspace->H->n, system->local_cap,
-//            workspace->H->m, SYM_HALF_MATRIX, comm );
-//    Allocate_Matrix( &(workspace->H_spar_patt_full), workspace->H->n,
-//            2 * workspace->H->m - workspace->H->n, SYM_FULL_MATRIX, comm );
-//    Allocate_Matrix2( &(workspace->H_app_inv), workspace->H->n, system->local_cap,
-//            workspace->H->m, SYM_FULL_MATRIX, comm );
     workspace->H_spar_patt = NULL;
     workspace->H_app_inv = NULL;
 
@@ -749,7 +741,7 @@ int  Init_Lists( reax_system *system, control_params *control,
         simulation_data *data, storage *workspace, reax_list **lists,
         mpi_datatypes *mpi_data, char *msg )
 {
-    int i, num_nbrs;
+    int i, num_nbrs, matrix_dim;
     int total_hbonds, total_bonds, bond_cap, num_3body, cap_3body, Htop;
     int *hb_top, *bond_top;
     int nrecv[MAX_NBRS];
@@ -760,7 +752,7 @@ int  Init_Lists( reax_system *system, control_params *control,
     bond_top = (int*) calloc( system->total_cap, sizeof(int) );
     hb_top = (int*) calloc( system->local_cap, sizeof(int) );
     Estimate_Storages( system, control, lists,
-            &Htop, hb_top, bond_top, &num_3body, comm );
+            &Htop, hb_top, bond_top, &num_3body, comm, &matrix_dim );
 
     if ( control->hbond_cut > 0 )
     {
diff --git a/PuReMD/src/linear_solvers.c b/PuReMD/src/linear_solvers.c
index 98bc9b2923baf2259db4dc3062801b2202e3bcd1..fd068c99f7023b58b37c5c62f4f5f71ee469b03c 100644
--- a/PuReMD/src/linear_solvers.c
+++ b/PuReMD/src/linear_solvers.c
@@ -717,9 +717,6 @@ real sparse_approx_inverse( reax_system *system, simulation_data *data,
         }
     }
     
-    //fprintf(stderr, "After dist call, p%d\n", system->my_rank );
-    //fflush(stderr);
-
     X = (int *) malloc( sizeof(int) * (system->bigN + 1) );
     pos_x = (int *) malloc( sizeof(int) * (system->bigN + 1) );
 
@@ -741,7 +738,7 @@ real sparse_approx_inverse( reax_system *system, simulation_data *data,
             ++N;
 
             /* for each of those indices
-             *              * search through the row of full A of that index */
+             * search through the row of full A of that index */
 
             /* the case where the local matrix has that index's row */
             if( j_temp < A->n )
@@ -838,7 +835,7 @@ real sparse_approx_inverse( reax_system *system, simulation_data *data,
         }
 
         /* create the right hand side of the linear equation
-         *          * that is the full column of the identity matrix */
+         * that is the full column of the identity matrix */
         e_j = (real *) malloc( sizeof(real) * M );
 
         for ( k = 0; k < M; ++k )
@@ -928,7 +925,7 @@ void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N )
             }
         }
     }
-    else if ( A->format == SYM_FULL_MATRIX )
+    else if ( A->format == SYM_FULL_MATRIX || A->format == FULL_MATRIX )
     {
         /* perform multiplication */
         for ( i = 0; i < A->n; ++i )
@@ -1168,7 +1165,7 @@ void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N )
             }
         }
     }
-    else if ( A->format == SYM_FULL_MATRIX )
+    else if ( A->format == SYM_FULL_MATRIX || A->format == FULL_MATRIX )
     {
         for ( i = 0; i < A->n; ++i )
         {
@@ -1205,16 +1202,19 @@ int CG( reax_system *system, control_params *control, simulation_data *data,
     t_start = Get_Time( );
     scale = sizeof(real) / sizeof(void);
     Dist( system, mpi_data, x, MPI_DOUBLE, scale, real_packer );
+//    Dist_NT( system, mpi_data, x, MPI_DOUBLE, scale, real_packer );
     t_comm += Get_Timing_Info( t_start );
 
     t_start = Get_Time( );
-    Sparse_MatVec( H, x, workspace->q, system->N );
+    //Sparse_MatVec( H, x, workspace->q, system->N );
+    Sparse_MatVec( H, x, workspace->q, H->NT );
     t_spmv += Get_Timing_Info( t_start );
 
     if ( H->format == SYM_HALF_MATRIX )
     {
         t_start = Get_Time( );
         Coll( system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker );
+//        Coll_NT( system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker );
         t_comm += Get_Timing_Info( t_start );
     }
 
@@ -1227,10 +1227,12 @@ int CG( reax_system *system, control_params *control, simulation_data *data,
     {
         t_start = Get_Time( );
         Dist( system, mpi_data, workspace->r, MPI_DOUBLE, scale, real_packer );
+//        Dist_NT( system, mpi_data, workspace->r, MPI_DOUBLE, scale, real_packer );
         t_comm += Get_Timing_Info( t_start );
         
         t_start = Get_Time( );
-        Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->d, system->n );
+        //Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->d, system->n );
+        Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->d, H->NT );
         t_pa += Get_Timing_Info( t_start );
     }
 
@@ -1253,16 +1255,19 @@ int CG( reax_system *system, control_params *control, simulation_data *data,
     {
         t_start = Get_Time( );
         Dist( system, mpi_data, workspace->d, MPI_DOUBLE, scale, real_packer );
+//        Dist_NT( system, mpi_data, workspace->d, MPI_DOUBLE, scale, real_packer );
         t_comm += Get_Timing_Info( t_start );
 
         t_start = Get_Time( );
-        Sparse_MatVec( H, workspace->d, workspace->q, system->N );
+        //Sparse_MatVec( H, workspace->d, workspace->q, system->N );
+        Sparse_MatVec( H, workspace->d, workspace->q, H->NT );
         t_spmv += Get_Timing_Info( t_start );
 
         if ( H->format == SYM_HALF_MATRIX )
         {
             t_start = Get_Time( );
             Coll(system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker);
+//            Coll_NT(system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker);
             t_comm += Get_Timing_Info( t_start );
         }
 
@@ -1281,10 +1286,12 @@ int CG( reax_system *system, control_params *control, simulation_data *data,
         {
             t_start = Get_Time( );
             Dist( system, mpi_data, workspace->r, MPI_DOUBLE, scale, real_packer );
+//            Dist_NT( system, mpi_data, workspace->r, MPI_DOUBLE, scale, real_packer );
             t_comm += Get_Timing_Info( t_start );
 
             t_start = Get_Time( );
-            Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->p, system->n );
+            //Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->p, system->n );
+            Sparse_MatVec( workspace->H_app_inv, workspace->r, workspace->p, H->NT );
             t_pa += Get_Timing_Info( t_start );
         }
 
diff --git a/PuReMD/src/neighbors.c b/PuReMD/src/neighbors.c
index 96800433bbdb63fa939448e4a4481c34d9e36804..c62af733a8147c5b8704fa4171a10883c8c94b73 100644
--- a/PuReMD/src/neighbors.c
+++ b/PuReMD/src/neighbors.c
@@ -104,6 +104,14 @@ void Generate_Neighbor_Lists( reax_system *system, simulation_data *data,
                 for (l = gci->str; l < gci->end; ++l )
                 {
                     atom1 = &(system->my_atoms[l]);
+                    if( gci->type >= NT_NBRS && gci->type < NT_NBRS + 6 )
+                    {
+                        atom1->nt_dir = gci->type - NT_NBRS;
+                    }
+                    else
+                    {
+                        atom1->nt_dir = -1;
+                    }
                     Set_Start_Index( l, num_far, far_nbrs );
                     //fprintf( stderr, "\tatom %d\n", atom1 );
 
@@ -205,6 +213,14 @@ int Estimate_NumNeighbors( reax_system *system, reax_list **lists,
                 for ( l = gci->str; l < gci->end; ++l )
                 {
                     atom1 = &(system->my_atoms[l]);
+                    if( gci->type >= NT_NBRS && gci->type < NT_NBRS + 6 )
+                    {
+                        atom1->nt_dir = gci->type - NT_NBRS;
+                    }
+                    else
+                    {
+                        atom1->nt_dir = -1;
+                    }
                     //fprintf( stderr, "\tatom %d: ", l );
                     //tmp = num_far; tested = 0;
                     itr = 0;
diff --git a/PuReMD/src/parallelreax.c b/PuReMD/src/parallelreax.c
index 894d7fde02ea631ec37a01cc5cbf8c775b5b8e13..660d254e1c00064518ea7383fd0e0bef0b010a82 100644
--- a/PuReMD/src/parallelreax.c
+++ b/PuReMD/src/parallelreax.c
@@ -115,6 +115,11 @@ static void usage(char* argv[])
 
 int main( int argc, char* argv[] )
 {
+#if defined(NT_DEBUG)
+    fprintf( stdout, "main beginning\n");
+    fflush( stdout);    
+#endif
+
     reax_system *system;
     control_params *control;
     simulation_data *data;
@@ -163,6 +168,10 @@ int main( int argc, char* argv[] )
         lists[i]->far_nbr_list = NULL;
         lists[i]->hbond_list = NULL;
     }
+#if defined(NT_DEBUG)
+    fprintf( stdout, "main reax lists allocated\n");
+    fflush( stdout);    
+#endif
     out_control = (output_controls *)
         smalloc( sizeof(output_controls), "out_control", MPI_COMM_WORLD );
     mpi_data = (mpi_datatypes *)
@@ -183,6 +192,10 @@ int main( int argc, char* argv[] )
     fprintf( stderr, "p%d: read simulation info\n", system->my_rank );
     MPI_Barrier( MPI_COMM_WORLD );
 #endif
+#if defined(NT_DEBUG)
+    fprintf( stdout, "main simulatin info read\n");
+    fflush( stdout);    
+#endif
 
     /* measure total simulation time after input is read */
     if ( system->my_rank == MASTER_NODE )
@@ -190,6 +203,10 @@ int main( int argc, char* argv[] )
 
     /* initialize datastructures */
     Initialize( system, control, data, workspace, lists, out_control, mpi_data );
+#if defined(NT_DEBUG)
+    fprintf( stdout, "main data structures initialized\n");
+    fflush( stdout);    
+#endif
 
 #if defined(DEBUG)
     fprintf( stderr, "p%d: initializated data structures\n", system->my_rank );
@@ -198,12 +215,32 @@ int main( int argc, char* argv[] )
 
     /* compute f_0 */
     Comm_Atoms( system, control, data, workspace, lists, mpi_data, 1 );
+#if defined(NT_DEBUG)
+    fprintf( stdout, "main after calling Comm_Atoms\n");
+    fflush( stdout);    
+#endif
     Reset( system, control, data, workspace, lists, MPI_COMM_WORLD );
+#if defined(NT_DEBUG)
+    fprintf( stdout, "main system is reset\n");
+    fflush( stdout);    
+#endif
     Generate_Neighbor_Lists( system, data, workspace, lists );
+#if defined(NT_DEBUG)
+    fprintf( stdout, "main neighbor lists are generated\n");
+    fflush( stdout);    
+#endif
     Compute_Forces( system, control, data, workspace,
             lists, out_control, mpi_data );
+#if defined(NT_DEBUG)
+    fprintf( stdout, "main forces are computed\n");
+    fflush( stdout);    
+#endif
     Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
     Output_Results( system, control, data, lists, out_control, mpi_data );
+#if defined(NT_DEBUG)
+    fprintf( stdout, "main results are written\n");
+    fflush( stdout);    
+#endif
 
 #if defined(DEBUG)
     fprintf( stderr, "p%d: computed forces at t0\n", system->my_rank );
diff --git a/PuReMD/src/reax_defs.h b/PuReMD/src/reax_defs.h
index 310a46f769ba98f67f6ee4ce22d9fba6b1e30944..0f3b9089aefa559dec4ae4f3c17c340633ef0e98 100644
--- a/PuReMD/src/reax_defs.h
+++ b/PuReMD/src/reax_defs.h
@@ -131,7 +131,7 @@ enum exchanges { NONE = 0, NEAR_EXCH = 1, FULL_EXCH = 2 };
 
 enum gcell_types { NO_NBRS = 0, NEAR_ONLY = 1, HBOND_ONLY = 2, FAR_ONLY = 4,
                    NEAR_HBOND = 3, NEAR_FAR = 5, HBOND_FAR = 6, FULL_NBRS = 7,
-                   NATIVE = 8
+                   NATIVE = 8, NT_NBRS = 9 // 9 through 14
                  };
 
 enum atoms { C_ATOM = 0, H_ATOM = 1, O_ATOM = 2, N_ATOM = 3,
diff --git a/PuReMD/src/reax_types.h b/PuReMD/src/reax_types.h
index 2b08d03e15726c1893cf236dbe22685b1313edea..b42dcd22c5e77ce8506d3c4b43dfa66a75223631 100644
--- a/PuReMD/src/reax_types.h
+++ b/PuReMD/src/reax_types.h
@@ -40,6 +40,7 @@
 /************* SOME DEFS - crucial for reax_types.h *********/
 
 #define PURE_REAX
+//#define NT_DEBUG
 //#define LAMMPS_REAX
 //#define DEBUG
 //#define DEBUG_FOCUS
@@ -217,14 +218,7 @@ typedef struct
     MPI_Datatype angle_line;
     MPI_Datatype angle_view;
 
-    //MPI_Request  send_req1[REAX_MAX_NBRS];
-    //MPI_Request  send_req2[REAX_MAX_NBRS];
-    //MPI_Status   send_stat1[REAX_MAX_NBRS];
-    //MPI_Status   send_stat2[REAX_MAX_NBRS];
-    //MPI_Status   recv_stat1[REAX_MAX_NBRS];
-    //MPI_Status   recv_stat2[REAX_MAX_NBRS];
-
-    mpi_out_data out_buffers[REAX_MAX_NBRS];
+    mpi_out_data out_buffers[REAX_MAX_NT_NBRS];
     mpi_out_data out_nt_buffers[REAX_MAX_NT_NBRS];
     void *in1_buffer;
     void *in2_buffer;
@@ -441,6 +435,8 @@ typedef struct
     int num_bonds;
     int num_hbonds;
     int renumber;
+    int nt_dir;
+    int pos;
 } reax_atom;
 
 
@@ -891,6 +887,15 @@ typedef struct
 } far_neighbor_data;
 
 
+typedef struct
+{
+    int nbr;
+    ivec rel_box;
+    real d;
+    rvec dvec;
+} nt_neighbor_data;
+
+
 typedef struct
 {
     int nbr;
@@ -961,7 +966,7 @@ typedef struct
 {
     /* matrix storage format */
     int format;
-    int cap, n, m;
+    int cap, n, m, NT;
     int *start, *end;
     sparse_matrix_entry *entries;
 } sparse_matrix;
@@ -1075,6 +1080,7 @@ typedef union
     //dbond_data         *dbo_list;
     //dDelta_data        *dDelta_list;
     //far_neighbor_data  *far_nbr_list;
+    //nt_neighbor_data   *nt_nbr_list;
     //hbond_data         *hbond_list;
 } list_type;
 
@@ -1101,6 +1107,7 @@ typedef struct
     dbond_data *dbo_list;
     dDelta_data *dDelta_list;
     far_neighbor_data *far_nbr_list;
+    nt_neighbor_data *nt_nbr_list;
     hbond_data *hbond_list;
 } reax_list;
 
diff --git a/configure.ac b/configure.ac
index 95a0443651267cfed80e6c99245af81a209f7925..830a2ce26a6b8bce71fccc7c059593fa2b7aafb2 100644
--- a/configure.ac
+++ b/configure.ac
@@ -84,9 +84,7 @@ if test "x${DEBUG}" = "xyes"
 then
 #	#TODO: fix exporting to subdirs
 #	# See: http://stackoverflow.com/questions/34124337/changing-flags-in-configure-ac-vs-caching-with-subprojects
-#	CFLAGS="-g3 -O0 -D_GLIBCXX_DEBUG ${CFLAGS}"
-	export BUILD_DEBUG="true"
-	export DEBUG_FLAGS="-g2 -O0 -D_GLIBCXX_DEBUG"
+	export BUILD_DEBUG="yes"
 fi
 
 # gprof flags.