diff --git a/PG-PuReMD/src/.lin_alg.c.swo b/PG-PuReMD/src/.lin_alg.c.swo
new file mode 100644
index 0000000000000000000000000000000000000000..89ba9907eea5484d0b8ca15e58e0629f024eca49
Binary files /dev/null and b/PG-PuReMD/src/.lin_alg.c.swo differ
diff --git a/PG-PuReMD/src/basic_comm.c b/PG-PuReMD/src/basic_comm.c
index c47d6fdcc777e3a16c6a66379152eca507967726..d22009ff6172faf25e7013ae8c1bcf51b05c20f0 100644
--- a/PG-PuReMD/src/basic_comm.c
+++ b/PG-PuReMD/src/basic_comm.c
@@ -42,7 +42,7 @@ static void int_packer( void *dummy, mpi_out_data *out_buf )
 
     for ( i = 0; i < out_buf->cnt; ++i )
     {
-        out[i] = buf[ out_buf->index[i] ];
+        out[i] += buf[ out_buf->index[i] ];
     }
 }
 
@@ -100,7 +100,7 @@ static void int_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf
 
         for ( i = 0; i < out_buf->cnt; ++i )
         {
-            buf[ out_buf->index[i] ] += in[i];
+            buf[ out_buf->index[i] ] = in[i];
         }
 }
 
@@ -267,43 +267,21 @@ void Dist( const reax_system * const system, mpi_datatypes * const mpi_data,
 #if defined(DEBUG)
     fprintf( stderr, "p%d dist: entered\n", system->my_rank );
 #endif
-    fprintf( stdout, "p%d dist: entered\n", system->my_rank );
-    fflush(stdout);
-
 
     comm = mpi_data->comm_mesh3D;
     out_bufs = mpi_data->out_buffers;
     pack = Get_Packer( buf_type );
 
-    fprintf( stdout, "Packer Gottten\n" );
-    fflush(stdout);
-
-    /*for(d = 0; d < 4101; d++)
-        if(*(int * )(buf + 4001) != -1)
-        fprintf(stdout,"%d ", *(int * )(buf + d));
-    fprintf(stdout, "\n");*/
-
     for ( d = 0; d < 3; ++d )
     {
 
-        fprintf( stdout, "Iteration %d\n", d );
-        fflush(stdout);
-
         /* initiate recvs */
         nbr1 = &system->my_nbrs[2 * d];
         if ( nbr1->atoms_cnt )
         {
-            fprintf( stdout, "Irecv %d\n", d );
-            fflush(stdout);
-            fprintf(stdout,"%d %d\n",nbr1->atoms_str, nbr1->atoms_cnt);
-            fprintf(stdout,"%d\n", *(int *)Get_Buffer_Offset( buf, nbr1->atoms_str + nbr1->atoms_cnt - 1, buf_type ));
-            fprintf(stdout,"%d %d\n", nbr1->rank, 2*d + 1);
-            fflush(stdout);
             MPI_Irecv( Get_Buffer_Offset( buf, nbr1->atoms_str, buf_type ),
                     nbr1->atoms_cnt, type, nbr1->rank, 2 * d + 1, comm, &req1 );
         }
-        fprintf( stdout, "A %d\n", d );
-        fflush(stdout);
 
         nbr2 = &system->my_nbrs[2 * d + 1];
         if ( nbr2->atoms_cnt )
@@ -311,8 +289,6 @@ void Dist( const reax_system * const system, mpi_datatypes * const mpi_data,
             MPI_Irecv( Get_Buffer_Offset( buf, nbr2->atoms_str, buf_type ),
                     nbr2->atoms_cnt, type, nbr2->rank, 2 * d, comm, &req2 );
         }
-        fprintf( stdout, "B %d\n", d );
-        fflush(stdout);
 
         /* send both messages in dimension d */
         if ( out_bufs[2 * d].cnt )
@@ -321,8 +297,6 @@ void Dist( const reax_system * const system, mpi_datatypes * const mpi_data,
             MPI_Send( out_bufs[2 * d].out_atoms, out_bufs[2 * d].cnt,
                     type, nbr1->rank, 2 * d, comm );
         }
-        fprintf( stdout, "C %d\n", d );
-        fflush(stdout);
 
         if ( out_bufs[2 * d + 1].cnt )
         {
@@ -330,8 +304,6 @@ void Dist( const reax_system * const system, mpi_datatypes * const mpi_data,
             MPI_Send( out_bufs[2 * d + 1].out_atoms, out_bufs[2 * d + 1].cnt,
                     type, nbr2->rank, 2 * d + 1, comm );
         }
-        fprintf( stdout, "D %d\n", d );
-        fflush(stdout);
 
         if( nbr1->atoms_cnt )
         {
@@ -343,8 +315,6 @@ void Dist( const reax_system * const system, mpi_datatypes * const mpi_data,
         }
     }
 
-    fprintf( stdout, "Out of For Loop\n" );
-    fflush(stdout);
 
 #if defined(DEBUG)
     fprintf( stderr, "p%d dist: done\n", system->my_rank );
@@ -355,9 +325,6 @@ void Dist( const reax_system * const system, mpi_datatypes * const mpi_data,
 void Coll( const reax_system * const system, mpi_datatypes * const mpi_data,
         void *buf, int buf_type, MPI_Datatype type )
 {
-    fprintf(stdout, "Coll\n");
-    fflush(stdout);
-
     int d;
     mpi_out_data *out_bufs;
     MPI_Comm comm;
@@ -376,9 +343,6 @@ void Coll( const reax_system * const system, mpi_datatypes * const mpi_data,
 
     for ( d = 2; d >= 0; --d )
     {
-        fprintf(stdout, "loop %d\n",d);
-        fflush(stdout);
-
         /* initiate recvs */
         nbr1 = &system->my_nbrs[2 * d];
 
diff --git a/PG-PuReMD/src/charges.c b/PG-PuReMD/src/charges.c
index b9c8cccbf64fb8cd99b6764f231430ac70c1bf69..bda01bb36f07778948c72c53203eb6e4d73cef97 100644
--- a/PG-PuReMD/src/charges.c
+++ b/PG-PuReMD/src/charges.c
@@ -843,6 +843,8 @@ static void QEq( reax_system * const system, control_params * const control,
             exit( INVALID_INPUT );
             break;
     }
+    
+
 
 #if defined(LOG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
diff --git a/PG-PuReMD/src/driver.c b/PG-PuReMD/src/driver.c
index 2040fa45aa79657957a24e26b6570e854ebcce2d..e820c2b370052cca70ac63c8be6d0ff162a8e641 100644
--- a/PG-PuReMD/src/driver.c
+++ b/PG-PuReMD/src/driver.c
@@ -65,6 +65,6 @@ int main( int argc, char* argv[] )
     { 
         MPI_Finalize( );
     }
-
+    
     return (ret == PUREMD_SUCCESS) ? 0 : (-1);
 }
diff --git a/PG-PuReMD/src/forces.c b/PG-PuReMD/src/forces.c
index 0e18c5330672411c4b5af24685e2aad9471ea636..67aa099550321afc9a8ff876e181a26b5cf3fe59 100644
--- a/PG-PuReMD/src/forces.c
+++ b/PG-PuReMD/src/forces.c
@@ -1140,7 +1140,7 @@ int Compute_Forces( reax_system * const system, control_params * const control,
         Print_Force_Files( system, control, data, workspace, lists, out_control, mpi_data );
 #endif
     }
-
+    
     return ret;
 }
 
diff --git a/PG-PuReMD/src/init_md.c b/PG-PuReMD/src/init_md.c
index 1364ab2f90bff6009fa59821dacd32cc05e466ab..f5b8d2bb7c2cbb42bfef34d3775572601685deda 100644
--- a/PG-PuReMD/src/init_md.c
+++ b/PG-PuReMD/src/init_md.c
@@ -942,8 +942,8 @@ static void Finalize_MPI_Datatypes( mpi_datatypes * const mpi_data )
 
     Deallocate_MPI_Buffers( mpi_data );
 
-    ret = MPI_Type_free( &mpi_data->mpi_atom_type );
-    Check_MPI_Error( ret, "Finalize_MPI_Datatypes::mpi_data->mpi_atom_type" );
+    //ret = MPI_Type_free( &mpi_data->mpi_atom_type );
+    //Check_MPI_Error( ret, "Finalize_MPI_Datatypes::mpi_data->mpi_atom_type" );
     ret = MPI_Type_free( &mpi_data->boundary_atom_type );
     Check_MPI_Error( ret, "Finalize_MPI_Datatypes::mpi_data->boundary_atom_type" );
     ret = MPI_Type_free( &mpi_data->mpi_rvec );
@@ -963,23 +963,18 @@ void Finalize( reax_system * const system, control_params * const control,
         output_controls * const out_control, mpi_datatypes * const mpi_data,
         const int output_enabled )
 {
+
     if ( control->tabulate )
     {
         Finalize_LR_Lookup_Table( system, control, workspace, mpi_data );
     }
-
     if ( output_enabled == TRUE )
     {
         Finalize_Output_Files( system, control, out_control );
     }
-
     Finalize_Lists( control, lists );
-
     Finalize_Workspace( system, control, workspace );
-
     Finalize_Simulation_Data( system, control, data, out_control );
-
     Finalize_System( system, control, data );
-
     Finalize_MPI_Datatypes( mpi_data );
 }
diff --git a/PG-PuReMD/src/lin_alg.c b/PG-PuReMD/src/lin_alg.c
index 3eb6cc58af4150517cf9a582c95ecdfa9b59a7fe..420286db0c1436e8a39872ec64d8c933e31cfbac 100644
--- a/PG-PuReMD/src/lin_alg.c
+++ b/PG-PuReMD/src/lin_alg.c
@@ -254,7 +254,9 @@ void setup_sparse_approx_inverse( const reax_system * const system, storage * co
     bucketlist = NULL;
     samplelist = NULL;
 
-    for(i = 0; i < system->n; i++)
+    fprintf(stdout,"bigN = %d\n",system->bigN);
+
+    /*for(i = 0; i < system->n; i++)
     {
         fprintf(stdout, "\n%d\n",i);
         for(pj = A->start[i]; pj < A->end[i]; pj++)
@@ -281,7 +283,7 @@ void setup_sparse_approx_inverse( const reax_system * const system, storage * co
 
     m = 0;
     for( i = 0; i < A->n; ++i )
-    {
+    {//TODO: add if
         m += A->end[i] - A->start[i];
     }
 
@@ -292,7 +294,10 @@ void setup_sparse_approx_inverse( const reax_system * const system, storage * co
     {
         for( pj = A->start[i]; pj < A->end[i]; ++pj )
         {
-            local_entries[m++] = A->entries[pj].val;
+            if(A->entries[pj].j < system->bigN)
+            {
+                local_entries[m++] = A->entries[pj].val;
+            }
         }
     }
 
@@ -586,7 +591,8 @@ void setup_sparse_approx_inverse( const reax_system * const system, storage * co
 
         for ( pj = A->start[i]; pj < A->end[i]; ++pj )
         {
-            if ( ( A->entries[pj].val >= threshold )  || ( A->entries[pj].j == i ) )
+            if ( ( ( A->entries[pj].val >= threshold )  || ( A->entries[pj].j == i ) ) 
+                    && A->entries[pj].j < system->bigN )
             {
                 A_spar_patt->entries[size].val = A->entries[pj].val;
                 A_spar_patt->entries[size].j = A->entries[pj].j;
@@ -639,66 +645,44 @@ int sparse_approx_inverse(const reax_system * const system, storage * const work
 
     start = Get_Time( );
 
-
     if ( A_app_inv == NULL )
     {
-        fprintf(stdout, "1  %d %d %d\n",A->n, A->n_max, A->m);
-        fflush(stdout);
         Allocate_Matrix(A_app_inv, A_spar_patt->n, A_spar_patt->n_max, A_spar_patt->m );
     }
 
     else if ( (A_app_inv->m) < (A_spar_patt->m) )
     {
-        fprintf(stdout, "2 %d %d %d\n",A_spar_patt->n, A_spar_patt->n_max, A_spar_patt->m);
-        fflush(stdout);
         Reallocate_Matrix( A_app_inv, A_spar_patt->n, A_spar_patt->n_max, A_spar_patt->m );
     }
 
-    /*for(i = 0; i < system->n; i++)
-    {
-        fprintf(stdout, "\n%d\n",i);
-        for(pj = A_spar_patt->start[i]; pj < A_spar_patt->end[i]; pj++)
-        {
-            fprintf(stdout, "%d %.2lf ---",A_spar_patt->entries[pj].j, A_spar_patt->entries[pj].val);
-        }
-        fprintf(stdout,"\n");
-    }*/
-
-    fprintf(stdout,"\n\n\nSAI computation started\n");
-    fprintf(stdout, "rank = %d\n", system->my_rank);
-    fprintf(stdout, "n = %d N = %d\n",system->n, system->N);
-    fprintf(stdout, "nnz of sparsity pattern = %d\n",A_spar_patt->m);
-    fflush(stdout);
-
-
     j_recv1 = NULL;
     j_recv2 = NULL;
     val_recv1 = NULL;
     val_recv2 = NULL;
 
-    mark = (int *) malloc( sizeof(int) * (system->N + 1) );
+    mark = (int *) malloc( sizeof(int) * (system->bigN + 1) );
+
+    // row_needed and row_nnz are used for Dist and Coll functions
+    // so their size should be N rather than bigN
     row_needed = (int *) malloc( sizeof(int) * (system->N + 1) );
     row_nnz = (int *) malloc( sizeof(int) * (system->N + 1) );
 
-    j_list = (int **) malloc( sizeof(int *) * (system->N + 1) );
-    val_list = (real **) malloc( sizeof(real *) * (system->N + 1) );
+    j_list = (int **) malloc( sizeof(int *) * (system->bigN + 1) );
+    val_list = (real **) malloc( sizeof(real *) * (system->bigN + 1) );
 
-    for ( i = 0; i <= system->N; ++i )
+    for ( i = 0; i <= system->bigN; ++i )
     {   
-        mark[ i ] = -1;
-        row_needed[ i ] = -1;
-        row_nnz[ i ] = 0;
+        mark[i] = -1;
+        row_needed[i] = -1;
+        row_nnz[i] = 0;
     }
 
     /* mark the atoms that already have their row stored in the local matrix */
     for ( i = 0; i < system->n; ++i )
     {   
         atom = &system->my_atoms[i];
-        fprintf(stdout, "%d ",atom->orig_id);
         mark[ atom->orig_id ] = i;
     }
-    fprintf(stdout,"\n");
-    fflush(stdout);
 
     /*find the atoms that are not marked but needed,
      *     meaning we need to communicate their row*/
@@ -707,288 +691,295 @@ int sparse_approx_inverse(const reax_system * const system, storage * const work
         for ( pj = A_spar_patt->start[i]; pj < A_spar_patt->end[i]; ++pj )
         {
             atom = &system->my_atoms[ A_spar_patt->entries[pj].j ];
-            //fprintf(stdout,"Spar_patt column: %d - orig_id: %d\n",A_spar_patt->entries[pj].j, atom->orig_id);
-            //fflush(stdout);
-            if(atom->orig_id > system->n)
-                fprintf(stdout, "adsadsads %d ",atom->orig_id);
 
             if( mark[ atom->orig_id ] == -1)
             {
-                fprintf(stdout, "asd %d ",atom->orig_id);
                 row_needed[ A_spar_patt->entries[pj].j ] = atom->orig_id;
             }
         }
     }
 
-    fflush(stdout);
-
-    Dist( system, mpi_data, row_needed, INT_PTR_TYPE, MPI_INT );
+    
+    // TODO: Dist or Coll
+    // Announce the rows that you need but you do not have
+    Coll( system, mpi_data, row_needed, INT_PTR_TYPE, MPI_INT );
 
-    fprintf(stdout, "First Dist is done\n");
-    fflush(stdout);
+    for(i = 0; i<system->bigN;i++)
+    {
+        fprintf(stdout,"%d ", row_needed[i]);
+    }
 
-    /* fill in the nnz of the lines that will be collected by other processes */
-    for( i = 0; i < system->N; ++i )
+    /* fill in the nnz of the rows of the original charge matrix
+     * that will be collected by other processes */
+    for( i = 0; i < system->bigN; ++i )
     {
-        if( row_needed[i] !=-1 && mark[ row_needed[i] ] != -1)
+        if( row_needed[i] != -1 && mark[ row_needed[i] ] != -1 )
         {
-            row_nnz[i] = A->end[  mark[ row_needed[i] ] ] - A->start[  mark[ row_needed[i] ] ];
-            
+            for( pj = A->start[i]; pj < A->end[i]; ++pj)
+            {
+                if( A->entries[pj].j < system->bigN )
+                {
+                    ++row_nnz[i];
+                }
+            }
         }
     }
-    fprintf(stdout, "Required NNZ calculation is done\n");
-    fflush(stdout);
 
-    /* announce the nnz's in each row to allocota space */
-    Coll( system, mpi_data, row_nnz, INT_PTR_TYPE, MPI_INT );
+    // TODO: Dist or Coll
+    /* Announce the nnz's in each row that will be communicated later */
+    Dist( system, mpi_data, row_nnz, INT_PTR_TYPE, MPI_INT );
     
-    fprintf(stdout, "First Call is done\n");
-    fflush(stdout);
-
     comm = mpi_data->comm_mesh3D;
     out_bufs = mpi_data->out_buffers;
-    for ( d = 2; d >= 0; --d )
+
+    fprintf(stdout,"before manual dist function\n");
+    
+    /*  Dist, not Coll */
+    for ( d = 0; d < 3; ++d)
     {
-        fprintf(stdout, "iterative loop for coll %d\n", d);
-        fflush(stdout);
+
+        fprintf(stdout,"iteration number: %d\n", d);
 
         flag1 = 0;
         flag2 = 0;
 
         /* initiate recvs */
         nbr1 = &system->my_nbrs[2 * d];
-
-        if ( out_bufs[2 * d].cnt )
+        if ( nbr1->atoms_cnt )
         {
+            /* calculate the total data that will be received */
             cnt = 0;
-            for( i = 0; i < out_bufs[2 * d].cnt; ++i )
+            for( i = nbr1->atoms_str; i < system->bigN && i < (nbr1->atoms_str + nbr1->atoms_cnt); ++i )
             {
-                cnt += row_nnz[ out_bufs[2 * d].index[i] ];
-     //           fprintf(stdout,"Outbuffs to be sent id:%d nnz:%d\n",out_bufs[2 * d+1].index[i], row_nnz[ out_bufs[2 * d+1].index[i] ]);
-       //         fflush(stdout);
+                if( row_needed[i] != -1 && mark[ row_needed[i] ] == -1)
+                {
+                    cnt += row_nnz[i];
+                }
             }
 
-            j_recv1 = (int *) malloc( sizeof(int) * cnt );
-            val_recv1 = (real *) malloc( sizeof(real) * cnt );
-            
+            /* initiate Irecv */
             if( cnt )
             {
                 flag1 = 1;
+                j_recv1 = (int *) malloc( sizeof(int) * cnt );
+                val_recv1 = (real *) malloc( sizeof(real) * cnt );
                 MPI_Irecv( j_recv1, cnt, MPI_INT, nbr1->rank, 2 * d + 1, comm, &req1 );
                 MPI_Irecv( val_recv1, cnt, MPI_DOUBLE, nbr1->rank, 2 * d + 1, comm, &req2 );
             }
         }
 
-        nbr2 = &system->my_nbrs[2 * d + 1];
-        fprintf(stdout, "Irecv is completed\n");
+        fprintf(stdout,"after first Irecv, cnt = %d\n",cnt);
         fflush(stdout);
 
-        if ( out_bufs[2 * d + 1].cnt )
+        nbr2 = &system->my_nbrs[2 * d + 1];
+        if ( nbr1->atoms_cnt )
         {
+            /* calculate the total data that will be received */
             cnt = 0;
-            for( i = 0; i < out_bufs[2 * d+1].cnt; ++i )
+            for( i = nbr2->atoms_str; i < system->bigN && i < (nbr2->atoms_str + nbr2->atoms_cnt); ++i )
             {
-                cnt += row_nnz[ out_bufs[2 * d+1].index[i] ];
+                if( row_needed[i] != -1 && mark[ row_needed[i] ] == -1)
+                {
+                    cnt += row_nnz[i];
+                }
             }
 
-            j_recv2 = (int *) malloc( sizeof(int) * cnt );
-            val_recv2 = (real *) malloc( sizeof(real) * cnt );
-
+            /* initiate Irecv */
             if( cnt )
             {
                 flag2 = 1;
+                j_recv2 = (int *) malloc( sizeof(int) * cnt );
+                val_recv2 = (real *) malloc( sizeof(real) * cnt );
                 MPI_Irecv( j_recv2, cnt, MPI_INT, nbr2->rank, 2 * d, comm, &req3 );
                 MPI_Irecv( val_recv2, cnt, MPI_DOUBLE, nbr2->rank, 2 * d, comm, &req4 );
             }
         }
 
-        fprintf(stdout, "Second Irecv is completed\n");
+        fprintf(stdout,"after second Irecv, cnt = %d\n", cnt);
         fflush(stdout);
 
         /* send both messages in dimension d */
-        if ( nbr1->atoms_cnt )
+        if( out_bufs[2 * d].cnt )
         {
             cnt = 0;
-            for( i = nbr1->atoms_str; i < nbr1->atoms_str + nbr1->atoms_cnt; ++i)
+            for( i = 0; i < out_bufs[2 * d].cnt; ++i )
             {
-                atom = &system->my_atoms[i];
-
-                if(mark[ atom->orig_id ] != -1)
-                {
-                    cnt += A->end[ mark[ atom->orig_id ] ] - A->start[ mark[ atom->orig_id ] ];
-                }
-                else
+                if( out_bufs[2 * d].index[i] < system->bigN )
                 {
-                    cnt += row_nnz[i];
+                    if( row_needed[ out_bufs[2 * d].index[i] ] != -1)
+                    {
+                        cnt += row_nnz[ out_bufs[2 * d].index[i] ];
+                    }
                 }
             }
 
-            fprintf(stdout, "Inside of First Send\n");
+            fprintf(stdout,"cnt = %d\n", cnt);
             fflush(stdout);
-
-            j_send = (int *) malloc( sizeof(int) * cnt );
-            val_send = (real *) malloc( sizeof(real) * cnt );
-
-            cnt = 0;
-            for( i = nbr1->atoms_str; i < nbr1->atoms_str + nbr1->atoms_cnt; ++i)
+            if( cnt )
             {
-                atom = &system->my_atoms[i];
-                if(row_needed[ atom->orig_id ] != -1)
+                j_send = (int *) malloc( sizeof(int) * cnt );
+                val_send = (real *) malloc( sizeof(real) * cnt );
+
+                cnt = 0;
+                for( i = 0; i < out_bufs[2 * d].cnt; ++i )
                 {
-                    if(mark[ atom->orig_id ] != -1)
+                    if( out_bufs[2 * d].index[i] < system->bigN )
                     {
-                        for( pj = A->start[ mark[ atom->orig_id ] ]; pj < A->end[ mark[ atom->orig_id ] ]; ++pj)
+                        if( row_needed[ out_bufs[2 * d].index[i] ] != -1)
                         {
-                            j_send[cnt] = A->entries[pj].j;
-                            val_send[cnt] = A->entries[pj].val;
-                            cnt++;
-                        }
-                    }
-                    else
-                    {
-                        for( pj = 0; pj < row_nnz[i]; ++pj)
-                        {
-                            j_send[cnt] = j_list[i][pj];
-                            val_send[cnt] = val_list[i][pj];
-                            cnt++;
+                            if( mark[ row_needed[ out_bufs[2 * d].index[i] ] ] != -1)
+                            {
+                                for( pj = A->start[ out_bufs[2 * d].index[i] ]; pj < A->end[ out_bufs[2 * d].index[i] ]; ++pj )
+                                {
+                                    if( A->entries[pj].j < system->bigN)
+                                    {
+                                        j_send[cnt] = A->entries[pj].j;
+                                        val_send[cnt] = A->entries[pj].val;
+                                        cnt++;
+                                    }
+                                }
+                            }
+                            else
+                            {
+                                for( pj = 0; pj < row_nnz[ out_bufs[2 * d].index[i] ]; ++pj )
+                                {
+                                    j_send[cnt] = j_list[ out_bufs[2 * d].index[i] ][pj];
+                                    val_send[cnt] = val_list[ out_bufs[2 * d].index[i] ][pj];
+                                    cnt++;
+                                }
+                            }
                         }
                     }
                 }
 
-            }
-            fprintf(stdout, "Still Inside of First Send\n");
-            fflush(stdout);
-
-            if( cnt )
-            {
                 MPI_Send( j_send, cnt, MPI_INT, nbr1->rank, 2 * d, comm );
                 MPI_Send( val_send, cnt, MPI_DOUBLE, nbr1->rank, 2 * d, comm );
             }
-
         }
-            fprintf(stdout, "Outside of First Send\n");
-            fflush(stdout);
 
-        if ( nbr2->atoms_cnt )
+        fprintf(stdout,"after first Send\n");
+        fflush(stdout);
+
+        if( out_bufs[2 * d + 1].cnt )
         {
-            fprintf(stdout, "Beginning of Second Send\n");
-            fflush(stdout);
             cnt = 0;
-            for( i = nbr2->atoms_str; i < nbr2->atoms_str + nbr2->atoms_cnt; ++i)
+            for( i = 0; i < out_bufs[2 * d + 1].cnt; ++i )
             {
-                atom = &system->my_atoms[i];
-
-                if(mark[ atom->orig_id ] != -1)
+                if( out_bufs[2 * d + 1].index[i] < system->bigN )
                 {
-                    cnt += A->end[ mark[ atom->orig_id ] ] - A->start[ mark[ atom->orig_id ] ];
-                }
-                else
-                {
-                    cnt += row_nnz[i];
+                    if( row_needed[ out_bufs[2 * d + 1].index[i] ] != -1)
+                    {
+                        cnt += row_nnz[ out_bufs[2 * d + 1].index[i] ];
+                    }
                 }
             }
-            
-            fprintf(stdout, "Inside of Second Send\n");
-            fflush(stdout);
-
-            j_send = (int *) malloc( sizeof(int) * cnt );
-            val_send = (real *) malloc( sizeof(real) * cnt );
 
-            fprintf(stdout, "Inside of Second Send after Malloc\n");
+            fprintf(stdout,"cnt = %d\n", cnt);
             fflush(stdout);
 
-            cnt = 0;
-            for( i = nbr2->atoms_str; i < nbr2->atoms_str + nbr2->atoms_cnt; ++i)
+            if( cnt )
             {
-                atom = &system->my_atoms[i];
+                j_send = (int *) malloc( sizeof(int) * cnt );
+                val_send = (real *) malloc( sizeof(real) * cnt );
 
-                if(row_needed[ atom->orig_id ] != -1)
+                cnt = 0;
+                for( i = 0; i < out_bufs[2 * d + 1].cnt; ++i )
                 {
-                    if(mark[ atom->orig_id ] != -1)
+                    if( out_bufs[2 * d + 1].index[i] < system->bigN )
                     {
-                        for( pj = A->start[ mark[ atom->orig_id ] ]; pj < A->end[ mark[ atom->orig_id ] ]; ++pj)
+                        if( row_needed[ out_bufs[2 * d + 1].index[i] ] != -1)
                         {
-                            j_send[cnt] = A->entries[pj].j;
-                            val_send[cnt] = A->entries[pj].val;
-                            cnt++;
-                        }
-                    }
-                    else
-                    {
-                        for( pj = 0; pj < row_nnz[i]; ++pj)
-                        {
-                            j_send[cnt] = j_list[i][pj];
-                            val_send[cnt] = val_list[i][pj];
-                            cnt++;
+                            if( mark[ row_needed[ out_bufs[2 * d + 1].index[i] ] ] != -1)
+                            {
+                                for( pj = A->start[ out_bufs[2 * d + 1].index[i] ]; pj < A->end[ out_bufs[2 * d + 1].index[i] ]; ++pj )
+                                {
+                                    if( A->entries[pj].j < system->bigN)
+                                    {
+                                        j_send[cnt] = A->entries[pj].j;
+                                        val_send[cnt] = A->entries[pj].val;
+                                        cnt++;
+                                    }
+                                }
+                            }
+                            else
+                            {
+                                for( pj = 0; pj < row_nnz[ out_bufs[2 * d + 1].index[i] ]; ++pj )
+                                {
+                                    j_send[cnt] = j_list[ out_bufs[2 * d + 1].index[i] ][pj];
+                                    val_send[cnt] = val_list[ out_bufs[2 * d + 1].index[i] ][pj];
+                                    cnt++;
+                                }
+                            }
                         }
                     }
                 }
-            }
 
-            fprintf(stdout, "Still Inside of Second Send\n");
-            fprintf(stdout, "cnt = %d\n",cnt);
-            fflush(stdout);
-
-            if ( cnt )
-            {
-                MPI_Send( j_send, cnt, MPI_INT, nbr2->rank, 2 * d + 1, comm );
-                MPI_Send( val_send, cnt, MPI_DOUBLE, nbr2->rank, 2 * d + 1, comm );
+                MPI_Send( j_send, cnt, MPI_INT, nbr1->rank, 2 * d + 1, comm );
+                MPI_Send( val_send, cnt, MPI_DOUBLE, nbr1->rank, 2 * d + 1, comm );
             }
         }
 
-        if ( out_bufs[2 * d].cnt && flag1 )
+        fprintf(stdout,"after the seconf Send\n");
+        fflush(stdout);
+
+        if( flag1 )
         {
             MPI_Wait( &req1, &stat1 );
             MPI_Wait( &req2, &stat2 );
 
+
             cnt = 0;
-            for( i = 0; i < out_bufs[2 * d].cnt; ++i )
+            for( i = nbr1->atoms_str; i < system->bigN && i < (nbr1->atoms_str + nbr1->atoms_cnt); ++i )
             {
-                j_list[ out_bufs[2 * d].index[i] ] = (int *) malloc( sizeof(int) * row_nnz[ out_bufs[2 * d].index[i] ] );
-                val_list[ out_bufs[2 * d].index[i] ] = (real *) malloc( sizeof(real) * row_nnz[ out_bufs[2 * d].index[i] ] );
-
-                for( pj = 0; pj < row_nnz[ out_bufs[2 * d].index[i] ]; ++pj)
+                if( row_needed[i] != -1 && mark[ row_needed[i] ] == -1)
                 {
-                    j_list[ out_bufs[2 * d].index[i] ][pj] = j_recv1[cnt];
-                    val_list[ out_bufs[2 * d].index[i] ][pj] = val_recv1[cnt];
-                    cnt++;
+                    j_list[i] = (int *) malloc( sizeof(int) *  row_nnz[i] );
+                    val_list[i] = (real *) malloc( sizeof(real) * row_nnz[i] );
+
+                    for( pj = 0; pj < row_nnz[i]; ++pj )
+                    {
+                        j_list[i][pj] = j_recv1[cnt];
+                        val_list[i][pj] = val_recv1[cnt];
+                        cnt++;
+                    }
                 }
             }
         }
 
-        if ( out_bufs[2 * d + 1].cnt && flag2 )
+        if( flag2 )
         {
             MPI_Wait( &req3, &stat3 );
             MPI_Wait( &req4, &stat4 );
 
+
             cnt = 0;
-            for( i = 0; i < out_bufs[2 * d + 1].cnt; ++i )
+            for( i = nbr2->atoms_str; i < system->bigN && i < (nbr2->atoms_str + nbr2->atoms_cnt); ++i )
             {
-                for( pj = 0; pj < row_nnz[ out_bufs[2 * d + 1].index[i] ]; ++pj)
+                if( row_needed[i] != -1 && mark[ row_needed[i] ] == -1)
                 {
-                    j_list[ out_bufs[2 * d + 1].index[i] ][pj] = j_recv2[cnt];
-                    val_list[ out_bufs[2 * d + 1].index[i] ][pj] = val_recv2[cnt];
-                    cnt++;
+                    j_list[i] = (int *) malloc( sizeof(int) *  row_nnz[i] );
+                    val_list[i] = (real *) malloc( sizeof(real) * row_nnz[i] );
+
+                    for( pj = 0; pj < row_nnz[i]; ++pj )
+                    {
+                        j_list[i][pj] = j_recv2[cnt];
+                        val_list[i][pj] = val_recv2[cnt];
+                        cnt++;
+                    }
                 }
             }
         }
 
-        fprintf(stdout,"Package placed into correct positions\n");
-        fflush(stdout);
-    }    
-
-    fprintf(stdout,"before initilaization of SAI computation variables\n");
-    fflush(stdout);
+    }
 
     A_app_inv->start[A_app_inv->n] = A_spar_patt->start[A_spar_patt->n];
 
+    X = (char *) malloc( sizeof(char) * system->bigN );
+    Y = (char *) malloc( sizeof(char) * system->bigN );
+    pos_x = (int *) malloc( sizeof(int) * system->bigN );
+    pos_y = (int *) malloc( sizeof(int) * system->bigN );
 
-    X = (char *) malloc( sizeof(char) * A->n );
-    Y = (char *) malloc( sizeof(char) * A->n );
-    pos_x = (int *) malloc( sizeof(int) * A->n );
-    pos_y = (int *) malloc( sizeof(int) * A->n );
-
-    for ( i = 0; i < A->n; ++i )
+    for ( i = 0; i < system->bigN; ++i )
     {
         X[i] = 0;
         Y[i] = 0;
@@ -996,7 +987,8 @@ int sparse_approx_inverse(const reax_system * const system, storage * const work
         pos_y[i] = 0;
     }
 
-    fprintf(stdout, "SAI computation\n");
+    fprintf(stdout,"right before SAI computation\n");
+    fflush(stdout);
 
     for ( i = 0; i < A_spar_patt->n; ++i )
     {
@@ -1006,25 +998,27 @@ int sparse_approx_inverse(const reax_system * const system, storage * const work
         /* find column indices of nonzeros (which will be the columns indices of the dense matrix) */
         for ( pj = A_spar_patt->start[i]; pj < A_spar_patt->end[i]; ++pj )
         {
-
             j_temp = A_spar_patt->entries[pj].j;
-
+            atom = &system->my_atoms[j_temp];
             Y[j_temp] = 1;
             pos_y[j_temp] = N;
             ++N;
 
             /* for each of those indices
              *             search through the row of full A of that index */
-
+    
             /* the case where the local matrix has that index's row */
-            if(mark[j_temp] != -1)
+            if( mark[ atom->orig_id ] != -1)
             {
-                for ( k = A->start[ mark[j_temp] ]; k < A->end[ mark[j_temp] ]; ++k )
+                for ( k = A->start[ j_temp ]; k < A->end[ j_temp ]; ++k )
                 {
-                    /* and accumulate the nonzero column indices to serve as the row indices of the dense matrix */
-                    X[A->entries[k].j] = 1;
+                    if( A->entries[k].j < system->bigN )
+                    {
+                        /* and accumulate the nonzero column indices to serve as the row indices of the dense matrix */
+                        X[A->entries[k].j] = 1;
+                    }
                 }
-            }
+            }   
 
             /* the case where we communicated that index's row */
             else
@@ -1039,7 +1033,7 @@ int sparse_approx_inverse(const reax_system * const system, storage * const work
 
         /* enumerate the row indices from 0 to (# of nonzero rows - 1) for the dense matrix */
         identity_pos = M;
-        for ( k = 0; k < A->n; k++)
+        for ( k = 0; k < system->bigN; k++)
         {
             if ( X[k] != 0 )
             {
@@ -1066,13 +1060,14 @@ int sparse_approx_inverse(const reax_system * const system, storage * const work
             /* change the value if any of the column indices is seen */
 
             /* it is in the original list */
-            if( mark[ pos_x[d_i] ] != -1)
+            atom = &system->my_atoms[ pos_x[d_i] ];
+            if( mark[ atom->orig_id ] != -1)
             {
-                for ( d_j = A->start[  mark[ pos_x[d_i] ] ]; d_j < A->end[ mark[ pos_x[d_i] ] ]; ++d_j )
+                for ( d_j = A->start[  pos_x[d_i] ]; d_j < A->end[ pos_x[d_i] ]; ++d_j )
                 {
-                    if ( Y[A->entries[d_j].j] == 1 )
+                    if ( A->entries[d_j].j < system->bigN && Y[ A->entries[d_j].j ] == 1 )
                     {
-                        dense_matrix[d_i * N + pos_y[A->entries[d_j].j]] = A->entries[d_j].val;
+                        dense_matrix[ d_i * N + pos_y[ A->entries[d_j].j ] ] = A->entries[d_j].val;
                     }
                 }
             }
@@ -1087,7 +1082,6 @@ int sparse_approx_inverse(const reax_system * const system, storage * const work
                     }
                 }
             }
-
         }
 
         /* create the right hand side of the linear equation
@@ -1108,9 +1102,6 @@ int sparse_approx_inverse(const reax_system * const system, storage * const work
         lda = N;
         ldb = nrhs;
 
-        fprintf(stdout,"Before LAPACKE\n");
-        fflush(stdout);
-
         info = LAPACKE_dgels( LAPACK_ROW_MAJOR, 'N', m, n, nrhs, dense_matrix, lda,
                 e_j, ldb );
 
@@ -1140,7 +1131,7 @@ int sparse_approx_inverse(const reax_system * const system, storage * const work
         /* empty variables that will be used next iteration */
         free( dense_matrix );
         free( e_j );
-        for ( k = 0; k < A->n; ++k )
+        for ( k = 0; k < system->bigN; ++k )
         {
             X[k] = 0;
             Y[k] = 0;
@@ -1149,10 +1140,6 @@ int sparse_approx_inverse(const reax_system * const system, storage * const work
         }
     }
 
-
-    fprintf(stdout, "SAI computation over\n");
-    fflush(stdout);
-
     free( pos_y);
     free( pos_x);
     free( Y );
@@ -1179,8 +1166,6 @@ static void apply_preconditioner( const reax_system * const system, const storag
         const control_params * const control, const real * const y, real * const x, 
         const int fresh_pre, const int side )
 {
-    //fprintf(stdout,"apply_preconditioner working\n");
-    //fflush(stdout);
     /* no preconditioning */
     if ( control->cm_solver_pre_comp_type == NONE_PC )
     {
@@ -1415,8 +1400,6 @@ int dual_CG( const reax_system * const system, const control_params * const cont
         const real tol, rvec2 * const x, const int fresh_pre )
 {
 
-    fprintf(stdout,"dual_cg working\n");
-    fflush(stdout);
     int i, j, n, N, iters;
     rvec2 tmp, alpha, beta;
     rvec2 my_sum, norm_sqr, b_norm, my_dot;
@@ -1684,7 +1667,7 @@ int dual_CG( const reax_system * const system, const control_params * const cont
     {
         fprintf( stderr, "[WARNING] Dual CG convergence failed (%d iters)\n", i + 1 + iters );
     }
-
+    
     return (i + 1) + iters;
 }
 
@@ -1722,9 +1705,6 @@ int CG( const reax_system * const system, const control_params * const control,
 
     Vector_Sum( workspace->r , 1.0,  b, -1.0, workspace->q, system->n );
 
-    apply_preconditioner( system, workspace, control, workspace->r, workspace->d, fresh_pre, LEFT );
-    apply_preconditioner( system, workspace, control, workspace->d, workspace->p, fresh_pre, RIGHT );
-
     b_norm = Parallel_Norm( b, system->n, mpi_data->world );
     sig_new = Parallel_Dot( workspace->r, workspace->d, system->n, mpi_data->world );