diff --git a/PuReMD-GPU/Makefile.am b/PuReMD-GPU/Makefile.am
index bb4ebe45fd54575213cc93ccf18ae9031ce9846f..016114db2dfaf3b9af623069b2f5f3ffc677907a 100644
--- a/PuReMD-GPU/Makefile.am
+++ b/PuReMD-GPU/Makefile.am
@@ -28,6 +28,7 @@ bin_puremd_gpu_SOURCES = src/analyze.c src/print_utils.c \
 	src/reset_utils.c src/single_body_interactions.c \
 	src/system_props.c src/three_body_interactions.c \
 	src/traj.c src/two_body_interactions.c src/vector.c \
+	src/testmd.c \
 	src/cuda_utils.cu src/cuda_copy.cu src/cuda_init.cu src/cuda_reduction.cu \
 	src/cuda_center_mass.cu src/cuda_box.cu src/validation.cu \
         src/cuda_allocate.cu src/cuda_bond_orders.cu \
@@ -38,8 +39,7 @@ bin_puremd_gpu_SOURCES = src/analyze.c src/print_utils.c \
 	src/cuda_reset_utils.cu src/cuda_single_body_interactions.cu \
         src/cuda_system_props.cu src/cuda_three_body_interactions.cu \
 	src/cuda_two_body_interactions.cu src/cuda_environment.cu \
-	src/cuda_post_evolve.cu \
-	src/testmd.c
+	src/cuda_post_evolve.cu
 include_HEADERS = src/mytypes.h src/analyze.h src/print_utils.h \
         src/restart.h src/param.h src/pdb_tools.h src/box.h \
 	src/lin_alg.h src/QEq.h src/allocate.h src/bond_orders.h \
diff --git a/PuReMD-GPU/configure.ac b/PuReMD-GPU/configure.ac
index 9736714d5d49615e9d6e387feed9a16c37f65348..30e7e0bff6bc3115193c75d3b2d0dcbe29a5cf4b 100644
--- a/PuReMD-GPU/configure.ac
+++ b/PuReMD-GPU/configure.ac
@@ -42,19 +42,6 @@ AC_SEARCH_LIBS([gzeof], [z])
 AC_SEARCH_LIBS([gzgets], [z])
 AC_SEARCH_LIBS([gzseek], [z])
 AC_SEARCH_LIBS([gzclose, [z]])
-AC_SEARCH_LIBS([cublasCheckError], [cublas])
-AC_SEARCH_LIBS([cublasDnrm2], [cublas])
-AC_SEARCH_LIBS([cublasDaxpy], [cublas])
-AC_SEARCH_LIBS([cublasDscal], [cublas])
-AC_SEARCH_LIBS([cublasDdot], [cublas])
-AC_SEARCH_LIBS([cudaThreadSynchronize], [cuda])
-AC_SEARCH_LIBS([cudaCheckError], [cuda])
-# FIXME: Replace `main' with a function in `-lcudart':
-AC_CHECK_LIB([cudart], [main])
-AC_SEARCH_LIBS([cusparseCheckError], [cusparse])
-AC_SEARCH_LIBS([cusparseCreateMatDescr], [cusparse])
-AC_SEARCH_LIBS([cusparseSetMatType], [cusparse])
-AC_SEARCH_LIBS([cusparseSetMatIndexBase], [cusparse])
 
 # Checks for typedefs, structures, and compiler characteristics.
 AC_CHECK_HEADER_STDBOOL
@@ -78,10 +65,26 @@ then
 fi
 AC_DEFINE([HAVE_CUDA], [1], [Define to 1 if you have CUDA support enabled.])
 
-if test "BUILD_PROF" = "true"
-then
-	NVCCFLAGS+=" --compiler-options ${gprof_flags}"
-fi
+AC_SEARCH_LIBS([cublasDnrm2], [cublas])
+AC_SEARCH_LIBS([cublasDaxpy], [cublas])
+AC_SEARCH_LIBS([cublasDscal], [cublas])
+AC_SEARCH_LIBS([cublasDdot], [cublas])
+AC_SEARCH_LIBS([cudaThreadSynchronize], [cudart])
+AC_SEARCH_LIBS([cudaGetLastError], [cudart])
+AC_CHECK_LIB([cudart], [cudaMalloc])
+AC_SEARCH_LIBS([cusparseCreateMatDescr], [cusparse])
+AC_SEARCH_LIBS([cusparseSetMatType], [cusparse])
+AC_SEARCH_LIBS([cusparseSetMatIndexBase], [cusparse])
+
+AC_SEARCH_LIBS([cublasDnrm2], [cublas],
+	[CUBLAS_FOUND_LIBS="yes"], [CUBLAS_FOUND_LIBS="no"], [-lcublas])
+AS_IF([test "x${CUBLAS_FOUND_LIBS}" != "xyes"],
+	[AC_MSG_ERROR([Unable to find CUBLAS library.])])
+
+AC_SEARCH_LIBS([cusparseSetMatType], [cusparse],
+	[CUSPARSE_FOUND_LIBS="yes"], [CUSPARSE_FOUND_LIBS="no"], [-lcusparse])
+AS_IF([test "x${CUSPARSE_FOUND_LIBS}" != "xyes"],
+	[AC_MSG_ERROR([Unable to find CUSPARSE library.])])
 
 AC_CHECK_TYPES([cublasHandle_t], [], 
 	       [AC_MSG_FAILURE([cublasHandle_t type not found in cublas.h], [1])], [#include<cublas_v2.h>])
@@ -90,6 +93,12 @@ AC_CHECK_TYPES([cusparseHandle_t], [],
 AC_CHECK_TYPES([cusparseMatDescr_t], [], 
 	       [AC_MSG_FAILURE([cusparseMatDescr_t type not found in cusparse.h], [1])], [#include<cusparse_v2.h>])
 
+if test "BUILD_PROF" = "true"
+then
+	NVCCFLAGS+=" --compiler-options ${gprof_flags}"
+fi
+
+
 AC_CONFIG_FILES([Makefile])
 
 AC_OUTPUT
diff --git a/PuReMD-GPU/src/allocate.c b/PuReMD-GPU/src/allocate.c
index e24a67b5ba2aa1ddfb17858d5977b2ec57000411..65f0eb2a872673259d508f17fc0da43530a7426f 100644
--- a/PuReMD-GPU/src/allocate.c
+++ b/PuReMD-GPU/src/allocate.c
@@ -41,7 +41,7 @@ void Reallocate_Neighbor_List( list *far_nbrs, int n, int num_intrs )
 }
 
 
-int Allocate_Matrix( sparse_matrix *H, int n, int m )
+HOST int Allocate_Matrix( sparse_matrix *H, int n, int m )
 {
     H->n = n;
     H->m = m;
diff --git a/PuReMD-GPU/src/allocate.h b/PuReMD-GPU/src/allocate.h
index 4ec4b541ec2f764cbbf583c4262b2793253a4e99..b03ed80b34f153b9929ccaa80bc5c27fbf6ce540 100644
--- a/PuReMD-GPU/src/allocate.h
+++ b/PuReMD-GPU/src/allocate.h
@@ -23,6 +23,11 @@
 
 #include "mytypes.h"
 
+
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 void Reallocate( reax_system*, static_storage*, list**, int );
 
 int Allocate_Matrix( sparse_matrix*, int, int );
@@ -32,5 +37,9 @@ int Allocate_HBond_List( int, int, int*, int*, list* );
 
 int Allocate_Bond_List( int, int*, list* );
 
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PuReMD-GPU/src/box.h b/PuReMD-GPU/src/box.h
index a2b6b1a9f1c4d68d1e4e94c6718aa0ddd46d915a..418aa6208a81fb05ee56ff09afc6ff76751f75c9 100644
--- a/PuReMD-GPU/src/box.h
+++ b/PuReMD-GPU/src/box.h
@@ -63,7 +63,7 @@ real Metric_Product( rvec, rvec, simulation_box* );
 
 void Print_Box_Information( simulation_box*, FILE* );
 
-HOST_DEVICE inline real Sq_Distance_on_T3( rvec x1, rvec x2, simulation_box* box, rvec r)
+static inline HOST_DEVICE real Sq_Distance_on_T3( rvec x1, rvec x2, simulation_box* box, rvec r)
 {
 
     real norm = 0.0;
@@ -97,7 +97,7 @@ HOST_DEVICE inline real Sq_Distance_on_T3( rvec x1, rvec x2, simulation_box* box
 }
 
 
-HOST_DEVICE inline void Inc_on_T3( rvec x, rvec dx, simulation_box *box )
+static inline HOST_DEVICE void Inc_on_T3( rvec x, rvec dx, simulation_box *box )
 {
     int i;
     real tmp;
diff --git a/PuReMD-GPU/src/cuda_QEq.cu b/PuReMD-GPU/src/cuda_QEq.cu
index f11271eecd8112d33d53451c62e02e586e1dbf38..033945338aa76aa4c02909f90d2ca3eb1dacad58 100644
--- a/PuReMD-GPU/src/cuda_QEq.cu
+++ b/PuReMD-GPU/src/cuda_QEq.cu
@@ -38,7 +38,7 @@
 #include "validation.h"
 
 
-GLOBAL void Cuda_Sort_Matrix_Rows ( sparse_matrix A )
+GLOBAL void Cuda_Sort_Matrix_Rows( sparse_matrix A )
 {
     int i;
     int si, ei;
@@ -54,7 +54,7 @@ GLOBAL void Cuda_Sort_Matrix_Rows ( sparse_matrix A )
 }
 
 
-GLOBAL void Cuda_Calculate_Droptol ( sparse_matrix p_A, real *droptol, real dtol )
+GLOBAL void Cuda_Calculate_Droptol( sparse_matrix p_A, real *droptol, real dtol )
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     int k, j, offset, x, diagonal;
@@ -78,7 +78,7 @@ GLOBAL void Cuda_Calculate_Droptol ( sparse_matrix p_A, real *droptol, real dtol
 }
 
 
-GLOBAL void Cuda_Calculate_Droptol_js ( sparse_matrix p_A, real *droptol, real dtol )
+GLOBAL void Cuda_Calculate_Droptol_js( sparse_matrix p_A, real *droptol, real dtol )
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     int k, j, offset, x, diagonal;
@@ -98,7 +98,7 @@ GLOBAL void Cuda_Calculate_Droptol_js ( sparse_matrix p_A, real *droptol, real d
 }
 
 
-GLOBAL void Cuda_Calculate_Droptol_diagonal ( sparse_matrix p_A, real *droptol, real dtol )
+GLOBAL void Cuda_Calculate_Droptol_diagonal( sparse_matrix p_A, real *droptol, real dtol )
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     int k, j, offset, x, diagonal;
@@ -118,7 +118,7 @@ GLOBAL void Cuda_Calculate_Droptol_diagonal ( sparse_matrix p_A, real *droptol,
 }
 
 
-GLOBAL void Cuda_Estimate_LU_Fill ( sparse_matrix p_A, real *droptol, int *fillin)
+GLOBAL void Cuda_Estimate_LU_Fill( sparse_matrix p_A, real *droptol, int *fillin )
 {
     int i, j, pj;
     real val;
@@ -402,7 +402,7 @@ void Cuda_Fill_U    ( sparse_matrix *A, real *droptol,
 */
 
 
-void Cuda_Init_MatVec(reax_system *system, control_params *control,
+void Cuda_Init_MatVec( reax_system *system, control_params *control,
         simulation_data *data, static_storage *workspace, list *far_nbrs )
 {
     int i, fillin;
@@ -416,8 +416,8 @@ void Cuda_Init_MatVec(reax_system *system, control_params *control,
     {
         Cuda_Sort_Matrix_Rows<<< BLOCKS, BLOCK_SIZE >>>
             ( dev_workspace->H );
-        cudaThreadSynchronize();
-        cudaCheckError();
+        cudaThreadSynchronize( );
+        cudaCheckError( );
 
 #ifdef __DEBUG_CUDA__
         fprintf (stderr, "Sorting done... \n");
@@ -425,8 +425,8 @@ void Cuda_Init_MatVec(reax_system *system, control_params *control,
 
         Cuda_Calculate_Droptol<<<BLOCKS, BLOCK_SIZE >>>
             ( dev_workspace->H, dev_workspace->droptol, control->droptol );
-        cudaThreadSynchronize();
-        cudaCheckError();
+        cudaThreadSynchronize( );
+        cudaCheckError( );
 
 #ifdef __DEBUG_CUDA__
         fprintf (stderr, "Droptol done... \n");
@@ -434,24 +434,24 @@ void Cuda_Init_MatVec(reax_system *system, control_params *control,
 
         if( dev_workspace->L.entries == NULL )
         {
-            cuda_memset ( spad, 0, 2 * INT_SIZE * system->N, RES_SCRATCH );
+            cuda_memset( spad, 0, 2 * INT_SIZE * system->N, RES_SCRATCH );
             Cuda_Estimate_LU_Fill <<< BLOCKS, BLOCK_SIZE >>>
                 ( dev_workspace->H, dev_workspace->droptol, spad );
-            cudaThreadSynchronize();
-            cudaCheckError();
+            cudaThreadSynchronize( );
+            cudaCheckError( );
 
             //Reduction for fill in 
-            Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, INT_SIZE * BLOCK_SIZE >>>  
+            Cuda_reduction_int<<<BLOCKS_POW_2, BLOCK_SIZE, INT_SIZE * BLOCK_SIZE >>>  
                 (spad, spad + system->N,  system->N);
-            cudaThreadSynchronize ();
-            cudaCheckError ();
+            cudaThreadSynchronize( );
+            cudaCheckError( );
 
-            Cuda_reduction <<<1, BLOCKS_POW_2, INT_SIZE * BLOCKS_POW_2>>> 
+            Cuda_reduction_int<<<1, BLOCKS_POW_2, INT_SIZE * BLOCKS_POW_2>>> 
                 (spad + system->N, spad + system->N + BLOCKS_POW_2, BLOCKS_POW_2); 
-            cudaThreadSynchronize ();
-            cudaCheckError ();
+            cudaThreadSynchronize( );
+            cudaCheckError( );
 
-            copy_host_device (&fillin, spad + system->N + BLOCKS_POW_2, INT_SIZE, cudaMemcpyDeviceToHost, RES_SCRATCH );
+            copy_host_device( &fillin, spad + system->N + BLOCKS_POW_2, INT_SIZE, cudaMemcpyDeviceToHost, RES_SCRATCH );
             fillin += dev_workspace->H.n;
 
 #ifdef __DEBUG_CUDA__
@@ -497,47 +497,47 @@ void Cuda_Init_MatVec(reax_system *system, control_params *control,
         sparse_matrix t_H, t_L, t_U;
         real *t_droptol;
 
-        t_droptol = (real *) malloc (REAL_SIZE * system->N);
+        t_droptol = (real *) malloc( REAL_SIZE * system->N );
 
 #ifdef __DEBUG_CUDA__
-        fprintf (stderr, " Allocation temp matrices count %d entries %d \n", dev_workspace->H.n, dev_workspace->H.m );
+        fprintf( stderr, " Allocation temp matrices count %d entries %d \n", dev_workspace->H.n, dev_workspace->H.m );
 #endif
 
-        start = Get_Time ();
-        if(!Allocate_Matrix(&t_H, dev_workspace->H.n, dev_workspace->H.m))
+        start = Get_Time( );
+        if( !Allocate_Matrix(&t_H, dev_workspace->H.n, dev_workspace->H.m) )
         {
             fprintf(stderr, "No space for H matrix \n");
-            exit(0);
+            exit( 0 );
         }
-        if(!Allocate_Matrix(&t_L, far_nbrs->n, dev_workspace->L.m))
+        if( !Allocate_Matrix(&t_L, far_nbrs->n, dev_workspace->L.m) )
         {
-            fprintf(stderr, "No space for L matrix \n");
-            exit(0);
+            fprintf( stderr, "No space for L matrix \n" );
+            exit( 0 );
         }
-        if(!Allocate_Matrix(&t_U, far_nbrs->n, dev_workspace->U.m))
+        if( !Allocate_Matrix(&t_U, far_nbrs->n, dev_workspace->U.m) )
         {
-            fprintf(stderr, "No space for U matrix \n");
-            exit(0);
+            fprintf( stderr, "No space for U matrix \n" );
+            exit( 0 );
         }
 
-        copy_host_device ( t_H.start, dev_workspace->H.start, INT_SIZE *
+        copy_host_device( t_H.start, dev_workspace->H.start, INT_SIZE *
                 (dev_workspace->H.n + 1), cudaMemcpyDeviceToHost,
                 RES_SPARSE_MATRIX_INDEX );
-        copy_host_device ( t_H.end, dev_workspace->H.end, INT_SIZE *
+        copy_host_device( t_H.end, dev_workspace->H.end, INT_SIZE *
                 (dev_workspace->H.n + 1), cudaMemcpyDeviceToHost,
                 RES_SPARSE_MATRIX_INDEX );
-        copy_host_device ( t_H.entries, dev_workspace->H.entries,
+        copy_host_device( t_H.entries, dev_workspace->H.entries,
                 SPARSE_MATRIX_ENTRY_SIZE * dev_workspace->H.m,
                 cudaMemcpyDeviceToHost, RES_SPARSE_MATRIX_ENTRY );
 
-        copy_host_device ( t_droptol, dev_workspace->droptol, REAL_SIZE *
+        copy_host_device( t_droptol, dev_workspace->droptol, REAL_SIZE *
                 system->N, cudaMemcpyDeviceToHost, RES_STORAGE_DROPTOL );
 
         //fprintf (stderr, " Done copying LUH .. \n");
-        Cuda_ICHOLT (&t_H, t_droptol, &t_L, &t_U);
+        Cuda_ICHOLT( &t_H, t_droptol, &t_L, &t_U );
 
-        Sync_Host_Device (&t_L, &t_U, cudaMemcpyHostToDevice);
-        end += Get_Timing_Info (start);
+        Sync_Host_Device_Mat( &t_L, &t_U, cudaMemcpyHostToDevice );
+        end += Get_Timing_Info( start );
 
         /*
            fprintf (stderr, "Done syncing .... \n");
@@ -552,7 +552,7 @@ void Cuda_Init_MatVec(reax_system *system, control_params *control,
          */
 
         //#ifdef __DEBUG_CUDA__
-        fprintf (stderr, "Done copying the L/U matrices to the device ---> %f \n", end);
+        fprintf( stderr, "Done copying the L/U matrices to the device ---> %f \n", end );
         //#endif
 
         //#ifdef __BUILD_DEBUG__
@@ -562,7 +562,7 @@ void Cuda_Init_MatVec(reax_system *system, control_params *control,
 }
 
 
-GLOBAL void Init_MatVec_Postprocess (static_storage p_workspace, int N )
+GLOBAL void Init_MatVec_Postprocess( static_storage p_workspace, int N )
 {
 
     static_storage *workspace = &p_workspace;
@@ -608,7 +608,7 @@ GLOBAL void Init_MatVec_Postprocess (static_storage p_workspace, int N )
 }
 
 
-GLOBAL void Cuda_Update_Atoms_q ( reax_atom *atoms, real *s, real u, real *t, int N)
+GLOBAL void Cuda_Update_Atoms_q( reax_atom *atoms, real *s, real u, real *t, int N )
 {
     int i = blockIdx.x*blockDim.x + threadIdx.x;
 
@@ -621,39 +621,39 @@ GLOBAL void Cuda_Update_Atoms_q ( reax_atom *atoms, real *s, real u, real *t, in
 }
 
 
-void Cuda_Calculate_Charges (reax_system *system, static_storage *workspace)
+void Cuda_Calculate_Charges( reax_system *system, static_storage *workspace )
 {
     real *spad = (real *) scratch;
     real u, s_sum, t_sum;
 
-    cuda_memset (spad, 0, (BLOCKS_POW_2 * 2 * REAL_SIZE), RES_SCRATCH );
+    cuda_memset( spad, 0, (BLOCKS_POW_2 * 2 * REAL_SIZE), RES_SCRATCH );
 
     //s_sum 
-    Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>>  
+    Cuda_reduction<<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>>  
         (&dev_workspace->s [index_wkspace_sys (0, 0,system->N)], spad,  system->N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+    cudaThreadSynchronize( );
+    cudaCheckError( );
 
-    Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> 
+    Cuda_reduction<<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> 
         (spad, spad+BLOCKS_POW_2, BLOCKS_POW_2); 
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+    cudaThreadSynchronize( );
+    cudaCheckError( );
 
-    copy_host_device (&s_sum, spad+BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+    copy_host_device( &s_sum, spad+BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
 
     //t_sum
-    cuda_memset (spad, 0, (BLOCKS_POW_2 * 2 * REAL_SIZE), RES_SCRATCH );
+    cuda_memset( spad, 0, (BLOCKS_POW_2 * 2 * REAL_SIZE), RES_SCRATCH );
     Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>>  
         (&dev_workspace->t [index_wkspace_sys (0, 0,system->N)], spad,  system->N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+    cudaThreadSynchronize( );
+    cudaCheckError( );
 
-    Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> 
+    Cuda_reduction<<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> 
         (spad, spad+BLOCKS_POW_2, BLOCKS_POW_2); 
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+    cudaThreadSynchronize( );
+    cudaCheckError( );
 
-    copy_host_device (&t_sum, spad+BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+    copy_host_device( &t_sum, spad+BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
 
     //fraction here
     u = s_sum / t_sum;
@@ -662,10 +662,10 @@ void Cuda_Calculate_Charges (reax_system *system, static_storage *workspace)
     fprintf (stderr, "DEVICE ---> s %13.2f, t %13.f, u %13.2f \n", s_sum, t_sum, u );
 #endif
 
-    Cuda_Update_Atoms_q <<< BLOCKS, BLOCK_SIZE >>>
+    Cuda_Update_Atoms_q<<< BLOCKS, BLOCK_SIZE >>>
         ( (reax_atom *)system->d_atoms, dev_workspace->s, u, dev_workspace->t, system->N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+    cudaThreadSynchronize( );
+    cudaCheckError( );
 }
 
 
@@ -677,33 +677,34 @@ void Cuda_QEq( reax_system *system, control_params *control, simulation_data *da
     real t_start, t_elapsed;
 
 #ifdef __DEBUG_CUDA__
-    t_start = Get_Time ();
+    t_start = Get_Time( );
 #endif
 
     /*
     //Cuda_Init_MatVec( system, control, data, workspace, far_nbrs );
 
-    Cuda_Sort_Matrix_Rows <<< BLOCKS, BLOCK_SIZE >>>
+    Cuda_Sort_Matrix_Rows<<< BLOCKS, BLOCK_SIZE >>>
     ( dev_workspace->H );
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+    cudaThreadSynchronize();
+    cudaCheckError();
 
     t_elapsed = Get_Timing_Info (t_start);
     fprintf (stderr, "Sorting done...tming --> %f \n", t_elapsed);
      */
-    Init_MatVec_Postprocess <<< BLOCKS, BLOCK_SIZE >>>
+
+    Init_MatVec_Postprocess<<< BLOCKS, BLOCK_SIZE >>>
         (*dev_workspace, system->N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+    cudaThreadSynchronize( );
+    cudaCheckError( );
 
 #ifdef __DEBUG_CUDA__
-    t_elapsed = Get_Timing_Info (t_start);
-    fprintf (stderr, "Done with post processing of init_matvec --> %d  with time ---> %f \n", cudaGetLastError (), t_elapsed);
+    t_elapsed = Get_Timing_Info( t_start );
+    fprintf( stderr, "Done with post processing of init_matvec --> %d  with time ---> %f \n", cudaGetLastError (), t_elapsed );
 #endif
 
     //Here goes the GMRES part of the program ()
     //#ifdef __DEBUG_CUDA__
-    t_start = Get_Time ();
+    t_start = Get_Time( );
     //#endif
 
     //matvecs = Cuda_GMRES( dev_workspace, dev_workspace->b_s, control->q_err, dev_workspace->s );
@@ -715,10 +716,9 @@ void Cuda_QEq( reax_system *system, control_params *control, simulation_data *da
     d_timing.matvecs += matvecs;
 
 #ifdef __DEBUG_CUDA__
-    t_elapsed = Get_Timing_Info ( t_start );
-    fprintf (stderr, " Cuda_GMRES done with iterations %d with timing ---> %f \n", matvecs, t_elapsed );
+    t_elapsed = Get_Timing_Info( t_start );
+    fprintf( stderr, " Cuda_GMRES done with iterations %d with timing ---> %f \n", matvecs, t_elapsed );
 #endif
 
-    //Here cuda calculate charges
-    Cuda_Calculate_Charges (system, workspace);
+    Cuda_Calculate_Charges( system, workspace );
 }
diff --git a/PuReMD-GPU/src/cuda_QEq.h b/PuReMD-GPU/src/cuda_QEq.h
index 2215510c07edc4d5b3497ec30b5be1a73aa2d411..f62ab1157e3812837cd3a8a65e07856a78b22bf2 100644
--- a/PuReMD-GPU/src/cuda_QEq.h
+++ b/PuReMD-GPU/src/cuda_QEq.h
@@ -24,8 +24,16 @@
 #include "mytypes.h"
 
 
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 void Cuda_QEq( reax_system*, control_params*, simulation_data*, static_storage*,
         list*, output_controls* );
 
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PuReMD-GPU/src/cuda_allocate.cu b/PuReMD-GPU/src/cuda_allocate.cu
index fb9795c5f198b5de4ec3029dfd485710f8d491aa..d0e1f22b6f970038820f82c504dc3472d50d1d7a 100644
--- a/PuReMD-GPU/src/cuda_allocate.cu
+++ b/PuReMD-GPU/src/cuda_allocate.cu
@@ -302,7 +302,7 @@ cuda_memset (d_bond_top, 0, (n+BLOCKS_POW_2+1) * INT_SIZE, RES_SCRATCH );
  cudaThreadSynchronize ();
  cudaCheckError ();
 
- Cuda_reduction <<<1, BLOCKS_POW_2, INT_SIZE * BLOCKS_POW_2>>> 
+ Cuda_reduction_int<<<1, BLOCKS_POW_2, INT_SIZE * BLOCKS_POW_2>>> 
  (d_bond_top + n, d_bond_top + n + BLOCKS_POW_2, BLOCKS_POW_2); 
  cudaThreadSynchronize ();
 
diff --git a/PuReMD-GPU/src/cuda_allocate.h b/PuReMD-GPU/src/cuda_allocate.h
index 1de435a0f70a3cca0f3a39d63d3ddccf685774a1..dc672d3bd76c1afa7a468aa2fdda3dd0ca3d3ec9 100644
--- a/PuReMD-GPU/src/cuda_allocate.h
+++ b/PuReMD-GPU/src/cuda_allocate.h
@@ -23,9 +23,19 @@
 
 #include "mytypes.h"
 
+
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 int Cuda_Allocate_Matrix( sparse_matrix*, int, int );
 int Cuda_Allocate_HBond_List( int, int, int*, int*, list* );
 int Cuda_Allocate_Bond_List( int, int*, list* );
 void Cuda_Reallocate( reax_system*, static_storage*, list*, int, int );
 
+#ifdef __cplusplus
+}
+#endif
+
+
 #endif
diff --git a/PuReMD-GPU/src/cuda_bond_orders.h b/PuReMD-GPU/src/cuda_bond_orders.h
index bee988ff209563097d0c511e36dfbe41028990c0..4015b9fa34aa1e66843efc8dc1a49f1c75da749c 100644
--- a/PuReMD-GPU/src/cuda_bond_orders.h
+++ b/PuReMD-GPU/src/cuda_bond_orders.h
@@ -23,6 +23,11 @@
 
 #include "mytypes.h"
 
+
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 GLOBAL void Cuda_Calculate_Bond_Orders_Init (  reax_atom *, global_parameters , single_body_parameters *,
         static_storage , int , int );
 GLOBAL void Cuda_Calculate_Bond_Orders ( reax_atom *, global_parameters , single_body_parameters *,
@@ -35,4 +40,9 @@ GLOBAL void Cuda_Compute_Total_Force_PostProcess (reax_atom *, simulation_data *
 //HOST_DEVICE void Cuda_Add_dBond_to_Forces( int, int, reax_atom *, static_storage*, list* );
 //HOST_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int, int, reax_atom *, simulation_data*, static_storage*, list* );
 
+#ifdef __cplusplus
+}
+#endif
+
+
 #endif
diff --git a/PuReMD-GPU/src/cuda_box.h b/PuReMD-GPU/src/cuda_box.h
index 7186506fc64dd2e5c7535af1211b489dc02522f9..913abc15867cf5ec9d8e87dad0fa56198de9b1ff 100644
--- a/PuReMD-GPU/src/cuda_box.h
+++ b/PuReMD-GPU/src/cuda_box.h
@@ -23,7 +23,9 @@
 
 #include "mytypes.h"
 
+
 GLOBAL void k_compute_Inc_on_T3 (reax_atom *atoms, unsigned int N,
     simulation_box *box, real d1, real d2, real d3);
 
+
 #endif
diff --git a/PuReMD-GPU/src/cuda_center_mass.cu b/PuReMD-GPU/src/cuda_center_mass.cu
index a82c0b0303aec6f5878df03d9493936604ae9aaa..158d3a16489f20362bd854312eb39aa0dcec57e8 100644
--- a/PuReMD-GPU/src/cuda_center_mass.cu
+++ b/PuReMD-GPU/src/cuda_center_mass.cu
@@ -23,11 +23,8 @@
 #include "vector.h"
 
 
-GLOBAL void k_center_of_mass_blocks (single_body_parameters *sbp, reax_atom *atoms,
-        rvec *res_xcm, 
-        rvec *res_vcm, 
-        rvec *res_amcm, 
-        size_t n)
+GLOBAL void k_center_of_mass_blocks( single_body_parameters *sbp, reax_atom *atoms,
+        rvec *res_xcm, rvec *res_vcm, rvec *res_amcm, size_t n )
 {
     extern __shared__ rvec xcm[];
     extern __shared__ rvec vcm[];
@@ -76,13 +73,8 @@ GLOBAL void k_center_of_mass_blocks (single_body_parameters *sbp, reax_atom *ato
 }
 
 
-GLOBAL void k_center_of_mass (rvec *xcm, 
-        rvec *vcm, 
-        rvec *amcm, 
-        rvec *res_xcm,
-        rvec *res_vcm,
-        rvec *res_amcm,
-        size_t n)
+GLOBAL void k_center_of_mass( rvec *xcm, rvec *vcm, rvec *amcm,
+        rvec *res_xcm, rvec *res_vcm, rvec *res_amcm, size_t n )
 {
     extern __shared__ rvec sh_xcm[];
     extern __shared__ rvec sh_vcm[];
@@ -132,11 +124,8 @@ GLOBAL void k_center_of_mass (rvec *xcm,
 }
 
 
-GLOBAL void k_compute_center_mass (single_body_parameters *sbp, 
-        reax_atom *atoms,
-        real *results, 
-        real xcm0, real xcm1, real xcm2,
-        size_t n)
+GLOBAL void k_compute_center_mass_sbp( single_body_parameters *sbp, reax_atom *atoms,
+        real *results, real xcm0, real xcm1, real xcm2, size_t n )
 {
     extern __shared__ real xx[];
     extern __shared__ real xy[];
@@ -161,11 +150,11 @@ GLOBAL void k_compute_center_mass (single_body_parameters *sbp,
     xcm[1] = xcm1;
     xcm[2] = xcm2;
 
-
     xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = 
         yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0;
 
-    if (i < n){
+    if (i < n)
+    {
         m = sbp[ atoms[i].type ].mass;
         rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
         xx[ xx_i ] = diff[0] * diff[0] * m;
@@ -177,8 +166,10 @@ GLOBAL void k_compute_center_mass (single_body_parameters *sbp,
     }
     __syncthreads ();
 
-    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1){
-        if (threadIdx.x < offset){
+    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    {
+        if (threadIdx.x < offset) 
+        {
             index = threadIdx.x + offset;
             xx[ threadIdx.x ] += xx[ index ];
             xy[ xy_i + threadIdx.x ] += xy [ xy_i + index ];
@@ -190,7 +181,8 @@ GLOBAL void k_compute_center_mass (single_body_parameters *sbp,
         __syncthreads ();
     }
 
-    if (threadIdx.x == 0) {
+    if (threadIdx.x == 0)
+    {
         results [ blockIdx.x*6 ] = xx [ 0 ];
         results [ blockIdx.x*6 + 1 ] = xy [ xy_i + 0 ];
         results [ blockIdx.x*6 + 2 ] = xz [ xz_i + 0 ];
@@ -201,7 +193,7 @@ GLOBAL void k_compute_center_mass (single_body_parameters *sbp,
 }
 
 
-GLOBAL void k_compute_center_mass (real *input, real *output, size_t n)
+GLOBAL void k_compute_center_mass( real *input, real *output, size_t n )
 {
     extern __shared__ real xx[];
     extern __shared__ real xy[];
diff --git a/PuReMD-GPU/src/cuda_center_mass.h b/PuReMD-GPU/src/cuda_center_mass.h
index 887377340eacca0479fffc352499a456dba5a177..0c1d76ec63defe3cb6a0738a2843b10d02104bb5 100644
--- a/PuReMD-GPU/src/cuda_center_mass.h
+++ b/PuReMD-GPU/src/cuda_center_mass.h
@@ -23,12 +23,22 @@
 
 #include "mytypes.h"
 
-GLOBAL void k_center_of_mass_blocks (single_body_parameters *, reax_atom *,
-    rvec *res_xcm, rvec *res_vcm, rvec *res_amcm, size_t n); 
-GLOBAL void k_center_of_mass (rvec *xcm,
-    rvec *vcm, rvec *amcm, rvec *res_xcm, rvec *res_vcm, rvec *res_amcm, size_t n);
-GLOBAL void k_compute_center_mass (single_body_parameters *sbp,
-    reax_atom *atoms, real *results, real xcm0, real xcm1, real xcm2, size_t n);
-GLOBAL void k_compute_center_mass (real *input, real *output, size_t n);
+
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
+GLOBAL void k_center_of_mass_blocks( single_body_parameters *, reax_atom *,
+    rvec *res_xcm, rvec *res_vcm, rvec *res_amcm, size_t n ); 
+GLOBAL void k_center_of_mass( rvec *xcm,
+    rvec *vcm, rvec *amcm, rvec *res_xcm, rvec *res_vcm, rvec *res_amcm, size_t n );
+GLOBAL void k_compute_center_mass_sbp( single_body_parameters *sbp,
+    reax_atom *atoms, real *results, real xcm0, real xcm1, real xcm2, size_t n );
+GLOBAL void k_compute_center_mass( real *input, real *output, size_t n );
+
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PuReMD-GPU/src/cuda_copy.cu b/PuReMD-GPU/src/cuda_copy.cu
index 88ad09502bcd26d2b6b6d53103513c5cd73f7f9d..1f50dbf3c74fae4d9ab73149d02c51dbec8ca10e 100644
--- a/PuReMD-GPU/src/cuda_copy.cu
+++ b/PuReMD-GPU/src/cuda_copy.cu
@@ -25,7 +25,7 @@
 #include "vector.h"
 
 
-void Sync_Host_Device( grid *host, grid *dev, enum cudaMemcpyKind dir )
+void Sync_Host_Device_Grid( grid *host, grid *dev, enum cudaMemcpyKind dir )
 {
     copy_host_device( host->top, dev->top, 
             INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_TOP );
@@ -50,7 +50,7 @@ void Sync_Host_Device( grid *host, grid *dev, enum cudaMemcpyKind dir )
 }
 
 
-void Sync_Host_Device( reax_system *sys, enum cudaMemcpyKind dir )
+void Sync_Host_Device_Sys( reax_system *sys, enum cudaMemcpyKind dir )
 {
 
     copy_host_device( sys->atoms, sys->d_atoms, 
@@ -78,13 +78,13 @@ void Sync_Host_Device( reax_system *sys, enum cudaMemcpyKind dir )
 }
 
 
-void Sync_Host_Device( simulation_data *host, simulation_data *dev, enum cudaMemcpyKind dir )
+void Sync_Host_Device_Data( simulation_data *host, simulation_data *dev, enum cudaMemcpyKind dir )
 {
     copy_host_device( host, dev, SIMULATION_DATA_SIZE, dir, RES_SIMULATION_DATA );
 }
 
 
-void Sync_Host_Device( sparse_matrix *L, sparse_matrix *U, enum cudaMemcpyKind dir )
+void Sync_Host_Device_Mat( sparse_matrix *L, sparse_matrix *U, enum cudaMemcpyKind dir )
 {
     copy_host_device( L->start, dev_workspace->L.start, INT_SIZE * (L->n + 1), dir, RES_SPARSE_MATRIX_INDEX );
     copy_host_device( L->end, dev_workspace->L.end, INT_SIZE * (L->n + 1), dir, RES_SPARSE_MATRIX_INDEX );
@@ -96,12 +96,12 @@ void Sync_Host_Device( sparse_matrix *L, sparse_matrix *U, enum cudaMemcpyKind d
 }
 
 
-void Sync_Host_Device( output_controls *, control_params *, enum cudaMemcpyKind )
+void Sync_Host_Device_Control( output_controls *, control_params *, enum cudaMemcpyKind )
 {
 }
 
 
-void Sync_Host_Device( control_params *host, control_params *device, enum cudaMemcpyKind )
+void Sync_Host_Device_Params( control_params *host, control_params *device, enum cudaMemcpyKind )
 {
     copy_host_device( host, device, CONTROL_PARAMS_SIZE, cudaMemcpyHostToDevice, RES_CONTROL_PARAMS );
 }
@@ -117,7 +117,7 @@ void Prep_Device_For_Output( reax_system *system, simulation_data *data )
     //fprintf (stderr, "size to copy --> %d \n", size );
     //copy_host_device (data, (simulation_data *)data->d_simulation_data, size, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA );
 
-    //Sync_Host_Device (data, (simulation_data *)data->d_simulation_data, cudaMemcpyDeviceToHost );
+    //Sync_Host_Device_Data( data, (simulation_data *)data->d_simulation_data, cudaMemcpyDeviceToHost );
     /*
        copy_host_device (&data->E_BE, &((simulation_data *)data->d_simulation_data)->E_BE, 
        REAL_SIZE * 13, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA );
@@ -151,16 +151,18 @@ void Prep_Device_For_Output( reax_system *system, simulation_data *data )
     data->kin_press =  local_data.kin_press;
     data->therm.T = local_data.therm.T;
 
-    //Sync_Host_Device (&system.g, &system.d_g, cudaMemcpyDeviceToHost );
-    Sync_Host_Device( system, cudaMemcpyDeviceToHost );
+    //Sync_Host_Device_Sys( &system.g, &system.d_g, cudaMemcpyDeviceToHost );
+    Sync_Host_Device_Sys( system, cudaMemcpyDeviceToHost );
 }
 
 
-void Sync_Host_Device( list *host, list *device, int type )
+void Sync_Host_Device_List( list *host, list *device, int type )
 {
     //list is already allocated -- discard it first
     if (host->n > 0)
+    {
         Cuda_Delete_List( host );
+    }
 
     //memory is allocated on the host
     Cuda_Make_List( device->n, device->num_intrs, type, host );
diff --git a/PuReMD-GPU/src/cuda_copy.h b/PuReMD-GPU/src/cuda_copy.h
index 6f2509e8318c391bd1af6c9eb5eeca8354016029..6b6f38ae3d12d7d00d8cd38eef996d61cb765bd5 100644
--- a/PuReMD-GPU/src/cuda_copy.h
+++ b/PuReMD-GPU/src/cuda_copy.h
@@ -18,8 +18,6 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-
-
 #ifndef __CUDA_COPY_H_
 #define __CUDA_COPY_H_
 
@@ -28,14 +26,24 @@
 #include "mytypes.h"
 #include "list.h"
 
-void Sync_Host_Device (grid *, grid *, enum cudaMemcpyKind);
-void Sync_Host_Device (reax_system *, enum cudaMemcpyKind);
-void Sync_Host_Device (control_params *, control_params *, enum cudaMemcpyKind);
-void Sync_Host_Device (simulation_data *, simulation_data *, enum cudaMemcpyKind);
-void Sync_Host_Device (sparse_matrix *, sparse_matrix *, enum cudaMemcpyKind);
-void Sync_Host_Device (output_controls *, enum cudaMemcpyKind);
 
-void Prep_Device_For_Output (reax_system *, simulation_data *);
-void Sync_Host_Device (list *host, list *device, int type);
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
+void Sync_Host_Device_Grid( grid *, grid *, enum cudaMemcpyKind );
+void Sync_Host_Device_Sys( reax_system *, enum cudaMemcpyKind );
+void Sync_Host_Device_Params( control_params *, control_params *, enum cudaMemcpyKind );
+void Sync_Host_Device_Data( simulation_data *, simulation_data *, enum cudaMemcpyKind );
+void Sync_Host_Device_Mat( sparse_matrix *, sparse_matrix *, enum cudaMemcpyKind );
+void Sync_Host_Device_Control( output_controls *, enum cudaMemcpyKind );
+
+void Prep_Device_For_Output( reax_system *, simulation_data * );
+void Sync_Host_Device_List( list *host, list *device, int type );
+
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PuReMD-GPU/src/cuda_environment.cu b/PuReMD-GPU/src/cuda_environment.cu
index 05eaf07df6faf5b9f9e561273247ab8b9db519b6..cd6ae50d82b4716aef1d16f2f957dc5f4976cca7 100644
--- a/PuReMD-GPU/src/cuda_environment.cu
+++ b/PuReMD-GPU/src/cuda_environment.cu
@@ -23,20 +23,18 @@
 #include "cuda_utils.h"
 
 
-extern "C" void Setup_Cuda_Environment( int rank, int nprocs, int gpus_per_node )
+void Setup_Cuda_Environment( int rank, int nprocs, int gpus_per_node )
 {
 
-    int deviceCount;
+    int deviceCount = 0;
     cudaError_t flag;
-    cublasStatus_t cublasStatus;
     cublasHandle_t cublasHandle;
     cusparseHandle_t cusparseHandle;
-    cusparseStatus_t cusparseStatus;
     cusparseMatDescr_t matdescriptor;
     
     flag = cudaGetDeviceCount( &deviceCount );
 
-    if ( flag != cudaSuccess )
+    if ( flag != cudaSuccess || deviceCount < 1 )
     {
         fprintf( stderr, "ERROR: no CUDA capable device(s) found. Terminating...\n" );
         exit( 1 );
@@ -45,28 +43,28 @@ extern "C" void Setup_Cuda_Environment( int rank, int nprocs, int gpus_per_node
     //Calculate the # of GPUs per processor
     //and assign the GPU for each process
     //TODO: handle condition where # CPU procs > # GPUs
-    cudaSetDevice( (rank % (deviceCount)) );
+    cudaSetDevice( rank % deviceCount );
 
 #if defined(__CUDA_DEBUG__)
-    fprintf( stderr, "p:%d is using GPU: %d \n", rank, (rank % deviceCount));
+    fprintf( stderr, "p:%d is using GPU: %d \n", rank, rank % deviceCount );
 #endif
 
     //CHANGE ORIGINAL
     //cudaDeviceSetLimit( cudaLimitStackSize, 8192 );
     //cudaDeviceSetCacheConfig( cudaFuncCachePreferL1 );
-    //cudaCheckError();
+    //cudaCheckError( );
 
-    cublasCheckError( cublasStatus = cublasCreate(&cublasHandle) );
+    cublasCheckError( cublasCreate(&cublasHandle) );
 
-    cusparseCheckError( cusparseStatus = cusparseCreate (&cusparseHandle) );
-    cusparseCheckError( cusparseCreateMatDescr (&matdescriptor) );
+    cusparseCheckError( cusparseCreate(&cusparseHandle) );
+    cusparseCheckError( cusparseCreateMatDescr(&matdescriptor) );
     cusparseSetMatType( matdescriptor, CUSPARSE_MATRIX_TYPE_GENERAL );
     cusparseSetMatIndexBase( matdescriptor, CUSPARSE_INDEX_BASE_ZERO );
 
 }
 
 
-extern "C" void Cleanup_Cuda_Environment( )
+void Cleanup_Cuda_Environment( )
 {
     cudaDeviceReset( );
     cudaDeviceSynchronize( );
diff --git a/PuReMD-GPU/src/cuda_environment.h b/PuReMD-GPU/src/cuda_environment.h
index 3d7987193df15d92c9c1c9fab71aad36a3e5a6e5..61f811db3354628a3067a3423442f875f16e6871 100644
--- a/PuReMD-GPU/src/cuda_environment.h
+++ b/PuReMD-GPU/src/cuda_environment.h
@@ -28,8 +28,8 @@
 extern "C"  {
 #endif
 
-void Setup_Cuda_Environment(int, int, int);
-void Cleanup_Cuda_Environment();
+void Setup_Cuda_Environment( int, int, int );
+void Cleanup_Cuda_Environment( );
 
 #ifdef __cplusplus
 }
diff --git a/PuReMD-GPU/src/cuda_forces.cu b/PuReMD-GPU/src/cuda_forces.cu
index 72e8fbe2dd6559062999e82635ae8a3168a27f40..bf277b391ce0df0c5336ea0a0653b6863ca14fec 100644
--- a/PuReMD-GPU/src/cuda_forces.cu
+++ b/PuReMD-GPU/src/cuda_forces.cu
@@ -59,35 +59,35 @@ void Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
     fprintf (stderr, " Begin Bonded Forces ... %d x %d\n", BLOCKS, BLOCK_SIZE);
 #endif
 
-    Cuda_Calculate_Bond_Orders_Init <<< BLOCKS, BLOCK_SIZE >>>
+    Cuda_Calculate_Bond_Orders_Init<<< BLOCKS, BLOCK_SIZE >>>
         (  system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp,
            *dev_workspace, system->reaxprm.num_atom_types, system->N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+    cudaThreadSynchronize( );
+    cudaCheckError( );
 
-    Cuda_Calculate_Bond_Orders <<< BLOCKS, BLOCK_SIZE >>>
+    Cuda_Calculate_Bond_Orders<<< BLOCKS, BLOCK_SIZE >>>
         ( system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp, 
           system->reaxprm.d_tbp, *dev_workspace, 
           *(dev_lists + BONDS), *(dev_lists + DDELTA), *(dev_lists + DBO), 
           system->reaxprm.num_atom_types, system->N );
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+    cudaThreadSynchronize( );
+    cudaCheckError( );
 
-    Cuda_Update_Uncorrected_BO <<<BLOCKS, BLOCK_SIZE>>>
+    Cuda_Update_Uncorrected_BO<<<BLOCKS, BLOCK_SIZE>>>
         (*dev_workspace, *(dev_lists + BONDS), system->N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+    cudaThreadSynchronize( );
+    cudaCheckError( );
 
-    Cuda_Update_Workspace_After_Bond_Orders <<<BLOCKS, BLOCK_SIZE>>>
+    Cuda_Update_Workspace_After_Bond_Orders<<<BLOCKS, BLOCK_SIZE>>>
         (system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp, 
          *dev_workspace, system->N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+    cudaThreadSynchronize( );
+    cudaCheckError( );
 
 #ifdef __DEBUG_CUDA__
     t_elapsed = Get_Timing_Info( t_start );
-    fprintf (stderr, "Bond Orders... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed );
-    fprintf (stderr, "Cuda_Calculate_Bond_Orders Done... \n");
+    fprintf( stderr, "Bond Orders... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed );
+    fprintf( stderr, "Cuda_Calculate_Bond_Orders Done... \n" );
 #endif
 
     //Step 2.
@@ -197,7 +197,7 @@ void Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
 #endif
 
     cuda_memset(spad, 0, (dev_lists + BONDS)->num_intrs * sizeof (int), RES_SCRATCH);
-    Three_Body_Estimate <<<BLOCKS, BLOCK_SIZE>>>
+    k_Three_Body_Estimate<<<BLOCKS, BLOCK_SIZE>>>
         (system->d_atoms, 
          (control_params *)control->d_control, 
          *(dev_lists + BONDS),
@@ -269,7 +269,7 @@ void Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
 
     cuda_memset (spad, 0, ( 6 * REAL_SIZE * system->N + RVEC_SIZE * system->N * 2), RES_SCRATCH );
 
-    Three_Body_Interactions <<< BLOCKS, BLOCK_SIZE >>>
+    k_Three_Body_Interactions <<< BLOCKS, BLOCK_SIZE >>>
         ( system->d_atoms,
           system->reaxprm.d_sbp, system->reaxprm.d_thbp, system->reaxprm.d_gp, 
           (control_params *)control->d_control,
@@ -314,41 +314,38 @@ void Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
     cudaThreadSynchronize ();
     cudaCheckError ();
 
-    Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
+    Cuda_reduction<<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
         (spad + 5*system->N, &((simulation_data *)data->d_simulation_data)->E_Coa, BLOCKS_POW_2);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+    cudaThreadSynchronize( );
+    cudaCheckError( );
 
     //Reduction for ext_pres
     rvec_spad = (rvec *) (spad + 6*system->N);
-    Cuda_reduction_rvec <<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE >>> 
+    Cuda_reduction_rvec<<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE >>> 
         (rvec_spad, rvec_spad + system->N,  system->N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+    cudaThreadSynchronize( );
+    cudaCheckError( );
 
-    Cuda_reduction_rvec <<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2 >>> 
+    Cuda_reduction_rvec<<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2 >>> 
         (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+    cudaThreadSynchronize( );
+    cudaCheckError( );
 
     real t_1, t_2;
-    t_1 = Get_Time ();
+    t_1 = Get_Time( );
     //Sum up the f vector for each atom and collect the CdDelta from all the bonds
-    Three_Body_Interactions_results <<< BLOCKS, BLOCK_SIZE >>>
-        (     system->d_atoms,
-            (control_params *)control->d_control,
-            *dev_workspace, 
-            *(dev_lists + BONDS), 
-            system->N );
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-    t_2 = Get_Timing_Info (t_1);
+    k_Three_Body_Interactions_results <<< BLOCKS, BLOCK_SIZE >>>
+        (system->d_atoms, (control_params *)control->d_control,
+            *dev_workspace, *(dev_lists + BONDS), system->N );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    t_2 = Get_Timing_Info( t_1 );
 
 #ifdef __DEBUG_CUDA__
     t_elapsed = Get_Timing_Info( t_start );
-    fprintf (stderr, "Three_Body_Interactions post process Timing %lf \n", t_2);
-    fprintf (stderr, "Three_Body_Interactions ...  Timing %lf \n", t_elapsed );
-    fprintf (stderr, "Three_Body_Interactions Done... \n");
+    fprintf( stderr, "Three_Body_Interactions post process Timing %lf \n", t_2 );
+    fprintf( stderr, "Three_Body_Interactions ...  Timing %lf \n", t_elapsed );
+    fprintf( stderr, "Three_Body_Interactions Done... \n" );
 #endif
 
     //Step 5.
@@ -356,21 +353,16 @@ void Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
     t_start = Get_Time( );
 #endif
 
-    cuda_memset (spad, 0, ( 4 * REAL_SIZE * system->N + RVEC_SIZE * system->N * 2), RES_SCRATCH );
-    Four_Body_Interactions <<< BLOCKS, BLOCK_SIZE >>>
-        //Four_Body_Interactions <<< system->N, 32, 32*( 2*REAL_SIZE + RVEC_SIZE)>>>
-        ( system->d_atoms,
-          system->reaxprm.d_gp,
-          system->reaxprm.d_fbp,
-          (control_params *)control->d_control,
-          *(dev_lists + BONDS), *(dev_lists + THREE_BODIES),
-          (simulation_box *)system->d_box,
-          (simulation_data *)data->d_simulation_data,
-          *dev_workspace,
-          system->N, system->reaxprm.num_atom_types, 
-          spad, spad + 2*system->N, (rvec *) (spad + 4*system->N));
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+    cuda_memset( spad, 0, ( 4 * REAL_SIZE * system->N + RVEC_SIZE * system->N * 2), RES_SCRATCH );
+    //k_Four_Body_Interactions<<< system->N, 32, 32*( 2*REAL_SIZE + RVEC_SIZE)>>>
+    k_Four_Body_Interactions<<< BLOCKS, BLOCK_SIZE >>>
+        ( system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_fbp,
+          (control_params *)control->d_control, *(dev_lists + BONDS), *(dev_lists + THREE_BODIES),
+          (simulation_box *)system->d_box, (simulation_data *)data->d_simulation_data,
+          *dev_workspace, system->N, system->reaxprm.num_atom_types, 
+          spad, spad + 2*system->N, (rvec *) (spad + 4*system->N) );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
 
     //Reduction for E_Tor
     Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
@@ -407,13 +399,11 @@ void Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
     cudaCheckError ();
 
     //Post process here
-    Four_Body_Postprocess     <<< BLOCKS, BLOCK_SIZE >>>
-        (     system->d_atoms,
-            *dev_workspace,
-            *(dev_lists + BONDS),
+    k_Four_Body_Postprocess<<< BLOCKS, BLOCK_SIZE >>>
+        ( system->d_atoms, *dev_workspace, *(dev_lists + BONDS),
             system->N );
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+    cudaThreadSynchronize( );
+    cudaCheckError( );
 
 #ifdef __DEBUG_CUDA__
     t_elapsed = Get_Timing_Info( t_start );
@@ -429,7 +419,7 @@ void Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
         cuda_memset (spad, 0, ( 2 * REAL_SIZE * system->N + RVEC_SIZE * system->N * 2 ), RES_SCRATCH );
 
         /*
-           Hydrogen_Bonds <<< BLOCKS, BLOCK_SIZE, BLOCK_SIZE *( REAL_SIZE + RVEC_SIZE) >>>
+           k_Hydrogen_Bonds <<< BLOCKS, BLOCK_SIZE, BLOCK_SIZE *( REAL_SIZE + RVEC_SIZE) >>>
            (  system->d_atoms, 
            system->reaxprm.d_sbp,
            system->reaxprm.d_hbp,
@@ -450,7 +440,7 @@ void Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
 
         int hbs = (system->N * HBONDS_THREADS_PER_ATOM/ HBONDS_BLOCK_SIZE) + 
             (((system->N * HBONDS_THREADS_PER_ATOM) % HBONDS_BLOCK_SIZE) == 0 ? 0 : 1);
-        Hydrogen_Bonds_HB <<< hbs, HBONDS_BLOCK_SIZE, HBONDS_BLOCK_SIZE * ( 2 * REAL_SIZE + 2 * RVEC_SIZE )  >>>
+        k_Hydrogen_Bonds_HB <<< hbs, HBONDS_BLOCK_SIZE, HBONDS_BLOCK_SIZE * ( 2 * REAL_SIZE + 2 * RVEC_SIZE )  >>>
             (  system->d_atoms, 
                system->reaxprm.d_sbp,
                system->reaxprm.d_hbp,
@@ -498,7 +488,7 @@ void Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
         t_1 = Get_Time ();
 #endif
 
-        Hydrogen_Bonds_Postprocess <<< BLOCKS, BLOCK_SIZE, BLOCK_SIZE * RVEC_SIZE >>>
+        k_Hydrogen_Bonds_Postprocess <<< BLOCKS, BLOCK_SIZE, BLOCK_SIZE * RVEC_SIZE >>>
             (     system->d_atoms, 
                 system->reaxprm.d_sbp, 
                 *dev_workspace, 
@@ -516,8 +506,8 @@ void Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
         t_1 = Get_Time ();
 #endif
 
-        //Hydrogen_Bonds_Far_Nbrs <<< system->N, 32, 32 * RVEC_SIZE>>>
-        Hydrogen_Bonds_HNbrs <<< system->N, 32, 32 * RVEC_SIZE>>>
+        //k_Hydrogen_Bonds_Far_Nbrs <<< system->N, 32, 32 * RVEC_SIZE>>>
+        k_Hydrogen_Bonds_HNbrs <<< system->N, 32, 32 * RVEC_SIZE>>>
             (     system->d_atoms, 
                 system->reaxprm.d_sbp, 
                 *dev_workspace, 
diff --git a/PuReMD-GPU/src/cuda_forces.h b/PuReMD-GPU/src/cuda_forces.h
index 3b26116fad2c15a3dd57cc68caad62c402d95ab1..b017e63ebeee45c03d8926a79f1a9a0dd7a4771d 100644
--- a/PuReMD-GPU/src/cuda_forces.h
+++ b/PuReMD-GPU/src/cuda_forces.h
@@ -24,6 +24,10 @@
 #include "mytypes.h"
 
 
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 GLOBAL void k_Estimate_Sparse_Matrix_Entries ( reax_atom *, control_params *, 
         simulation_data *, simulation_box *, list, int, int * );
 
@@ -36,5 +40,9 @@ void Cuda_Threebody_List( reax_system *, static_storage *, list *, int );
 
 int validate_device (reax_system *, simulation_data *, static_storage *, list **);
 
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PuReMD-GPU/src/cuda_four_body_interactions.cu b/PuReMD-GPU/src/cuda_four_body_interactions.cu
index 8d8100608b7c067ba0ade906d744277f2aaead32..60d9973482ddd61a34d874301f1ed360443625b9 100644
--- a/PuReMD-GPU/src/cuda_four_body_interactions.cu
+++ b/PuReMD-GPU/src/cuda_four_body_interactions.cu
@@ -160,7 +160,7 @@ DEVICE real Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_jk,
 }
 
 
-GLOBAL void Four_Body_Interactions ( reax_atom *atoms, 
+GLOBAL void k_Four_Body_Interactions ( reax_atom *atoms, 
         global_parameters g_params,
         four_body_header *d_fbp,
         control_params *control,
@@ -804,7 +804,7 @@ GLOBAL void Four_Body_Interactions ( reax_atom *atoms,
 }
 
 
-GLOBAL void Four_Body_Postprocess ( reax_atom *atoms, 
+GLOBAL void k_Four_Body_Postprocess( reax_atom *atoms, 
         static_storage p_workspace, 
         list p_bonds, int N )
 {
diff --git a/PuReMD-GPU/src/cuda_four_body_interactions.h b/PuReMD-GPU/src/cuda_four_body_interactions.h
index 5e6cf35176e90c3e9a088162a33b7a776a7fb1f7..088e24f4f15a1e83405b83427ba56d1e5d4e67ba 100644
--- a/PuReMD-GPU/src/cuda_four_body_interactions.h
+++ b/PuReMD-GPU/src/cuda_four_body_interactions.h
@@ -23,10 +23,20 @@
 
 #include "mytypes.h"
 
-GLOBAL void Four_Body_Interactions ( reax_atom *, global_parameters ,
+
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
+GLOBAL void k_Four_Body_Interactions( reax_atom *, global_parameters ,
     four_body_header *, control_params *, list , list , simulation_box *,
-    simulation_data *, static_storage , int , int , real *, real *, rvec *);
+    simulation_data *, static_storage , int , int , real *, real *, rvec * );
+
+GLOBAL void k_Four_Body_Postprocess( reax_atom *, static_storage, list , int );
+
+#ifdef __cplusplus
+}
+#endif
 
-GLOBAL void Four_Body_Postprocess (reax_atom *, static_storage, list , int );
 
 #endif
diff --git a/PuReMD-GPU/src/cuda_grid.h b/PuReMD-GPU/src/cuda_grid.h
index ca2edf1718f57625b1cce68b4ab58180f11efbd4..28a20797a56bda9682354f967634eeb6301e44ae 100644
--- a/PuReMD-GPU/src/cuda_grid.h
+++ b/PuReMD-GPU/src/cuda_grid.h
@@ -23,7 +23,17 @@
 
 #include "mytypes.h"
 
+
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 void Cuda_Bin_Atoms( reax_system*, static_storage* );
 void Cuda_Bin_Atoms_Sync (reax_system *);
 
+#ifdef __cplusplus
+}
+#endif
+
+
 #endif
diff --git a/PuReMD-GPU/src/cuda_helpers.h b/PuReMD-GPU/src/cuda_helpers.h
index 292d561311a455287f9c107e4cef61a6ea61fa45..e306c673f8aec585bbda7b92250e2f0ec91eb2e3 100644
--- a/PuReMD-GPU/src/cuda_helpers.h
+++ b/PuReMD-GPU/src/cuda_helpers.h
@@ -25,7 +25,11 @@
 #include "mytypes.h"
 
 
-DEVICE inline int cuda_strcmp(char *a, char *b, int len)
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
+static inline DEVICE int cuda_strcmp(char *a, char *b, int len)
 {
     char *src, *dst;
 
@@ -52,7 +56,7 @@ DEVICE inline int cuda_strcmp(char *a, char *b, int len)
 }
 
 
-DEVICE inline real myAtomicAdd(double* address, double val)
+static inline DEVICE double myAtomicAdd(double* address, double val)
 {
     unsigned long long int* address_as_ull =
         (unsigned long long int*)address;
@@ -61,14 +65,14 @@ DEVICE inline real myAtomicAdd(double* address, double val)
     {
         assumed = old;
         old = atomicCAS(address_as_ull, assumed,
-                        __double_as_longlong(val + __longlong_as_double(assumed)));
+                __double_as_longlong(val + __longlong_as_double(assumed)));
     }
     while (assumed != old);
     return __longlong_as_double(old);
 }
 
 
-DEVICE inline void atomic_rvecAdd( rvec ret, rvec v )
+static inline DEVICE void atomic_rvecAdd( rvec ret, rvec v )
 {
     MYATOMICADD( (double*)&ret[0], (double)v[0] );
     MYATOMICADD( (double*)&ret[1], (double)v[1] );
@@ -76,12 +80,16 @@ DEVICE inline void atomic_rvecAdd( rvec ret, rvec v )
 }
 
 
-DEVICE inline void atomic_rvecScaledAdd( rvec ret, real c, rvec v )
+static inline DEVICE void atomic_rvecScaledAdd( rvec ret, real c, rvec v )
 {
     MYATOMICADD( (double*)&ret[0], (double)(c * v[0]) );
     MYATOMICADD( (double*)&ret[1], (double)(c * v[1]) );
     MYATOMICADD( (double*)&ret[2], (double)(c * v[2]) );
 }
 
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PuReMD-GPU/src/cuda_init.h b/PuReMD-GPU/src/cuda_init.h
index 233761691fe56bc8b1290c8972c4b413cc876f54..cd9c568130730d298fbaf781f4f6b18b0f02f65a 100644
--- a/PuReMD-GPU/src/cuda_init.h
+++ b/PuReMD-GPU/src/cuda_init.h
@@ -18,22 +18,31 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-
 #ifndef __CUDA_INIT_H__
 #define __CUDA_INIT_H__
 
 #include "mytypes.h"
 
-void    Cuda_Init_System ( reax_system* );
-void    Cuda_Init_Simulation_Data (simulation_data *);
-void    Cuda_Init_Workspace_System ( reax_system *, static_storage *);
-void    Cuda_Init_Workspace ( reax_system *, control_params *, static_storage *);
-void    Cuda_Init_Workspace_Device ( static_storage *);
-void  Cuda_Init_Control (control_params *);
-void  Cuda_Init_Grid (grid *, grid *);
 
-void    Cuda_Init_Sparse_Matrix (sparse_matrix *, int, int);
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
+void Cuda_Init_System( reax_system* );
+void Cuda_Init_Simulation_Data( simulation_data * );
+void Cuda_Init_Workspace_System( reax_system *, static_storage * );
+void Cuda_Init_Workspace( reax_system *, control_params *, static_storage * );
+void Cuda_Init_Workspace_Device( static_storage * );
+void Cuda_Init_Control( control_params * );
+void Cuda_Init_Grid( grid *, grid * );
+
+void Cuda_Init_Sparse_Matrix( sparse_matrix *, int, int );
+
+void Cuda_Init_Scratch( );
+
+#ifdef __cplusplus
+}
+#endif
 
-void Cuda_Init_Scratch ();
 
 #endif
diff --git a/PuReMD-GPU/src/cuda_init_md.cu b/PuReMD-GPU/src/cuda_init_md.cu
index 576c61ddaff2ad4fbbbdda891a6c9ba5694018c4..1a205506e4c5ff767e02398a3859f838818c1e1a 100644
--- a/PuReMD-GPU/src/cuda_init_md.cu
+++ b/PuReMD-GPU/src/cuda_init_md.cu
@@ -334,7 +334,7 @@ void Cuda_Init_Lists( reax_system *system, control_params *control,
     //First Bin atoms and they sync the host and the device for the grid.
     //This will copy the atoms from host to device.
     Cuda_Bin_Atoms( system, workspace );
-    Sync_Host_Device( &system->g, &system->d_g, cudaMemcpyHostToDevice );
+    Sync_Host_Device_Grid( &system->g, &system->d_g, cudaMemcpyHostToDevice );
 
     k_Estimate_NumNeighbors<<<blockspergrid, threadsperblock >>>
         (system->d_atoms, system->d_g, system->d_box, 
@@ -544,16 +544,16 @@ void Cuda_Initialize( reax_system *system, control_params *control,
 
     //System
     Cuda_Init_System( system );
-    Sync_Host_Device( system, cudaMemcpyHostToDevice );
+    Sync_Host_Device_Sys( system, cudaMemcpyHostToDevice );
     Cuda_Init_System( system, control, data );
 
     //Simulation Data
     copy_host_device( system->atoms, system->d_atoms, REAX_ATOM_SIZE * system->N , 
             cudaMemcpyHostToDevice, RES_SYSTEM_ATOMS );
     Cuda_Init_Simulation_Data( data );
-    //Sync_Host_Device (data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice);
+    //Sync_Host_Device_Data( data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice );
     Cuda_Init_Simulation_Data( system, control, data, out_control, Evolve );
-    Sync_Host_Device( data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice );
+    Sync_Host_Device_Data( data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice );
 
     //static storage
     Cuda_Init_Workspace_System( system, dev_workspace );
diff --git a/PuReMD-GPU/src/cuda_init_md.h b/PuReMD-GPU/src/cuda_init_md.h
index 80d5aa2fb44b56b89a99f45715e672bbd4f7a25b..0a6544b017d31ca0a82651840682c3aae49d1b11 100644
--- a/PuReMD-GPU/src/cuda_init_md.h
+++ b/PuReMD-GPU/src/cuda_init_md.h
@@ -23,8 +23,18 @@
 
 #include "mytypes.h"
 
+
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 void Cuda_Initialize( reax_system*, control_params*, simulation_data*,
-                      static_storage*, list**, output_controls*, evolve_function* );
+       static_storage*, list**, output_controls*, evolve_function* );
+
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
 
diff --git a/PuReMD-GPU/src/cuda_integrate.cu b/PuReMD-GPU/src/cuda_integrate.cu
index 547f4f4a1089b26f3d5d7d3b2e41fedc0b05335a..cba0b79c39b4f9b66e5b506d11dcffb81adc488d 100644
--- a/PuReMD-GPU/src/cuda_integrate.cu
+++ b/PuReMD-GPU/src/cuda_integrate.cu
@@ -299,7 +299,7 @@ void Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system,
         cudaThreadSynchronize ();
         cudaCheckError ();
 
-        Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>>
+        Cuda_reduction<<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>>
             (results, results + BLOCKS_POW_2, BLOCKS_POW_2);
         cudaThreadSynchronize ();
         cudaCheckError ();
diff --git a/PuReMD-GPU/src/cuda_integrate.h b/PuReMD-GPU/src/cuda_integrate.h
index 3413f2d03593c597d48d2092f7737b380475e548..959b6684fef618b86252a5606b6feb4a6d093f15 100644
--- a/PuReMD-GPU/src/cuda_integrate.h
+++ b/PuReMD-GPU/src/cuda_integrate.h
@@ -23,6 +23,11 @@
 
 #include "mytypes.h"
 
+
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 void Cuda_Velocity_Verlet_NVE( reax_system*, control_params*, simulation_data*,
         static_storage*, list**, output_controls* );
 void Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system*, control_params*,
@@ -32,5 +37,10 @@ void Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* , control_params* ,
         simulation_data *, static_storage *,
         list **, output_controls * );
 
+#ifdef __cplusplus
+}
+#endif
+
+
 #endif
 
diff --git a/PuReMD-GPU/src/cuda_lin_alg.h b/PuReMD-GPU/src/cuda_lin_alg.h
index 0ad017b10c59c650a1bfdbbda0edbedcbbab3c8f..6b464152280f3590c1c70761c1b3ef206779cf5b 100644
--- a/PuReMD-GPU/src/cuda_lin_alg.h
+++ b/PuReMD-GPU/src/cuda_lin_alg.h
@@ -25,9 +25,19 @@
 
 #include "mytypes.h"
 
+
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 GLOBAL void Cuda_Matvec (sparse_matrix , real *, real *, int );
 GLOBAL void Cuda_Matvec_csr (sparse_matrix , real *, real *, int );
 int Cuda_GMRES( static_storage *, real *b, real tol, real *x );
 int Cublas_GMRES( reax_system *, static_storage *, real *b, real tol, real *x );
 
+#ifdef __cplusplus
+}
+#endif
+
+
 #endif
diff --git a/PuReMD-GPU/src/cuda_list.h b/PuReMD-GPU/src/cuda_list.h
index 4ebc60079bcbd5c835fe49d27ef4d23b65483d8b..cafc85743b9eb6b31da5b35b063361ef97fddf54 100644
--- a/PuReMD-GPU/src/cuda_list.h
+++ b/PuReMD-GPU/src/cuda_list.h
@@ -23,7 +23,17 @@
 
 #include "mytypes.h"
 
+
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 char Cuda_Make_List( int, int, int, list* );
 void Cuda_Delete_List( list* );
 
+#ifdef __cplusplus
+}
+#endif
+
+
 #endif
diff --git a/PuReMD-GPU/src/cuda_lookup.cu b/PuReMD-GPU/src/cuda_lookup.cu
index 7605d4099a62ad1359d0783a4f8d11f26dd12dc6..9804ad8b6afee5b0c7e04ea37f46d9c206c5ba68 100644
--- a/PuReMD-GPU/src/cuda_lookup.cu
+++ b/PuReMD-GPU/src/cuda_lookup.cu
@@ -405,7 +405,7 @@ GLOBAL void calculate_LR_Values ( LR_lookup_table *d_LR, real *h, real *fh, real
     int r = blockIdx.x * blockDim.x + threadIdx.x;
     if ( r == 0 || r > count ) return;
 
-    LR_vdW_Coulomb( g_params, tbp, control, i, j, r * dr, &data[r], num_atom_types );
+    d_LR_vdW_Coulomb( g_params, tbp, control, i, j, r * dr, &data[r], num_atom_types );
 
     h[r] = d_LR[ index_lr(i, j, num_atom_types) ].dx;
     fh[r] = d_LR[ index_lr(i, j, num_atom_types) ].y[r].H;
diff --git a/PuReMD-GPU/src/cuda_lookup.h b/PuReMD-GPU/src/cuda_lookup.h
index 041df8777474005745063160abd934b4d2bd3bf1..a0e05e0c092f1634a1d21fd902f198f714ac2391 100644
--- a/PuReMD-GPU/src/cuda_lookup.h
+++ b/PuReMD-GPU/src/cuda_lookup.h
@@ -23,8 +23,18 @@
 
 #include "mytypes.h"
 
+
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 void Cuda_Make_LR_Lookup_Table( reax_system*, control_params* );
 void copy_LR_table_to_device ( reax_system*, control_params* );
 
+#ifdef __cplusplus
+}
+#endif
+
+
 #endif
 
diff --git a/PuReMD-GPU/src/cuda_neighbors.cu b/PuReMD-GPU/src/cuda_neighbors.cu
index 3f447b48d760d8ebf9feb1b75a2ee914ffd1f9ed..876b6b9913e4d825e0cc8be5a2fc1d092c56d9f8 100644
--- a/PuReMD-GPU/src/cuda_neighbors.cu
+++ b/PuReMD-GPU/src/cuda_neighbors.cu
@@ -41,7 +41,7 @@ extern inline DEVICE int index_grid (int blocksize)
 }
 
 
-DEVICE int Are_Far_Neighbors( rvec x1, rvec x2, simulation_box *box, 
+DEVICE int d_Are_Far_Neighbors( rvec x1, rvec x2, simulation_box *box, 
         real cutoff, far_neighbor_data *data )
 {
     real norm_sqr, d, tmp;
@@ -123,20 +123,20 @@ GLOBAL void k_Estimate_NumNeighbors( reax_atom *sys_atoms,
                 //CHANGE ORIGINAL
                 /*
                    if (atom1 > atom2) {
-                   if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, 
+                   if (d_Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, 
                    control->vlist_cut, &nbr_data)){
                    ++num_far;
                    }
                    }
                  */
                 if (atom1 > atom2) {
-                    if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, 
+                    if (d_Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, 
                                 control->vlist_cut, &nbr_data)){
                         ++num_far;
                     }
                 }
                 else if (atom1 < atom2) {
-                    if (Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, 
+                    if (d_Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, 
                                 control->vlist_cut, &nbr_data)){
                         ++num_far;
                     }
@@ -205,13 +205,13 @@ GLOBAL void k_New_Estimate_NumNeighbors( reax_atom *sys_atoms,
             {
                 atom2 = nbr_atoms[m];
                 if (atom1 > atom2) {
-                    if (Are_Far_Neighbors (atom1_x, sys_atoms[atom2].x, box, 
+                    if (d_Are_Far_Neighbors (atom1_x, sys_atoms[atom2].x, box, 
                                 control->vlist_cut, &temp)){
                         num_far++;
                     }
                 }
                 else if (atom1 < atom2) {
-                    if (Are_Far_Neighbors (sys_atoms[atom2].x, atom1_x, box, 
+                    if (d_Are_Far_Neighbors (sys_atoms[atom2].x, atom1_x, box, 
                                 control->vlist_cut, &temp)){
                         num_far ++;
                     }
@@ -273,7 +273,7 @@ GLOBAL void k_Generate_Neighbor_Lists ( reax_atom *sys_atoms,
                 //CHANGE ORIGINAL
                 /*
                    if (atom1 > atom2) {
-                   if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, 
+                   if (d_Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, 
                    control->vlist_cut, &temp)){
 
                    nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] );
@@ -291,7 +291,7 @@ GLOBAL void k_Generate_Neighbor_Lists ( reax_atom *sys_atoms,
                    }
                  */
                 if (atom1 > atom2) {
-                    if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, 
+                    if (d_Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, 
                                 control->vlist_cut, &temp)){
                         nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] );
                         nbr_data->nbr = atom2;
@@ -307,7 +307,7 @@ GLOBAL void k_Generate_Neighbor_Lists ( reax_atom *sys_atoms,
                     }
                 }
                 else if (atom1 < atom2) {
-                    if (Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, 
+                    if (d_Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, 
                                 control->vlist_cut, &temp)){
                         nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] );
                         nbr_data->nbr = atom2;
@@ -394,7 +394,7 @@ GLOBAL void k_New_Generate_Neighbor_Lists( reax_atom *sys_atoms,
             {
                 atom2 = nbr_atoms[m];
                 if (atom1 > atom2) {
-                    if (Are_Far_Neighbors (atom1_x, sys_atoms[atom2].x, box, 
+                    if (d_Are_Far_Neighbors (atom1_x, sys_atoms[atom2].x, box, 
                                 control->vlist_cut, &temp)){
                         //nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] );
                         nbr_data = my_start;
@@ -412,7 +412,7 @@ GLOBAL void k_New_Generate_Neighbor_Lists( reax_atom *sys_atoms,
                     }
                 }
                 else if (atom1 < atom2) {
-                    if (Are_Far_Neighbors (sys_atoms[atom2].x, atom1_x, box, 
+                    if (d_Are_Far_Neighbors (sys_atoms[atom2].x, atom1_x, box, 
                                 control->vlist_cut, &temp)){
                         //nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] );
                         nbr_data = my_start;
@@ -532,7 +532,7 @@ GLOBAL void Test_Generate_Neighbor_Lists( reax_atom *sys_atoms,
                 if (m < max) {
                     atom2 = nbr_atoms[m];
                     if (atom1 > atom2) {
-                        if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, 
+                        if (d_Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, 
                                     control->vlist_cut, &temp))
                         {
                             tnbr [threadIdx.x] = 1;
@@ -540,7 +540,7 @@ GLOBAL void Test_Generate_Neighbor_Lists( reax_atom *sys_atoms,
                         }
                     }
                     else if (atom1 < atom2) {
-                        if (Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, 
+                        if (d_Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, 
                                     control->vlist_cut, &temp)){
                             tnbr [threadIdx.x] = 1;
                             nbrgen = TRUE;
diff --git a/PuReMD-GPU/src/cuda_neighbors.h b/PuReMD-GPU/src/cuda_neighbors.h
index 870ddae9717fe59388e89886651e3edb6ce1a915..13656f62f12b53509fad82f535e974e45cc45805 100644
--- a/PuReMD-GPU/src/cuda_neighbors.h
+++ b/PuReMD-GPU/src/cuda_neighbors.h
@@ -24,13 +24,21 @@
 #include "mytypes.h"
 
 
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 GLOBAL void k_Estimate_NumNeighbors( reax_atom *, grid, simulation_box *,
         control_params *, int * );
 
 void Cuda_Generate_Neighbor_Lists (reax_system *system,
         static_storage *workspace, control_params *control, int);
 
-DEVICE int Are_Far_Neighbors( rvec, rvec, simulation_box*, real, far_neighbor_data* );
+DEVICE int d_Are_Far_Neighbors( rvec, rvec, simulation_box*, real, far_neighbor_data* );
+
+#ifdef __cplusplus
+}
+#endif
 
 
 #endif
diff --git a/PuReMD-GPU/src/cuda_post_evolve.cu b/PuReMD-GPU/src/cuda_post_evolve.cu
index 2190220b334604e74793e148544ea61e01d7551d..f5dbad825f2ac9f6e120e6cf7147258c89868c71 100644
--- a/PuReMD-GPU/src/cuda_post_evolve.cu
+++ b/PuReMD-GPU/src/cuda_post_evolve.cu
@@ -33,7 +33,7 @@ void Cuda_Setup_Evolve( reax_system* system, control_params* control,
 {
     //fprintf (stderr, "Begin ... \n");
     //to Sync step to the device.
-    //Sync_Host_Device (&data, (simulation_data *)data.d_simulation_data, cudaMemcpyHostToDevice );
+    //Sync_Host_Device_Data( &data, (simulation_data *)data.d_simulation_data, cudaMemcpyHostToDevice );
     copy_host_device( &data->step, &((simulation_data *)data->d_simulation_data)->step, 
             INT_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA );
 
@@ -49,7 +49,7 @@ void Cuda_Setup_Output( reax_system* system, simulation_data* data )
 
 void Cuda_Sync_Temp( control_params* control )
 {
-    Sync_Host_Device( control, (control_params*)control->d_control, cudaMemcpyHostToDevice );
+    Sync_Host_Device_Params( control, (control_params*)control->d_control, cudaMemcpyHostToDevice );
 }
 
 
diff --git a/PuReMD-GPU/src/cuda_post_evolve.h b/PuReMD-GPU/src/cuda_post_evolve.h
index 02c9eb63c37b69b3b1b8127e8a4df9f656150e8a..1d8fdc270edd28e0f387d53d9b9837c3a4879542 100644
--- a/PuReMD-GPU/src/cuda_post_evolve.h
+++ b/PuReMD-GPU/src/cuda_post_evolve.h
@@ -23,6 +23,11 @@
 
 #include "mytypes.h"
 
+
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 void Cuda_Setup_Evolve( reax_system *, control_params *, 
         simulation_data *, static_storage *, 
         list **, output_controls * );
@@ -35,5 +40,9 @@ void Cuda_Post_Evolve( reax_system *, control_params *,
         simulation_data *, static_storage *, 
         list **, output_controls * );
 
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PuReMD-GPU/src/cuda_reduction.cu b/PuReMD-GPU/src/cuda_reduction.cu
index 52cb198a71bcdcc637fcd6c69a8fffd8af34ed86..e22f9ad8474830ebdbe11089cdfcc986ff8dd8f3 100644
--- a/PuReMD-GPU/src/cuda_reduction.cu
+++ b/PuReMD-GPU/src/cuda_reduction.cu
@@ -149,7 +149,7 @@ GLOBAL void Cuda_matrix_col_reduction(const real *input, real *per_block_results
 }
 
 
-GLOBAL void Cuda_reduction(const int *input, int *per_block_results, const size_t n)
+GLOBAL void Cuda_reduction_int(const int *input, int *per_block_results, const size_t n)
 {
     extern __shared__ int sh_input[];
     unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/PuReMD-GPU/src/cuda_reduction.h b/PuReMD-GPU/src/cuda_reduction.h
index fefbe4c2de5e7674c2d53c73c2aba698b7c93f2a..5b9baf1df69f2a4ab2bdab0d3bff26e390634951 100644
--- a/PuReMD-GPU/src/cuda_reduction.h
+++ b/PuReMD-GPU/src/cuda_reduction.h
@@ -18,22 +18,32 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#ifndef __REDUCTION_H__
-#define __REDUCTION_H__
+#ifndef __CUDA_REDUCTION_H__
+#define __CUDA_REDUCTION_H__
 
 #include "mytypes.h"
 
 #define INITIAL 0
 #define FINAL       1
 
-GLOBAL void Cuda_reduction (const real *input, real *per_block_results, const size_t n);
-GLOBAL void Cuda_Norm (const real *input, real *per_block_results, const size_t n, int pass);
-GLOBAL void Cuda_Dot (const real *a, const real *b, real *per_block_results, const size_t n);
-GLOBAL void Cuda_reduction (const int *input, int *per_block_results, const size_t n);
-GLOBAL void Cuda_reduction_rvec (rvec *, rvec *, size_t n);
+
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
+GLOBAL void Cuda_reduction( const real *input, real *per_block_results, const size_t n );
+GLOBAL void Cuda_Norm( const real *input, real *per_block_results, const size_t n, int pass );
+GLOBAL void Cuda_Dot( const real *a, const real *b, real *per_block_results, const size_t n );
+GLOBAL void Cuda_reduction_int( const int *input, int *per_block_results, const size_t n );
+GLOBAL void Cuda_reduction_rvec( rvec *, rvec *, size_t n );
 
 GLOBAL void Cuda_Vector_Sum( real* , real , real* , real , real* , int ) ;
 GLOBAL void Cuda_Vector_Scale( real* , real , real* , int ) ;
 GLOBAL void Cuda_Vector_Add( real* , real , real* , int );
 
+#ifdef __cplusplus
+}
+#endif
+
+
 #endif
diff --git a/PuReMD-GPU/src/cuda_reset_utils.cu b/PuReMD-GPU/src/cuda_reset_utils.cu
index be582117d1d18a8832c2a2d793f4578d8dbcf88d..d18d9f749a9a57a269f731aecae7754bdb8925a4 100644
--- a/PuReMD-GPU/src/cuda_reset_utils.cu
+++ b/PuReMD-GPU/src/cuda_reset_utils.cu
@@ -143,7 +143,7 @@ void Cuda_Reset( reax_system *system, control_params *control,
 
     //Reset_Simulation_Data( data );
     Cuda_Sync_Simulation_Data ( data );
-    //Sync_Host_Device (data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice);
+    //Sync_Host_Device_Data( data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice );
 
     if( control->ensemble == NPT || control->ensemble == sNPT || 
             control->ensemble == iNPT )
diff --git a/PuReMD-GPU/src/cuda_reset_utils.h b/PuReMD-GPU/src/cuda_reset_utils.h
index 1e7b3e6606e681293143cdf10bef9da332a4ea6d..bf730b935cef4857eb8e0aa97d614ef122bfca0e 100644
--- a/PuReMD-GPU/src/cuda_reset_utils.h
+++ b/PuReMD-GPU/src/cuda_reset_utils.h
@@ -24,6 +24,10 @@
 #include "mytypes.h"
 
 
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 void Cuda_Reset_Grid( grid* );
 
 void Cuda_Reset_Workspace (reax_system *, static_storage *);
@@ -33,6 +37,9 @@ void Cuda_Reset( reax_system*, control_params*, simulation_data*,
 
 void Cuda_Reset_Atoms (reax_system *);
 
-
+#ifdef __cplusplus
+}
 #endif
 
+
+#endif
diff --git a/PuReMD-GPU/src/cuda_single_body_interactions.h b/PuReMD-GPU/src/cuda_single_body_interactions.h
index ce181c9d7c3814a2281f000032166c6e51b26db8..3ecd4b9a68b174624ac1dcf56aae4cec360750be 100644
--- a/PuReMD-GPU/src/cuda_single_body_interactions.h
+++ b/PuReMD-GPU/src/cuda_single_body_interactions.h
@@ -24,6 +24,10 @@
 #include "mytypes.h"
 
 
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *, global_parameters ,
         single_body_parameters *, two_body_parameters *,
         static_storage , simulation_data *,
@@ -46,6 +50,10 @@ GLOBAL void test_LonePair_Postprocess ( reax_atom *, global_parameters ,
         static_storage , simulation_data *,
         list , int , int );
 
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
 
diff --git a/PuReMD-GPU/src/cuda_system_props.cu b/PuReMD-GPU/src/cuda_system_props.cu
index 4f19c90d37f934c50523aabadcf967170224a4c0..7ff5c11fe6d644e6878cd5e25e67b82de39ad0f5 100644
--- a/PuReMD-GPU/src/cuda_system_props.cu
+++ b/PuReMD-GPU/src/cuda_system_props.cu
@@ -37,7 +37,7 @@ GLOBAL void k_Kinetic_Energy_Reduction(simulation_data *, real *, int);
 void prep_dev_system (reax_system *system) 
 {
     //copy the system atoms to the device
-    Sync_Host_Device ( system, cudaMemcpyHostToDevice );
+    Sync_Host_Device_Sys( system, cudaMemcpyHostToDevice );
 }
 
 
@@ -188,18 +188,18 @@ void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data,
     cuda_memset (partial_results, 0, REAL_SIZE * 6 * (BLOCKS_POW_2 + 1), RES_SCRATCH );
     local_results = (real *) malloc (REAL_SIZE * 6 *(BLOCKS_POW_2+ 1));
 
-    k_compute_center_mass<<<BLOCKS_POW_2, BLOCK_SIZE, 6 * (REAL_SIZE * BLOCK_SIZE) >>> 
+    k_compute_center_mass_sbp<<<BLOCKS_POW_2, BLOCK_SIZE, 6 * (REAL_SIZE * BLOCK_SIZE) >>> 
         (system->reaxprm.d_sbp, system->d_atoms, partial_results, 
          data->xcm[0], data->xcm[1], data->xcm[2], system->N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+    cudaThreadSynchronize( );
+    cudaCheckError( );
 
     k_compute_center_mass<<<1, BLOCKS_POW_2, 6 * (REAL_SIZE * BLOCKS_POW_2) >>> 
         (partial_results, partial_results + (BLOCKS_POW_2 * 6), BLOCKS_POW_2);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+    cudaThreadSynchronize( );
+    cudaCheckError( );
 
-    copy_host_device (local_results, partial_results + 6 * BLOCKS_POW_2, REAL_SIZE * 6, cudaMemcpyDeviceToHost, __LINE__);
+    copy_host_device( local_results, partial_results + 6 * BLOCKS_POW_2, REAL_SIZE * 6, cudaMemcpyDeviceToHost, __LINE__ );
 
 #ifdef __BUILD_DEBUG__
     if (check_zero (local_results[0],xx) ||
@@ -253,16 +253,19 @@ void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data,
     inv[2][2] = mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1];
 
     if( fabs(det) > ALMOST_ZERO )
+    {
         rtensor_Scale( inv, 1./det, inv );
+    }
     else 
+    {
         rtensor_MakeZero( inv );
+    }
 
     /* Compute the angular velocity about the centre of mass */
     rtensor_MatVec( data->avcm, inv, data->amcm );  
     data->erot_cm = 0.5 * E_CONV * rvec_Dot( data->avcm, data->amcm );
 
-    //free the resources
-    free (local_results);
+    free( local_results );
 
 #if defined(DEBUG)
     fprintf( stderr, "xcm:  %24.15e %24.15e %24.15e\n",  
@@ -286,7 +289,7 @@ void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data,
 }
 
 
-void Cuda_Compute_Kinetic_Energy (reax_system *system, simulation_data *data)
+void Cuda_Compute_Kinetic_Energy( reax_system *system, simulation_data *data )
 {
     real *results = (real *) scratch;
     cuda_memset (results, 0, REAL_SIZE * BLOCKS_POW_2, RES_SCRATCH);
@@ -458,5 +461,3 @@ data->therm.T = ALMOST_ZERO;
 }
 }
  */
-
-
diff --git a/PuReMD-GPU/src/cuda_system_props.h b/PuReMD-GPU/src/cuda_system_props.h
index bc123ddd960ebc450070651bd728b36bcb8dda38..026999e9e7bbc6ecc70b0945e7a30bd853a6cbe0 100644
--- a/PuReMD-GPU/src/cuda_system_props.h
+++ b/PuReMD-GPU/src/cuda_system_props.h
@@ -24,11 +24,19 @@
 #include "mytypes.h"
 
 
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 void prep_dev_system (reax_system *system);
 
 void Cuda_Compute_Total_Mass( reax_system*, simulation_data* );
 void Cuda_Compute_Center_of_Mass( reax_system*, simulation_data*, FILE* );
 void Cuda_Compute_Kinetic_Energy( reax_system*, simulation_data* );
 
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PuReMD-GPU/src/cuda_three_body_interactions.cu b/PuReMD-GPU/src/cuda_three_body_interactions.cu
index 384d952b8f4358936016b982b7d076a5bc62e2d4..038b88402b49e516ad4594de520a09464b4c7c8e 100644
--- a/PuReMD-GPU/src/cuda_three_body_interactions.cu
+++ b/PuReMD-GPU/src/cuda_three_body_interactions.cu
@@ -30,7 +30,7 @@
 
 
 /* calculates the theta angle between i-j-k */
-DEVICE void Calculate_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk, 
+DEVICE void d_Calculate_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk, 
         real *theta, real *cos_theta )
 {
     (*cos_theta) = Dot( dvec_ji, dvec_jk, 3 ) / ( d_ji * d_jk );
@@ -42,7 +42,7 @@ DEVICE void Calculate_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk,
 
 
 /* calculates the derivative of the cosine of the angle between i-j-k */
-DEVICE void Calculate_dCos_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk, 
+DEVICE void d_Calculate_dCos_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk, 
         rvec* dcos_theta_di, rvec* dcos_theta_dj, 
         rvec* dcos_theta_dk )
 {
@@ -73,7 +73,7 @@ DEVICE void Calculate_dCos_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_
 
 /* this is a 3-body interaction in which the main role is 
    played by j which sits in the middle of the other two. */
-GLOBAL void Three_Body_Interactions( reax_atom *atoms,
+GLOBAL void k_Three_Body_Interactions( reax_atom *atoms,
         single_body_parameters *sbp,
         three_body_header *d_thbp,
         global_parameters g_params,
@@ -253,11 +253,11 @@ where i < k */
                 if (BOA_jk <= 0) continue;
                 //CHANGE ORIGINAL
 
-                Calculate_Theta( pbond_ij->dvec, pbond_ij->d, 
+                d_Calculate_Theta( pbond_ij->dvec, pbond_ij->d, 
                         pbond_jk->dvec, pbond_jk->d,
                         &theta, &cos_theta );
 
-                Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, 
+                d_Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, 
                         pbond_jk->dvec, pbond_jk->d, 
                         &(p_ijk->dcos_di), &(p_ijk->dcos_dj), 
                         &(p_ijk->dcos_dk) );
@@ -642,7 +642,7 @@ where i < k */
 }
 
 
-GLOBAL void Three_Body_Interactions_results (     reax_atom *atoms, control_params *control,
+GLOBAL void k_Three_Body_Interactions_results (     reax_atom *atoms, control_params *control,
         static_storage p_workspace, 
         list p_bonds, int N )
 {
@@ -671,7 +671,7 @@ GLOBAL void Three_Body_Interactions_results (     reax_atom *atoms, control_para
 
 /* this is a 3-body interaction in which the main role is 
    played by j which sits in the middle of the other two. */
-GLOBAL void Three_Body_Estimate ( reax_atom *atoms, 
+GLOBAL void k_Three_Body_Estimate ( reax_atom *atoms, 
         control_params *control,
         list p_bonds, int N, 
         int *count)
@@ -747,7 +747,7 @@ GLOBAL void Three_Body_Estimate ( reax_atom *atoms,
 }
 
 
-GLOBAL void Hydrogen_Bonds(reax_atom *atoms,
+GLOBAL void k_Hydrogen_Bonds(reax_atom *atoms,
         single_body_parameters *sbp,
         hbond_parameters *d_hbp,
         control_params *control,
@@ -874,10 +874,10 @@ GLOBAL void Hydrogen_Bonds(reax_atom *atoms,
                     hbp = &(d_hbp[ index_hbp(type_i, type_j, type_k, num_atom_types) ]);
                     ++num_hb_intrs;
 
-                    Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
+                    d_Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
                             &theta, &cos_theta );
                     // the derivative of cos(theta)
-                    Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
+                    d_Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
                             &dcos_theta_di, &dcos_theta_dj, 
                             &dcos_theta_dk );
 
@@ -1105,7 +1105,7 @@ DEVICE void warpReduce(volatile real* sdata, int tid)
 
 
 
-GLOBAL void Hydrogen_Bonds_HB(reax_atom *atoms,
+GLOBAL void k_Hydrogen_Bonds_HB(reax_atom *atoms,
         single_body_parameters *sbp,
         hbond_parameters *d_hbp,
         control_params *control,
@@ -1254,10 +1254,10 @@ GLOBAL void Hydrogen_Bonds_HB(reax_atom *atoms,
                     hbp = &(d_hbp[ index_hbp(type_i, type_j, type_k, num_atom_types) ]);
                     ++num_hb_intrs;
 
-                    Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
+                    d_Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
                             &theta, &cos_theta );
                     // the derivative of cos(theta)
-                    Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
+                    d_Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
                             &dcos_theta_di, &dcos_theta_dj, 
                             &dcos_theta_dk );
 
@@ -1482,7 +1482,7 @@ GLOBAL void Hydrogen_Bonds_HB(reax_atom *atoms,
 }
 
 
-GLOBAL void Hydrogen_Bonds_Postprocess(reax_atom *atoms, 
+GLOBAL void k_Hydrogen_Bonds_Postprocess(reax_atom *atoms, 
         single_body_parameters *sbp,
         static_storage p_workspace,
         list p_bonds, list p_hbonds, list p_far_nbrs, int N, 
@@ -1547,7 +1547,7 @@ GLOBAL void Hydrogen_Bonds_Postprocess(reax_atom *atoms,
 }
 
 
-GLOBAL void Hydrogen_Bonds_Far_Nbrs(reax_atom *atoms, 
+GLOBAL void k_Hydrogen_Bonds_Far_Nbrs(reax_atom *atoms, 
         single_body_parameters *sbp,
         static_storage p_workspace,
         list p_bonds, list p_hbonds, list p_far_nbrs, int N )
@@ -1593,7 +1593,7 @@ GLOBAL void Hydrogen_Bonds_Far_Nbrs(reax_atom *atoms,
 }
 
 
-GLOBAL void Hydrogen_Bonds_HNbrs(reax_atom *atoms, 
+GLOBAL void k_Hydrogen_Bonds_HNbrs(reax_atom *atoms, 
         single_body_parameters *sbp,
         static_storage p_workspace,
         list p_bonds, list p_hbonds, list p_far_nbrs, int N )
diff --git a/PuReMD-GPU/src/cuda_three_body_interactions.h b/PuReMD-GPU/src/cuda_three_body_interactions.h
index 20e3baae482f481c9bc2cd11a0d7f8c4f2b3d0ae..4a87fcfe42852c50ac58a1ef52e8353cf24971a2 100644
--- a/PuReMD-GPU/src/cuda_three_body_interactions.h
+++ b/PuReMD-GPU/src/cuda_three_body_interactions.h
@@ -23,49 +23,49 @@
 
 #include "mytypes.h"
 
-DEVICE void Calculate_Theta( rvec, real, rvec, real, real*, real* );
 
-DEVICE void Calculate_dCos_Theta( rvec, real, rvec, real, rvec*, rvec*, rvec* );
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
+DEVICE void d_Calculate_Theta( rvec, real, rvec, real, real*, real* );
+
+DEVICE void d_Calculate_dCos_Theta( rvec, real, rvec, real, rvec*, rvec*, rvec* );
 
-GLOBAL void Three_Body_Interactions( reax_atom *, single_body_parameters *, three_body_header *,
-        global_parameters , control_params *, simulation_data *,
-        static_storage ,
+GLOBAL void k_Three_Body_Interactions( reax_atom *, single_body_parameters *, three_body_header *,
+        global_parameters , control_params *, simulation_data *, static_storage ,
         list , list , int , int , real *, real *, real *, rvec *);
 
-GLOBAL void Three_Body_Interactions_results (  reax_atom *,
-        control_params *,
-        static_storage ,
-        list , int );
+GLOBAL void k_Three_Body_Interactions_results( reax_atom *,
+        control_params *, static_storage , list , int );
 
-GLOBAL void Three_Body_Estimate ( reax_atom *atoms,
-        control_params *control,
-        list p_bonds, int N,
-        int *count);
+GLOBAL void k_Three_Body_Estimate( reax_atom *atoms,
+        control_params *control, list p_bonds, int N, int *count);
 
-GLOBAL void Hydrogen_Bonds (  reax_atom *,
+GLOBAL void k_Hydrogen_Bonds( reax_atom *,
         single_body_parameters *, hbond_parameters *,
         control_params *, simulation_data *, static_storage ,
         list , list , int , int, real *, rvec *, rvec *);
 
-GLOBAL void Hydrogen_Bonds_HB (  reax_atom *,
+GLOBAL void k_Hydrogen_Bonds_HB( reax_atom *,
         single_body_parameters *, hbond_parameters *,
         control_params *, simulation_data *, static_storage ,
         list , list , int , int, real *, rvec *, rvec *);
 
-GLOBAL void Hydrogen_Bonds_Postprocess (  reax_atom *,
+GLOBAL void k_Hydrogen_Bonds_Postprocess(  reax_atom *,
         single_body_parameters *,
         static_storage , list,
         list , list , int, real * );
 
-GLOBAL void Hydrogen_Bonds_Far_Nbrs (  reax_atom *,
-        single_body_parameters *,
-        static_storage , list,
-        list , list , int );
+GLOBAL void k_Hydrogen_Bonds_Far_Nbrs(  reax_atom *,
+        single_body_parameters *, static_storage , list, list , list , int );
 
-GLOBAL void Hydrogen_Bonds_HNbrs (  reax_atom *,
-        single_body_parameters *,
-        static_storage , list,
-        list , list , int );
+GLOBAL void k_Hydrogen_Bonds_HNbrs( reax_atom *, single_body_parameters *,
+        static_storage , list, list , list , int );
+
+#ifdef __cplusplus
+}
+#endif
 
 
 #endif
diff --git a/PuReMD-GPU/src/cuda_two_body_interactions.cu b/PuReMD-GPU/src/cuda_two_body_interactions.cu
index 386a3651b42a5367262a0897fbf7dad535d018b9..d5f6abd64b16cc4582a404566560768eca49b866 100644
--- a/PuReMD-GPU/src/cuda_two_body_interactions.cu
+++ b/PuReMD-GPU/src/cuda_two_body_interactions.cu
@@ -21,6 +21,7 @@
 #include "cuda_two_body_interactions.h"
 
 #include "bond_orders.h"
+#include "index_utils.h"
 #include "list.h"
 #include "lookup.h"
 #include "vector.h"
@@ -191,122 +192,6 @@ GLOBAL void Cuda_Bond_Energy ( reax_atom *atoms, global_parameters g_params,
 }
 
 
-DEVICE void LR_vdW_Coulomb(global_parameters g_params, two_body_parameters *tbp,
-        control_params *control, int i, int j, real r_ij, LR_data *lr, int num_atom_types )
-{
-    real p_vdW1 = g_params.l[28];
-    real p_vdW1i = 1.0 / p_vdW1;
-    real powr_vdW1, powgi_vdW1;
-    real tmp, fn13, exp1, exp2;
-    real Tap, dTap, dfn13;
-    real dr3gamij_1, dr3gamij_3;
-    real e_core, de_core;
-    two_body_parameters *twbp;
-
-    twbp = &(tbp[ index_tbp (i, j, num_atom_types) ]);
-    e_core = 0;
-    de_core = 0;
-
-    /* calculate taper and its derivative */
-    Tap = control->Tap7 * r_ij + control->Tap6;
-    Tap = Tap * r_ij + control->Tap5;
-    Tap = Tap * r_ij + control->Tap4;
-    Tap = Tap * r_ij + control->Tap3;
-    Tap = Tap * r_ij + control->Tap2;
-    Tap = Tap * r_ij + control->Tap1;
-    Tap = Tap * r_ij + control->Tap0;
-
-    dTap = 7 * control->Tap7 * r_ij + 6 * control->Tap6;
-    dTap = dTap * r_ij + 5 * control->Tap5;
-    dTap = dTap * r_ij + 4 * control->Tap4;
-    dTap = dTap * r_ij + 3 * control->Tap3;
-    dTap = dTap * r_ij + 2 * control->Tap2;
-    dTap += control->Tap1 / r_ij;
-
-
-    /* vdWaals calculations */
-    powr_vdW1 = POW(r_ij, p_vdW1);
-    powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
-
-    fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
-    exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
-    exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
-
-    lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);
-    /* fprintf(stderr,"vdW: Tap:%f, r: %f, f13:%f, D:%f, Energy:%f,\
-       Gamma_w:%f, p_vdw: %f, alpha: %f, r_vdw: %f, %lf %lf\n",
-       Tap, r_ij, fn13, twbp->D, Tap * twbp->D * (exp1 - 2.0 * exp2),
-       powgi_vdW1, p_vdW1, twbp->alpha, twbp->r_vdW, exp1, exp2); */
-
-    dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * POW(r_ij, p_vdW1 - 2.0);
-
-    lr->CEvd = dTap * twbp->D * (exp1 - 2 * exp2) -
-               Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
-
-    /*vdWaals Calculations*/
-    if (g_params.vdw_type == 1 || g_params.vdw_type == 3)
-    {
-        // shielding
-        powr_vdW1 = POW(r_ij, p_vdW1);
-        powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
-
-        fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
-        exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
-        exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
-
-        lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);
-
-        dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) *
-                POW(r_ij, p_vdW1 - 2.0);
-
-        lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) -
-                   Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
-    }
-    else  // no shielding
-    {
-        exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
-        exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
-
-        lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);
-
-        lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) -
-                   Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2);
-    }
-
-    if (g_params.vdw_type == 2 || g_params.vdw_type == 3)
-    {
-        // innner wall
-        e_core = twbp->ecore * EXP(twbp->acore * (1.0 - (r_ij / twbp->rcore)));
-        lr->e_vdW += Tap * e_core;
-
-        de_core = -(twbp->acore / twbp->rcore) * e_core;
-        lr->CEvd += dTap * e_core + Tap * de_core;
-    }
-
-    /* Coulomb calculations */
-    dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
-    dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
-
-    tmp = Tap / dr3gamij_3;
-    lr->H = EV_to_KCALpMOL * tmp;
-    lr->e_ele = C_ele * tmp;
-    /* fprintf( stderr,"i:%d(%d), j:%d(%d), gamma:%f,\
-       Tap:%f, dr3gamij_3:%f, qi: %f, qj: %f\n",
-       i, system->atoms[i].type, j, system->atoms[j].type,
-       twbp->gamma, Tap, dr3gamij_3,
-       system->atoms[i].q, system->atoms[j].q ); */
-
-    lr->CEclmb = C_ele * ( dTap -  Tap * r_ij / dr3gamij_1 ) / dr3gamij_3;
-    /* fprintf( stdout, "%d %d\t%g\t%g  %g\t%g  %g\t%g  %g\n",
-       i+1, j+1, r_ij, e_vdW, CEvd * r_ij,
-       system->atoms[i].q, system->atoms[j].q, e_ele, CEclmb * r_ij ); */
-
-    /* fprintf( stderr,"LR_Lookup:%3d%3d%5.3f-%8.5f,%8.5f%8.5f,%8.5f%8.5f\n",
-       i, j, r_ij, lr->H, lr->e_vdW, lr->CEvd, lr->e_ele, lr->CEclmb ); */
-}
-
-
-
 /*
 
    GLOBAL void Cuda_vdW_Coulomb_Energy( reax_atom *atoms,     
diff --git a/PuReMD-GPU/src/cuda_two_body_interactions.h b/PuReMD-GPU/src/cuda_two_body_interactions.h
index ac21c58430a1ca7879563b7687eab57f88f2811b..fe3e273775f67e17f88d742a52707f72bfbac56c 100644
--- a/PuReMD-GPU/src/cuda_two_body_interactions.h
+++ b/PuReMD-GPU/src/cuda_two_body_interactions.h
@@ -23,28 +23,149 @@
 
 #include "mytypes.h"
 
+#include "index_utils.h"
 
-GLOBAL void Cuda_Bond_Energy ( reax_atom *, global_parameters , single_body_parameters *, two_body_parameters *,
-        simulation_data *, static_storage , list , int , int, real *);
+
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
+GLOBAL void Cuda_Bond_Energy( reax_atom *, global_parameters , single_body_parameters *, two_body_parameters *,
+        simulation_data *, static_storage , list , int , int, real * );
 
 GLOBAL void Cuda_vdW_Coulomb_Energy( reax_atom *, two_body_parameters *,
         global_parameters , control_params *, simulation_data *, list , real *, real *, rvec *,
         int , int );
 
-GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy ( reax_atom *, control_params *, simulation_data *,
+GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy( reax_atom *, control_params *, simulation_data *,
         list , real *, real *, rvec *,
         LR_lookup_table *, int , int , int ) ;
 
-GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy_1 ( reax_atom *, control_params *, simulation_data *,
+GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy_1( reax_atom *, control_params *, simulation_data *,
         list , real *, real *, rvec *,
         LR_lookup_table *, int , int , int ) ;
 
-GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy_2 ( reax_atom *, control_params *, simulation_data *,
+GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy_2( reax_atom *, control_params *, simulation_data *,
         list , real *, real *, rvec *,
         LR_lookup_table *, int , int , int ) ;
 
-DEVICE void LR_vdW_Coulomb( global_parameters, two_body_parameters *,
-        control_params *, int , int , real , LR_data * , int);
+static DEVICE void d_LR_vdW_Coulomb( global_parameters g_params, two_body_parameters *tbp,
+        control_params *control, int i, int j, real r_ij, LR_data *lr, int num_atom_types )
+{
+    real p_vdW1 = g_params.l[28];
+    real p_vdW1i = 1.0 / p_vdW1;
+    real powr_vdW1, powgi_vdW1;
+    real tmp, fn13, exp1, exp2;
+    real Tap, dTap, dfn13;
+    real dr3gamij_1, dr3gamij_3;
+    real e_core, de_core;
+    two_body_parameters *twbp;
+
+    twbp = &(tbp[ index_tbp (i, j, num_atom_types) ]);
+    e_core = 0;
+    de_core = 0;
+
+    /* calculate taper and its derivative */
+    Tap = control->Tap7 * r_ij + control->Tap6;
+    Tap = Tap * r_ij + control->Tap5;
+    Tap = Tap * r_ij + control->Tap4;
+    Tap = Tap * r_ij + control->Tap3;
+    Tap = Tap * r_ij + control->Tap2;
+    Tap = Tap * r_ij + control->Tap1;
+    Tap = Tap * r_ij + control->Tap0;
+
+    dTap = 7 * control->Tap7 * r_ij + 6 * control->Tap6;
+    dTap = dTap * r_ij + 5 * control->Tap5;
+    dTap = dTap * r_ij + 4 * control->Tap4;
+    dTap = dTap * r_ij + 3 * control->Tap3;
+    dTap = dTap * r_ij + 2 * control->Tap2;
+    dTap += control->Tap1 / r_ij;
+
+
+    /* vdWaals calculations */
+    powr_vdW1 = POW(r_ij, p_vdW1);
+    powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
+
+    fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
+    exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+    exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+
+    lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);
+    /* fprintf(stderr,"vdW: Tap:%f, r: %f, f13:%f, D:%f, Energy:%f,\
+       Gamma_w:%f, p_vdw: %f, alpha: %f, r_vdw: %f, %lf %lf\n",
+       Tap, r_ij, fn13, twbp->D, Tap * twbp->D * (exp1 - 2.0 * exp2),
+       powgi_vdW1, p_vdW1, twbp->alpha, twbp->r_vdW, exp1, exp2); */
+
+    dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * POW(r_ij, p_vdW1 - 2.0);
+
+    lr->CEvd = dTap * twbp->D * (exp1 - 2 * exp2) -
+               Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
+
+    /*vdWaals Calculations*/
+    if (g_params.vdw_type == 1 || g_params.vdw_type == 3)
+    {
+        // shielding
+        powr_vdW1 = POW(r_ij, p_vdW1);
+        powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
+
+        fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
+        exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+        exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+
+        lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);
+
+        dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) *
+                POW(r_ij, p_vdW1 - 2.0);
+
+        lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) -
+                   Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
+    }
+    else  // no shielding
+    {
+        exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
+        exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
+
+        lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);
+
+        lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) -
+                   Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2);
+    }
+
+    if (g_params.vdw_type == 2 || g_params.vdw_type == 3)
+    {
+        // innner wall
+        e_core = twbp->ecore * EXP(twbp->acore * (1.0 - (r_ij / twbp->rcore)));
+        lr->e_vdW += Tap * e_core;
+
+        de_core = -(twbp->acore / twbp->rcore) * e_core;
+        lr->CEvd += dTap * e_core + Tap * de_core;
+    }
+
+    /* Coulomb calculations */
+    dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
+    dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
+
+    tmp = Tap / dr3gamij_3;
+    lr->H = EV_to_KCALpMOL * tmp;
+    lr->e_ele = C_ele * tmp;
+    /* fprintf( stderr,"i:%d(%d), j:%d(%d), gamma:%f,\
+       Tap:%f, dr3gamij_3:%f, qi: %f, qj: %f\n",
+       i, system->atoms[i].type, j, system->atoms[j].type,
+       twbp->gamma, Tap, dr3gamij_3,
+       system->atoms[i].q, system->atoms[j].q ); */
+
+    lr->CEclmb = C_ele * ( dTap -  Tap * r_ij / dr3gamij_1 ) / dr3gamij_3;
+    /* fprintf( stdout, "%d %d\t%g\t%g  %g\t%g  %g\t%g  %g\n",
+       i+1, j+1, r_ij, e_vdW, CEvd * r_ij,
+       system->atoms[i].q, system->atoms[j].q, e_ele, CEclmb * r_ij ); */
+
+    /* fprintf( stderr,"LR_Lookup:%3d%3d%5.3f-%8.5f,%8.5f%8.5f,%8.5f%8.5f\n",
+       i, j, r_ij, lr->H, lr->e_vdW, lr->CEvd, lr->e_ele, lr->CEclmb ); */
+}
+
+#ifdef __cplusplus
+}
+#endif
 
 
 #endif
diff --git a/PuReMD-GPU/src/cuda_utils.h b/PuReMD-GPU/src/cuda_utils.h
index 8150fed70a742c5419b1fef361f06d52f989329e..c8976d081bbead0048faa7dd12238c40f2eda5d7 100644
--- a/PuReMD-GPU/src/cuda_utils.h
+++ b/PuReMD-GPU/src/cuda_utils.h
@@ -29,6 +29,10 @@
 #define IDX2C(i,j,ld) (((j)*(ld))+(i))
 
 
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 static __inline__ void modify( cublasHandle_t handle, float *m, int ldm, int n, int p, int q, float alpha, float beta )
 {
     cublasSscal( handle, n - p, &alpha, &m[IDX2C(p, q, ldm)], ldm );
@@ -47,9 +51,9 @@ void compute_nearest_pow_2( int blocks, int *result );
 void print_device_mem_usage( );
 
 #define cusparseCheckError(cusparseStatus) __cusparseCheckError (cusparseStatus, __FILE__, __LINE__)
-inline void __cusparseCheckError( cusparseStatus_t cusparseStatus, const char *file, const int line )
+static inline void __cusparseCheckError( cusparseStatus_t cusparseStatus, const char *file, const int line )
 {
-    if (cusparseStatus != CUSPARSE_STATUS_SUCCESS)
+    if ( cusparseStatus != CUSPARSE_STATUS_SUCCESS )
     {
         fprintf (stderr, "failed .. %s:%d -- error code %d \n", __FILE__, __LINE__, cusparseStatus);
         exit (-1);
@@ -59,9 +63,9 @@ inline void __cusparseCheckError( cusparseStatus_t cusparseStatus, const char *f
 
 
 #define cublasCheckError(cublasStatus) __cublasCheckError (cublasStatus, __FILE__, __LINE__)
-inline void __cublasCheckError( cublasStatus_t cublasStatus, const char *file, const int line )
+static inline void __cublasCheckError( cublasStatus_t cublasStatus, const char *file, const int line )
 {
-    if( cublasStatus != CUBLAS_STATUS_SUCCESS )
+    if ( cublasStatus != CUBLAS_STATUS_SUCCESS )
     {
         fprintf( stderr, "failed .. %s:%d -- error code %d \n", __FILE__, __LINE__, cublasStatus );
         exit( -1 );
@@ -69,8 +73,9 @@ inline void __cublasCheckError( cublasStatus_t cublasStatus, const char *file, c
     return;
 }
 
+
 #define cudaCheckError() __cudaCheckError( __FILE__, __LINE__ )
-inline void __cudaCheckError( const char *file, const int line )
+static inline void __cudaCheckError( const char *file, const int line )
 {
     cudaError err = cudaGetLastError( );
     if ( cudaSuccess != err )
@@ -91,5 +96,9 @@ inline void __cudaCheckError( const char *file, const int line )
     return;
 }
 
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PuReMD-GPU/src/grid.h b/PuReMD-GPU/src/grid.h
index 273f56c5deedc14eb579a9156c7e2c6138635b75..8f26f018e947bc2ebe1789736d8203dc435addfd 100644
--- a/PuReMD-GPU/src/grid.h
+++ b/PuReMD-GPU/src/grid.h
@@ -23,6 +23,11 @@
 
 #include "mytypes.h"
 
+
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 void Setup_Grid( reax_system* );
 
 void Update_Grid( reax_system* );
@@ -35,4 +40,9 @@ void Bin_Atoms( reax_system*, static_storage* );
 
 void Reset_Marks( grid*, ivec*, int );
 
+#ifdef __cplusplus
+}
+#endif
+
+
 #endif
diff --git a/PuReMD-GPU/src/init_md.c b/PuReMD-GPU/src/init_md.c
index 6e9ffe974a700de207c126f9948d6aca0f18621d..2a2ce1270e2c694722e489b9a3f38f8dd48177a1 100644
--- a/PuReMD-GPU/src/init_md.c
+++ b/PuReMD-GPU/src/init_md.c
@@ -351,7 +351,7 @@ void compare_far_neighbors (int *test, int *start, int *end, far_neighbor_data *
     int index = 0;
     int count = 0;
     int jicount = 0;
-    int end_index, gpu_index, gpu_end, k;
+    int i, j, end_index, gpu_index, gpu_end, k;
     far_neighbor_data gpu, cpu;
 
     /*
@@ -370,12 +370,12 @@ void compare_far_neighbors (int *test, int *start, int *end, far_neighbor_data *
      */
 
 
-    for (int i = 0; i < N; i++){
+    for (i = 0; i < N; i++){
         index = Start_Index (i, slist);
         //fprintf (stderr, "GPU : Neighbors of atom --> %d (start: %d , end: %d )\n", i, start[i], end[i]);
 
 
-        for (int j = start[i]; j < end[i]; j++){
+        for (j = start[i]; j < end[i]; j++){
             gpu = data[j];
 
             if (i < data[j].nbr) continue;
diff --git a/PuReMD-GPU/src/init_md.h b/PuReMD-GPU/src/init_md.h
index 5eea3d8c554e44bdce185c50891c48f67e69f9c8..8c23806594a8f2b107ddb884efbf68e7b5fe27ff 100644
--- a/PuReMD-GPU/src/init_md.h
+++ b/PuReMD-GPU/src/init_md.h
@@ -24,6 +24,10 @@
 #include "mytypes.h"
 
 
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 void Initialize( reax_system*, control_params*, simulation_data*,
         static_storage*, list**, output_controls*, evolve_function* );
 
@@ -32,5 +36,9 @@ void Generate_Initial_Velocities(reax_system *, real );
 void Init_Out_Controls(reax_system *, control_params *, static_storage *,
         output_controls *);
 
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PuReMD-GPU/src/lin_alg.h b/PuReMD-GPU/src/lin_alg.h
index 3fe9609cf373a113a17b40e1e5e6762276277399..a515a959494a6eca40fe9f338d2a08118ff3e39a 100644
--- a/PuReMD-GPU/src/lin_alg.h
+++ b/PuReMD-GPU/src/lin_alg.h
@@ -25,6 +25,7 @@
 
 #include "mytypes.h"
 
+
 int GMRES( static_storage*, sparse_matrix*,
            real*, real, real*, FILE* , reax_system* );
 
@@ -43,4 +44,5 @@ int CG( static_storage*, sparse_matrix*,
 int uyduruk_GMRES( static_storage*, sparse_matrix*,
                    real*, real, real*, int, FILE*, reax_system* );
 
+
 #endif
diff --git a/PuReMD-GPU/src/list.h b/PuReMD-GPU/src/list.h
index a4d0aac132005967ca3c41d92264dd8d4d18e761..b90c41419271ca6b859be08ea4005fbe9107c029 100644
--- a/PuReMD-GPU/src/list.h
+++ b/PuReMD-GPU/src/list.h
@@ -28,31 +28,31 @@ char Make_List( int, int, int, list* );
 void Delete_List( list* );
 
 
-inline HOST_DEVICE int Num_Entries(int i, list* l)
+static inline HOST_DEVICE int Num_Entries(int i, list* l)
 {
     return l->end_index[i] - l->index[i];
 }
 
 
-inline HOST_DEVICE int Start_Index(int i, list *l )
+static inline HOST_DEVICE int Start_Index(int i, list *l )
 {
     return l->index[i];
 }
 
 
-inline HOST_DEVICE int End_Index( int i, list *l )
+static inline HOST_DEVICE int End_Index( int i, list *l )
 {
     return l->end_index[i];
 }
 
 
-inline HOST_DEVICE void Set_Start_Index(int i, int val, list *l)
+static inline HOST_DEVICE void Set_Start_Index(int i, int val, list *l)
 {
     l->index[i] = val;
 }
 
 
-inline HOST_DEVICE void Set_End_Index(int i, int val, list *l)
+static inline HOST_DEVICE void Set_End_Index(int i, int val, list *l)
 {
     l->end_index[i] = val;
 }
diff --git a/PuReMD-GPU/src/lookup.h b/PuReMD-GPU/src/lookup.h
index 0c175e8a6c1e96a5eb87ab29d43dda6fa7d38d7b..7dac3e4f41764caa53cf0e2313f8c566f4e0e634 100644
--- a/PuReMD-GPU/src/lookup.h
+++ b/PuReMD-GPU/src/lookup.h
@@ -23,10 +23,20 @@
 
 #include "mytypes.h"
 
+
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 void Make_Lookup_Table( real, real, int, lookup_function, lookup_table* );
 int  Lookup_Index_Of( real, lookup_table* );
 real Lookup( real, lookup_table* );
 
 void Make_LR_Lookup_Table( reax_system*, control_params* );
 
+#ifdef __cplusplus
+}
+#endif
+
+
 #endif
diff --git a/PuReMD-GPU/src/mytypes.h b/PuReMD-GPU/src/mytypes.h
index 25902c040df2ab8ff60dcda09298915e9db7f0a2..273f95976159c55461a84668c4a5f2494f1d9522 100644
--- a/PuReMD-GPU/src/mytypes.h
+++ b/PuReMD-GPU/src/mytypes.h
@@ -28,7 +28,10 @@
     #define GLOBAL __global__
     #define HOST_DEVICE __host__ __device__
 
+    #include <cuda_runtime.h>
     #include <cuda.h>
+    #include <cuda_runtime_api.h>
+
     #include <cublas_v2.h>
     #include <cusparse_v2.h>
     #if __CUDA_ARCH__ < 600
@@ -1145,14 +1148,5 @@ extern void *scratch;
 extern int BLOCKS, BLOCKS_POW_2, BLOCK_SIZE;
 extern int MATVEC_BLOCKS;
 
-#ifdef __CUDACC__
-extern cublasStatus_t cublasStatus;
-extern cublasHandle_t cublasHandle;
-
-extern cusparseHandle_t cusparseHandle;
-extern cusparseStatus_t cusparseStatus;
-extern cusparseMatDescr_t matdescriptor;
-#endif
-
 
 #endif
diff --git a/PuReMD-GPU/src/random.h b/PuReMD-GPU/src/random.h
index 4b698b78a9d74464c9af7ab333f642bf07a0667b..b19bc58e3dcef04a324b108be718bfbff3e5c06c 100644
--- a/PuReMD-GPU/src/random.h
+++ b/PuReMD-GPU/src/random.h
@@ -28,7 +28,7 @@
    large periodicity for generation of pseudo random number. function
    Random returns this random number appropriately scaled so that
    0 <= Random(range) < range */
-HOST_DEVICE inline double Random(double range)
+static inline HOST_DEVICE double Random(double range)
 {
     return (random() * range) / 2147483647L;
 }
@@ -37,7 +37,7 @@ HOST_DEVICE inline double Random(double range)
 /* This function seeds the system pseudo random number generator with
    current time. Use this function once in the begining to initialize
    the system */
-HOST_DEVICE inline void Randomize( )
+static inline HOST_DEVICE void Randomize( )
 {
     srandom( time(NULL) );
 }
@@ -45,7 +45,7 @@ HOST_DEVICE inline void Randomize( )
 
 /* GRandom return random number with gaussian distribution with mean
    and standard deviation "sigma" */
-HOST_DEVICE inline double GRandom(double mean, double sigma)
+static inline HOST_DEVICE double GRandom(double mean, double sigma)
 {
     double v1 = Random(2.0) - 1.0;
     double v2 = Random(2.0) - 1.0;
@@ -61,4 +61,5 @@ HOST_DEVICE inline double GRandom(double mean, double sigma)
     return mean + v1 * sigma * sqrt(-2.0 * log(rsq) / rsq);
 }
 
+
 #endif
diff --git a/PuReMD-GPU/src/reset_utils.h b/PuReMD-GPU/src/reset_utils.h
index 796a1508afdef4cf9b9297e37879426a03faeefd..190bd7f5632f3b0a2b3291edf82e03aa4922793e 100644
--- a/PuReMD-GPU/src/reset_utils.h
+++ b/PuReMD-GPU/src/reset_utils.h
@@ -24,6 +24,10 @@
 #include "mytypes.h"
 
 
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 void Reset_Atoms( reax_system* );
 
 void Reset_Pressures( simulation_data* );
@@ -48,5 +52,9 @@ void Reset_Grid( grid* );
 
 void Reset_Marks( grid*, ivec*, int );
 
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PuReMD-GPU/src/system_props.c b/PuReMD-GPU/src/system_props.c
index 491b0d93f22aab1cdda29311643434f42694398a..0126b86b776dce8fd30aea0c228731b95104b216 100644
--- a/PuReMD-GPU/src/system_props.c
+++ b/PuReMD-GPU/src/system_props.c
@@ -24,7 +24,7 @@
 #include "vector.h"
 
 
-real Get_Time( )
+HOST real Get_Time( )
 {
     struct timeval tim;
 
@@ -33,7 +33,7 @@ real Get_Time( )
 }
 
 
-real Get_Timing_Info( real t_start )
+HOST real Get_Timing_Info( real t_start )
 {
     struct timeval tim;
     real t_end;
diff --git a/PuReMD-GPU/src/system_props.h b/PuReMD-GPU/src/system_props.h
index e2f7cb1aeb51696bf332d50c6f34fa4af84ed561..874132451d02b2d62d87c82065874f04a35b2d37 100644
--- a/PuReMD-GPU/src/system_props.h
+++ b/PuReMD-GPU/src/system_props.h
@@ -24,6 +24,10 @@
 #include "mytypes.h"
 
 
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 real Get_Time( );
 
 real Get_Timing_Info( real );
@@ -40,5 +44,9 @@ void Compute_Pressure( reax_system*, simulation_data*, static_storage* );
 
 void Compute_Pressure_Isotropic( reax_system*, control_params*, simulation_data*, output_controls* );
 
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PuReMD-GPU/src/testmd.c b/PuReMD-GPU/src/testmd.c
index d33a0785a1a26322f5e999ae4dc4cf6953bc70f8..afe0cd4a5cf7e7282d7e2409f7f0edc66c34296a 100644
--- a/PuReMD-GPU/src/testmd.c
+++ b/PuReMD-GPU/src/testmd.c
@@ -213,17 +213,17 @@ int main( int argc, char* argv[] )
             &out_control, &Evolve );
 #endif
 
-    t_start = Get_Time ();
+    t_start = Get_Time( );
     Cuda_Initialize( &system, &control, &data, &workspace, &lists, 
             &out_control, &Cuda_Evolve );
     t_elapsed = Get_Timing_Info( t_start );
 
 #ifdef __DEBUG_CUDA__
-    fprintf (stderr, " Cuda Initialize timing ---> %f \n", t_elapsed );
+    fprintf( stderr, " Cuda Initialize timing ---> %f \n", t_elapsed );
 #endif
 
 #ifdef __CUDA_MEM__
-    print_device_mem_usage ();
+    print_device_mem_usage( );
 #endif
 
 #ifdef __BUILD_DEBUG__
@@ -233,11 +233,11 @@ int main( int argc, char* argv[] )
     Cuda_Reset( &system, &control, &data, &workspace, &lists );
 
 #ifdef __BUILD_DEBUG__
-    Generate_Neighbor_Lists ( &system, &control, &data, &workspace, 
+    Generate_Neighbor_Lists( &system, &control, &data, &workspace, 
             &lists, &out_control );
 #endif
 
-    Cuda_Generate_Neighbor_Lists (&system, &workspace, &control, FALSE);
+    Cuda_Generate_Neighbor_Lists( &system, &workspace, &control, FALSE );
 
 #ifdef __BUILD_DEBUG__
     Compute_Forces(&system, &control, &data, &workspace, &lists, &out_control);
@@ -297,8 +297,8 @@ int main( int argc, char* argv[] )
         //fprintf (stderr, "Post Evolve done \n");
 
 #ifndef __BUILD_DEBUG__
-        Prep_Device_For_Output ( &system, &data );
-        Output_Results(&system, &control, &data, &workspace, &lists, &out_control);
+        Cuda_Setup_Output( &system, &data );
+        Output_Results( &system, &control, &data, &workspace, &lists, &out_control );
 
         //Analysis( &system, &control, &data, &workspace, &lists, &out_control );
         steps = data.step - data.prev_steps;
diff --git a/PuReMD-GPU/src/traj.c b/PuReMD-GPU/src/traj.c
index 1a8a150ed571a069df8ceb16928a5506eb458f61..2844c370ee79702ed0c75d090afe545149aae185 100644
--- a/PuReMD-GPU/src/traj.c
+++ b/PuReMD-GPU/src/traj.c
@@ -213,7 +213,7 @@ int Append_Custom_Frame( reax_system *system, control_params *control,
 
 #ifdef __PRINT_CPU_RESULTS__
         //fprintf (stderr, "Synching bonds from device for printing ....\n");
-        Sync_Host_Device (bonds, (dev_lists + BONDS), TYP_BOND );
+        Sync_Host_Device_List( bonds, (dev_lists + BONDS), TYP_BOND );
 #endif
 
         for( i = 0; i < system->N; ++i )
@@ -245,10 +245,10 @@ int Append_Custom_Frame( reax_system *system, control_params *control,
 
 #ifdef __PRINT_CPU_RESULTS__
         //fprintf (stderr, "Synching three bodies from deivce for printing ... \n");
-        Sync_Host_Device (thb_intrs, dev_lists + THREE_BODIES, TYP_THREE_BODY );
+        Sync_Host_Device_List( thb_intrs, dev_lists + THREE_BODIES, TYP_THREE_BODY );
         if ( !write_bonds) {
             //fprintf (stderr, "Synching bonds for three bodies from device for printing ... \n");
-            Sync_Host_Device (bonds, (dev_lists + BONDS), TYP_BOND );
+            Sync_Host_Device_List( bonds, (dev_lists + BONDS), TYP_BOND );
         }
 #endif 
 
diff --git a/PuReMD-GPU/src/two_body_interactions.c b/PuReMD-GPU/src/two_body_interactions.c
index d7a491789b05e9c1a855b31b60ba79dab573e271..2e7a6daf9039ea26c22b2fcfda5913e46255ad75 100644
--- a/PuReMD-GPU/src/two_body_interactions.c
+++ b/PuReMD-GPU/src/two_body_interactions.c
@@ -26,8 +26,6 @@
 #include "vector.h"
 #include "index_utils.h"
 
-#include "cuda_helpers.h"
-
 
 void Bond_Energy( reax_system *system, control_params *control, 
         simulation_data *data, static_storage *workspace, 
diff --git a/PuReMD-GPU/src/vector.h b/PuReMD-GPU/src/vector.h
index ee1375ee0816bcefbf1f335bfdaeccf8fcfed3f3..e1111e514928e79fc79197a0f2486d5eefb1cfa3 100644
--- a/PuReMD-GPU/src/vector.h
+++ b/PuReMD-GPU/src/vector.h
@@ -26,6 +26,10 @@
 #include "random.h"
 
 
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 int  Vector_isZero( real*, int );
 void Vector_MakeZero( real*, int );
 void Vector_Copy( real*, real*, int );
@@ -67,7 +71,7 @@ void ivec_MakeZero( ivec );
 void ivec_rScale( ivec, real, rvec );
 
 
-HOST_DEVICE inline real Dot( real* v1, real* v2, int k )
+static inline HOST_DEVICE real Dot( real* v1, real* v2, int k )
 {
     real ret = 0;
 
@@ -81,13 +85,13 @@ HOST_DEVICE inline real Dot( real* v1, real* v2, int k )
 /////////////////////////////
 //rvec functions
 /////////////////////////////
-HOST_DEVICE inline void rvec_MakeZero( rvec v )
+static inline HOST_DEVICE void rvec_MakeZero( rvec v )
 {
     v[0] = v[1] = v[2] = ZERO;
 }
 
 
-HOST_DEVICE inline void rvec_Add( rvec ret, rvec v )
+static inline HOST_DEVICE void rvec_Add( rvec ret, rvec v )
 {
     ret[0] += v[0];
     ret[1] += v[1];
@@ -95,13 +99,13 @@ HOST_DEVICE inline void rvec_Add( rvec ret, rvec v )
 }
 
 
-HOST_DEVICE inline void rvec_Copy( rvec dest, rvec src )
+static inline HOST_DEVICE void rvec_Copy( rvec dest, rvec src )
 {
     dest[0] = src[0], dest[1] = src[1], dest[2] = src[2];
 }
 
 
-HOST_DEVICE inline void rvec_Cross( rvec ret, rvec v1, rvec v2 )
+static inline HOST_DEVICE void rvec_Cross( rvec ret, rvec v1, rvec v2 )
 {
     ret[0] = v1[1] * v2[2] - v1[2] * v2[1];
     ret[1] = v1[2] * v2[0] - v1[0] * v2[2];
@@ -109,13 +113,13 @@ HOST_DEVICE inline void rvec_Cross( rvec ret, rvec v1, rvec v2 )
 }
 
 
-HOST_DEVICE inline void rvec_ScaledAdd( rvec ret, real c, rvec v )
+static inline HOST_DEVICE void rvec_ScaledAdd( rvec ret, real c, rvec v )
 {
     ret[0] += c * v[0], ret[1] += c * v[1], ret[2] += c * v[2];
 }
 
 
-HOST_DEVICE inline void rvec_ScaledSum( rvec ret, real c1, rvec v1 , real c2, rvec v2 )
+static inline HOST_DEVICE void rvec_ScaledSum( rvec ret, real c1, rvec v1 , real c2, rvec v2 )
 {
     ret[0] = c1 * v1[0] + c2 * v2[0];
     ret[1] = c1 * v1[1] + c2 * v2[1];
@@ -123,7 +127,7 @@ HOST_DEVICE inline void rvec_ScaledSum( rvec ret, real c1, rvec v1 , real c2, rv
 }
 
 
-HOST_DEVICE inline void rvec_Random( rvec v )
+static inline HOST_DEVICE void rvec_Random( rvec v )
 {
     v[0] = Random(2.0) - 1.0;
     v[1] = Random(2.0) - 1.0;
@@ -131,25 +135,25 @@ HOST_DEVICE inline void rvec_Random( rvec v )
 }
 
 
-HOST_DEVICE inline real rvec_Norm_Sqr( rvec v )
+static inline HOST_DEVICE real rvec_Norm_Sqr( rvec v )
 {
     return SQR(v[0]) + SQR(v[1]) + SQR(v[2]);
 }
 
 
-HOST_DEVICE inline void rvec_Scale( rvec ret, real c, rvec v )
+static inline HOST_DEVICE void rvec_Scale( rvec ret, real c, rvec v )
 {
     ret[0] = c * v[0], ret[1] = c * v[1], ret[2] = c * v[2];
 }
 
 
-HOST_DEVICE inline real rvec_Dot( rvec v1, rvec v2 )
+static inline HOST_DEVICE real rvec_Dot( rvec v1, rvec v2 )
 {
     return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2];
 }
 
 
-HOST_DEVICE inline void rvec_iMultiply( rvec r, ivec v1, rvec v2 )
+static inline HOST_DEVICE void rvec_iMultiply( rvec r, ivec v1, rvec v2 )
 {
     r[0] = v1[0] * v2[0];
     r[1] = v1[1] * v2[1];
@@ -157,7 +161,7 @@ HOST_DEVICE inline void rvec_iMultiply( rvec r, ivec v1, rvec v2 )
 }
 
 
-HOST_DEVICE inline real rvec_Norm( rvec v )
+static inline HOST_DEVICE real rvec_Norm( rvec v )
 {
     return SQRT( SQR(v[0]) + SQR(v[1]) + SQR(v[2]) );
 }
@@ -166,13 +170,13 @@ HOST_DEVICE inline real rvec_Norm( rvec v )
 /////////////////
 //ivec functions
 /////////////////
-HOST_DEVICE inline void ivec_Copy( ivec dest , ivec src )
+static inline HOST_DEVICE void ivec_Copy( ivec dest , ivec src )
 {
     dest[0] = src[0], dest[1] = src[1], dest[2] = src[2];
 }
 
 
-HOST_DEVICE inline void ivec_Scale( ivec dest, real C, ivec src )
+static inline HOST_DEVICE void ivec_Scale( ivec dest, real C, ivec src )
 {
     dest[0] = C * src[0];
     dest[1] = C * src[1];
@@ -180,7 +184,7 @@ HOST_DEVICE inline void ivec_Scale( ivec dest, real C, ivec src )
 }
 
 
-HOST_DEVICE inline void ivec_Sum( ivec dest, ivec v1, ivec v2 )
+static inline HOST_DEVICE void ivec_Sum( ivec dest, ivec v1, ivec v2 )
 {
     dest[0] = v1[0] + v2[0];
     dest[1] = v1[1] + v2[1];
@@ -191,24 +195,29 @@ HOST_DEVICE inline void ivec_Sum( ivec dest, ivec v1, ivec v2 )
 /////////////////
 //vector functions
 /////////////////
-HOST_DEVICE inline void Vector_Sum( real* dest, real c, real* v, real d, real* y, int k )
+static inline HOST_DEVICE void Vector_Sum( real* dest, real c, real* v, real d, real* y, int k )
 {
     for (k--; k >= 0; k--)
         dest[k] = c * v[k] + d * y[k];
 }
 
 
-HOST_DEVICE inline void Vector_Scale( real* dest, real c, real* v, int k )
+static inline HOST_DEVICE void Vector_Scale( real* dest, real c, real* v, int k )
 {
     for (k--; k >= 0; k--)
         dest[k] = c * v[k];
 }
 
 
-HOST_DEVICE inline void Vector_Add( real* dest, real c, real* v, int k )
+static inline HOST_DEVICE void Vector_Add( real* dest, real c, real* v, int k )
 {
     for (k--; k >= 0; k--)
         dest[k] += c * v[k];
 }
 
+#ifdef __cplusplus
+}
+#endif
+
+
 #endif