diff --git a/docs/documentation/case.md b/docs/documentation/case.md
index 5acd85a7a3..883d6619ab 100644
--- a/docs/documentation/case.md
+++ b/docs/documentation/case.md
@@ -312,6 +312,7 @@ This is enabled by adding ``'elliptic_smoothing': "T",`` and ``'elliptic_smoothi
 | Parameter            | Type    | Description |
 | ---:                 | :----:  | :---                |
 | `num_ibs`            | Integer | Number of immersed boundary patches |
+| `ib_neighborhood_radius` | Integer | Parameter that controls the neighborhood size for IB detection. |
 | `geometry`           | Integer | Geometry configuration of the patch.|
 | `x[y,z]_centroid`    | Real    | Centroid of the applied geometry in the [x,y,z]-direction. |
 | `length_x[y,z]`      | Real    | Length, if applicable, in the [x,y,z]-direction. |
@@ -373,6 +374,8 @@ Additional details on this specification can be found in [NACA airfoil](https://
 
 - `ib_coefficient_of_friction` is the coefficient of friction used in IB collisions.
 
+- `ib_neighborhood_radius` controls the size of the neighborhood size. This value defaults to 1, which indicates that any given rank is aware of IB's up to 1 ranks away. This parameter is required to strong-scale a case when IB's eventually grow to be larger than one full processor domain wide.
+
 ### 5. Fluid Material's {#sec-fluid-materials}
 
 | Parameter | Type   | Description                                    |
@@ -638,7 +641,7 @@ To restart the simulation from $k$-th time step, see @ref running "Restarting Ca
 | `alpha_wrt(i)`          | Logical | Add the volume fraction of fluid $i$ to the database	|
 | `gamma_wrt`             | Logical | Add the specific heat ratio function to the database	|
 | `heat_ratio_wrt`        | Logical | Add the specific heat ratio to the database	|
-| `ib_state_wrt`          | Logical | Write IB state and loads to a datafile at each time step |
+| `ib_state_wrt`          | Logical | Parameter to handle writing IB state on saves and outputting the state as a point mesh to SILO files. |
 | `pi_inf_wrt`            | Logical | Add the liquid stiffness function to the database |
 | `pres_inf_wrt`          | Logical | Add the liquid stiffness to the formatted database	 |
 | `c_wrt`                 | Logical | Add the sound speed to the database	 |
@@ -706,7 +709,7 @@ If `file_per_process` is true, then pre_process, simulation, and post_process mu
 
 - `probe_wrt` activates the output of state variables at coordinates specified by `probe(i)%[x;y,z]`.
 
-- `ib_state_wrt` activates the output of data specified by patch_ib(i)%force(:) (and torque, vel, angular_vel, angles, [x,y,z]_centroid) into a single binary datafile for all IBs at all timesteps. During post_processing, this file is converted into separate time histories for each IB.
+- `ib_state_wrt` is used to trigger post-processing of the IB state to be written out as a point mesh in the SILO files. When no IBs are moving, it also triggers force and torque calculation so that those values may be written to the output state files.
 
 - `output_partial_domain` activates the output of part of the domain specified by `[x,y,z]_output%%beg` and `[x,y,z]_output%%end`.
 This is useful for large domains where only a portion of the domain is of interest.
diff --git a/examples/2D_mibm_shock_cylinder/case.py b/examples/2D_mibm_shock_cylinder/case.py
index 69aca0cc9b..9fbc86829d 100644
--- a/examples/2D_mibm_shock_cylinder/case.py
+++ b/examples/2D_mibm_shock_cylinder/case.py
@@ -87,7 +87,7 @@
             "precision": 2,
             "prim_vars_wrt": "T",
             "E_wrt": "T",
-            "ib_state_wrt": "T",
+            "ib_state_wrt": "F",
             "parallel_io": "T",
             # Patch: Constant Tube filled with air
             # Specify the cylindrical air tube grid geometry
diff --git a/src/common/m_constants.fpp b/src/common/m_constants.fpp
index cacc50f528..d66fb10614 100644
--- a/src/common/m_constants.fpp
+++ b/src/common/m_constants.fpp
@@ -25,6 +25,7 @@ module m_constants
     integer, parameter  :: num_probes_max = 10                 !< Maximum number of flow probes in the simulation
     integer, parameter  :: num_patches_max = 10                !< Maximum number of IC patches
     integer, parameter  :: num_ib_patches_max = 50000          !< Maximum number of immersed boundary patches (patch_ib)
+    integer, parameter  :: num_local_ibs_max = 2000            !< Maximum number of immersed boundary patches (patch_ib)
     integer, parameter  :: num_bc_patches_max = 10             !< Maximum number of boundary condition patches
     integer, parameter  :: max_2d_fourier_modes = 10           !< Max Fourier mode index for 2D modal patch (geometry 13)
     integer, parameter  :: max_sph_harm_degree = 5             !< Max degree L for 3D spherical harmonic patch (geometry 14)
diff --git a/src/common/m_derived_types.fpp b/src/common/m_derived_types.fpp
index 6ee2e836a5..c4e5873ead 100644
--- a/src/common/m_derived_types.fpp
+++ b/src/common/m_derived_types.fpp
@@ -202,12 +202,10 @@ module m_derived_types
 
     type :: t_model_array
         ! Original CPU-side fields (unchanged)
-        type(t_model), allocatable              :: model                    !< STL/OBJ geometry model
-        real(wp), allocatable, dimension(:,:,:) :: boundary_v               !< Boundary vertices
-        real(wp), allocatable, dimension(:,:)   :: interpolated_boundary_v  !< Interpolated boundary vertices
-        integer                                 :: boundary_edge_count      !< Number of boundary edges
-        integer                                 :: total_vertices           !< Total vertex count
-        integer                                 :: interpolate              !< Interpolation flag
+        type(t_model), allocatable              :: model                !< STL/OBJ geometry model
+        real(wp), allocatable, dimension(:,:,:) :: boundary_v           !< Boundary vertices
+        integer                                 :: boundary_edge_count  !< Number of boundary edges
+        integer                                 :: total_vertices       !< Total vertex count
 
         ! GPU-friendly flattened arrays
         integer                                 :: ntrs   !< Copy of model%ntrs
@@ -272,9 +270,10 @@ module m_derived_types
     end type ic_patch_parameters
 
     type ib_patch_parameters
-
         integer  :: geometry                            !< Type of geometry for the patch
+        integer  :: gbl_patch_id
         real(wp) :: x_centroid, y_centroid, z_centroid  !< Geometric center coordinates of the patch
+
         !> Centroid locations of intermediate steps in the time_stepper module
         real(wp)                 :: step_x_centroid, step_y_centroid, step_z_centroid
         real(wp), dimension(1:3) :: centroid_offset  !< offset of center of mass from computed cell center for odd-shaped IBs
diff --git a/src/common/m_model.fpp b/src/common/m_model.fpp
index 0dab036f9b..a526eb1266 100644
--- a/src/common/m_model.fpp
+++ b/src/common/m_model.fpp
@@ -983,12 +983,13 @@ contains
         dx_local = minval(dx); dy_local = minval(dy)
         if (p /= 0) dz_local = minval(dz)
 
+        num_gbl_ibs = num_ibs
         allocate (stl_bounding_boxes(num_ibs,1:3,1:3))
 
         do patch_id = 1, num_ibs
             if (patch_ib(patch_id)%geometry == 5 .or. patch_ib(patch_id)%geometry == 12) then
                 allocate (models(patch_id)%model)
-                print *, " * Reading model: " // trim(patch_ib(patch_id)%model_filepath)
+                if (proc_rank == 0) print *, " * Reading model: " // trim(patch_ib(patch_id)%model_filepath)
 
                 model = f_model_read(patch_ib(patch_id)%model_filepath)
                 params%scale(:) = patch_ib(patch_id)%model_scale(:)
@@ -1001,9 +1002,7 @@ contains
                     params%scale(:) = 1._wp
                 end if
 
-                if (proc_rank == 0) then
-                    print *, " * Transforming model."
-                end if
+                if (proc_rank == 0) print *, " * Transforming model."
 
                 ! Get the model center before transforming the model
                 bbox_old = f_create_bbox(model)
diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
index 7d4b92705c..9cfdd82e14 100644
--- a/src/common/m_mpi_common.fpp
+++ b/src/common/m_mpi_common.fpp
@@ -94,6 +94,8 @@ contains
         proc_rank = 0
 #endif
 
+        $:GPU_UPDATE(device='[num_procs, proc_rank]')
+
     end subroutine s_mpi_initialize
 
     !> Set up MPI I/O data views and variable pointers for parallel file output.
diff --git a/src/post_process/m_data_output.fpp b/src/post_process/m_data_output.fpp
index aece31ff8d..3a3a7bb10c 100644
--- a/src/post_process/m_data_output.fpp
+++ b/src/post_process/m_data_output.fpp
@@ -1396,48 +1396,42 @@ contains
                 end do
 
                 close (file_unit)
-            end if
-
-            call MPI_BCAST(ib_data, nBodies*NFIELDS_PER_IB, mpi_p, 0, MPI_COMM_WORLD, ierr)
-
-            do i = 1, nBodies
-                force_x(i) = ib_data(i, 2); force_y(i) = ib_data(i, 3); force_z(i) = ib_data(i, 4)
-                torque_x(i) = ib_data(i, 5); torque_y(i) = ib_data(i, 6); torque_z(i) = ib_data(i, 7)
-                vel_x(i) = ib_data(i, 8); vel_y(i) = ib_data(i, 9); vel_z(i) = ib_data(i, 10)
-                omega_x(i) = ib_data(i, 11); omega_y(i) = ib_data(i, 12); omega_z(i) = ib_data(i, 13)
-                angle_x(i) = ib_data(i, 14); angle_y(i) = ib_data(i, 15); angle_z(i) = ib_data(i, 16)
-                px(i) = ib_data(i, 17); py(i) = ib_data(i, 18); pz(i) = ib_data(i, 19)
-                ib_diameter(i) = ib_data(i, 20)*2.0_wp
-            end do
 
-            if (proc_rank == 0) then
-                do i = 1, num_procs
-                    write (meshnames(i), '(A,I0,A,I0,A)') '../p', i - 1, '/', t_step, '.silo:ib_bodies'
-                    meshtypes(i) = DB_POINTMESH
+                do i = 1, nBodies
+                    force_x(i) = ib_data(i, 2); force_y(i) = ib_data(i, 3); force_z(i) = ib_data(i, 4)
+                    torque_x(i) = ib_data(i, 5); torque_y(i) = ib_data(i, 6); torque_z(i) = ib_data(i, 7)
+                    vel_x(i) = ib_data(i, 8); vel_y(i) = ib_data(i, 9); vel_z(i) = ib_data(i, 10)
+                    omega_x(i) = ib_data(i, 11); omega_y(i) = ib_data(i, 12); omega_z(i) = ib_data(i, 13)
+                    angle_x(i) = ib_data(i, 14); angle_y(i) = ib_data(i, 15); angle_z(i) = ib_data(i, 16)
+                    px(i) = ib_data(i, 17); py(i) = ib_data(i, 18); pz(i) = ib_data(i, 19)
+                    ib_diameter(i) = ib_data(i, 20)*2.0_wp
                 end do
+
+                write (meshnames(1), '(A,I0,A)') '../p0/', t_step, '.silo:ib_bodies'
+                meshtypes(1) = DB_POINTMESH
                 err = DBSET2DSTRLEN(len(meshnames(1)))
-                err = DBPUTMMESH(dbroot, 'ib_bodies', 16, num_procs, meshnames, len_trim(meshnames), meshtypes, DB_F77NULL, ierr)
+                err = DBPUTMMESH(dbroot, 'ib_bodies', 16, 1, meshnames, len_trim(meshnames), meshtypes, DB_F77NULL, ierr)
+
+                err = DBPUTPM(dbfile, 'ib_bodies', 9, 3, px, py, pz, nBodies, DB_DOUBLE, DB_F77NULL, ierr)
+
+                call s_write_ib_variable('ib_force_x', t_step, force_x, nBodies)
+                call s_write_ib_variable('ib_force_y', t_step, force_y, nBodies)
+                call s_write_ib_variable('ib_force_z', t_step, force_z, nBodies)
+                call s_write_ib_variable('ib_torque_x', t_step, torque_x, nBodies)
+                call s_write_ib_variable('ib_torque_y', t_step, torque_y, nBodies)
+                call s_write_ib_variable('ib_torque_z', t_step, torque_z, nBodies)
+                call s_write_ib_variable('ib_vel_x', t_step, vel_x, nBodies)
+                call s_write_ib_variable('ib_vel_y', t_step, vel_y, nBodies)
+                call s_write_ib_variable('ib_vel_z', t_step, vel_z, nBodies)
+                call s_write_ib_variable('ib_omega_x', t_step, omega_x, nBodies)
+                call s_write_ib_variable('ib_omega_y', t_step, omega_y, nBodies)
+                call s_write_ib_variable('ib_omega_z', t_step, omega_z, nBodies)
+                call s_write_ib_variable('ib_angle_x', t_step, angle_x, nBodies)
+                call s_write_ib_variable('ib_angle_y', t_step, angle_y, nBodies)
+                call s_write_ib_variable('ib_angle_z', t_step, angle_z, nBodies)
+                call s_write_ib_variable('ib_diameter', t_step, ib_diameter, nBodies)
             end if
 
-            err = DBPUTPM(dbfile, 'ib_bodies', 9, 3, px, py, pz, nBodies, DB_DOUBLE, DB_F77NULL, ierr)
-
-            call s_write_ib_variable('ib_force_x', t_step, force_x, nBodies)
-            call s_write_ib_variable('ib_force_y', t_step, force_y, nBodies)
-            call s_write_ib_variable('ib_force_z', t_step, force_z, nBodies)
-            call s_write_ib_variable('ib_torque_x', t_step, torque_x, nBodies)
-            call s_write_ib_variable('ib_torque_y', t_step, torque_y, nBodies)
-            call s_write_ib_variable('ib_torque_z', t_step, torque_z, nBodies)
-            call s_write_ib_variable('ib_vel_x', t_step, vel_x, nBodies)
-            call s_write_ib_variable('ib_vel_y', t_step, vel_y, nBodies)
-            call s_write_ib_variable('ib_vel_z', t_step, vel_z, nBodies)
-            call s_write_ib_variable('ib_omega_x', t_step, omega_x, nBodies)
-            call s_write_ib_variable('ib_omega_y', t_step, omega_y, nBodies)
-            call s_write_ib_variable('ib_omega_z', t_step, omega_z, nBodies)
-            call s_write_ib_variable('ib_angle_x', t_step, angle_x, nBodies)
-            call s_write_ib_variable('ib_angle_y', t_step, angle_y, nBodies)
-            call s_write_ib_variable('ib_angle_z', t_step, angle_z, nBodies)
-            call s_write_ib_variable('ib_diameter', t_step, ib_diameter, nBodies)
-
             deallocate (ib_data, px, py, pz, force_x, force_y, force_z)
             deallocate (torque_x, torque_y, torque_z, vel_x, vel_y, vel_z)
             deallocate (omega_x, omega_y, omega_z, angle_x, angle_y, angle_z)
@@ -1450,23 +1444,18 @@ contains
     !> Write a single IB point-variable to the Silo database slave and master files.
     subroutine s_write_ib_variable(varname, t_step, data, nBodies)
 
-        character(len=*), intent(in)                    :: varname
-        integer, intent(in)                             :: t_step
-        real(wp), dimension(:), intent(in)              :: data
-        integer, intent(in)                             :: nBodies
-        character(len=4*name_len), dimension(num_procs) :: var_names
-        integer, dimension(num_procs)                   :: var_types
-        integer                                         :: ierr, i
-
-        if (proc_rank == 0) then
-            do i = 1, num_procs
-                write (var_names(i), '(A,I0,A,I0,A)') '../p', i - 1, '/', t_step, '.silo:' // trim(varname)
-                var_types(i) = DB_POINTVAR
-            end do
-            err = DBSET2DSTRLEN(len(var_names(1)))
-            err = DBPUTMVAR(dbroot, trim(varname), len_trim(varname), num_procs, var_names, len_trim(var_names), var_types, &
-                            & DB_F77NULL, ierr)
-        end if
+        character(len=*), intent(in)       :: varname
+        integer, intent(in)                :: t_step
+        real(wp), dimension(:), intent(in) :: data
+        integer, intent(in)                :: nBodies
+        character(len=4*name_len)          :: var_name_entry
+        integer                            :: var_type_entry, ierr
+
+        write (var_name_entry, '(A,I0,A)') '../p0/', t_step, '.silo:' // trim(varname)
+        var_type_entry = DB_POINTVAR
+        err = DBSET2DSTRLEN(len(var_name_entry))
+        err = DBPUTMVAR(dbroot, trim(varname), len_trim(varname), 1, var_name_entry, len_trim(var_name_entry), var_type_entry, &
+                        & DB_F77NULL, ierr)
 
         err = DBPUTPV1(dbfile, trim(varname), len_trim(varname), 'ib_bodies', 9, data, nBodies, DB_DOUBLE, DB_F77NULL, ierr)
 
diff --git a/src/simulation/m_collisions.fpp b/src/simulation/m_collisions.fpp
index fe860cb055..aec23af491 100644
--- a/src/simulation/m_collisions.fpp
+++ b/src/simulation/m_collisions.fpp
@@ -19,7 +19,8 @@ module m_collisions
 
     implicit none
 
-    private; public :: s_apply_collision_forces, s_initialize_collisions_module, s_finalize_collisions_module
+    private; public :: s_apply_collision_forces, s_initialize_collisions_module, s_finalize_collisions_module, &
+        & f_local_rank_owns_location, f_neighborhood_ranks_own_location
     ! overlap distances for computing collisions
     integer, allocatable, dimension(:,:)  :: collision_lookup
     real(wp), allocatable, dimension(:,:) :: wall_overlap_distances
@@ -113,7 +114,7 @@ contains
             overlap_distance = patch_ib(pid1)%radius + patch_ib(pid2)%radius - norm2(normal_vector)
             if (overlap_distance > 0._wp) then  ! if the two patches are close enough to collide
                 normal_vector = normal_vector/norm2(normal_vector)
-                if (f_local_rank_owns_collision(centroid_1)) then
+                if (f_local_rank_owns_location(centroid_1)) then
                     ! compute constants of the collision
                     effective_mass = 1.0_wp/((1.0_wp/patch_ib(pid1)%mass) + (1._wp/(patch_ib(pid2)%mass)))
                     k = spring_stiffness*effective_mass
@@ -192,7 +193,7 @@ contains
                 ! ensure the local rank owns that collision before proceeding
                 collision_location = [patch_ib(patch_id)%x_centroid, patch_ib(patch_id)%y_centroid, 0._wp]
                 if (num_dims == 3) collision_location(3) = patch_ib(patch_id)%z_centroid
-                if (f_local_rank_owns_collision(collision_location)) then
+                if (f_local_rank_owns_location(collision_location)) then
                     k = spring_stiffness*patch_ib(patch_id)%mass
                     eta = damping_parameter*patch_ib(patch_id)%mass
 
@@ -397,45 +398,78 @@ contains
     end subroutine s_detect_wall_collisions
 
     !> @brief function checks if this local MPI processor owns this specific collision
-    function f_local_rank_owns_collision(collision_location) result(owns_collision)
+    function f_local_rank_owns_location(location) result(owns_collision)
 
         $:GPU_ROUTINE(parallelism='[seq]')
 
-        real(wp), dimension(3), intent(in) :: collision_location
+        real(wp), dimension(3), intent(in) :: location
         logical                            :: owns_collision
         real(wp), dimension(3)             :: projected_location
 
-        #:if defined('MFC_MPI')
-            if (num_procs == 1) then
-                owns_collision = .true.
-            else
-                projected_location(:) = collision_location(:)
-
-                ! catch the edge case where th collision lies just outside the computational domain
-                #:for X, ID in [('x', 1), ('y', 2), ('z', 3)]
-                    if (num_dims >= ${ID}$) then
-                        if (ib_bc_${X}$%beg /= BC_PERIODIC) then
-                            ! if it is outside the domain in one direction, project it somewhere inside so at least one rank owns it
-                            if (collision_location(${ID}$) < ${X}$_domain%beg) then
-                                projected_location(${ID}$) = ${X}$_domain%beg
-                            else if (${X}$_domain%end < collision_location(${ID}$)) then
-                                projected_location(${ID}$) = ${X}$_domain%end - 1.0e-10_wp
-                            end if
+        owns_collision = .true.
+
+#ifdef MFC_MPI
+        if (num_procs > 1) then
+            projected_location(:) = location(:)
+
+            ! catch the edge case where th collision lies just outside the computational domain
+            #:for X, ID, DIM in [('x', 1, 'm'), ('y', 2, 'n'), ('z', 3, 'p')]
+                if (num_dims >= ${ID}$) then
+                    if (ib_bc_${X}$%beg /= BC_PERIODIC) then
+                        ! if it is outside the domain in one direction, project it somewhere inside so at least one rank owns it
+                        if (location(${ID}$) < ${X}$_domain%beg) then
+                            projected_location(${ID}$) = ${X}$_domain%beg
+                        else if (${X}$_domain%end < location(${ID}$)) then
+                            projected_location(${ID}$) = ${X}$_domain%end - 1.0e-10_wp
                         end if
                     end if
-                #:endfor
+                    owns_collision = owns_collision .and. ${X}$_cb(-1) <= projected_location(${ID}$) &
+                        & .and. projected_location(${ID}$) < ${X}$_cb(${DIM}$)
+                end if
+            #:endfor
+        end if
+#endif
 
-                ! the object that contains the collision location owns the collisions
-                owns_collision = x_cb(-1) <= projected_location(1) .and. projected_location(1) < x_cb(m)
-                owns_collision = owns_collision .and. y_cb(-1) <= projected_location(2) .and. projected_location(2) < y_cb(n)
-                if (num_dims == 3) owns_collision = owns_collision .and. z_cb(-1) <= projected_location(3) &
-                    & .and. projected_location(3) < z_cb(p)
-            end if
-        #:else
+    end function f_local_rank_owns_location
+
+    !> @brief function checks if this local MPI processor owns this specific collision
+    function f_neighborhood_ranks_own_location(location) result(owns_collision)
+
+        $:GPU_ROUTINE(parallelism='[seq]')
+
+        real(wp), dimension(3), intent(in) :: location
+        logical                            :: owns_collision, periodic_owner
+        real(wp)                           :: temp_neighbor_domain
+        integer                            :: i
+
+        owns_collision = .true.
+
+#ifdef MFC_MPI
+        if (num_procs > 2) then
+            ! catch the edge case where th collision lies just outside the computational domain
             owns_collision = .true.
-        #:endif
+            #:for X, ID in [('x', 1), ('y', 2,), ('z', 3,)]
+                if (num_dims >= ${ID}$) then
+                    if (ib_bc_${X}$%beg == BC_PERIODIC .and. neighbor_domain_${X}$%beg >= neighbor_domain_${X}$%end) then
+                        ! project right side to the left
+                        temp_neighbor_domain = neighbor_domain_${X}$%end + (${X}$_domain%end - ${X}$_domain%beg)
+                        periodic_owner = neighbor_domain_${X}$%beg <= location(${ID}$) .and. location(${ID}$) < temp_neighbor_domain
+                        ! project the left side to the right
+                        temp_neighbor_domain = neighbor_domain_${X}$%beg - (${X}$_domain%end - ${X}$_domain%beg)
+                        periodic_owner = periodic_owner .or. (temp_neighbor_domain <= location(${ID}$) .and. location(${ID}$) &
+                                                              & < neighbor_domain_${X}$%end)
+
+                        owns_collision = owns_collision .and. periodic_owner
+                    else
+                        owns_collision = owns_collision .and. neighbor_domain_${X}$%beg <= location(${ID}$) .and. location(${ID}$) &
+                            & < neighbor_domain_${X}$%end
+                    end if
+                end if
+            #:endfor
+        end if
+#endif
 
-    end function f_local_rank_owns_collision
+    end function f_neighborhood_ranks_own_location
 
     subroutine s_finalize_collisions_module()
 
diff --git a/src/simulation/m_compute_levelset.fpp b/src/simulation/m_compute_levelset.fpp
index 96fdebb357..ac0071db73 100644
--- a/src/simulation/m_compute_levelset.fpp
+++ b/src/simulation/m_compute_levelset.fpp
@@ -88,8 +88,8 @@ contains
 
         radius = patch_ib(ib_patch_id)%radius
 
-        dist_vec(1) = x_cc(i) - patch_ib(ib_patch_id)%x_centroid - real(gp%x_periodicity, wp)*(x_domain%end - x_domain%beg)
-        dist_vec(2) = y_cc(j) - patch_ib(ib_patch_id)%y_centroid - real(gp%y_periodicity, wp)*(y_domain%end - y_domain%beg)
+        dist_vec(1) = x_cc(i) - (patch_ib(ib_patch_id)%x_centroid + real(gp%x_periodicity, wp)*(x_domain%end - x_domain%beg))
+        dist_vec(2) = y_cc(j) - (patch_ib(ib_patch_id)%y_centroid + real(gp%y_periodicity, wp)*(y_domain%end - y_domain%beg))
         dist_vec(3) = 0._wp
         dist = sqrt(sum(dist_vec**2))
 
diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index a5fdb3ef81..effc020325 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -909,7 +909,7 @@ contains
         integer                              :: ifile, ierr
         integer, dimension(MPI_STATUS_SIZE)  :: status
         logical                              :: file_exist
-        integer                              :: i
+        integer                              :: i, ib_idx
         integer, parameter                   :: NFIELDS_PER_IB = 20
         real(wp)                             :: ib_buf(NFIELDS_PER_IB)
 
@@ -923,19 +923,6 @@ contains
         end if
         call s_mpi_barrier()
 
-        ! Divide num_ibs across num_procs
-        nibs_per_rank = num_ibs/num_procs
-        remainder = mod(num_ibs, num_procs)
-
-        ! Ranks < remainder get one extra IB
-        if (proc_rank < remainder) then
-            ib_start = proc_rank*(nibs_per_rank + 1) + 1
-            ib_end = ib_start + nibs_per_rank  ! nibs_per_rank + 1 total
-        else
-            ib_start = remainder*(nibs_per_rank + 1) + (proc_rank - remainder)*nibs_per_rank + 1
-            ib_end = ib_start + nibs_per_rank - 1
-        end if
-
         write (file_loc, '(A,I0,A)') '/restart_data/ib_state_', t_step, '.dat'
         file_loc = trim(case_dir) // trim(file_loc)
 
@@ -947,20 +934,21 @@ contains
 
         call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, ior(MPI_MODE_WRONLY, MPI_MODE_CREATE), mpi_info_int, ifile, ierr)
 
-        do i = ib_start, ib_end
+        do i = 1, num_local_ibs
+            ib_idx = local_ib_patch_ids(i)
             ib_buf(1) = mytime
-            ib_buf(2:4) = patch_ib(i)%force(1:3)
-            ib_buf(5:7) = patch_ib(i)%torque(1:3)
-            ib_buf(8:10) = patch_ib(i)%vel(1:3)
-            ib_buf(11:13) = patch_ib(i)%angular_vel(1:3)
-            ib_buf(14:16) = patch_ib(i)%angles(1:3)
-            ib_buf(17) = patch_ib(i)%x_centroid
-            ib_buf(18) = patch_ib(i)%y_centroid
-            ib_buf(19) = patch_ib(i)%z_centroid
-            ib_buf(20) = patch_ib(i)%radius
+            ib_buf(2:4) = patch_ib(ib_idx)%force(1:3)
+            ib_buf(5:7) = patch_ib(ib_idx)%torque(1:3)
+            ib_buf(8:10) = patch_ib(ib_idx)%vel(1:3)
+            ib_buf(11:13) = patch_ib(ib_idx)%angular_vel(1:3)
+            ib_buf(14:16) = patch_ib(ib_idx)%angles(1:3)
+            ib_buf(17) = patch_ib(ib_idx)%x_centroid
+            ib_buf(18) = patch_ib(ib_idx)%y_centroid
+            ib_buf(19) = patch_ib(ib_idx)%z_centroid
+            ib_buf(20) = patch_ib(ib_idx)%radius
 
             ! Global IB index (i) determines position in file
-            disp = int(i - 1, MPI_OFFSET_KIND)*int(NFIELDS_PER_IB, MPI_OFFSET_KIND)*WP_MOK
+            disp = int(patch_ib(ib_idx)%gbl_patch_id - 1, MPI_OFFSET_KIND)*int(NFIELDS_PER_IB, MPI_OFFSET_KIND)*WP_MOK
 
             call MPI_FILE_WRITE_AT(ifile, disp, ib_buf, NFIELDS_PER_IB, mpi_p, status, ierr)
         end do
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index 7fb3353e2c..071fdc49be 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -28,6 +28,7 @@ module m_global_parameters
     integer                 :: t_step_old     !< Existing IC/grid folder
     ! Computational Domain Parameters
     integer :: proc_rank  !< Rank of the local processor
+    $:GPU_DECLARE(create='[num_procs, proc_rank]')
     !> @name Number of cells in the x-, y- and z-directions, respectively
     !> @{
     integer :: m, n, p
@@ -229,7 +230,8 @@ module m_global_parameters
     $:GPU_DECLARE(create='[ib_bc_x, ib_bc_y, ib_bc_z]')
 #endif
     type(bounds_info) :: x_domain, y_domain, z_domain
-    $:GPU_DECLARE(create='[x_domain, y_domain, z_domain]')
+    type(bounds_info) :: neighbor_domain_x, neighbor_domain_y, neighbor_domain_z
+    $:GPU_DECLARE(create='[x_domain, y_domain, z_domain, neighbor_domain_x, neighbor_domain_y, neighbor_domain_z]')
     real(wp) :: x_a, y_a, z_a
     real(wp) :: x_b, y_b, z_b
     logical  :: parallel_io       !< Format of the data files
@@ -336,18 +338,23 @@ module m_global_parameters
 
     !> @name Immersed Boundaries
     !> @{
-    logical                                                  :: ib
-    integer                                                  :: num_ibs
-    integer                                                  :: collision_model
-    real(wp)                                                 :: coefficient_of_restitution
-    real(wp)                                                 :: collision_time
-    real(wp)                                                 :: ib_coefficient_of_friction
-    logical                                                  :: ib_state_wrt
-    type(ib_patch_parameters), dimension(num_ib_patches_max) :: patch_ib  !< Immersed boundary patch parameters
-    type(vec3_dt), allocatable, dimension(:)                 :: airfoil_grid_u, airfoil_grid_l
-    integer                                                  :: Np
-
-    $:GPU_DECLARE(create='[ib, num_ibs, patch_ib, Np, airfoil_grid_u, airfoil_grid_l]')
+    logical :: ib
+    integer :: num_ibs                                                !< number of IBs that the current processor is aware of
+    integer :: num_gbl_ibs                                            !< number of IBs in the overall simulation
+    integer :: num_local_ibs                                          !< number of IBs that lie inside the processor domain
+    integer :: ib_neighborhood_radius                                 !< neighborhood radius in ranks (1 = immediate neighbors)
+    integer :: collision_model
+    real(wp) :: coefficient_of_restitution
+    real(wp) :: collision_time
+    real(wp) :: ib_coefficient_of_friction
+    logical :: ib_state_wrt
+    type(ib_patch_parameters), allocatable, dimension(:) :: patch_ib  !< Immersed boundary patch parameters
+    integer, dimension(num_local_ibs_max) :: local_ib_patch_ids       !< lookup table of IBs in the local compute domain
+    integer, allocatable, dimension(:,:,:) :: ib_neighbor_ranks       !< MPI ranks of neighborhood domains, indexed (-N:N,-N:N,-N:N)
+    type(vec3_dt), allocatable, dimension(:) :: airfoil_grid_u, airfoil_grid_l
+    integer :: Np
+
+    $:GPU_DECLARE(create='[ib, num_ibs, num_gbl_ibs, num_local_ibs, patch_ib, Np, airfoil_grid_u, airfoil_grid_l, local_ib_patch_ids]')
     $:GPU_DECLARE(create='[ib_coefficient_of_friction]')
     !> @}
 
@@ -634,6 +641,7 @@ contains
         ! Immersed Boundaries
         ib = .false.
         num_ibs = dflt_int
+        ib_neighborhood_radius = 1
         collision_model = 0
         coefficient_of_restitution = dflt_real
         collision_time = dflt_real
@@ -787,7 +795,9 @@ contains
             relativity = .false.
         #:endif
 
+        allocate (patch_ib(num_ib_patches_max))
         do i = 1, num_ib_patches_max
+            patch_ib(i)%gbl_patch_id = i
             patch_ib(i)%geometry = dflt_int
             patch_ib(i)%x_centroid = 0._wp
             patch_ib(i)%y_centroid = 0._wp
diff --git a/src/simulation/m_ib_patches.fpp b/src/simulation/m_ib_patches.fpp
index cf3ad9ecfa..9bdcc9dee7 100644
--- a/src/simulation/m_ib_patches.fpp
+++ b/src/simulation/m_ib_patches.fpp
@@ -109,7 +109,7 @@ contains
         radius = patch_ib(patch_id)%radius
 
         ! encode the periodicity information into the patch_id
-        call s_encode_patch_periodicity(patch_id, xp, yp, 0, encoded_patch_id)
+        call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, 0, encoded_patch_id)
 
         ! find the indices to the left and right of the IB in i, j, k
         il = -gp_layers - 1
@@ -221,7 +221,7 @@ contains
         end if
 
         ! encode the periodicity information into the patch_id
-        call s_encode_patch_periodicity(patch_id, xp, yp, 0, encoded_patch_id)
+        call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, 0, encoded_patch_id)
 
         ! find the indices to the left and right of the IB in i, j, k
         il = -gp_layers - 1
@@ -376,7 +376,7 @@ contains
         end if
 
         ! encode the periodicity information into the patch_id
-        call s_encode_patch_periodicity(patch_id, xp, yp, zp, encoded_patch_id)
+        call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, zp, encoded_patch_id)
 
         ! find the indices to the left and right of the IB in i, j, k
         il = -gp_layers - 1
@@ -466,7 +466,7 @@ contains
         inverse_rotation(:,:) = patch_ib(patch_id)%rotation_matrix_inverse(:,:)
 
         ! encode the periodicity information into the patch_id
-        call s_encode_patch_periodicity(patch_id, xp, yp, 0, encoded_patch_id)
+        call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, 0, encoded_patch_id)
 
         ! find the indices to the left and right of the IB in i, j, k
         il = -gp_layers - 1
@@ -510,16 +510,12 @@ contains
         real(wp)                 :: radius
         real(wp), dimension(1:3) :: center
 
-        ! Variables to initialize the pressure field that corresponds to the bubble-collapse test case found in Tiwari et al. (2013)
-
-        ! Transferring spherical patch's radius, centroid, smoothing patch identity and smoothing coefficient information
-
         center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(x_domain%end - x_domain%beg)
         center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(y_domain%end - y_domain%beg)
         center(3) = patch_ib(patch_id)%z_centroid + real(zp, wp)*(z_domain%end - z_domain%beg)
         radius = patch_ib(patch_id)%radius
 
-        ! completely skip particles no in the domain
+        ! completely skip particles not in the domain
         if (center(1) - radius > x_cc(m + gp_layers + 1) .or. center(1) + radius < x_cc(-gp_layers - 1) .or. center(2) &
             & - radius > y_cc(n + gp_layers + 1) .or. center(2) + radius < y_cc(-gp_layers - 1) .or. center(3) - radius > z_cc(p &
             & + gp_layers + 1) .or. center(3) + radius < z_cc(-gp_layers - 1)) then
@@ -527,7 +523,7 @@ contains
         end if
 
         ! encode the periodicity information into the patch_id
-        call s_encode_patch_periodicity(patch_id, xp, yp, zp, encoded_patch_id)
+        call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, zp, encoded_patch_id)
 
         ! find the indices to the left and right of the IB in i, j, k
         il = -gp_layers - 1
@@ -542,18 +538,12 @@ contains
 
         ! Checking whether the sphere covers a particular cell in the domain and verifying whether the current patch has permission
         ! to write to that cell. If both queries check out, the primitive variables of the current patch are assigned to this cell.
-        $:GPU_PARALLEL_LOOP(private='[i, j, k, cart_y, cart_z]', copyin='[encoded_patch_id, center, radius]', collapse=3)
+        $:GPU_PARALLEL_LOOP(private='[i, j, k]', copyin='[encoded_patch_id, center, radius]', collapse=3)
         do k = kl, kr
             do j = jl, jr
                 do i = il, ir
-                    if (grid_geometry == 3) then
-                        call s_convert_cylindrical_to_cartesian_coord(y_cc(j), z_cc(k))
-                    else
-                        cart_y = y_cc(j)
-                        cart_z = z_cc(k)
-                    end if
                     ! Updating the patch identities bookkeeping variable
-                    if (((x_cc(i) - center(1))**2 + (cart_y - center(2))**2 + (cart_z - center(3))**2 <= radius**2)) then
+                    if (((x_cc(i) - center(1))**2 + (y_cc(j) - center(2))**2 + (z_cc(k) - center(3))**2 <= radius**2)) then
                         ib_markers%sf(i, j, k) = encoded_patch_id
                     end if
                 end do
@@ -586,7 +576,7 @@ contains
         inverse_rotation(:,:) = patch_ib(patch_id)%rotation_matrix_inverse(:,:)
 
         ! encode the periodicity information into the patch_id
-        call s_encode_patch_periodicity(patch_id, xp, yp, zp, encoded_patch_id)
+        call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, zp, encoded_patch_id)
 
         ! find the indices to the left and right of the IB in i, j, k
         il = -gp_layers - 1
@@ -656,7 +646,7 @@ contains
         inverse_rotation(:,:) = patch_ib(patch_id)%rotation_matrix_inverse(:,:)
 
         ! encode the periodicity information into the patch_id
-        call s_encode_patch_periodicity(patch_id, xp, yp, zp, encoded_patch_id)
+        call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, zp, encoded_patch_id)
 
         il = -gp_layers - 1
         jl = -gp_layers - 1
@@ -724,7 +714,7 @@ contains
         inverse_rotation(:,:) = patch_ib(patch_id)%rotation_matrix_inverse(:,:)
 
         ! encode the periodicity information into the patch_id
-        call s_encode_patch_periodicity(patch_id, xp, yp, 0, encoded_patch_id)
+        call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, 0, encoded_patch_id)
 
         ! find the indices to the left and right of the IB in i, j, k
         il = -gp_layers - 1
@@ -762,7 +752,7 @@ contains
         integer, intent(in)                :: xp, yp                !< integers containing the periodicity projection information
         integer                            :: i, j, il, ir, jl, jr  !< Generic loop iterators
         integer                            :: spc, encoded_patch_id
-        integer                            :: cx, cy
+        integer                            :: cx, cy, gbl_patch_id
         real(wp)                           :: lx(2), ly(2)
         real(wp), dimension(1:2)           :: bbox_min, bbox_max
         real(wp), dimension(1:3)           :: local_corner, world_corner
@@ -771,6 +761,7 @@ contains
         real(wp), dimension(1:3)           :: center, xy_local
         real(wp), dimension(1:3,1:3)       :: inverse_rotation, rotation
 
+        gbl_patch_id = patch_ib(patch_id)%gbl_patch_id
         center = 0._wp
         center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(x_domain%end - x_domain%beg)
         center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(y_domain%end - y_domain%beg)
@@ -781,7 +772,7 @@ contains
         threshold = patch_ib(patch_id)%model_threshold
 
         ! encode the periodicity information into the patch_id
-        call s_encode_patch_periodicity(patch_id, xp, yp, 0, encoded_patch_id)
+        call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, 0, encoded_patch_id)
 
         il = -gp_layers - 1
         jl = -gp_layers - 1
@@ -789,10 +780,10 @@ contains
         jr = n + gp_layers + 1
 
         ! Local-space bounding box extents (min=1, max=2 in the third index)
-        lx(1) = stl_bounding_boxes(patch_id, 1, 1) + offset(1)
-        lx(2) = stl_bounding_boxes(patch_id, 1, 3) + offset(1)
-        ly(1) = stl_bounding_boxes(patch_id, 2, 1) + offset(2)
-        ly(2) = stl_bounding_boxes(patch_id, 2, 3) + offset(2)
+        lx(1) = stl_bounding_boxes(gbl_patch_id, 1, 1) + offset(1)
+        lx(2) = stl_bounding_boxes(gbl_patch_id, 1, 3) + offset(1)
+        ly(1) = stl_bounding_boxes(gbl_patch_id, 2, 1) + offset(2)
+        ly(2) = stl_bounding_boxes(gbl_patch_id, 2, 3) + offset(2)
 
         bbox_min = 1e12
         bbox_max = -1e12
@@ -819,7 +810,7 @@ contains
                 xy_local = matmul(inverse_rotation, xy_local)
                 xy_local = xy_local - offset
 
-                eta = f_model_is_inside_flat(gpu_ntrs(patch_id), patch_id, xy_local)
+                eta = f_model_is_inside_flat(gpu_ntrs(gbl_patch_id), gbl_patch_id, xy_local)
 
                 ! Reading STL boundary vertices and compute the levelset and levelset_norm
                 if (eta > threshold) then
@@ -838,7 +829,7 @@ contains
         type(integer_field), intent(inout) :: ib_markers
         integer, intent(in)                :: xp, yp, zp  !< integers containing the periodicity projection information
         integer                            :: i, j, k, il, ir, jl, jr, kl, kr  !< Generic loop iterators
-        integer                            :: spc, encoded_patch_id
+        integer                            :: spc, encoded_patch_id, gbl_patch_id
         real(wp)                           :: eta, threshold
         real(wp), dimension(1:3)           :: offset
         real(wp), dimension(1:3)           :: center, xyz_local
@@ -847,6 +838,7 @@ contains
         real(wp)                           :: lx(2), ly(2), lz(2)
         real(wp), dimension(1:3)           :: bbox_min, bbox_max, local_corner, world_corner
 
+        gbl_patch_id = patch_ib(patch_id)%gbl_patch_id
         center = 0._wp
         center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(x_domain%end - x_domain%beg)
         center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(y_domain%end - y_domain%beg)
@@ -858,7 +850,7 @@ contains
         rotation(:,:) = patch_ib(patch_id)%rotation_matrix(:,:)
 
         ! encode the periodicity information into the patch_id
-        call s_encode_patch_periodicity(patch_id, xp, yp, zp, encoded_patch_id)
+        call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, zp, encoded_patch_id)
 
         il = -gp_layers - 1
         jl = -gp_layers - 1
@@ -868,12 +860,12 @@ contains
         kr = p + gp_layers + 1
 
         ! Local-space bounding box extents (min=1, max=2 in the third index)
-        lx(1) = stl_bounding_boxes(patch_id, 1, 1) + offset(1)
-        lx(2) = stl_bounding_boxes(patch_id, 1, 3) + offset(1)
-        ly(1) = stl_bounding_boxes(patch_id, 2, 1) + offset(2)
-        ly(2) = stl_bounding_boxes(patch_id, 2, 3) + offset(2)
-        lz(1) = stl_bounding_boxes(patch_id, 3, 1) + offset(3)
-        lz(2) = stl_bounding_boxes(patch_id, 3, 3) + offset(3)
+        lx(1) = stl_bounding_boxes(gbl_patch_id, 1, 1) + offset(1)
+        lx(2) = stl_bounding_boxes(gbl_patch_id, 1, 3) + offset(1)
+        ly(1) = stl_bounding_boxes(gbl_patch_id, 2, 1) + offset(2)
+        ly(2) = stl_bounding_boxes(gbl_patch_id, 2, 3) + offset(2)
+        lz(1) = stl_bounding_boxes(gbl_patch_id, 3, 1) + offset(3)
+        lz(2) = stl_bounding_boxes(gbl_patch_id, 3, 3) + offset(3)
 
         bbox_min = 1e12
         bbox_max = -1e12
@@ -906,7 +898,7 @@ contains
                     xyz_local = matmul(inverse_rotation, xyz_local)
                     xyz_local = xyz_local - offset
 
-                    eta = f_model_is_inside_flat(gpu_ntrs(patch_id), patch_id, xyz_local)
+                    eta = f_model_is_inside_flat(gpu_ntrs(gbl_patch_id), gbl_patch_id, xyz_local)
 
                     if (eta > patch_ib(patch_id)%model_threshold) then
                         ib_markers%sf(i, j, k) = encoded_patch_id
@@ -1039,7 +1031,7 @@ contains
         temp_y_per = y_periodicity; if (y_periodicity == -1) temp_y_per = 2
         temp_z_per = z_periodicity; if (z_periodicity == -1) temp_z_per = 2
 
-        offset = (num_ibs + 1)*temp_x_per + 3*(num_ibs + 1)*temp_y_per + 9*(num_ibs + 1)*temp_z_per
+        offset = (num_gbl_ibs + 1)*temp_x_per + 3*(num_gbl_ibs + 1)*temp_y_per + 9*(num_gbl_ibs + 1)*temp_z_per
         encoded_patch_id = patch_id + offset
 
     end subroutine s_encode_patch_periodicity
@@ -1054,7 +1046,7 @@ contains
         integer, intent(out), optional :: x_periodicity, y_periodicity, z_periodicity
         integer                        :: offset, remainder, xp, yp, zp, base
 
-        base = num_ibs + 1
+        base = num_gbl_ibs + 1
 
         patch_id = mod(encoded_patch_id - 1, base) + 1
         offset = (encoded_patch_id - patch_id)/base
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index 5f865cffeb..7544277af3 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -71,25 +71,20 @@ contains
         call nvtxStartRange("SETUP-IBM-MODULE")
 
         ! do all set up for moving immersed boundaries
-        moving_immersed_boundary_flag = .false.
         do i = 1, num_ibs
             if (patch_ib(i)%moving_ibm /= 0) then
                 call s_compute_moment_of_inertia(i, patch_ib(i)%angular_vel)
-                moving_immersed_boundary_flag = .true.
             end if
             call s_update_ib_rotation_matrix(i)
         end do
         $:GPU_UPDATE(device='[patch_ib(1:num_ibs)]')
 
         ! GPU routines require updated cell centers
-        $:GPU_UPDATE(device='[num_ibs, x_cc, y_cc, dx, dy, x_domain, y_domain, ib_bc_x%beg, ib_bc_y%beg]')
+        $:GPU_UPDATE(device='[num_ibs, num_gbl_ibs, x_cc, y_cc, dx, dy, x_domain, y_domain, ib_bc_x%beg, ib_bc_y%beg]')
         if (p /= 0) then
             $:GPU_UPDATE(device='[z_cc, dz, z_domain, ib_bc_z%beg]')
         end if
 
-        ! allocate STL models
-        call s_instantiate_STL_models()
-
         ! recompute the new ib_patch locations and broadcast them.
         ib_markers%sf = 0._wp
         $:GPU_UPDATE(device='[ib_markers%sf]')
@@ -123,6 +118,8 @@ contains
 
         call nvtxEndRange
 
+        ! print *, proc_rank, num_local_ibs, num_ibs, num_gbl_ibs
+
     end subroutine s_ibm_setup
 
     !> Update the conservative variables at the ghost points
@@ -247,6 +244,8 @@ contains
                         q_prim_vf(eqn_idx%E)%sf(j, k, l) = q_prim_vf(eqn_idx%E)%sf(j, k, &
                                   & l) + pres_IP/(1._wp - 2._wp*abs(gp%levelset*alpha_rho_IP(q)/pres_IP) &
                                   & *dot_product(patch_ib(patch_id) %force/patch_ib(patch_id)%mass, gp%levelset_norm))
+                        ! q_prim_vf(eqn_idx%E)%sf(j, k, l) = q_prim_vf(eqn_idx%E)%sf(j, k, & & l) + pres_IP/(1._wp -
+                        ! 2._wp*abs(gp%levelset*alpha_rho_IP(q)/pres_IP)) ! TODO :: REMOVE ME
                     end do
                 end if
 
@@ -535,7 +534,7 @@ contains
         integer                                              :: i, j, k, ii, jj, kk, gp_layers_z  !< Iterator variables
         integer                                              :: xp, yp, zp                        !< periodicities
         integer                                              :: count, count_i, local_idx
-        integer                                              :: patch_id, encoded_patch_id
+        integer                                              :: patch_id, encoded_patch_id, neighborhood_patch_id
         logical                                              :: is_gp
 
         count = 0
@@ -543,9 +542,9 @@ contains
         gp_layers_z = gp_layers
         if (p == 0) gp_layers_z = 0
 
-        $:GPU_PARALLEL_LOOP(private='[i, j, k, ii, jj, kk, is_gp, local_idx, patch_id, encoded_patch_id, xp, yp, zp]', &
-                            & copyin='[count, count_i, x_domain, y_domain, z_domain]', firstprivate='[gp_layers, gp_layers_z]', &
-                                & collapse=3)
+        $:GPU_PARALLEL_LOOP(private='[i, j, k, ii, jj, kk, is_gp, local_idx, patch_id, encoded_patch_id, neighborhood_patch_id, &
+                            & xp, yp, zp]', copyin='[count, count_i, x_domain, y_domain, z_domain]', firstprivate='[gp_layers, &
+                            & gp_layers_z]', collapse=3)
         do i = 0, m
             do j = 0, n
                 do k = 0, p
@@ -572,11 +571,12 @@ contains
                             ghost_points_in(local_idx)%loc = [i, j, k]
                             encoded_patch_id = ib_markers%sf(i, j, k)
                             call s_decode_patch_periodicity(encoded_patch_id, patch_id, xp, yp, zp)
-                            ghost_points_in(local_idx)%ib_patch_id = patch_id
+                            call s_get_neighborhood_idx(patch_id, neighborhood_patch_id)
+                            ghost_points_in(local_idx)%ib_patch_id = neighborhood_patch_id
                             ghost_points_in(local_idx)%x_periodicity = xp
                             ghost_points_in(local_idx)%y_periodicity = yp
                             ghost_points_in(local_idx)%z_periodicity = zp
-                            ghost_points_in(local_idx)%slip = patch_ib(patch_id)%slip
+                            ghost_points_in(local_idx)%slip = patch_ib(neighborhood_patch_id)%slip
 
                             if ((x_cc(i) - dx(i)) < x_domain%beg) then
                                 ghost_points_in(local_idx)%DB(1) = -1
@@ -719,9 +719,10 @@ contains
 
     !> Interpolate primitive variables to a ghost point's image point using bilinear or trilinear interpolation
     subroutine s_interpolate_image_point(q_prim_vf, gp, alpha_rho_IP, alpha_IP, pres_IP, vel_IP, c_IP, r_IP, v_IP, pb_IP, mv_IP, &
+                                         & nmom_IP, pb_in, mv_in, presb_IP, massv_IP)
 
-        & nmom_IP, pb_in, mv_in, presb_IP, massv_IP)
         $:GPU_ROUTINE(parallelism='[seq]')
+
         type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf  !< Primitive Variables
         real(stp), optional, dimension(idwbuff(1)%beg:,idwbuff(2)%beg:,idwbuff(3)%beg:,1:,1:), intent(in) :: pb_in, mv_in
         type(ghost_point), intent(in) :: gp
@@ -1010,9 +1011,8 @@ contains
 
         call s_apply_collision_forces(ghost_points, num_gps, ib_markers, forces, torques)
 
-        ! reduce the forces across all MPI ranks
-        call s_mpi_allreduce_vectors_sum(forces, forces, num_ibs, 3)
-        call s_mpi_allreduce_vectors_sum(torques, torques, num_ibs, 3)
+        ! reduce the forces across local neighborhood ranks
+        call s_communicate_ib_forces(forces, torques)
 
         ! consider body forces after reducing to avoid double counting
         do i = 1, num_ibs
@@ -1108,35 +1108,33 @@ contains
     end subroutine s_compute_centroid_offset
 
     !> Computes the moment of inertia for an immersed boundary
-    subroutine s_compute_moment_of_inertia(ib_marker, axis)
+    subroutine s_compute_moment_of_inertia(ib_idx, axis)
 
         real(wp), dimension(3), intent(in) :: axis  !< the axis about which we compute the moment. Only required in 3D.
-        integer, intent(in)                :: ib_marker
+        integer, intent(in)                :: ib_idx
         real(wp)                           :: moment, distance_to_axis, cell_volume
         real(wp), dimension(3)             :: position, closest_point_along_axis, vector_to_axis, normal_axis
-        integer                            :: i, j, k, count
+        integer                            :: i, j, k, count, ib_marker
 
         if (p == 0) then
             normal_axis = [0, 0, 1]
         else if (sqrt(sum(axis**2)) < sgm_eps) then
             ! if the object is not actually rotating at this time, return a dummy value and exit
-            patch_ib(ib_marker)%moment = 1._wp
+            patch_ib(ib_idx)%moment = 1._wp
             return
         else
             normal_axis = axis/sqrt(sum(axis))
         end if
 
         ! if the IB is in 2D or a 3D sphere, we can compute this exactly
-        if (patch_ib(ib_marker)%geometry == 2) then  ! circle
-            patch_ib(ib_marker)%moment = 0.5_wp*patch_ib(ib_marker)%mass*(patch_ib(ib_marker)%radius)**2
-        else if (patch_ib(ib_marker)%geometry == 3) then  ! rectangle
-            patch_ib(ib_marker)%moment = patch_ib(ib_marker)%mass*(patch_ib(ib_marker)%length_x**2 + patch_ib(ib_marker) &
-                     & %length_y**2)/6._wp
-        else if (patch_ib(ib_marker)%geometry == 6) then  ! ellipse
-            patch_ib(ib_marker)%moment = 0.0625_wp*patch_ib(ib_marker)%mass*(patch_ib(ib_marker)%length_x**2 + patch_ib(ib_marker) &
-                     & %length_y**2)
-        else if (patch_ib(ib_marker)%geometry == 8) then  ! sphere
-            patch_ib(ib_marker)%moment = 0.4*patch_ib(ib_marker)%mass*(patch_ib(ib_marker)%radius)**2
+        if (patch_ib(ib_idx)%geometry == 2) then  ! circle
+            patch_ib(ib_idx)%moment = 0.5_wp*patch_ib(ib_idx)%mass*(patch_ib(ib_idx)%radius)**2
+        else if (patch_ib(ib_idx)%geometry == 3) then  ! rectangle
+            patch_ib(ib_idx)%moment = patch_ib(ib_idx)%mass*(patch_ib(ib_idx)%length_x**2 + patch_ib(ib_idx) %length_y**2)/6._wp
+        else if (patch_ib(ib_idx)%geometry == 6) then  ! ellipse
+            patch_ib(ib_idx)%moment = 0.0625_wp*patch_ib(ib_idx)%mass*(patch_ib(ib_idx)%length_x**2 + patch_ib(ib_idx) %length_y**2)
+        else if (patch_ib(ib_idx)%geometry == 8) then  ! sphere
+            patch_ib(ib_idx)%moment = 0.4*patch_ib(ib_idx)%mass*(patch_ib(ib_idx)%radius)**2
         else  ! we do not have an analytic moment of inertia calculation and need to approximate it directly via a sum
             count = 0
             moment = 0._wp
@@ -1146,6 +1144,8 @@ contains
                 cell_volume = cell_volume*(z_cc(1) - z_cc(0))
             end if
 
+            ib_marker = patch_ib(ib_idx)%gbl_patch_id
+
             $:GPU_PARALLEL_LOOP(private='[position, closest_point_along_axis, vector_to_axis, distance_to_axis]', copy='[moment, &
                                 & count]', copyin='[ib_marker, cell_volume, normal_axis]', collapse=3)
             do i = 0, m
@@ -1156,12 +1156,12 @@ contains
                             count = count + 1  ! increment the count of total cells in the boundary
 
                             ! get the position in local coordinates so that the axis passes through 0, 0, 0
-                            if (p == 0) then
-                                position = [x_cc(i), y_cc(j), 0._wp] - [patch_ib(ib_marker)%x_centroid, &
-                                                 & patch_ib(ib_marker)%y_centroid, 0._wp]
+                            if (num_dims < 3) then
+                                position = [x_cc(i), y_cc(j), 0._wp] - [patch_ib(ib_idx)%x_centroid, patch_ib(ib_idx)%y_centroid, &
+                                                 & 0._wp]
                             else
-                                position = [x_cc(i), y_cc(j), z_cc(k)] - [patch_ib(ib_marker)%x_centroid, &
-                                                 & patch_ib(ib_marker)%y_centroid, patch_ib(ib_marker)%z_centroid]
+                                position = [x_cc(i), y_cc(j), z_cc(k)] - [patch_ib(ib_idx)%x_centroid, &
+                                                 & patch_ib(ib_idx)%y_centroid, patch_ib(ib_idx)%z_centroid]
                             end if
 
                             ! project the position along the axis to find the closest distance to the rotation axis
@@ -1179,8 +1179,7 @@ contains
             $:END_GPU_PARALLEL_LOOP()
 
             ! write the final moment assuming the points are all uniform density
-            patch_ib(ib_marker)%moment = moment*patch_ib(ib_marker)%mass/(count*cell_volume)
-            $:GPU_UPDATE(device='[patch_ib(ib_marker)%moment]')
+            patch_ib(ib_idx)%moment = moment*patch_ib(ib_idx)%mass/(count*cell_volume)
         end if
 
     end subroutine s_compute_moment_of_inertia
@@ -1225,4 +1224,269 @@ contains
 
     end subroutine s_wrap_periodic_ibs
 
+    !> @brief reasseses ownership of IBs and passes ownership of IBs to neighbor processors
+    !> Reduces forces and torques across the local neighborhood without a global allreduce. Accumulation phase: 2 passes per
+    !! dimension receiving from the low-index (-X) neighbor. Pass 1: add received values; save what was received as recv_snap. Pass
+    !! 2: send current (post-pass-1) values; add received; subtract recv_snap to remove double-counting of the direct contribution
+    !! already added in pass 1. Back-propagation phase: 2 passes per dimension receiving from the high-index (+X) neighbor, each
+    !! overwriting local forces with the neighbor's accumulated total.
+    subroutine s_communicate_ib_forces(forces, torques)
+
+        real(wp), dimension(num_ibs, 3), intent(inout) :: forces, torques
+
+#ifdef MFC_MPI
+        integer                       :: i, j, k, pack_pos, unpack_pos, buf_size, ierr
+        integer                       :: send_neighbor, recv_neighbor, recv_count, pid, tag
+        real(wp), dimension(3)        :: fval, tval
+        real(wp), allocatable         :: recv_forces_snap(:,:), recv_torques_snap(:,:)
+        character(len=1), allocatable :: send_buf(:), recv_buf(:)
+
+        if (num_procs == 1) return
+
+        buf_size = storage_size(0)/8 + (storage_size(0)/8 + 6*storage_size(0._wp)/8)*size(patch_ib)
+        allocate (send_buf(buf_size), recv_buf(buf_size), recv_forces_snap(num_ibs, 3), recv_torques_snap(num_ibs, 3))
+
+        ! Accumulation phase: propagate contributions toward the high-index corner.
+        #:for X, ID in [('x', 1), ('y', 2), ('z', 3)]
+            if (num_dims >= ${ID}$) then
+                send_neighbor = merge(bc_${X}$%end, MPI_PROC_NULL, bc_${X}$%end >= 0)
+                recv_neighbor = merge(bc_${X}$%beg, MPI_PROC_NULL, bc_${X}$%beg >= 0)
+
+                recv_forces_snap = 0._wp
+                recv_torques_snap = 0._wp
+                tag = 300
+
+                do k = 1, 2*ib_neighborhood_radius
+                    ! send forces to +${X}$ neighbor; receive from -${X}$ neighbor. Add received values then
+                    pack_pos = 0
+                    call MPI_PACK(num_ibs, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                    do i = 1, num_ibs
+                        call MPI_PACK(patch_ib(i)%gbl_patch_id, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                        fval(:) = forces(i,:); tval(:) = torques(i,:)
+                        call MPI_PACK(fval, 3, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                        call MPI_PACK(tval, 3, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                    end do
+                    call MPI_SENDRECV(send_buf, pack_pos, MPI_PACKED, send_neighbor, tag, recv_buf, buf_size, MPI_PACKED, &
+                                      & recv_neighbor, tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+                    if (recv_neighbor /= MPI_PROC_NULL) then
+                        unpack_pos = 0
+                        call MPI_UNPACK(recv_buf, buf_size, unpack_pos, recv_count, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr)
+                        do i = 1, recv_count
+                            call MPI_UNPACK(recv_buf, buf_size, unpack_pos, pid, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr)
+                            call MPI_UNPACK(recv_buf, buf_size, unpack_pos, fval, 3, mpi_p, MPI_COMM_WORLD, ierr)
+                            call MPI_UNPACK(recv_buf, buf_size, unpack_pos, tval, 3, mpi_p, MPI_COMM_WORLD, ierr)
+                            call s_get_neighborhood_idx(pid, j)
+                            if (j > 0) then
+                                ! add forces and subtract recv_snap prevent double-counting
+                                forces(j,:) = forces(j,:) + fval(:) - recv_forces_snap(j,:)
+                                torques(j,:) = torques(j,:) + tval(:) - recv_torques_snap(j,:)
+                                recv_forces_snap(j,:) = fval(:)
+                                recv_torques_snap(j,:) = tval(:)
+                            end if
+                        end do
+                    end if
+                    tag = tag + 2
+                end do
+            end if
+        #:endfor
+
+        ! Send final sums back to neighbors in -X direction
+        #:for X, ID in [('x', 1), ('y', 2), ('z', 3)]
+            if (num_dims >= ${ID}$) then
+                send_neighbor = merge(bc_${X}$%beg, MPI_PROC_NULL, bc_${X}$%beg >= 0)
+                recv_neighbor = merge(bc_${X}$%end, MPI_PROC_NULL, bc_${X}$%end >= 0)
+
+                do k = 1, 2*ib_neighborhood_radius
+                    pack_pos = 0
+                    call MPI_PACK(num_ibs, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                    do i = 1, num_ibs
+                        call MPI_PACK(patch_ib(i)%gbl_patch_id, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                        fval(:) = forces(i,:); tval(:) = torques(i,:)
+                        call MPI_PACK(fval, 3, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                        call MPI_PACK(tval, 3, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                    end do
+                    call MPI_SENDRECV(send_buf, pack_pos, MPI_PACKED, send_neighbor, tag, recv_buf, buf_size, MPI_PACKED, &
+                                      & recv_neighbor, tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+                    if (recv_neighbor /= MPI_PROC_NULL) then
+                        unpack_pos = 0
+                        call MPI_UNPACK(recv_buf, buf_size, unpack_pos, recv_count, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr)
+                        do i = 1, recv_count
+                            call MPI_UNPACK(recv_buf, buf_size, unpack_pos, pid, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr)
+                            call MPI_UNPACK(recv_buf, buf_size, unpack_pos, fval, 3, mpi_p, MPI_COMM_WORLD, ierr)
+                            call MPI_UNPACK(recv_buf, buf_size, unpack_pos, tval, 3, mpi_p, MPI_COMM_WORLD, ierr)
+                            call s_get_neighborhood_idx(pid, j)
+                            if (j > 0) then
+                                forces(j,:) = fval(:)
+                                torques(j,:) = tval(:)
+                            end if
+                        end do
+                    end if
+                    tag = tag + 2
+                end do
+            end if
+        #:endfor
+
+        deallocate (send_buf, recv_buf, recv_forces_snap, recv_torques_snap)
+#endif
+
+    end subroutine s_communicate_ib_forces
+
+    subroutine s_handoff_ib_ownership()
+
+        integer                               :: i, j, k, output_idx, local_output_idx
+        integer                               :: old_num_local_ibs
+        integer                               :: new_count, recv_count
+        integer                               :: pack_pos, unpack_pos, buf_size, patch_bytes
+        integer                               :: send_neighbor, recv_neighbor, ierr
+        integer                               :: dx, dy, dz, tag, nbr_idx, nreqs
+        real(wp), dimension(3)                :: centroid
+        logical                               :: is_new
+        type(ib_patch_parameters)             :: tmp_patch
+        integer, dimension(num_local_ibs_max) :: local_ib_idx_old
+        ! 26 neighbors max in 3D (8 in 2D); each gets its own recv buffer
+        integer, parameter             :: max_nbrs = 26
+        character(len=1), allocatable  :: send_buf(:), recv_bufs(:,:)
+        integer, dimension(2*max_nbrs) :: requests
+        integer, dimension(max_nbrs)   :: recv_neighbor_list
+
+#ifdef MFC_MPI
+        if (num_procs > 1) then
+            ! save a copy of the local IB's global indices to cross-reference for later.
+            local_ib_idx_old = 0
+            old_num_local_ibs = num_local_ibs
+            do i = 1, num_local_ibs
+                local_ib_idx_old(i) = patch_ib(local_ib_patch_ids(i))%gbl_patch_id
+            end do
+
+            ! delete any particles that no longer need to be tracked and coalesce the array
+            output_idx = 0
+            local_output_idx = 0
+            do i = 1, num_ibs
+                centroid = [patch_ib(i)%x_centroid, patch_ib(i)%y_centroid, 0._wp]
+                if (num_dims == 3) centroid(3) = patch_ib(i)%z_centroid
+
+                ! delete if not in neighborhood
+                if (f_neighborhood_ranks_own_location(centroid)) then
+                    output_idx = output_idx + 1
+                    if (i /= output_idx) patch_ib(output_idx) = patch_ib(i)
+
+                    ! check if in local domain
+                    if (f_local_rank_owns_location(centroid)) then
+                        local_output_idx = local_output_idx + 1
+                        local_ib_patch_ids(local_output_idx) = output_idx
+                    end if
+                end if
+            end do
+            num_ibs = output_idx
+            num_local_ibs = local_output_idx
+
+            ! Broadcast newly-owned patches to all neighborhood neighbors
+            patch_bytes = storage_size(tmp_patch)/8
+            buf_size = storage_size(0)/8 + patch_bytes*num_local_ibs_max
+            allocate (send_buf(buf_size), recv_bufs(buf_size, max_nbrs))
+
+            ! Write placeholder count at position 0
+            pack_pos = 0
+            call MPI_PACK(0, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+
+            ! pack new patches and count them
+            new_count = 0
+            do i = 1, num_local_ibs
+                k = local_ib_patch_ids(i)
+                is_new = .true.
+                do j = 1, old_num_local_ibs
+                    if (patch_ib(k)%gbl_patch_id == local_ib_idx_old(j)) then
+                        is_new = .false.
+                        exit
+                    end if
+                end do
+                if (is_new) then
+                    print *, proc_rank, " New Owner ", patch_ib(k)%gbl_patch_id  ! TODO :: REMOVE THIS DEBUG PRINT
+                    call MPI_PACK(patch_ib(k), patch_bytes, MPI_BYTE, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                    new_count = new_count + 1
+                end if
+            end do
+
+            ! Overwrite the placeholder with the real count
+            pack_pos = 0
+            call MPI_PACK(new_count, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+            pack_pos = storage_size(0)/8 + new_count*patch_bytes
+
+            ! Post all receives first, then sends
+            nreqs = 0
+            nbr_idx = 0
+            do dz = merge(-1, 0, num_dims == 3), merge(1, 0, num_dims == 3)
+                do dy = -1, 1
+                    do dx = -1, 1
+                        if (dx == 0 .and. dy == 0 .and. dz == 0) cycle
+                        nbr_idx = nbr_idx + 1
+                        tag = 200 + (dx + 1)*9 + (dy + 1)*3 + (dz + 1)
+                        recv_neighbor = ib_neighbor_ranks(-dx, -dy, -dz)
+                        recv_neighbor_list(nbr_idx) = MPI_PROC_NULL
+                        if (recv_neighbor < 0) cycle
+                        recv_neighbor_list(nbr_idx) = recv_neighbor
+                        nreqs = nreqs + 1
+                        call MPI_IRECV(recv_bufs(:,nbr_idx), buf_size, MPI_PACKED, recv_neighbor, tag, MPI_COMM_WORLD, &
+                                       & requests(nreqs), ierr)
+                    end do
+                end do
+            end do
+
+            do dz = merge(-1, 0, num_dims == 3), merge(1, 0, num_dims == 3)
+                do dy = -1, 1
+                    do dx = -1, 1
+                        if (dx == 0 .and. dy == 0 .and. dz == 0) cycle
+                        tag = 200 + (dx + 1)*9 + (dy + 1)*3 + (dz + 1)
+                        send_neighbor = ib_neighbor_ranks(dx, dy, dz)
+                        if (send_neighbor < 0) cycle
+                        nreqs = nreqs + 1
+                        call MPI_ISEND(send_buf, pack_pos, MPI_PACKED, send_neighbor, tag, MPI_COMM_WORLD, requests(nreqs), ierr)
+                    end do
+                end do
+            end do
+
+            call MPI_WAITALL(nreqs, requests, MPI_STATUSES_IGNORE, ierr)
+
+            ! Unpack all received buffers
+            do nbr_idx = 1, merge(26, 8, num_dims == 3)
+                if (recv_neighbor_list(nbr_idx) == MPI_PROC_NULL) cycle
+                unpack_pos = 0
+                call MPI_UNPACK(recv_bufs(:,nbr_idx), buf_size, unpack_pos, recv_count, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr)
+                do i = 1, recv_count
+                    call MPI_UNPACK(recv_bufs(:,nbr_idx), buf_size, unpack_pos, tmp_patch, patch_bytes, MPI_BYTE, MPI_COMM_WORLD, &
+                                    & ierr)
+                    call s_get_neighborhood_idx(tmp_patch%gbl_patch_id, j)
+                    if (j < 0) then
+                        num_ibs = num_ibs + 1
+                        @:ASSERT(num_ibs <= size(patch_ib), 'patch_ib overflow in neighborhood handoff')
+                        patch_ib(num_ibs) = tmp_patch
+                    end if
+                end do
+            end do
+
+            deallocate (send_buf, recv_bufs)
+        end if
+#endif
+
+    end subroutine s_handoff_ib_ownership
+
+    subroutine s_get_neighborhood_idx(gbl_idx, neighborhood_idx)
+
+        $:GPU_ROUTINE(parallelism='[seq]')
+
+        integer, intent(in)  :: gbl_idx
+        integer, intent(out) :: neighborhood_idx
+        integer              :: i
+
+        neighborhood_idx = -1
+
+        do i = 1, num_ibs
+            if (patch_ib(i)%gbl_patch_id == gbl_idx) then
+                neighborhood_idx = i
+                exit
+            end if
+        end do
+
+    end subroutine s_get_neighborhood_idx
+
 end module m_ibm
diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp
index cf08cbc709..e67fcc41a8 100644
--- a/src/simulation/m_mpi_proxy.fpp
+++ b/src/simulation/m_mpi_proxy.fpp
@@ -76,7 +76,7 @@ contains
             & 'num_probes', 'num_integrals', 'bubble_model', 'thermal',        &
             & 'num_source', 'relax_model', 'num_ibs', 'n_start',    &
             & 'num_bc_patches', 'num_igr_iters', 'num_igr_warm_start_iters', &
-            & 'adap_dt_max_iters', 'collision_model' ]
+            & 'adap_dt_max_iters', 'collision_model', 'ib_neighborhood_radius' ]
             call MPI_BCAST(${VAR}$, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
         #:endfor
 
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index 310595249b..02ce17c6f4 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -39,6 +39,8 @@ module m_start_up
 
     use m_nvtx
     use m_ibm
+    use m_model
+    use m_collisions
     use m_compile_specific
     use m_checker_common
     use m_checker
@@ -90,7 +92,7 @@ contains
             x_a, y_a, z_a, x_b, y_b, z_b, &
             x_domain, y_domain, z_domain, &
             hypoelasticity, &
-            ib, num_ibs, patch_ib, &
+            ib, num_ibs, ib_neighborhood_radius, patch_ib, &
             collision_model, coefficient_of_restitution, collision_time, &
             ib_coefficient_of_friction, ib_state_wrt, &
             fluid_pp, bub_pp, probe_wrt, prim_vars_wrt, &
@@ -919,6 +921,8 @@ contains
             else if (t_step_start > 0) then
                 call s_read_ib_restart_data(t_step_start)
             end if
+            call s_instantiate_STL_models()
+            call s_reduce_ib_patch_array()
             call s_ibm_setup()
             if (t_step_start == 0 .or. (cfl_dt .and. n_start == 0)) then
                 call s_write_ib_data_file(0)
@@ -1014,6 +1018,8 @@ contains
 #else
             "on CPUs"
 #endif
+        else
+            allocate (patch_ib(num_ib_patches_max))
         end if
 
         call s_mpi_bcast_user_inputs()
@@ -1196,4 +1202,274 @@ contains
 
     end subroutine s_read_ib_restart_data
 
+    !> @brief takes the patch_ib struct array that contains all global IB patches and reduces to only contain patches that are in
+    !! the local computational domain.
+    subroutine s_reduce_ib_patch_array()
+
+        type(ib_patch_parameters), dimension(num_ib_patches_max) :: patch_ib_gbl
+        real(wp), dimension(3)                                   :: centroid
+        integer                                                  :: i, j
+        integer                                                  :: num_aware_ibs
+        logical                                                  :: is_in_neighborhood, is_local
+
+        ! do all set up for moving immersed boundaries
+
+        moving_immersed_boundary_flag = .false.
+        do i = 1, num_ibs
+            if (patch_ib(i)%moving_ibm /= 0) then
+                moving_immersed_boundary_flag = .true.
+                exit
+            end if
+        end do
+
+        patch_ib_gbl(1:num_ibs) = patch_ib(1:num_ibs)
+        call get_neighbor_bounds()  ! make sure the bounds of the neighbors are correctly set up
+        call s_compute_ib_neighbor_ranks()  ! build lookup of all neighbor MPI ranks
+
+        deallocate (patch_ib)
+        num_aware_ibs = min(num_local_ibs_max*(2*ib_neighborhood_radius + 1)**num_dims, num_ib_patches_max)
+        allocate (patch_ib(num_aware_ibs))
+
+        ! assign defaults to all values
+        num_gbl_ibs = num_ibs
+        num_local_ibs = num_ibs
+        do i = 1, num_local_ibs_max
+            local_ib_patch_ids(i) = i
+        end do
+
+#ifdef MFC_MPI
+        ! fallback for 1-rank case
+        if (num_procs == 1) then
+            patch_ib(:) = patch_ib_gbl(1:num_aware_ibs)
+        else
+            ! determine the set of patches owned by local rank
+            num_local_ibs = 0
+            num_ibs = 0
+            do i = 1, num_gbl_ibs
+                ! catch the edge case where th collision lies just outside the computational domain
+                is_in_neighborhood = .true.
+                is_local = .true.
+                centroid = [patch_ib_gbl(i)%x_centroid, patch_ib_gbl(i)%y_centroid, 0._wp]
+                if (num_dims == 3) centroid(3) = patch_ib_gbl(i)%z_centroid
+
+                if (f_neighborhood_ranks_own_location(centroid)) then
+                    num_ibs = num_ibs + 1
+                    patch_ib(num_ibs) = patch_ib_gbl(i)
+                    patch_ib(num_ibs)%gbl_patch_id = i
+                    if (f_local_rank_owns_location(centroid)) then
+                        num_local_ibs = num_local_ibs + 1
+                        local_ib_patch_ids(num_local_ibs) = num_ibs
+                    end if
+                end if
+            end do
+        end if
+#else
+        ! reduce the size of the array for local simulation in no-MPI case
+        patch_ib(:) = patch_ib_gbl(1:num_aware_ibs)
+#endif
+
+    end subroutine s_reduce_ib_patch_array
+
+    !> Build ib_neighbor_ranks(-1:1,-1:1,-1:1): MPI ranks of all neighbor domains. Uses two rounds of MPI_SENDRECV cascades - face
+    !! neighbors are known from bc_*, edge neighbors are obtained in round 1, and (3D) corner neighbors in round 2.
+    subroutine s_compute_ib_neighbor_ranks()
+
+        integer                :: ax, k, nbr_idx, nreqs, sx, sy, sz, dx, dy, dz
+        integer, allocatable   :: send_table(:,:,:), recv_tables(:,:,:,:)
+        integer, dimension(52) :: requests
+
+#ifdef MFC_MPI
+        integer               :: ierr
+        integer, dimension(4) :: buf4, rbuf4
+        integer, dimension(2) :: buf2, rbuf2
+
+        ax = ib_neighborhood_radius
+
+        if (allocated(ib_neighbor_ranks)) deallocate (ib_neighbor_ranks)
+        allocate (ib_neighbor_ranks(-ax:ax,-ax:ax,-ax:ax))
+        ib_neighbor_ranks = MPI_PROC_NULL
+        ib_neighbor_ranks(0, 0, 0) = proc_rank
+
+        ! Fill radius-1 entries: face neighbors are known from domain decomposition
+        ib_neighbor_ranks(-1, 0, 0) = bc_x%beg
+        ib_neighbor_ranks(+1, 0, 0) = bc_x%end
+        if (num_dims >= 2) then
+            ib_neighbor_ranks(0, -1, 0) = bc_y%beg
+            ib_neighbor_ranks(0, +1, 0) = bc_y%end
+        end if
+        if (num_dims == 3) then
+            ib_neighbor_ranks(0, 0, -1) = bc_z%beg
+            ib_neighbor_ranks(0, 0, +1) = bc_z%end
+        end if
+
+        if (num_dims >= 2) then
+            ! Round 1a: exchange y/z face ranks with +/-x face neighbors -> xy and xz edge ranks
+            buf4 = [bc_y%beg, bc_y%end, bc_z%beg, bc_z%end]
+
+            ! Send to -x, receive from +x -> edges (+1,+/-1,0) and (+1,0,+/-1)
+            call MPI_SENDRECV(buf4, 4, MPI_INTEGER, merge(bc_x%beg, MPI_PROC_NULL, bc_x%beg >= 0), 310, rbuf4, 4, MPI_INTEGER, &
+                              & merge(bc_x%end, MPI_PROC_NULL, bc_x%end >= 0), 310, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+            if (bc_x%end >= 0) then
+                ib_neighbor_ranks(+1, -1, 0) = rbuf4(1)
+                ib_neighbor_ranks(+1, +1, 0) = rbuf4(2)
+                ib_neighbor_ranks(+1, 0, -1) = rbuf4(3)
+                ib_neighbor_ranks(+1, 0, +1) = rbuf4(4)
+            end if
+
+            call MPI_SENDRECV(buf4, 4, MPI_INTEGER, merge(bc_x%end, MPI_PROC_NULL, bc_x%end >= 0), 311, rbuf4, 4, MPI_INTEGER, &
+                              & merge(bc_x%beg, MPI_PROC_NULL, bc_x%beg >= 0), 311, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+            if (bc_x%beg >= 0) then
+                ib_neighbor_ranks(-1, -1, 0) = rbuf4(1)
+                ib_neighbor_ranks(-1, +1, 0) = rbuf4(2)
+                ib_neighbor_ranks(-1, 0, -1) = rbuf4(3)
+                ib_neighbor_ranks(-1, 0, +1) = rbuf4(4)
+            end if
+        end if
+
+        if (num_dims == 3) then
+            ! Round 1b: exchange z face ranks with +/-y face neighbors -> yz edge ranks
+            buf2 = [bc_z%beg, bc_z%end]
+
+            call MPI_SENDRECV(buf2, 2, MPI_INTEGER, merge(bc_y%beg, MPI_PROC_NULL, bc_y%beg >= 0), 312, rbuf2, 2, MPI_INTEGER, &
+                              & merge(bc_y%end, MPI_PROC_NULL, bc_y%end >= 0), 312, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+            if (bc_y%end >= 0) then
+                ib_neighbor_ranks(0, +1, -1) = rbuf2(1)
+                ib_neighbor_ranks(0, +1, +1) = rbuf2(2)
+            end if
+
+            call MPI_SENDRECV(buf2, 2, MPI_INTEGER, merge(bc_y%end, MPI_PROC_NULL, bc_y%end >= 0), 313, rbuf2, 2, MPI_INTEGER, &
+                              & merge(bc_y%beg, MPI_PROC_NULL, bc_y%beg >= 0), 313, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+            if (bc_y%beg >= 0) then
+                ib_neighbor_ranks(0, -1, -1) = rbuf2(1)
+                ib_neighbor_ranks(0, -1, +1) = rbuf2(2)
+            end if
+
+            ! Round 2: exchange z face ranks with xy-diagonal edge neighbors -> corner ranks. Each of the 4 xy diagonals gives 2
+            ! corners (the +/-z variants). Pattern: send buf2 to mirror diagonal, receive from this diagonal -> that edge's z face
+            ! ranks.
+            #:for DX, DY, MDX, MDY, TAG in [(1,1,-1,-1,320), (1,-1,-1,1,321), (-1,1,1,-1,322), (-1,-1,1,1,323)]
+                call MPI_SENDRECV(buf2, 2, MPI_INTEGER, merge(ib_neighbor_ranks(${MDX}$, ${MDY}$, 0), MPI_PROC_NULL, &
+                                  & ib_neighbor_ranks(${MDX}$, ${MDY}$, 0) >= 0), ${TAG}$, rbuf2, 2, MPI_INTEGER, &
+                                  & merge(ib_neighbor_ranks(${DX}$, ${DY}$, 0), MPI_PROC_NULL, ib_neighbor_ranks(${DX}$, ${DY}$, &
+                                  & 0) >= 0), ${TAG}$, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+                if (ib_neighbor_ranks(${DX}$, ${DY}$, 0) >= 0) then
+                    ib_neighbor_ranks(${DX}$, ${DY}$, -1) = rbuf2(1)
+                    ib_neighbor_ranks(${DX}$, ${DY}$, +1) = rbuf2(2)
+                end if
+            #:endfor
+        end if
+
+        ! For radius > 1: extend the table by iterative 26-neighbor full-table exchanges. In each round, every rank broadcasts its
+        ! current table to all 26 immediate neighbors. Their entry at offset (dx,dy,dz) from them = our entry at
+        ! (dx+sx,dy+sy,dz+sz). One extension round fills the entire next shell, so ax-1 rounds suffice.
+        if (ax > 1) then
+            allocate (send_table(-ax:ax,-ax:ax,-ax:ax))
+            allocate (recv_tables(-ax:ax,-ax:ax,-ax:ax,1:26))
+
+            do k = 2, ax
+                send_table = ib_neighbor_ranks
+
+                nreqs = 0
+                nbr_idx = 0
+                do sz = -1, 1
+                    do sy = -1, 1
+                        do sx = -1, 1
+                            if (sx == 0 .and. sy == 0 .and. sz == 0) cycle
+                            nbr_idx = nbr_idx + 1
+                            if (ib_neighbor_ranks(sx, sy, sz) < 0) cycle
+                            nreqs = nreqs + 1
+                            call MPI_IRECV(recv_tables(:,:,:,nbr_idx), (2*ax + 1)**3, MPI_INTEGER, ib_neighbor_ranks(sx, sy, sz), &
+                                           & 400, MPI_COMM_WORLD, requests(nreqs), ierr)
+                        end do
+                    end do
+                end do
+
+                do sz = -1, 1
+                    do sy = -1, 1
+                        do sx = -1, 1
+                            if (sx == 0 .and. sy == 0 .and. sz == 0) cycle
+                            if (ib_neighbor_ranks(sx, sy, sz) < 0) cycle
+                            nreqs = nreqs + 1
+                            call MPI_ISEND(send_table, (2*ax + 1)**3, MPI_INTEGER, ib_neighbor_ranks(sx, sy, sz), 400, &
+                                           & MPI_COMM_WORLD, requests(nreqs), ierr)
+                        end do
+                    end do
+                end do
+
+                call MPI_WAITALL(nreqs, requests, MPI_STATUSES_IGNORE, ierr)
+
+                nbr_idx = 0
+                do sz = -1, 1
+                    do sy = -1, 1
+                        do sx = -1, 1
+                            if (sx == 0 .and. sy == 0 .and. sz == 0) cycle
+                            nbr_idx = nbr_idx + 1
+                            if (ib_neighbor_ranks(sx, sy, sz) < 0) cycle
+                            do dz = -ax, ax
+                                do dy = -ax, ax
+                                    do dx = -ax, ax
+                                        if (recv_tables(dx, dy, dz, nbr_idx) == MPI_PROC_NULL) cycle
+                                        if (dx + sx < -ax .or. dx + sx > ax) cycle
+                                        if (dy + sy < -ax .or. dy + sy > ax) cycle
+                                        if (dz + sz < -ax .or. dz + sz > ax) cycle
+                                        if (ib_neighbor_ranks(dx + sx, dy + sy, dz + sz) /= MPI_PROC_NULL) cycle
+                                        ib_neighbor_ranks(dx + sx, dy + sy, dz + sz) = recv_tables(dx, dy, dz, nbr_idx)
+                                    end do
+                                end do
+                            end do
+                        end do
+                    end do
+                end do
+            end do
+
+            deallocate (send_table, recv_tables)
+        end if
+#endif
+
+    end subroutine s_compute_ib_neighbor_ranks
+
+    subroutine get_neighbor_bounds()
+
+        real(wp) :: beg_val, end_val, recv_val
+        integer  :: k, send_neighbor, recv_neighbor, ierr
+
+        ! Default: unbounded in all directions (covers single-rank and no-MPI cases)
+
+        neighbor_domain_x%beg = -huge(0._wp)
+        neighbor_domain_x%end = huge(0._wp)
+        neighbor_domain_y%beg = -huge(0._wp)
+        neighbor_domain_y%end = huge(0._wp)
+        neighbor_domain_z%beg = -huge(0._wp)
+        neighbor_domain_z%end = huge(0._wp)
+
+#ifdef MFC_MPI
+        ! For each direction, propagate the left/right boundary edges outward ib_neighborhood_radius hops. After k rounds: beg_val =
+        ! left edge of the rank k hops to the left; end_val = right edge of the rank k hops to the right.
+        #:for X, ID, TAG, DIM in [('x', 1, 100, 'm'), ('y', 2, 102, 'n'), ('z', 3, 104, 'p')]
+            if (num_dims >= ${ID}$) then
+                beg_val = ${X}$_cb(-1)
+                end_val = ${X}$_cb(${DIM}$)
+                do k = 1, ib_neighborhood_radius
+                    send_neighbor = merge(bc_${X}$%end, MPI_PROC_NULL, bc_${X}$%end >= 0)
+                    recv_neighbor = merge(bc_${X}$%beg, MPI_PROC_NULL, bc_${X}$%beg >= 0)
+                    recv_val = -huge(0._wp)
+                    call MPI_SENDRECV(beg_val, 1, mpi_p, send_neighbor, ${TAG}$, recv_val, 1, mpi_p, recv_neighbor, ${TAG}$, &
+                                      & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+                    beg_val = recv_val
+
+                    send_neighbor = merge(bc_${X}$%beg, MPI_PROC_NULL, bc_${X}$%beg >= 0)
+                    recv_neighbor = merge(bc_${X}$%end, MPI_PROC_NULL, bc_${X}$%end >= 0)
+                    recv_val = huge(0._wp)
+                    call MPI_SENDRECV(end_val, 1, mpi_p, send_neighbor, ${TAG + 1}$, recv_val, 1, mpi_p, recv_neighbor, &
+                                      & ${TAG + 1}$, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+                    end_val = recv_val
+                end do
+                neighbor_domain_${X}$%beg = beg_val
+                neighbor_domain_${X}$%end = end_val
+            end if
+        #:endfor
+#endif
+
+    end subroutine get_neighbor_bounds
+
 end module m_start_up
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index ce0ae48c4b..701d7a3cb0 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -552,7 +552,6 @@ contains
                 ! check if any IBMS are moving, and if so, update the markers, ghost points, levelsets, and levelset norms
                 if (moving_immersed_boundary_flag) then
                     call s_propagate_immersed_boundaries(s)
-                    ! compute ib forces for fixed immersed boundaries if requested for output
                 end if
 
                 ! update the ghost fluid properties point values based on IB state
@@ -564,10 +563,11 @@ contains
             end if
         end do
 
-        !
         if (ib) then
-            if (moving_immersed_boundary_flag) call s_wrap_periodic_ibs()
-            if (ib_state_wrt .and. (.not. moving_immersed_boundary_flag)) then
+            if (moving_immersed_boundary_flag) then
+                call s_wrap_periodic_ibs()  ! wraps the positions of IBs to the local proc
+                call s_handoff_ib_ownership()  ! recomputes which ranks own which IBs and communicate to neighbors
+            else if (ib_state_wrt) then
                 call s_compute_ib_forces(q_prim_vf, fluid_pp)
             end if
         end if
@@ -708,11 +708,11 @@ contains
 
         integer, intent(in) :: s
         integer             :: i
-        logical             :: forces_computed
+        integer             :: gbl_id  ! used for analytic ib patch motion
 
         call nvtxStartRange("PROPAGATE-IMMERSED-BOUNDARIES")
 
-        forces_computed = .false.
+        if (moving_immersed_boundary_flag) call s_compute_ib_forces(q_prim_vf, fluid_pp)
 
         do i = 1, num_ibs
             if (s == 1) then
@@ -726,11 +726,6 @@ contains
 
             ! Compute forces BEFORE the RK velocity blend so the device copy of patch_ib%vel matches the host (pre-blend) when
             ! velocity-dependent collision damping forces are evaluated on the GPU.
-            if (patch_ib(i)%moving_ibm == 2 .and. .not. forces_computed) then
-                call s_compute_ib_forces(q_prim_vf, fluid_pp)
-                forces_computed = .true.
-            end if
-
             if (patch_ib(i)%moving_ibm > 0) then
                 patch_ib(i)%vel = (rk_coef(s, 1)*patch_ib(i)%step_vel + rk_coef(s, 2)*patch_ib(i)%vel)/rk_coef(s, 4)
                 patch_ib(i)%angular_vel = (rk_coef(s, 1)*patch_ib(i)%step_angular_vel + rk_coef(s, &
@@ -748,8 +743,7 @@ contains
                              & 3)*dt*patch_ib(i)%torque/rk_coef(s, 4))  ! add the torque to the angular momentum
                     call s_compute_moment_of_inertia(i, patch_ib(i)%angular_vel)
                     ! update the moment of inertia to be based on the direction of the angular momentum
-                    patch_ib(i)%angular_vel = patch_ib(i)%angular_vel/patch_ib(i) &
-                             & %moment  ! convert back to angular velocity with the new moment of inertia
+                    patch_ib(i)%angular_vel = patch_ib(i)%angular_vel/patch_ib(i)%moment
                 end if
 
                 ! Update the angle of the IB
diff --git a/toolchain/mfc/case.py b/toolchain/mfc/case.py
index 65c49f8304..7aed9212dc 100644
--- a/toolchain/mfc/case.py
+++ b/toolchain/mfc/case.py
@@ -295,7 +295,7 @@ def rhs_replace(match):
             # each element separated by new line characters. Then write those
             # new lines as a fully concatenated string with fortran syntax
             srcs.append(f"""\
-    if (i == {pid}) then
+    if (gbl_id == {pid}) then
 {f"{chr(10)}".join(lines)}
     end if\
 """)
@@ -305,6 +305,7 @@ def rhs_replace(match):
 ! parameterize the velocity and rotation rate of a moving IB.
 
 #:def mib_analytical()
+gbl_id = patch_ib(i)%gbl_patch_id
 {f"{chr(10)}{chr(10)}".join(srcs)}
 #:enddef
 """
diff --git a/toolchain/mfc/params/definitions.py b/toolchain/mfc/params/definitions.py
index ef04ad73c2..d9bd8a0016 100644
--- a/toolchain/mfc/params/definitions.py
+++ b/toolchain/mfc/params/definitions.py
@@ -226,6 +226,7 @@ def _fc(name: str) -> int:
     "hyperelasticity": "Enable hyperelastic model",
     "relativity": "Enable special relativity",
     "ib": "Enable immersed boundaries",
+    "ib_neighborhood_radius": "Neighborhood radius in ranks for IB awareness",
     "collision_model": "Collision model for immersed boundaries (0=none, 1=soft sphere)",
     "coefficient_of_restitution": "Coefficient of restitution for IB collisions",
     "collision_time": "Characteristic collision time for IB collisions",
@@ -664,6 +665,7 @@ def get_value_label(param_name: str, value: int) -> str:
     "num_fluids": {"min": 1, "max": NF},
     "num_patches": {"min": 0, "max": NUM_PATCHES_MAX},
     "num_ibs": {"min": 0},
+    "ib_neighborhood_radius": {"min": 1},
     "num_source": {"min": 1},
     "num_probes": {"min": 1},
     "num_integrals": {"min": 1},
@@ -915,6 +917,7 @@ def _load():
 
     # Immersed boundary
     _r("num_ibs", INT, {"ib"})
+    _r("ib_neighborhood_radius", INT, {"ib"})
     _r("ib", LOG, {"ib"})
     _r("collision_model", INT, {"ib"})
     _r("coefficient_of_restitution", REAL, {"ib"})
diff --git a/toolchain/mfc/params/descriptions.py b/toolchain/mfc/params/descriptions.py
index dd542a8d1a..eebaf6d58a 100644
--- a/toolchain/mfc/params/descriptions.py
+++ b/toolchain/mfc/params/descriptions.py
@@ -132,6 +132,7 @@
     # Immersed boundaries
     "ib": "Enable immersed boundary method",
     "num_ibs": "Number of immersed boundary patches",
+    "ib_neighborhood_radius": "Neighborhood radius in ranks for IB awareness",
     # Acoustic sources
     "acoustic_source": "Enable acoustic source terms",
     "num_source": "Number of acoustic sources",