diff --git a/.github/workflows/linux-ci.yml b/.github/workflows/linux-ci.yml
index c5e0c9aaf2..527a601dd9 100644
--- a/.github/workflows/linux-ci.yml
+++ b/.github/workflows/linux-ci.yml
@@ -692,4 +692,23 @@ jobs:
         make check
 
 
+  test-linux-lto:
+    runs-on: ubuntu-24.04
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Install clang, lld and llvm
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y clang lld llvm
+
+    - name: Build and test (whole-program ThinLTO, je_ prefix)
+      run: |
+        autoconf
+        CC=clang AR=llvm-ar NM=llvm-nm RANLIB=llvm-ranlib \
+          ./configure --with-jemalloc-prefix=je_ EXTRA_CFLAGS=-flto=thin
+        make -j3 EXTRA_LDFLAGS="-flto=thin -fuse-ld=lld"
+        make -j3 tests EXTRA_LDFLAGS="-flto=thin -fuse-ld=lld"
+        make check
+
 
diff --git a/Makefile.in b/Makefile.in
index 459f98fb04..ba24f84494 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -334,6 +334,14 @@ ifeq (@enable_experimental_smallocx@, 1)
 TESTS_INTEGRATION += \
   $(srcroot)test/integration/smallocx.c
 endif
+# tcache_fiber_migration is a standalone LTO reproducer (issue #2890): the
+# dedicated rule below links it against the static archive (--whole-archive) so
+# the allocator fastpath inlines next to the swapcontext.  Needs that archive,
+# hence the enable_static guard; reproduces the bug only under whole-program
+# LTO, a no-op guard otherwise.
+ifeq ($(enable_static), 1)
+TESTS_INTEGRATION += $(srcroot)test/integration/tcache_fiber_migration.c
+endif
 ifeq (@enable_cxx@, 1)
 CPP_SRCS := $(srcroot)src/jemalloc_cpp.cpp
 TESTS_INTEGRATION_CPP := $(srcroot)test/integration/cpp/basic.cpp \
@@ -564,6 +572,14 @@ $(objroot)test/integration/%$(EXE): $(objroot)test/integration/%.$(O) $(C_TESTLI
 	@mkdir -p $(@D)
 	$(CC) $(TEST_LD_MODE) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(LJEMALLOC) $(LDFLAGS) $(filter-out -lm,$(filter -lrt -pthread -lstdc++,$(LIBS))) $(LM) $(EXTRA_LDFLAGS)
 
+# tcache_fiber_migration (issue #2890) must inline the allocator next to the
+# swapcontext, so link jemalloc statically (--whole-archive), without the test
+# harness/shared lib; whole-program LTO (from the build's flags) does the
+# inlining.  Explicit rule -- overrides the generic integration rule above.
+$(objroot)test/integration/tcache_fiber_migration$(EXE): $(objroot)test/integration/tcache_fiber_migration.$(O) $(objroot)lib/$(LIBJEMALLOC).$(A)
+	@mkdir -p $(@D)
+	$(CC) $(LDTARGET) $(objroot)test/integration/tcache_fiber_migration.$(O) -Wl,--whole-archive $(objroot)lib/$(LIBJEMALLOC).$(A) -Wl,--no-whole-archive $(LDFLAGS) -pthread $(filter -lrt -lstdc++,$(LIBS)) $(LM) $(EXTRA_LDFLAGS)
+
 $(objroot)test/integration/cpp/%$(EXE): $(objroot)test/integration/cpp/%.$(O) $(C_TESTLIB_INTEGRATION_OBJS) $(C_UTIL_INTEGRATION_OBJS) $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB)
 	@mkdir -p $(@D)
 	$(CXX) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB) $(LDFLAGS) $(filter-out -lm,$(LIBS)) -lm $(EXTRA_LDFLAGS)
diff --git a/configure.ac b/configure.ac
index e57d0667e4..5ef440e8f3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2775,6 +2775,8 @@ if test "x${je_cv_tls_model}" = "xyes" -a \
   AC_DEFINE([JEMALLOC_TLS_MODEL],
             [__attribute__((tls_model("initial-exec")))],
             [ ])
+  AC_DEFINE([JEMALLOC_TLS_MODEL_INITIAL_EXEC], [ ],
+            [Defined when the TSD thread-locals use the initial-exec (static) TLS model, i.e. live at a fixed offset from the thread pointer.])
 else
   AC_DEFINE([JEMALLOC_TLS_MODEL], [ ], [ ])
 fi
diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index 31ae2e8ed2..841ae94233 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -148,6 +148,13 @@
 /* Non-empty if the tls_model attribute is supported. */
 #undef JEMALLOC_TLS_MODEL
 
+/*
+ * Defined when the TSD thread-locals use the initial-exec (static) TLS model,
+ * i.e. live at a fixed offset from the thread pointer.  Gates the fast path of
+ * JEMALLOC_TLS_ADDR (see tsd_internals.h).
+ */
+#undef JEMALLOC_TLS_MODEL_INITIAL_EXEC
+
 /*
  * JEMALLOC_DEBUG enables assertions and other sanity checks, and disables
  * inline functions.
diff --git a/include/jemalloc/internal/tsd_internals.h b/include/jemalloc/internal/tsd_internals.h
index f675587d73..55fac2c0e7 100644
--- a/include/jemalloc/internal/tsd_internals.h
+++ b/include/jemalloc/internal/tsd_internals.h
@@ -20,6 +20,92 @@
 #include "jemalloc/internal/util.h"
 #include "jemalloc/internal/witness.h"
 
+/*
+ * JEMALLOC_TLS_ADDR(tlsvar): the address of a thread-local, read so the compiler
+ * cannot hoist it across a user-space context switch (swapcontext, ucontext /
+ * coroutine runtimes).  A plain `&tlsvar` is loop-invariant, so under LTO the
+ * compiler may hoist the read out of the inlined `malloc`/`free` and keep it in
+ * a register; if the context then resumes on another OS thread, that stale
+ * tsd/tcache belongs to the previous thread and the two race on it -- heap
+ * corruption visible only under LTO.  See
+ * https://github.com/jemalloc/jemalloc/issues/2890
+ *
+ * Fast path (static TLS only, hence gated on JEMALLOC_TLS_MODEL_INITIAL_EXEC --
+ * jemalloc's default): the address is `thread_pointer + offset` with a
+ * thread-independent offset.  The offset is captured once at runtime into a
+ * global by a noinline helper (so both of its terms are evaluated at a single
+ * point on a single thread), and the thread pointer is re-read on every call
+ * with a `volatile` asm the optimizer may not hoist or CSE (per-arch reads from
+ * mimalloc's mi_prim_tls_slot).  The offset must NOT be computed inline as
+ * `&tlsvar - thread_pointer`: the compiler can hoist the (stale) `&tlsvar`
+ * while re-materializing the thread-pointer terms, and `fresh_tp + (stale_addr
+ * - fresh_tp)` cancels the fresh read back to the stale address.
+ *
+ * Otherwise (dynamic TLS models, unlisted arch, MinGW): a per-variable noinline
+ * accessor takes the address behind a `memory` barrier -- an opaque call the
+ * optimizer can neither inline, prove pure, nor hoist/CSE across the switch.
+ * Correct for every TLS model, at one real call per access.  Non-GNU compilers
+ * (MSVC) have no inline asm and keep the plain `&tlsvar`.
+ */
+#if defined(__GNUC__) && !defined(_WIN32) &&                                   \
+    defined(JEMALLOC_TLS_MODEL_INITIAL_EXEC) &&                                \
+    (defined(__aarch64__) || defined(__arm__) || defined(__x86_64__) ||        \
+    defined(__i386__))
+JEMALLOC_ALWAYS_INLINE char *
+jemalloc_thread_pointer(void) {
+	char *thread_pointer;
+#  if defined(__aarch64__) && defined(__APPLE__)
+	__asm__ __volatile__("mrs %0, tpidrro_el0\n\tbic %0, %0, #7" : "=r"(thread_pointer));
+#  elif defined(__aarch64__)
+	__asm__ __volatile__("mrs %0, tpidr_el0" : "=r"(thread_pointer));
+#  elif defined(__arm__)
+	__asm__ __volatile__("mrc p15, 0, %0, c13, c0, 3\n\tbic %0, %0, #3" : "=r"(thread_pointer));
+#  elif defined(__x86_64__) && defined(__APPLE__)
+	__asm__ __volatile__("movq %%gs:0, %0" : "=r"(thread_pointer));
+#  elif defined(__x86_64__)
+	__asm__ __volatile__("movq %%fs:0, %0" : "=r"(thread_pointer));
+#  else /* __i386__ */
+	__asm__ __volatile__("movl %%gs:0, %0" : "=r"(thread_pointer));
+#  endif
+	return thread_pointer;
+}
+/* 1 is unreachable: tlsvar and the thread pointer are at least 4-aligned. */
+#  define JEMALLOC_TLS_OFFSET_UNINITIALIZED 1
+#  define JEMALLOC_TLS_ADDR_DEFINE(tlsvar)                                      \
+	static UNUSED intptr_t jemalloc_tls_offset_##tlsvar =                   \
+	    JEMALLOC_TLS_OFFSET_UNINITIALIZED;                                  \
+	static UNUSED JEMALLOC_NOINLINE intptr_t                                \
+	jemalloc_tls_offset_init_##tlsvar(void) {                               \
+		intptr_t tls_offset = (intptr_t)((char *)&(tlsvar) -            \
+		    jemalloc_thread_pointer());                                 \
+		jemalloc_tls_offset_##tlsvar = tls_offset;                      \
+		return tls_offset;                                              \
+	}                                                                       \
+	JEMALLOC_ALWAYS_INLINE __typeof__(&(tlsvar))                            \
+	jemalloc_tls_addr_##tlsvar(void) {                                      \
+		intptr_t tls_offset = jemalloc_tls_offset_##tlsvar;             \
+		if (unlikely(tls_offset ==                                      \
+		    JEMALLOC_TLS_OFFSET_UNINITIALIZED)) {                       \
+			tls_offset = jemalloc_tls_offset_init_##tlsvar();       \
+		}                                                               \
+		return (__typeof__(&(tlsvar)))(jemalloc_thread_pointer() +     \
+		    tls_offset);                                                \
+	}
+#  define JEMALLOC_TLS_ADDR(tlsvar) (jemalloc_tls_addr_##tlsvar())
+#elif defined(__GNUC__)
+#  define JEMALLOC_TLS_ADDR_DEFINE(tlsvar)                                      \
+	static UNUSED JEMALLOC_NOINLINE __typeof__(&(tlsvar))                   \
+	jemalloc_tls_addr_##tlsvar(void) {                                      \
+		__typeof__(&(tlsvar)) tls_addr = &(tlsvar);                     \
+		__asm__ __volatile__("" : "+r"(tls_addr) : : "memory");         \
+		return tls_addr;                                                \
+	}
+#  define JEMALLOC_TLS_ADDR(tlsvar) (jemalloc_tls_addr_##tlsvar())
+#else
+#  define JEMALLOC_TLS_ADDR_DEFINE(tlsvar)
+#  define JEMALLOC_TLS_ADDR(tlsvar) (&(tlsvar))
+#endif
+
 /*
  * Thread-Specific-Data layout
  *
diff --git a/include/jemalloc/internal/tsd_malloc_thread_cleanup.h b/include/jemalloc/internal/tsd_malloc_thread_cleanup.h
index 00756df1d0..10d3579b02 100644
--- a/include/jemalloc/internal/tsd_malloc_thread_cleanup.h
+++ b/include/jemalloc/internal/tsd_malloc_thread_cleanup.h
@@ -13,6 +13,8 @@ extern JEMALLOC_TSD_TYPE_ATTR(tsd_t) tsd_tls;
 extern JEMALLOC_TSD_TYPE_ATTR(bool) tsd_initialized;
 extern bool tsd_booted;
 
+JEMALLOC_TLS_ADDR_DEFINE(tsd_tls)
+
 /* Initialization/cleanup. */
 JEMALLOC_ALWAYS_INLINE bool
 tsd_cleanup_wrapper(void) {
@@ -53,7 +55,7 @@ tsd_get_allocates(void) {
 /* Get/set. */
 JEMALLOC_ALWAYS_INLINE tsd_t *
 tsd_get(bool init) {
-	return &tsd_tls;
+	return JEMALLOC_TLS_ADDR(tsd_tls);
 }
 JEMALLOC_ALWAYS_INLINE void
 tsd_set(tsd_t *val) {
diff --git a/include/jemalloc/internal/tsd_tls.h b/include/jemalloc/internal/tsd_tls.h
index 6536eb540c..968d1c0bdf 100644
--- a/include/jemalloc/internal/tsd_tls.h
+++ b/include/jemalloc/internal/tsd_tls.h
@@ -13,6 +13,8 @@ extern JEMALLOC_TSD_TYPE_ATTR(tsd_t) tsd_tls;
 extern pthread_key_t tsd_tsd;
 extern bool          tsd_booted;
 
+JEMALLOC_TLS_ADDR_DEFINE(tsd_tls)
+
 /* Initialization/cleanup. */
 JEMALLOC_ALWAYS_INLINE bool
 tsd_boot0(void) {
@@ -46,7 +48,7 @@ tsd_get_allocates(void) {
 /* Get/set. */
 JEMALLOC_ALWAYS_INLINE tsd_t *
 tsd_get(bool init) {
-	return &tsd_tls;
+	return JEMALLOC_TLS_ADDR(tsd_tls);
 }
 
 JEMALLOC_ALWAYS_INLINE void
diff --git a/include/jemalloc/internal/tsd_win.h b/include/jemalloc/internal/tsd_win.h
index 8b22bec18d..d06cc27e7e 100644
--- a/include/jemalloc/internal/tsd_win.h
+++ b/include/jemalloc/internal/tsd_win.h
@@ -178,6 +178,8 @@ tsd_set(tsd_t *val) {
 extern JEMALLOC_TSD_TYPE_ATTR(tsd_wrapper_t) tsd_wrapper_tls;
 extern bool tsd_booted;
 
+JEMALLOC_TLS_ADDR_DEFINE(tsd_wrapper_tls)
+
 /* Initialization/cleanup. */
 JEMALLOC_ALWAYS_INLINE bool
 tsd_cleanup_wrapper(void) {
@@ -223,7 +225,7 @@ tsd_get_allocates(void) {
 /* Get/set. */
 JEMALLOC_ALWAYS_INLINE tsd_t *
 tsd_get(bool init) {
-	return &(tsd_wrapper_tls.val);
+	return &(JEMALLOC_TLS_ADDR(tsd_wrapper_tls)->val);
 }
 
 JEMALLOC_ALWAYS_INLINE void
diff --git a/scripts/gen_gh_actions.py b/scripts/gen_gh_actions.py
index 4c5474ab7f..d40fc7f321 100755
--- a/scripts/gen_gh_actions.py
+++ b/scripts/gen_gh_actions.py
@@ -632,6 +632,34 @@ def generate_freebsd_job(arch):
     return job
 
 
+def generate_linux_lto_job():
+    """Dedicated lane: whole-program ThinLTO + the je_ public prefix, statically
+    linked.  This is the configuration under which the tcache_fiber_migration
+    reproducer (issue #2890) actually exercises the bug -- the allocator
+    fastpath must be inlined next to the swapcontext, which only happens with
+    static linking under LTO.  llvm-ar/nm/ranlib are needed to archive the LTO
+    bitcode; -fuse-ld=lld to link it."""
+    return """  test-linux-lto:
+    runs-on: ubuntu-24.04
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Install clang, lld and llvm
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y clang lld llvm
+
+    - name: Build and test (whole-program ThinLTO, je_ prefix)
+      run: |
+        autoconf
+        CC=clang AR=llvm-ar NM=llvm-nm RANLIB=llvm-ranlib \\
+          ./configure --with-jemalloc-prefix=je_ EXTRA_CFLAGS=-flto=thin
+        make -j3 EXTRA_LDFLAGS="-flto=thin -fuse-ld=lld"
+        make -j3 tests EXTRA_LDFLAGS="-flto=thin -fuse-ld=lld"
+        make check
+"""
+
+
 def main():
     import sys
 
@@ -642,6 +670,7 @@ def main():
         jobs = '\n'.join((
             generate_linux_job(AMD64),
             generate_linux_job(ARM64),
+            generate_linux_lto_job(),
         ))
         print(GITHUB_ACTIONS_TEMPLATE.format(name='Linux CI', jobs=jobs))
 
@@ -665,6 +694,7 @@ def main():
         linux_jobs = '\n'.join((
             generate_linux_job(AMD64),
             generate_linux_job(ARM64),
+            generate_linux_lto_job(),
         ))
         macos_jobs = '\n'.join((
             generate_macos_job(AMD64),   # Intel
diff --git a/test/integration/tcache_fiber_migration.c b/test/integration/tcache_fiber_migration.c
new file mode 100644
index 0000000000..e374af1f60
--- /dev/null
+++ b/test/integration/tcache_fiber_migration.c
@@ -0,0 +1,139 @@
+/*
+ * Standalone regression test for the thread-pointer hoisting bug (issue #2890):
+ * under whole-program LTO the inlined allocator fastpath caches the TLS-derived
+ * tcache base in a callee-saved register; a fiber resumed on a different OS
+ * thread then frees/allocs against the previous thread's tcache -> heap
+ * corruption.  See JEMALLOC_TLS_ADDR in tsd_internals.h.
+ *
+ * Reproducing requires the allocator inlined next to the swapcontext, so:
+ *   - call jemalloc directly (via JEMALLOC_MANGLE, so malloc/free bind to the
+ *     configured-prefix symbols, not the libc wrappers, which do not inline);
+ *   - link jemalloc *statically* (Makefile.in links this one test against
+ *     libjemalloc.a with --whole-archive);
+ *   - build with whole-program LTO.
+ * Hence it is a deliberately standalone program (no test harness, so it can be
+ * static-linked without symbol clashes).  Without LTO it runs clean, so it is a
+ * no-op guard there.
+ */
+#include <pthread.h>
+#include <stdbool.h>
+#include <ucontext.h>
+
+#define JEMALLOC_MANGLE
+#include <jemalloc/jemalloc.h>
+
+enum {
+	num_fibers = 128,
+	num_workers = 8,
+	ops_per_fiber = 20 * 1000,
+	fiber_stack_size = 1 << 16,
+	ready_queue_capacity = 512 /* power of two, > num_fibers */
+};
+
+static ucontext_t fiber_context[num_fibers];
+/* Scheduler context to switch back to when a fiber yields; the resuming worker
+ * writes its own context here, so this changes when the fiber migrates. */
+static ucontext_t *return_context[num_fibers];
+static unsigned fiber_remaining_ops[num_fibers];
+static bool fiber_done[num_fibers];
+
+static unsigned ready_queue[ready_queue_capacity];
+static unsigned ready_queue_head;
+static unsigned ready_queue_tail;
+static unsigned live_fibers;
+static pthread_mutex_t queue_mtx = PTHREAD_MUTEX_INITIALIZER;
+
+static void
+push_ready_fiber(unsigned id) {
+	pthread_mutex_lock(&queue_mtx);
+	ready_queue[ready_queue_head++ & (ready_queue_capacity - 1)] = id;
+	pthread_mutex_unlock(&queue_mtx);
+}
+
+/* Returns a ready fiber id, -1 if none ready now, or -2 if all have finished. */
+static int
+pop_ready_fiber(void) {
+	int id;
+	pthread_mutex_lock(&queue_mtx);
+	if (live_fibers == 0) {
+		id = -2;
+	} else if (ready_queue_tail != ready_queue_head) {
+		id = (int)ready_queue[ready_queue_tail++ & (ready_queue_capacity - 1)];
+	} else {
+		id = -1;
+	}
+	pthread_mutex_unlock(&queue_mtx);
+	return id;
+}
+
+static void
+fiber_run(int id) {
+	void *p = malloc(64);
+	while (fiber_remaining_ops[id] > 0) {
+		fiber_remaining_ops[id]--;
+		free(p); /* push to the CURRENT thread's tcache */
+		/* Yield; may be resumed on a different worker thread. */
+		swapcontext(&fiber_context[id], return_context[id]);
+		p = malloc(64); /* pop; must come from the NEW thread's tcache */
+		*(char *)p = (char)id;
+	}
+	free(p);
+	pthread_mutex_lock(&queue_mtx);
+	fiber_done[id] = true;
+	live_fibers--;
+	pthread_mutex_unlock(&queue_mtx);
+	swapcontext(&fiber_context[id], return_context[id]);
+}
+
+static void *
+worker_thread(void *arg) {
+	ucontext_t scheduler_context;
+	(void)arg;
+	for (;;) {
+		int id = pop_ready_fiber();
+		if (id == -2) {
+			break;
+		}
+		if (id < 0) {
+			continue;
+		}
+		return_context[id] = &scheduler_context;
+		swapcontext(&scheduler_context, &fiber_context[id]);
+		if (!fiber_done[id]) {
+			push_ready_fiber((unsigned)id);
+		}
+	}
+	return NULL;
+}
+
+int
+main(void) {
+	void *stacks[num_fibers];
+	pthread_t threads[num_workers];
+	unsigned i;
+
+	live_fibers = num_fibers;
+	for (i = 0; i < num_fibers; i++) {
+		stacks[i] = malloc(fiber_stack_size);
+		fiber_remaining_ops[i] = ops_per_fiber;
+		getcontext(&fiber_context[i]);
+		fiber_context[i].uc_stack.ss_sp = stacks[i];
+		fiber_context[i].uc_stack.ss_size = fiber_stack_size;
+		fiber_context[i].uc_link = NULL;
+		makecontext(&fiber_context[i], (void (*)(void))fiber_run, 1, (int)i);
+		push_ready_fiber(i);
+	}
+
+	for (i = 0; i < num_workers; i++) {
+		pthread_create(&threads[i], NULL, worker_thread, NULL);
+	}
+	for (i = 0; i < num_workers; i++) {
+		pthread_join(threads[i], NULL);
+	}
+	for (i = 0; i < num_fibers; i++) {
+		free(stacks[i]);
+	}
+
+	/* Completing without a tcache-corruption crash is success. */
+	return live_fibers == 0 ? 0 : 1;
+}