From 07e318626a9de83a543e3c361fd674cc96c6ba07 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Tue, 9 Jun 2026 21:47:54 +0200
Subject: [PATCH 1/5] test: add `tcache_fiber_migration` standalone LTO
 reproducer

A worker-thread pool runs ucontext fibers that each do free/swapcontext/malloc
in one frame, so a fiber is routinely resumed on a different OS thread than it
suspended on.  With the tsd hoisting bug and a whole-program-LTO build the
inlined fastpath frees/allocates against the previous thread's tcache and the
process crashes; with the fix it runs to completion.

Reproducing requires the allocator inlined next to the swapcontext, so:
  - it is a standalone program (no test harness, so it can be static-linked
    without symbol clashes) that calls jemalloc via JEMALLOC_MANGLE, so
    malloc/free bind to the configured-prefix symbols (the libc wrappers do not
    inline) -- independent of --with-jemalloc-prefix;
  - Makefile.in links this one test against libjemalloc.a with --whole-archive
    (guarded by enable_static), not the shared library;
  - it only manifests under whole-program LTO -- a no-op guard otherwise.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 Makefile.in                               |  16 +++
 test/integration/tcache_fiber_migration.c | 139 ++++++++++++++++++++++
 2 files changed, 155 insertions(+)
 create mode 100644 test/integration/tcache_fiber_migration.c

diff --git a/Makefile.in b/Makefile.in
index 459f98fb04..92cc307b4a 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -334,6 +334,14 @@ ifeq (@enable_experimental_smallocx@, 1)
 TESTS_INTEGRATION += \
   $(srcroot)test/integration/smallocx.c
 endif
+# tcache_fiber_migration is a standalone LTO reproducer (issue #2890): the
+# dedicated rule below links it against the static archive (--whole-archive) so
+# the allocator fastpath inlines next to the swapcontext.  Needs that archive,
+# hence the enable_static guard; reproduces the bug only under whole-program
+# LTO, a no-op guard otherwise.
+ifeq ($(enable_static), 1)
+TESTS_INTEGRATION += $(srcroot)test/integration/tcache_fiber_migration.c
+endif
 ifeq (@enable_cxx@, 1)
 CPP_SRCS := $(srcroot)src/jemalloc_cpp.cpp
 TESTS_INTEGRATION_CPP := $(srcroot)test/integration/cpp/basic.cpp \
@@ -564,6 +572,14 @@ $(objroot)test/integration/%$(EXE): $(objroot)test/integration/%.$(O) $(C_TESTLI
 	@mkdir -p $(@D)
 	$(CC) $(TEST_LD_MODE) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(LJEMALLOC) $(LDFLAGS) $(filter-out -lm,$(filter -lrt -pthread -lstdc++,$(LIBS))) $(LM) $(EXTRA_LDFLAGS)
 
+# tcache_fiber_migration (issue #2890) must inline the allocator next to the
+# swapcontext, so link jemalloc statically (--whole-archive), without the test
+# harness/shared lib; whole-program LTO (from the build's flags) does the
+# inlining.  Explicit rule -- overrides the generic integration rule above.
+$(objroot)test/integration/tcache_fiber_migration$(EXE): $(objroot)test/integration/tcache_fiber_migration.$(O) $(objroot)lib/$(LIBJEMALLOC).$(A)
+	@mkdir -p $(@D)
+	$(CC) $(LDTARGET) $(objroot)test/integration/tcache_fiber_migration.$(O) -Wl,--whole-archive $(objroot)lib/$(LIBJEMALLOC).$(A) -Wl,--no-whole-archive $(LDFLAGS) -pthread $(LM) $(EXTRA_LDFLAGS)
+
 $(objroot)test/integration/cpp/%$(EXE): $(objroot)test/integration/cpp/%.$(O) $(C_TESTLIB_INTEGRATION_OBJS) $(C_UTIL_INTEGRATION_OBJS) $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB)
 	@mkdir -p $(@D)
 	$(CXX) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB) $(LDFLAGS) $(filter-out -lm,$(LIBS)) -lm $(EXTRA_LDFLAGS)
diff --git a/test/integration/tcache_fiber_migration.c b/test/integration/tcache_fiber_migration.c
new file mode 100644
index 0000000000..e374af1f60
--- /dev/null
+++ b/test/integration/tcache_fiber_migration.c
@@ -0,0 +1,139 @@
+/*
+ * Standalone regression test for the thread-pointer hoisting bug (issue #2890):
+ * under whole-program LTO the inlined allocator fastpath caches the TLS-derived
+ * tcache base in a callee-saved register; a fiber resumed on a different OS
+ * thread then frees/allocs against the previous thread's tcache -> heap
+ * corruption.  See JEMALLOC_TLS_ADDR in tsd_internals.h.
+ *
+ * Reproducing requires the allocator inlined next to the swapcontext, so:
+ *   - call jemalloc directly (via JEMALLOC_MANGLE, so malloc/free bind to the
+ *     configured-prefix symbols, not the libc wrappers, which do not inline);
+ *   - link jemalloc *statically* (Makefile.in links this one test against
+ *     libjemalloc.a with --whole-archive);
+ *   - build with whole-program LTO.
+ * Hence it is a deliberately standalone program (no test harness, so it can be
+ * static-linked without symbol clashes).  Without LTO it runs clean, so it is a
+ * no-op guard there.
+ */
+#include <pthread.h>
+#include <stdbool.h>
+#include <ucontext.h>
+
+#define JEMALLOC_MANGLE
+#include <jemalloc/jemalloc.h>
+
+enum {
+	num_fibers = 128,
+	num_workers = 8,
+	ops_per_fiber = 20 * 1000,
+	fiber_stack_size = 1 << 16,
+	ready_queue_capacity = 512 /* power of two, > num_fibers */
+};
+
+static ucontext_t fiber_context[num_fibers];
+/* Scheduler context to switch back to when a fiber yields; the resuming worker
+ * writes its own context here, so this changes when the fiber migrates. */
+static ucontext_t *return_context[num_fibers];
+static unsigned fiber_remaining_ops[num_fibers];
+static bool fiber_done[num_fibers];
+
+static unsigned ready_queue[ready_queue_capacity];
+static unsigned ready_queue_head;
+static unsigned ready_queue_tail;
+static unsigned live_fibers;
+static pthread_mutex_t queue_mtx = PTHREAD_MUTEX_INITIALIZER;
+
+static void
+push_ready_fiber(unsigned id) {
+	pthread_mutex_lock(&queue_mtx);
+	ready_queue[ready_queue_head++ & (ready_queue_capacity - 1)] = id;
+	pthread_mutex_unlock(&queue_mtx);
+}
+
+/* Returns a ready fiber id, -1 if none ready now, or -2 if all have finished. */
+static int
+pop_ready_fiber(void) {
+	int id;
+	pthread_mutex_lock(&queue_mtx);
+	if (live_fibers == 0) {
+		id = -2;
+	} else if (ready_queue_tail != ready_queue_head) {
+		id = (int)ready_queue[ready_queue_tail++ & (ready_queue_capacity - 1)];
+	} else {
+		id = -1;
+	}
+	pthread_mutex_unlock(&queue_mtx);
+	return id;
+}
+
+static void
+fiber_run(int id) {
+	void *p = malloc(64);
+	while (fiber_remaining_ops[id] > 0) {
+		fiber_remaining_ops[id]--;
+		free(p); /* push to the CURRENT thread's tcache */
+		/* Yield; may be resumed on a different worker thread. */
+		swapcontext(&fiber_context[id], return_context[id]);
+		p = malloc(64); /* pop; must come from the NEW thread's tcache */
+		*(char *)p = (char)id;
+	}
+	free(p);
+	pthread_mutex_lock(&queue_mtx);
+	fiber_done[id] = true;
+	live_fibers--;
+	pthread_mutex_unlock(&queue_mtx);
+	swapcontext(&fiber_context[id], return_context[id]);
+}
+
+static void *
+worker_thread(void *arg) {
+	ucontext_t scheduler_context;
+	(void)arg;
+	for (;;) {
+		int id = pop_ready_fiber();
+		if (id == -2) {
+			break;
+		}
+		if (id < 0) {
+			continue;
+		}
+		return_context[id] = &scheduler_context;
+		swapcontext(&scheduler_context, &fiber_context[id]);
+		if (!fiber_done[id]) {
+			push_ready_fiber((unsigned)id);
+		}
+	}
+	return NULL;
+}
+
+int
+main(void) {
+	void *stacks[num_fibers];
+	pthread_t threads[num_workers];
+	unsigned i;
+
+	live_fibers = num_fibers;
+	for (i = 0; i < num_fibers; i++) {
+		stacks[i] = malloc(fiber_stack_size);
+		fiber_remaining_ops[i] = ops_per_fiber;
+		getcontext(&fiber_context[i]);
+		fiber_context[i].uc_stack.ss_sp = stacks[i];
+		fiber_context[i].uc_stack.ss_size = fiber_stack_size;
+		fiber_context[i].uc_link = NULL;
+		makecontext(&fiber_context[i], (void (*)(void))fiber_run, 1, (int)i);
+		push_ready_fiber(i);
+	}
+
+	for (i = 0; i < num_workers; i++) {
+		pthread_create(&threads[i], NULL, worker_thread, NULL);
+	}
+	for (i = 0; i < num_workers; i++) {
+		pthread_join(threads[i], NULL);
+	}
+	for (i = 0; i < num_fibers; i++) {
+		free(stacks[i]);
+	}
+
+	/* Completing without a tcache-corruption crash is success. */
+	return live_fibers == 0 ? 0 : 1;
+}

From 89ee23c166cd3c1c5cb60d15e71d92c1f3a8b484 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Tue, 9 Jun 2026 21:47:54 +0200
Subject: [PATCH 2/5] ci: add a Linux whole-program-LTO lane (`test-linux-lto`)

The `tcache_fiber_migration` reproducer (issue #2890) only exercises the bug
when the allocator is statically linked and inlined under whole-program LTO.
None of the existing lanes build that way, so add a dedicated Linux lane:
clang ThinLTO, `--with-jemalloc-prefix=je_`, and llvm-ar/nm/ranlib (to archive
the LTO bitcode) + `-fuse-ld=lld`, then `make check`.

Authored in scripts/gen_gh_actions.py; .github/workflows/linux-ci.yml
regenerated.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/linux-ci.yml | 19 +++++++++++++++++++
 scripts/gen_gh_actions.py      | 30 ++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+)

diff --git a/.github/workflows/linux-ci.yml b/.github/workflows/linux-ci.yml
index c5e0c9aaf2..527a601dd9 100644
--- a/.github/workflows/linux-ci.yml
+++ b/.github/workflows/linux-ci.yml
@@ -692,4 +692,23 @@ jobs:
         make check
 
 
+  test-linux-lto:
+    runs-on: ubuntu-24.04
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Install clang, lld and llvm
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y clang lld llvm
+
+    - name: Build and test (whole-program ThinLTO, je_ prefix)
+      run: |
+        autoconf
+        CC=clang AR=llvm-ar NM=llvm-nm RANLIB=llvm-ranlib \
+          ./configure --with-jemalloc-prefix=je_ EXTRA_CFLAGS=-flto=thin
+        make -j3 EXTRA_LDFLAGS="-flto=thin -fuse-ld=lld"
+        make -j3 tests EXTRA_LDFLAGS="-flto=thin -fuse-ld=lld"
+        make check
+
 
diff --git a/scripts/gen_gh_actions.py b/scripts/gen_gh_actions.py
index 4c5474ab7f..d40fc7f321 100755
--- a/scripts/gen_gh_actions.py
+++ b/scripts/gen_gh_actions.py
@@ -632,6 +632,34 @@ def generate_freebsd_job(arch):
     return job
 
 
+def generate_linux_lto_job():
+    """Dedicated lane: whole-program ThinLTO + the je_ public prefix, statically
+    linked.  This is the configuration under which the tcache_fiber_migration
+    reproducer (issue #2890) actually exercises the bug -- the allocator
+    fastpath must be inlined next to the swapcontext, which only happens with
+    static linking under LTO.  llvm-ar/nm/ranlib are needed to archive the LTO
+    bitcode; -fuse-ld=lld to link it."""
+    return """  test-linux-lto:
+    runs-on: ubuntu-24.04
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Install clang, lld and llvm
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y clang lld llvm
+
+    - name: Build and test (whole-program ThinLTO, je_ prefix)
+      run: |
+        autoconf
+        CC=clang AR=llvm-ar NM=llvm-nm RANLIB=llvm-ranlib \\
+          ./configure --with-jemalloc-prefix=je_ EXTRA_CFLAGS=-flto=thin
+        make -j3 EXTRA_LDFLAGS="-flto=thin -fuse-ld=lld"
+        make -j3 tests EXTRA_LDFLAGS="-flto=thin -fuse-ld=lld"
+        make check
+"""
+
+
 def main():
     import sys
 
@@ -642,6 +670,7 @@ def main():
         jobs = '\n'.join((
             generate_linux_job(AMD64),
             generate_linux_job(ARM64),
+            generate_linux_lto_job(),
         ))
         print(GITHUB_ACTIONS_TEMPLATE.format(name='Linux CI', jobs=jobs))
 
@@ -665,6 +694,7 @@ def main():
         linux_jobs = '\n'.join((
             generate_linux_job(AMD64),
             generate_linux_job(ARM64),
+            generate_linux_lto_job(),
         ))
         macos_jobs = '\n'.join((
             generate_macos_job(AMD64),   # Intel

From f8f2f28c92c2824eefec7a94939dff4dc80b8563 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Tue, 9 Jun 2026 20:41:26 +0200
Subject: [PATCH 3/5] Guard TSD thread-locals against LTO hoisting
 (`JEMALLOC_TLS_ADDR`)

`tsd_get` returned `&tsd_tls`, a loop-invariant address the compiler forms as
`thread_pointer + const_offset`.  Under whole-program LTO it can be hoisted out
of the inlined `malloc`/`free` and kept in a callee-saved register across a
user-space context switch (`swapcontext`, ucontext / `boost::context` fibers,
coroutine runtimes).  If the context resumes on a *different* OS thread, the
cached tsd/tcache still belongs to the previous thread and the two race on one
tcache -- silent heap corruption that reproduces only under LTO and is invisible
to sanitizers.

Route every GNU TSD backend's `tsd_get` through `JEMALLOC_TLS_ADDR(tsd_tls)`,
backed by a per-variable accessor (`JEMALLOC_TLS_ADDR_DEFINE`) that takes the
address *inside* a `noinline` function holding a `memory` barrier.  The
optimizer can neither inline it, prove it pure, nor hoist/CSE the access across
the switch, so the thread-local is resolved afresh on the running thread.
Correct for every TLS model -- the access sits behind an opaque call boundary --
at the cost of one real call per public allocator entry.  Non-GNU compilers
(MSVC) have no inline asm and keep the plain `&tsd_tls`.

An earlier attempt re-read only the thread pointer with a `volatile` asm and
added the compile-time offset; that proved insufficient -- clang ThinLTO still
hoisted the access and corrupted the tcache (verified: crash with the asm path,
clean with this accessor, same source/model/compiler).  The address itself has
to be materialized behind the call boundary.

The accessor has a measurable cost:

Non-LTO:

| op        | no-fix  | fix     | overhead        |
|-----------|---------|---------|-----------------|
| malloc(1) | ~7.0 ns | 9.79 ns | +2.8 ns (~40%)  |
| free      | ~6.6 ns | 9.23 ns | +2.6 ns (~40%)  |

LTO:

| op        | no-fix   | fix      | overhead        |
|-----------|----------|----------|-----------------|
| malloc(1) | ~7.02 ns | ~8.99 ns | +1.97 ns (~28%) |
| free      | ~6.49 ns | ~8.42 ns | +1.9 ns (~29%)  |
---
 include/jemalloc/internal/tsd_internals.h     | 34 +++++++++++++++++++
 .../internal/tsd_malloc_thread_cleanup.h      |  4 ++-
 include/jemalloc/internal/tsd_tls.h           |  4 ++-
 include/jemalloc/internal/tsd_win.h           |  4 ++-
 4 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/include/jemalloc/internal/tsd_internals.h b/include/jemalloc/internal/tsd_internals.h
index f675587d73..196b17b743 100644
--- a/include/jemalloc/internal/tsd_internals.h
+++ b/include/jemalloc/internal/tsd_internals.h
@@ -20,6 +20,40 @@
 #include "jemalloc/internal/util.h"
 #include "jemalloc/internal/witness.h"
 
+/*
+ * JEMALLOC_TLS_ADDR(tlsvar): the address of a thread-local, read so the compiler
+ * cannot hoist it across a user-space context switch (swapcontext, ucontext /
+ * coroutine runtimes).  A plain `&tlsvar` is loop-invariant, so under LTO the
+ * compiler may hoist the read out of the inlined `malloc`/`free` and keep it in
+ * a register; if the context then resumes on another OS thread, that stale
+ * tsd/tcache belongs to the previous thread and the two race on it -- heap
+ * corruption visible only under LTO.  See
+ * https://github.com/jemalloc/jemalloc/issues/2890
+ *
+ * Each backend defines a per-variable accessor with JEMALLOC_TLS_ADDR_DEFINE
+ * (after declaring the thread-local) that takes the address *inside* a noinline
+ * function holding a `memory` barrier: opaque to the optimizer, which therefore
+ * can neither inline it, prove it pure, nor hoist/CSE the access across the
+ * switch.  Re-reading just the thread pointer (an asm + constant offset) is not
+ * enough -- the optimizer still defeats it under whole-program LTO -- so the
+ * access itself must sit behind the call boundary.  Correct for every TLS model,
+ * at one real call per access.  Non-GNU compilers (MSVC) have no inline asm and
+ * keep the plain `&tlsvar`.
+ */
+#if defined(__GNUC__)
+#  define JEMALLOC_TLS_ADDR_DEFINE(tlsvar)                                      \
+	static UNUSED JEMALLOC_NOINLINE __typeof__(&(tlsvar))                   \
+	jemalloc_tls_addr_##tlsvar(void) {                                      \
+		__typeof__(&(tlsvar)) tls_addr = &(tlsvar);                     \
+		__asm__ __volatile__("" : "+r"(tls_addr) : : "memory");         \
+		return tls_addr;                                                \
+	}
+#  define JEMALLOC_TLS_ADDR(tlsvar) (jemalloc_tls_addr_##tlsvar())
+#else
+#  define JEMALLOC_TLS_ADDR_DEFINE(tlsvar)
+#  define JEMALLOC_TLS_ADDR(tlsvar) (&(tlsvar))
+#endif
+
 /*
  * Thread-Specific-Data layout
  *
diff --git a/include/jemalloc/internal/tsd_malloc_thread_cleanup.h b/include/jemalloc/internal/tsd_malloc_thread_cleanup.h
index 00756df1d0..10d3579b02 100644
--- a/include/jemalloc/internal/tsd_malloc_thread_cleanup.h
+++ b/include/jemalloc/internal/tsd_malloc_thread_cleanup.h
@@ -13,6 +13,8 @@ extern JEMALLOC_TSD_TYPE_ATTR(tsd_t) tsd_tls;
 extern JEMALLOC_TSD_TYPE_ATTR(bool) tsd_initialized;
 extern bool tsd_booted;
 
+JEMALLOC_TLS_ADDR_DEFINE(tsd_tls)
+
 /* Initialization/cleanup. */
 JEMALLOC_ALWAYS_INLINE bool
 tsd_cleanup_wrapper(void) {
@@ -53,7 +55,7 @@ tsd_get_allocates(void) {
 /* Get/set. */
 JEMALLOC_ALWAYS_INLINE tsd_t *
 tsd_get(bool init) {
-	return &tsd_tls;
+	return JEMALLOC_TLS_ADDR(tsd_tls);
 }
 JEMALLOC_ALWAYS_INLINE void
 tsd_set(tsd_t *val) {
diff --git a/include/jemalloc/internal/tsd_tls.h b/include/jemalloc/internal/tsd_tls.h
index 6536eb540c..968d1c0bdf 100644
--- a/include/jemalloc/internal/tsd_tls.h
+++ b/include/jemalloc/internal/tsd_tls.h
@@ -13,6 +13,8 @@ extern JEMALLOC_TSD_TYPE_ATTR(tsd_t) tsd_tls;
 extern pthread_key_t tsd_tsd;
 extern bool          tsd_booted;
 
+JEMALLOC_TLS_ADDR_DEFINE(tsd_tls)
+
 /* Initialization/cleanup. */
 JEMALLOC_ALWAYS_INLINE bool
 tsd_boot0(void) {
@@ -46,7 +48,7 @@ tsd_get_allocates(void) {
 /* Get/set. */
 JEMALLOC_ALWAYS_INLINE tsd_t *
 tsd_get(bool init) {
-	return &tsd_tls;
+	return JEMALLOC_TLS_ADDR(tsd_tls);
 }
 
 JEMALLOC_ALWAYS_INLINE void
diff --git a/include/jemalloc/internal/tsd_win.h b/include/jemalloc/internal/tsd_win.h
index 8b22bec18d..d06cc27e7e 100644
--- a/include/jemalloc/internal/tsd_win.h
+++ b/include/jemalloc/internal/tsd_win.h
@@ -178,6 +178,8 @@ tsd_set(tsd_t *val) {
 extern JEMALLOC_TSD_TYPE_ATTR(tsd_wrapper_t) tsd_wrapper_tls;
 extern bool tsd_booted;
 
+JEMALLOC_TLS_ADDR_DEFINE(tsd_wrapper_tls)
+
 /* Initialization/cleanup. */
 JEMALLOC_ALWAYS_INLINE bool
 tsd_cleanup_wrapper(void) {
@@ -223,7 +225,7 @@ tsd_get_allocates(void) {
 /* Get/set. */
 JEMALLOC_ALWAYS_INLINE tsd_t *
 tsd_get(bool init) {
-	return &(tsd_wrapper_tls.val);
+	return &(JEMALLOC_TLS_ADDR(tsd_wrapper_tls)->val);
 }
 
 JEMALLOC_ALWAYS_INLINE void

From 9de81c67d8bdfc5b88a4947f7f2e60d6716ba146 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Tue, 9 Jun 2026 23:55:56 +0200
Subject: [PATCH 4/5] Add an inline fast path to `JEMALLOC_TLS_ADDR` for static
 TLS

The noinline accessor puts an opaque call on the malloc/free fastpath, which
the caller cannot CSE, schedule across, or keep registers alive over.
test/stress/microbench (malloc(1)/free pairs, pinned core, clang):

                     unpatched      accessor       this commit
  malloc, no LTO     7.0 ns/op      9.8 (+40%)     7.0 (+-0%)
  free,   no LTO     6.5 ns/op      9.2 (+40%)     6.5 (+-0%)
  malloc, ThinLTO    7.0 ns/op      9.0 (+28%)     7.6 (~+8%)
  free,   ThinLTO    6.5 ns/op      8.4 (+28%)     7.0 (~+8%)

Under a static TLS model the address is `thread_pointer + offset` with a
thread-independent offset, so inline the access: capture the offset once at
runtime into a global via a noinline helper (lazy, sentinel-initialized), and
re-read the thread pointer on every call with a `volatile` asm the optimizer
may not hoist or CSE (per-arch reads from mimalloc's `mi_prim_tls_slot`).
Both fast-path inputs are hoist-proof: the tp read is volatile, and the offset
global is thread-independent, so hoisting it across a context switch is
harmless.

Note the offset must NOT be computed inline as
`&tsd_tls - __builtin_thread_pointer()`: the two terms are independently
hoistable, and clang ThinLTO keeps the (stale) `&tsd_tls` in a callee-saved
register across the switch while re-materializing the thread-pointer terms,
producing `fresh_tp + (stale_addr - fresh_tp)` -- the volatile read is
algebraically cancelled back to the stale address:

    # preheader                          # loop body
    movq %fs:0, %rax                     movq %fs:0, %rax     <- volatile asm
    leaq tsd@TPOFF(%rax), %r14           movq %r14, %rcx
                                         subq %fs:0, %rcx
                                         addl (%rax,%rcx), .. <- = stale r14

Capturing the offset behind a call boundary evaluates both terms at one point
on one thread, where the difference really is the tpoff constant.

The fast path is gated on `JEMALLOC_TLS_MODEL_INITIAL_EXEC` (a new configure
define, set whenever jemalloc applies its default initial-exec model): under
the dynamic models the offset is not thread-independent.  It also requires GNU
asm, a known architecture, and !_WIN32 (MinGW's thread pointer lives in the
TEB, not fs/gs:0).  Everything else keeps the noinline accessor.

Verified with the fiber-migration reproducer (clang ThinLTO, static link,
je_-prefixed entry points): clean, with the inlined fastpath re-reading the
thread pointer after every `swapcontext`.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 configure.ac                                  |  2 +
 .../internal/jemalloc_internal_defs.h.in      |  7 ++
 include/jemalloc/internal/tsd_internals.h     | 72 ++++++++++++++++---
 3 files changed, 71 insertions(+), 10 deletions(-)

diff --git a/configure.ac b/configure.ac
index e57d0667e4..5ef440e8f3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2775,6 +2775,8 @@ if test "x${je_cv_tls_model}" = "xyes" -a \
   AC_DEFINE([JEMALLOC_TLS_MODEL],
             [__attribute__((tls_model("initial-exec")))],
             [ ])
+  AC_DEFINE([JEMALLOC_TLS_MODEL_INITIAL_EXEC], [ ],
+            [Defined when the TSD thread-locals use the initial-exec (static) TLS model, i.e. live at a fixed offset from the thread pointer.])
 else
   AC_DEFINE([JEMALLOC_TLS_MODEL], [ ], [ ])
 fi
diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index 31ae2e8ed2..841ae94233 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -148,6 +148,13 @@
 /* Non-empty if the tls_model attribute is supported. */
 #undef JEMALLOC_TLS_MODEL
 
+/*
+ * Defined when the TSD thread-locals use the initial-exec (static) TLS model,
+ * i.e. live at a fixed offset from the thread pointer.  Gates the fast path of
+ * JEMALLOC_TLS_ADDR (see tsd_internals.h).
+ */
+#undef JEMALLOC_TLS_MODEL_INITIAL_EXEC
+
 /*
  * JEMALLOC_DEBUG enables assertions and other sanity checks, and disables
  * inline functions.
diff --git a/include/jemalloc/internal/tsd_internals.h b/include/jemalloc/internal/tsd_internals.h
index 196b17b743..55fac2c0e7 100644
--- a/include/jemalloc/internal/tsd_internals.h
+++ b/include/jemalloc/internal/tsd_internals.h
@@ -30,17 +30,69 @@
  * corruption visible only under LTO.  See
  * https://github.com/jemalloc/jemalloc/issues/2890
  *
- * Each backend defines a per-variable accessor with JEMALLOC_TLS_ADDR_DEFINE
- * (after declaring the thread-local) that takes the address *inside* a noinline
- * function holding a `memory` barrier: opaque to the optimizer, which therefore
- * can neither inline it, prove it pure, nor hoist/CSE the access across the
- * switch.  Re-reading just the thread pointer (an asm + constant offset) is not
- * enough -- the optimizer still defeats it under whole-program LTO -- so the
- * access itself must sit behind the call boundary.  Correct for every TLS model,
- * at one real call per access.  Non-GNU compilers (MSVC) have no inline asm and
- * keep the plain `&tlsvar`.
+ * Fast path (static TLS only, hence gated on JEMALLOC_TLS_MODEL_INITIAL_EXEC --
+ * jemalloc's default): the address is `thread_pointer + offset` with a
+ * thread-independent offset.  The offset is captured once at runtime into a
+ * global by a noinline helper (so both of its terms are evaluated at a single
+ * point on a single thread), and the thread pointer is re-read on every call
+ * with a `volatile` asm the optimizer may not hoist or CSE (per-arch reads from
+ * mimalloc's mi_prim_tls_slot).  The offset must NOT be computed inline as
+ * `&tlsvar - thread_pointer`: the compiler can hoist the (stale) `&tlsvar`
+ * while re-materializing the thread-pointer terms, and `fresh_tp + (stale_addr
+ * - fresh_tp)` cancels the fresh read back to the stale address.
+ *
+ * Otherwise (dynamic TLS models, unlisted arch, MinGW): a per-variable noinline
+ * accessor takes the address behind a `memory` barrier -- an opaque call the
+ * optimizer can neither inline, prove pure, nor hoist/CSE across the switch.
+ * Correct for every TLS model, at one real call per access.  Non-GNU compilers
+ * (MSVC) have no inline asm and keep the plain `&tlsvar`.
  */
-#if defined(__GNUC__)
+#if defined(__GNUC__) && !defined(_WIN32) &&                                   \
+    defined(JEMALLOC_TLS_MODEL_INITIAL_EXEC) &&                                \
+    (defined(__aarch64__) || defined(__arm__) || defined(__x86_64__) ||        \
+    defined(__i386__))
+JEMALLOC_ALWAYS_INLINE char *
+jemalloc_thread_pointer(void) {
+	char *thread_pointer;
+#  if defined(__aarch64__) && defined(__APPLE__)
+	__asm__ __volatile__("mrs %0, tpidrro_el0\n\tbic %0, %0, #7" : "=r"(thread_pointer));
+#  elif defined(__aarch64__)
+	__asm__ __volatile__("mrs %0, tpidr_el0" : "=r"(thread_pointer));
+#  elif defined(__arm__)
+	__asm__ __volatile__("mrc p15, 0, %0, c13, c0, 3\n\tbic %0, %0, #3" : "=r"(thread_pointer));
+#  elif defined(__x86_64__) && defined(__APPLE__)
+	__asm__ __volatile__("movq %%gs:0, %0" : "=r"(thread_pointer));
+#  elif defined(__x86_64__)
+	__asm__ __volatile__("movq %%fs:0, %0" : "=r"(thread_pointer));
+#  else /* __i386__ */
+	__asm__ __volatile__("movl %%gs:0, %0" : "=r"(thread_pointer));
+#  endif
+	return thread_pointer;
+}
+/* 1 is unreachable: tlsvar and the thread pointer are at least 4-aligned. */
+#  define JEMALLOC_TLS_OFFSET_UNINITIALIZED 1
+#  define JEMALLOC_TLS_ADDR_DEFINE(tlsvar)                                      \
+	static UNUSED intptr_t jemalloc_tls_offset_##tlsvar =                   \
+	    JEMALLOC_TLS_OFFSET_UNINITIALIZED;                                  \
+	static UNUSED JEMALLOC_NOINLINE intptr_t                                \
+	jemalloc_tls_offset_init_##tlsvar(void) {                               \
+		intptr_t tls_offset = (intptr_t)((char *)&(tlsvar) -            \
+		    jemalloc_thread_pointer());                                 \
+		jemalloc_tls_offset_##tlsvar = tls_offset;                      \
+		return tls_offset;                                              \
+	}                                                                       \
+	JEMALLOC_ALWAYS_INLINE __typeof__(&(tlsvar))                            \
+	jemalloc_tls_addr_##tlsvar(void) {                                      \
+		intptr_t tls_offset = jemalloc_tls_offset_##tlsvar;             \
+		if (unlikely(tls_offset ==                                      \
+		    JEMALLOC_TLS_OFFSET_UNINITIALIZED)) {                       \
+			tls_offset = jemalloc_tls_offset_init_##tlsvar();       \
+		}                                                               \
+		return (__typeof__(&(tlsvar)))(jemalloc_thread_pointer() +     \
+		    tls_offset);                                                \
+	}
+#  define JEMALLOC_TLS_ADDR(tlsvar) (jemalloc_tls_addr_##tlsvar())
+#elif defined(__GNUC__)
 #  define JEMALLOC_TLS_ADDR_DEFINE(tlsvar)                                      \
 	static UNUSED JEMALLOC_NOINLINE __typeof__(&(tlsvar))                   \
 	jemalloc_tls_addr_##tlsvar(void) {                                      \

From 0af5fdda9cc4b65fdf7751c32c49765a844164dc Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Wed, 10 Jun 2026 00:26:18 +0200
Subject: [PATCH 5/5] test: link tcache_fiber_migration with -lstdc++ when cxx
 is enabled

The test links the whole libjemalloc.a (--whole-archive), which contains
jemalloc_cpp.o under --enable-cxx, so the link needs libstdc++
(std::set_new_handler, __cxa_*, typeinfo for std::bad_alloc,
__gxx_personality_v0).  Pull it from $(LIBS) the same way the generic
integration rule does; a no-cxx build is unaffected since $(LIBS) then
has no -lstdc++.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 Makefile.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.in b/Makefile.in
index 92cc307b4a..ba24f84494 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -578,7 +578,7 @@ $(objroot)test/integration/%$(EXE): $(objroot)test/integration/%.$(O) $(C_TESTLI
 # inlining.  Explicit rule -- overrides the generic integration rule above.
 $(objroot)test/integration/tcache_fiber_migration$(EXE): $(objroot)test/integration/tcache_fiber_migration.$(O) $(objroot)lib/$(LIBJEMALLOC).$(A)
 	@mkdir -p $(@D)
-	$(CC) $(LDTARGET) $(objroot)test/integration/tcache_fiber_migration.$(O) -Wl,--whole-archive $(objroot)lib/$(LIBJEMALLOC).$(A) -Wl,--no-whole-archive $(LDFLAGS) -pthread $(LM) $(EXTRA_LDFLAGS)
+	$(CC) $(LDTARGET) $(objroot)test/integration/tcache_fiber_migration.$(O) -Wl,--whole-archive $(objroot)lib/$(LIBJEMALLOC).$(A) -Wl,--no-whole-archive $(LDFLAGS) -pthread $(filter -lrt -lstdc++,$(LIBS)) $(LM) $(EXTRA_LDFLAGS)
 
 $(objroot)test/integration/cpp/%$(EXE): $(objroot)test/integration/cpp/%.$(O) $(C_TESTLIB_INTEGRATION_OBJS) $(C_UTIL_INTEGRATION_OBJS) $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB)
 	@mkdir -p $(@D)