From 07e318626a9de83a543e3c361fd674cc96c6ba07 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 9 Jun 2026 21:47:54 +0200 Subject: [PATCH 1/5] test: add `tcache_fiber_migration` standalone LTO reproducer A worker-thread pool runs ucontext fibers that each do free/swapcontext/malloc in one frame, so a fiber is routinely resumed on a different OS thread than it suspended on. With the tsd hoisting bug and a whole-program-LTO build the inlined fastpath frees/allocates against the previous thread's tcache and the process crashes; with the fix it runs to completion. Reproducing requires the allocator inlined next to the swapcontext, so: - it is a standalone program (no test harness, so it can be static-linked without symbol clashes) that calls jemalloc via JEMALLOC_MANGLE, so malloc/free bind to the configured-prefix symbols (the libc wrappers do not inline) -- independent of --with-jemalloc-prefix; - Makefile.in links this one test against libjemalloc.a with --whole-archive (guarded by enable_static), not the shared library; - it only manifests under whole-program LTO -- a no-op guard otherwise. Co-Authored-By: Claude Opus 4.8 (1M context) --- Makefile.in | 16 +++ test/integration/tcache_fiber_migration.c | 139 ++++++++++++++++++++++ 2 files changed, 155 insertions(+) create mode 100644 test/integration/tcache_fiber_migration.c diff --git a/Makefile.in b/Makefile.in index 459f98fb04..92cc307b4a 100644 --- a/Makefile.in +++ b/Makefile.in @@ -334,6 +334,14 @@ ifeq (@enable_experimental_smallocx@, 1) TESTS_INTEGRATION += \ $(srcroot)test/integration/smallocx.c endif +# tcache_fiber_migration is a standalone LTO reproducer (issue #2890): the +# dedicated rule below links it against the static archive (--whole-archive) so +# the allocator fastpath inlines next to the swapcontext. Needs that archive, +# hence the enable_static guard; reproduces the bug only under whole-program +# LTO, a no-op guard otherwise. +ifeq ($(enable_static), 1) +TESTS_INTEGRATION += $(srcroot)test/integration/tcache_fiber_migration.c +endif ifeq (@enable_cxx@, 1) CPP_SRCS := $(srcroot)src/jemalloc_cpp.cpp TESTS_INTEGRATION_CPP := $(srcroot)test/integration/cpp/basic.cpp \ @@ -564,6 +572,14 @@ $(objroot)test/integration/%$(EXE): $(objroot)test/integration/%.$(O) $(C_TESTLI @mkdir -p $(@D) $(CC) $(TEST_LD_MODE) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(LJEMALLOC) $(LDFLAGS) $(filter-out -lm,$(filter -lrt -pthread -lstdc++,$(LIBS))) $(LM) $(EXTRA_LDFLAGS) +# tcache_fiber_migration (issue #2890) must inline the allocator next to the +# swapcontext, so link jemalloc statically (--whole-archive), without the test +# harness/shared lib; whole-program LTO (from the build's flags) does the +# inlining. Explicit rule -- overrides the generic integration rule above. +$(objroot)test/integration/tcache_fiber_migration$(EXE): $(objroot)test/integration/tcache_fiber_migration.$(O) $(objroot)lib/$(LIBJEMALLOC).$(A) + @mkdir -p $(@D) + $(CC) $(LDTARGET) $(objroot)test/integration/tcache_fiber_migration.$(O) -Wl,--whole-archive $(objroot)lib/$(LIBJEMALLOC).$(A) -Wl,--no-whole-archive $(LDFLAGS) -pthread $(LM) $(EXTRA_LDFLAGS) + $(objroot)test/integration/cpp/%$(EXE): $(objroot)test/integration/cpp/%.$(O) $(C_TESTLIB_INTEGRATION_OBJS) $(C_UTIL_INTEGRATION_OBJS) $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB) @mkdir -p $(@D) $(CXX) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB) $(LDFLAGS) $(filter-out -lm,$(LIBS)) -lm $(EXTRA_LDFLAGS) diff --git a/test/integration/tcache_fiber_migration.c b/test/integration/tcache_fiber_migration.c new file mode 100644 index 0000000000..e374af1f60 --- /dev/null +++ b/test/integration/tcache_fiber_migration.c @@ -0,0 +1,139 @@ +/* + * Standalone regression test for the thread-pointer hoisting bug (issue #2890): + * under whole-program LTO the inlined allocator fastpath caches the TLS-derived + * tcache base in a callee-saved register; a fiber resumed on a different OS + * thread then frees/allocs against the previous thread's tcache -> heap + * corruption. See JEMALLOC_TLS_ADDR in tsd_internals.h. + * + * Reproducing requires the allocator inlined next to the swapcontext, so: + * - call jemalloc directly (via JEMALLOC_MANGLE, so malloc/free bind to the + * configured-prefix symbols, not the libc wrappers, which do not inline); + * - link jemalloc *statically* (Makefile.in links this one test against + * libjemalloc.a with --whole-archive); + * - build with whole-program LTO. + * Hence it is a deliberately standalone program (no test harness, so it can be + * static-linked without symbol clashes). Without LTO it runs clean, so it is a + * no-op guard there. + */ +#include +#include +#include + +#define JEMALLOC_MANGLE +#include + +enum { + num_fibers = 128, + num_workers = 8, + ops_per_fiber = 20 * 1000, + fiber_stack_size = 1 << 16, + ready_queue_capacity = 512 /* power of two, > num_fibers */ +}; + +static ucontext_t fiber_context[num_fibers]; +/* Scheduler context to switch back to when a fiber yields; the resuming worker + * writes its own context here, so this changes when the fiber migrates. */ +static ucontext_t *return_context[num_fibers]; +static unsigned fiber_remaining_ops[num_fibers]; +static bool fiber_done[num_fibers]; + +static unsigned ready_queue[ready_queue_capacity]; +static unsigned ready_queue_head; +static unsigned ready_queue_tail; +static unsigned live_fibers; +static pthread_mutex_t queue_mtx = PTHREAD_MUTEX_INITIALIZER; + +static void +push_ready_fiber(unsigned id) { + pthread_mutex_lock(&queue_mtx); + ready_queue[ready_queue_head++ & (ready_queue_capacity - 1)] = id; + pthread_mutex_unlock(&queue_mtx); +} + +/* Returns a ready fiber id, -1 if none ready now, or -2 if all have finished. */ +static int +pop_ready_fiber(void) { + int id; + pthread_mutex_lock(&queue_mtx); + if (live_fibers == 0) { + id = -2; + } else if (ready_queue_tail != ready_queue_head) { + id = (int)ready_queue[ready_queue_tail++ & (ready_queue_capacity - 1)]; + } else { + id = -1; + } + pthread_mutex_unlock(&queue_mtx); + return id; +} + +static void +fiber_run(int id) { + void *p = malloc(64); + while (fiber_remaining_ops[id] > 0) { + fiber_remaining_ops[id]--; + free(p); /* push to the CURRENT thread's tcache */ + /* Yield; may be resumed on a different worker thread. */ + swapcontext(&fiber_context[id], return_context[id]); + p = malloc(64); /* pop; must come from the NEW thread's tcache */ + *(char *)p = (char)id; + } + free(p); + pthread_mutex_lock(&queue_mtx); + fiber_done[id] = true; + live_fibers--; + pthread_mutex_unlock(&queue_mtx); + swapcontext(&fiber_context[id], return_context[id]); +} + +static void * +worker_thread(void *arg) { + ucontext_t scheduler_context; + (void)arg; + for (;;) { + int id = pop_ready_fiber(); + if (id == -2) { + break; + } + if (id < 0) { + continue; + } + return_context[id] = &scheduler_context; + swapcontext(&scheduler_context, &fiber_context[id]); + if (!fiber_done[id]) { + push_ready_fiber((unsigned)id); + } + } + return NULL; +} + +int +main(void) { + void *stacks[num_fibers]; + pthread_t threads[num_workers]; + unsigned i; + + live_fibers = num_fibers; + for (i = 0; i < num_fibers; i++) { + stacks[i] = malloc(fiber_stack_size); + fiber_remaining_ops[i] = ops_per_fiber; + getcontext(&fiber_context[i]); + fiber_context[i].uc_stack.ss_sp = stacks[i]; + fiber_context[i].uc_stack.ss_size = fiber_stack_size; + fiber_context[i].uc_link = NULL; + makecontext(&fiber_context[i], (void (*)(void))fiber_run, 1, (int)i); + push_ready_fiber(i); + } + + for (i = 0; i < num_workers; i++) { + pthread_create(&threads[i], NULL, worker_thread, NULL); + } + for (i = 0; i < num_workers; i++) { + pthread_join(threads[i], NULL); + } + for (i = 0; i < num_fibers; i++) { + free(stacks[i]); + } + + /* Completing without a tcache-corruption crash is success. */ + return live_fibers == 0 ? 0 : 1; +} From 89ee23c166cd3c1c5cb60d15e71d92c1f3a8b484 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 9 Jun 2026 21:47:54 +0200 Subject: [PATCH 2/5] ci: add a Linux whole-program-LTO lane (`test-linux-lto`) The `tcache_fiber_migration` reproducer (issue #2890) only exercises the bug when the allocator is statically linked and inlined under whole-program LTO. None of the existing lanes build that way, so add a dedicated Linux lane: clang ThinLTO, `--with-jemalloc-prefix=je_`, and llvm-ar/nm/ranlib (to archive the LTO bitcode) + `-fuse-ld=lld`, then `make check`. Authored in scripts/gen_gh_actions.py; .github/workflows/linux-ci.yml regenerated. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/linux-ci.yml | 19 +++++++++++++++++++ scripts/gen_gh_actions.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/.github/workflows/linux-ci.yml b/.github/workflows/linux-ci.yml index c5e0c9aaf2..527a601dd9 100644 --- a/.github/workflows/linux-ci.yml +++ b/.github/workflows/linux-ci.yml @@ -692,4 +692,23 @@ jobs: make check + test-linux-lto: + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v4 + + - name: Install clang, lld and llvm + run: | + sudo apt-get update + sudo apt-get install -y clang lld llvm + + - name: Build and test (whole-program ThinLTO, je_ prefix) + run: | + autoconf + CC=clang AR=llvm-ar NM=llvm-nm RANLIB=llvm-ranlib \ + ./configure --with-jemalloc-prefix=je_ EXTRA_CFLAGS=-flto=thin + make -j3 EXTRA_LDFLAGS="-flto=thin -fuse-ld=lld" + make -j3 tests EXTRA_LDFLAGS="-flto=thin -fuse-ld=lld" + make check + diff --git a/scripts/gen_gh_actions.py b/scripts/gen_gh_actions.py index 4c5474ab7f..d40fc7f321 100755 --- a/scripts/gen_gh_actions.py +++ b/scripts/gen_gh_actions.py @@ -632,6 +632,34 @@ def generate_freebsd_job(arch): return job +def generate_linux_lto_job(): + """Dedicated lane: whole-program ThinLTO + the je_ public prefix, statically + linked. This is the configuration under which the tcache_fiber_migration + reproducer (issue #2890) actually exercises the bug -- the allocator + fastpath must be inlined next to the swapcontext, which only happens with + static linking under LTO. llvm-ar/nm/ranlib are needed to archive the LTO + bitcode; -fuse-ld=lld to link it.""" + return """ test-linux-lto: + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v4 + + - name: Install clang, lld and llvm + run: | + sudo apt-get update + sudo apt-get install -y clang lld llvm + + - name: Build and test (whole-program ThinLTO, je_ prefix) + run: | + autoconf + CC=clang AR=llvm-ar NM=llvm-nm RANLIB=llvm-ranlib \\ + ./configure --with-jemalloc-prefix=je_ EXTRA_CFLAGS=-flto=thin + make -j3 EXTRA_LDFLAGS="-flto=thin -fuse-ld=lld" + make -j3 tests EXTRA_LDFLAGS="-flto=thin -fuse-ld=lld" + make check +""" + + def main(): import sys @@ -642,6 +670,7 @@ def main(): jobs = '\n'.join(( generate_linux_job(AMD64), generate_linux_job(ARM64), + generate_linux_lto_job(), )) print(GITHUB_ACTIONS_TEMPLATE.format(name='Linux CI', jobs=jobs)) @@ -665,6 +694,7 @@ def main(): linux_jobs = '\n'.join(( generate_linux_job(AMD64), generate_linux_job(ARM64), + generate_linux_lto_job(), )) macos_jobs = '\n'.join(( generate_macos_job(AMD64), # Intel From f8f2f28c92c2824eefec7a94939dff4dc80b8563 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 9 Jun 2026 20:41:26 +0200 Subject: [PATCH 3/5] Guard TSD thread-locals against LTO hoisting (`JEMALLOC_TLS_ADDR`) `tsd_get` returned `&tsd_tls`, a loop-invariant address the compiler forms as `thread_pointer + const_offset`. Under whole-program LTO it can be hoisted out of the inlined `malloc`/`free` and kept in a callee-saved register across a user-space context switch (`swapcontext`, ucontext / `boost::context` fibers, coroutine runtimes). If the context resumes on a *different* OS thread, the cached tsd/tcache still belongs to the previous thread and the two race on one tcache -- silent heap corruption that reproduces only under LTO and is invisible to sanitizers. Route every GNU TSD backend's `tsd_get` through `JEMALLOC_TLS_ADDR(tsd_tls)`, backed by a per-variable accessor (`JEMALLOC_TLS_ADDR_DEFINE`) that takes the address *inside* a `noinline` function holding a `memory` barrier. The optimizer can neither inline it, prove it pure, nor hoist/CSE the access across the switch, so the thread-local is resolved afresh on the running thread. Correct for every TLS model -- the access sits behind an opaque call boundary -- at the cost of one real call per public allocator entry. Non-GNU compilers (MSVC) have no inline asm and keep the plain `&tsd_tls`. An earlier attempt re-read only the thread pointer with a `volatile` asm and added the compile-time offset; that proved insufficient -- clang ThinLTO still hoisted the access and corrupted the tcache (verified: crash with the asm path, clean with this accessor, same source/model/compiler). The address itself has to be materialized behind the call boundary. The accessor has a measurable cost: Non-LTO: | op | no-fix | fix | overhead | |-----------|---------|---------|-----------------| | malloc(1) | ~7.0 ns | 9.79 ns | +2.8 ns (~40%) | | free | ~6.6 ns | 9.23 ns | +2.6 ns (~40%) | LTO: | op | no-fix | fix | overhead | |-----------|----------|----------|-----------------| | malloc(1) | ~7.02 ns | ~8.99 ns | +1.97 ns (~28%) | | free | ~6.49 ns | ~8.42 ns | +1.9 ns (~29%) | --- include/jemalloc/internal/tsd_internals.h | 34 +++++++++++++++++++ .../internal/tsd_malloc_thread_cleanup.h | 4 ++- include/jemalloc/internal/tsd_tls.h | 4 ++- include/jemalloc/internal/tsd_win.h | 4 ++- 4 files changed, 43 insertions(+), 3 deletions(-) diff --git a/include/jemalloc/internal/tsd_internals.h b/include/jemalloc/internal/tsd_internals.h index f675587d73..196b17b743 100644 --- a/include/jemalloc/internal/tsd_internals.h +++ b/include/jemalloc/internal/tsd_internals.h @@ -20,6 +20,40 @@ #include "jemalloc/internal/util.h" #include "jemalloc/internal/witness.h" +/* + * JEMALLOC_TLS_ADDR(tlsvar): the address of a thread-local, read so the compiler + * cannot hoist it across a user-space context switch (swapcontext, ucontext / + * coroutine runtimes). A plain `&tlsvar` is loop-invariant, so under LTO the + * compiler may hoist the read out of the inlined `malloc`/`free` and keep it in + * a register; if the context then resumes on another OS thread, that stale + * tsd/tcache belongs to the previous thread and the two race on it -- heap + * corruption visible only under LTO. See + * https://github.com/jemalloc/jemalloc/issues/2890 + * + * Each backend defines a per-variable accessor with JEMALLOC_TLS_ADDR_DEFINE + * (after declaring the thread-local) that takes the address *inside* a noinline + * function holding a `memory` barrier: opaque to the optimizer, which therefore + * can neither inline it, prove it pure, nor hoist/CSE the access across the + * switch. Re-reading just the thread pointer (an asm + constant offset) is not + * enough -- the optimizer still defeats it under whole-program LTO -- so the + * access itself must sit behind the call boundary. Correct for every TLS model, + * at one real call per access. Non-GNU compilers (MSVC) have no inline asm and + * keep the plain `&tlsvar`. + */ +#if defined(__GNUC__) +# define JEMALLOC_TLS_ADDR_DEFINE(tlsvar) \ + static UNUSED JEMALLOC_NOINLINE __typeof__(&(tlsvar)) \ + jemalloc_tls_addr_##tlsvar(void) { \ + __typeof__(&(tlsvar)) tls_addr = &(tlsvar); \ + __asm__ __volatile__("" : "+r"(tls_addr) : : "memory"); \ + return tls_addr; \ + } +# define JEMALLOC_TLS_ADDR(tlsvar) (jemalloc_tls_addr_##tlsvar()) +#else +# define JEMALLOC_TLS_ADDR_DEFINE(tlsvar) +# define JEMALLOC_TLS_ADDR(tlsvar) (&(tlsvar)) +#endif + /* * Thread-Specific-Data layout * diff --git a/include/jemalloc/internal/tsd_malloc_thread_cleanup.h b/include/jemalloc/internal/tsd_malloc_thread_cleanup.h index 00756df1d0..10d3579b02 100644 --- a/include/jemalloc/internal/tsd_malloc_thread_cleanup.h +++ b/include/jemalloc/internal/tsd_malloc_thread_cleanup.h @@ -13,6 +13,8 @@ extern JEMALLOC_TSD_TYPE_ATTR(tsd_t) tsd_tls; extern JEMALLOC_TSD_TYPE_ATTR(bool) tsd_initialized; extern bool tsd_booted; +JEMALLOC_TLS_ADDR_DEFINE(tsd_tls) + /* Initialization/cleanup. */ JEMALLOC_ALWAYS_INLINE bool tsd_cleanup_wrapper(void) { @@ -53,7 +55,7 @@ tsd_get_allocates(void) { /* Get/set. */ JEMALLOC_ALWAYS_INLINE tsd_t * tsd_get(bool init) { - return &tsd_tls; + return JEMALLOC_TLS_ADDR(tsd_tls); } JEMALLOC_ALWAYS_INLINE void tsd_set(tsd_t *val) { diff --git a/include/jemalloc/internal/tsd_tls.h b/include/jemalloc/internal/tsd_tls.h index 6536eb540c..968d1c0bdf 100644 --- a/include/jemalloc/internal/tsd_tls.h +++ b/include/jemalloc/internal/tsd_tls.h @@ -13,6 +13,8 @@ extern JEMALLOC_TSD_TYPE_ATTR(tsd_t) tsd_tls; extern pthread_key_t tsd_tsd; extern bool tsd_booted; +JEMALLOC_TLS_ADDR_DEFINE(tsd_tls) + /* Initialization/cleanup. */ JEMALLOC_ALWAYS_INLINE bool tsd_boot0(void) { @@ -46,7 +48,7 @@ tsd_get_allocates(void) { /* Get/set. */ JEMALLOC_ALWAYS_INLINE tsd_t * tsd_get(bool init) { - return &tsd_tls; + return JEMALLOC_TLS_ADDR(tsd_tls); } JEMALLOC_ALWAYS_INLINE void diff --git a/include/jemalloc/internal/tsd_win.h b/include/jemalloc/internal/tsd_win.h index 8b22bec18d..d06cc27e7e 100644 --- a/include/jemalloc/internal/tsd_win.h +++ b/include/jemalloc/internal/tsd_win.h @@ -178,6 +178,8 @@ tsd_set(tsd_t *val) { extern JEMALLOC_TSD_TYPE_ATTR(tsd_wrapper_t) tsd_wrapper_tls; extern bool tsd_booted; +JEMALLOC_TLS_ADDR_DEFINE(tsd_wrapper_tls) + /* Initialization/cleanup. */ JEMALLOC_ALWAYS_INLINE bool tsd_cleanup_wrapper(void) { @@ -223,7 +225,7 @@ tsd_get_allocates(void) { /* Get/set. */ JEMALLOC_ALWAYS_INLINE tsd_t * tsd_get(bool init) { - return &(tsd_wrapper_tls.val); + return &(JEMALLOC_TLS_ADDR(tsd_wrapper_tls)->val); } JEMALLOC_ALWAYS_INLINE void From 9de81c67d8bdfc5b88a4947f7f2e60d6716ba146 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 9 Jun 2026 23:55:56 +0200 Subject: [PATCH 4/5] Add an inline fast path to `JEMALLOC_TLS_ADDR` for static TLS The noinline accessor puts an opaque call on the malloc/free fastpath, which the caller cannot CSE, schedule across, or keep registers alive over. test/stress/microbench (malloc(1)/free pairs, pinned core, clang): unpatched accessor this commit malloc, no LTO 7.0 ns/op 9.8 (+40%) 7.0 (+-0%) free, no LTO 6.5 ns/op 9.2 (+40%) 6.5 (+-0%) malloc, ThinLTO 7.0 ns/op 9.0 (+28%) 7.6 (~+8%) free, ThinLTO 6.5 ns/op 8.4 (+28%) 7.0 (~+8%) Under a static TLS model the address is `thread_pointer + offset` with a thread-independent offset, so inline the access: capture the offset once at runtime into a global via a noinline helper (lazy, sentinel-initialized), and re-read the thread pointer on every call with a `volatile` asm the optimizer may not hoist or CSE (per-arch reads from mimalloc's `mi_prim_tls_slot`). Both fast-path inputs are hoist-proof: the tp read is volatile, and the offset global is thread-independent, so hoisting it across a context switch is harmless. Note the offset must NOT be computed inline as `&tsd_tls - __builtin_thread_pointer()`: the two terms are independently hoistable, and clang ThinLTO keeps the (stale) `&tsd_tls` in a callee-saved register across the switch while re-materializing the thread-pointer terms, producing `fresh_tp + (stale_addr - fresh_tp)` -- the volatile read is algebraically cancelled back to the stale address: # preheader # loop body movq %fs:0, %rax movq %fs:0, %rax <- volatile asm leaq tsd@TPOFF(%rax), %r14 movq %r14, %rcx subq %fs:0, %rcx addl (%rax,%rcx), .. <- = stale r14 Capturing the offset behind a call boundary evaluates both terms at one point on one thread, where the difference really is the tpoff constant. The fast path is gated on `JEMALLOC_TLS_MODEL_INITIAL_EXEC` (a new configure define, set whenever jemalloc applies its default initial-exec model): under the dynamic models the offset is not thread-independent. It also requires GNU asm, a known architecture, and !_WIN32 (MinGW's thread pointer lives in the TEB, not fs/gs:0). Everything else keeps the noinline accessor. Verified with the fiber-migration reproducer (clang ThinLTO, static link, je_-prefixed entry points): clean, with the inlined fastpath re-reading the thread pointer after every `swapcontext`. Co-Authored-By: Claude Opus 4.8 (1M context) --- configure.ac | 2 + .../internal/jemalloc_internal_defs.h.in | 7 ++ include/jemalloc/internal/tsd_internals.h | 72 ++++++++++++++++--- 3 files changed, 71 insertions(+), 10 deletions(-) diff --git a/configure.ac b/configure.ac index e57d0667e4..5ef440e8f3 100644 --- a/configure.ac +++ b/configure.ac @@ -2775,6 +2775,8 @@ if test "x${je_cv_tls_model}" = "xyes" -a \ AC_DEFINE([JEMALLOC_TLS_MODEL], [__attribute__((tls_model("initial-exec")))], [ ]) + AC_DEFINE([JEMALLOC_TLS_MODEL_INITIAL_EXEC], [ ], + [Defined when the TSD thread-locals use the initial-exec (static) TLS model, i.e. live at a fixed offset from the thread pointer.]) else AC_DEFINE([JEMALLOC_TLS_MODEL], [ ], [ ]) fi diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in index 31ae2e8ed2..841ae94233 100644 --- a/include/jemalloc/internal/jemalloc_internal_defs.h.in +++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in @@ -148,6 +148,13 @@ /* Non-empty if the tls_model attribute is supported. */ #undef JEMALLOC_TLS_MODEL +/* + * Defined when the TSD thread-locals use the initial-exec (static) TLS model, + * i.e. live at a fixed offset from the thread pointer. Gates the fast path of + * JEMALLOC_TLS_ADDR (see tsd_internals.h). + */ +#undef JEMALLOC_TLS_MODEL_INITIAL_EXEC + /* * JEMALLOC_DEBUG enables assertions and other sanity checks, and disables * inline functions. diff --git a/include/jemalloc/internal/tsd_internals.h b/include/jemalloc/internal/tsd_internals.h index 196b17b743..55fac2c0e7 100644 --- a/include/jemalloc/internal/tsd_internals.h +++ b/include/jemalloc/internal/tsd_internals.h @@ -30,17 +30,69 @@ * corruption visible only under LTO. See * https://github.com/jemalloc/jemalloc/issues/2890 * - * Each backend defines a per-variable accessor with JEMALLOC_TLS_ADDR_DEFINE - * (after declaring the thread-local) that takes the address *inside* a noinline - * function holding a `memory` barrier: opaque to the optimizer, which therefore - * can neither inline it, prove it pure, nor hoist/CSE the access across the - * switch. Re-reading just the thread pointer (an asm + constant offset) is not - * enough -- the optimizer still defeats it under whole-program LTO -- so the - * access itself must sit behind the call boundary. Correct for every TLS model, - * at one real call per access. Non-GNU compilers (MSVC) have no inline asm and - * keep the plain `&tlsvar`. + * Fast path (static TLS only, hence gated on JEMALLOC_TLS_MODEL_INITIAL_EXEC -- + * jemalloc's default): the address is `thread_pointer + offset` with a + * thread-independent offset. The offset is captured once at runtime into a + * global by a noinline helper (so both of its terms are evaluated at a single + * point on a single thread), and the thread pointer is re-read on every call + * with a `volatile` asm the optimizer may not hoist or CSE (per-arch reads from + * mimalloc's mi_prim_tls_slot). The offset must NOT be computed inline as + * `&tlsvar - thread_pointer`: the compiler can hoist the (stale) `&tlsvar` + * while re-materializing the thread-pointer terms, and `fresh_tp + (stale_addr + * - fresh_tp)` cancels the fresh read back to the stale address. + * + * Otherwise (dynamic TLS models, unlisted arch, MinGW): a per-variable noinline + * accessor takes the address behind a `memory` barrier -- an opaque call the + * optimizer can neither inline, prove pure, nor hoist/CSE across the switch. + * Correct for every TLS model, at one real call per access. Non-GNU compilers + * (MSVC) have no inline asm and keep the plain `&tlsvar`. */ -#if defined(__GNUC__) +#if defined(__GNUC__) && !defined(_WIN32) && \ + defined(JEMALLOC_TLS_MODEL_INITIAL_EXEC) && \ + (defined(__aarch64__) || defined(__arm__) || defined(__x86_64__) || \ + defined(__i386__)) +JEMALLOC_ALWAYS_INLINE char * +jemalloc_thread_pointer(void) { + char *thread_pointer; +# if defined(__aarch64__) && defined(__APPLE__) + __asm__ __volatile__("mrs %0, tpidrro_el0\n\tbic %0, %0, #7" : "=r"(thread_pointer)); +# elif defined(__aarch64__) + __asm__ __volatile__("mrs %0, tpidr_el0" : "=r"(thread_pointer)); +# elif defined(__arm__) + __asm__ __volatile__("mrc p15, 0, %0, c13, c0, 3\n\tbic %0, %0, #3" : "=r"(thread_pointer)); +# elif defined(__x86_64__) && defined(__APPLE__) + __asm__ __volatile__("movq %%gs:0, %0" : "=r"(thread_pointer)); +# elif defined(__x86_64__) + __asm__ __volatile__("movq %%fs:0, %0" : "=r"(thread_pointer)); +# else /* __i386__ */ + __asm__ __volatile__("movl %%gs:0, %0" : "=r"(thread_pointer)); +# endif + return thread_pointer; +} +/* 1 is unreachable: tlsvar and the thread pointer are at least 4-aligned. */ +# define JEMALLOC_TLS_OFFSET_UNINITIALIZED 1 +# define JEMALLOC_TLS_ADDR_DEFINE(tlsvar) \ + static UNUSED intptr_t jemalloc_tls_offset_##tlsvar = \ + JEMALLOC_TLS_OFFSET_UNINITIALIZED; \ + static UNUSED JEMALLOC_NOINLINE intptr_t \ + jemalloc_tls_offset_init_##tlsvar(void) { \ + intptr_t tls_offset = (intptr_t)((char *)&(tlsvar) - \ + jemalloc_thread_pointer()); \ + jemalloc_tls_offset_##tlsvar = tls_offset; \ + return tls_offset; \ + } \ + JEMALLOC_ALWAYS_INLINE __typeof__(&(tlsvar)) \ + jemalloc_tls_addr_##tlsvar(void) { \ + intptr_t tls_offset = jemalloc_tls_offset_##tlsvar; \ + if (unlikely(tls_offset == \ + JEMALLOC_TLS_OFFSET_UNINITIALIZED)) { \ + tls_offset = jemalloc_tls_offset_init_##tlsvar(); \ + } \ + return (__typeof__(&(tlsvar)))(jemalloc_thread_pointer() + \ + tls_offset); \ + } +# define JEMALLOC_TLS_ADDR(tlsvar) (jemalloc_tls_addr_##tlsvar()) +#elif defined(__GNUC__) # define JEMALLOC_TLS_ADDR_DEFINE(tlsvar) \ static UNUSED JEMALLOC_NOINLINE __typeof__(&(tlsvar)) \ jemalloc_tls_addr_##tlsvar(void) { \ From 0af5fdda9cc4b65fdf7751c32c49765a844164dc Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Wed, 10 Jun 2026 00:26:18 +0200 Subject: [PATCH 5/5] test: link tcache_fiber_migration with -lstdc++ when cxx is enabled The test links the whole libjemalloc.a (--whole-archive), which contains jemalloc_cpp.o under --enable-cxx, so the link needs libstdc++ (std::set_new_handler, __cxa_*, typeinfo for std::bad_alloc, __gxx_personality_v0). Pull it from $(LIBS) the same way the generic integration rule does; a no-cxx build is unaffected since $(LIBS) then has no -lstdc++. Co-Authored-By: Claude Fable 5 --- Makefile.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.in b/Makefile.in index 92cc307b4a..ba24f84494 100644 --- a/Makefile.in +++ b/Makefile.in @@ -578,7 +578,7 @@ $(objroot)test/integration/%$(EXE): $(objroot)test/integration/%.$(O) $(C_TESTLI # inlining. Explicit rule -- overrides the generic integration rule above. $(objroot)test/integration/tcache_fiber_migration$(EXE): $(objroot)test/integration/tcache_fiber_migration.$(O) $(objroot)lib/$(LIBJEMALLOC).$(A) @mkdir -p $(@D) - $(CC) $(LDTARGET) $(objroot)test/integration/tcache_fiber_migration.$(O) -Wl,--whole-archive $(objroot)lib/$(LIBJEMALLOC).$(A) -Wl,--no-whole-archive $(LDFLAGS) -pthread $(LM) $(EXTRA_LDFLAGS) + $(CC) $(LDTARGET) $(objroot)test/integration/tcache_fiber_migration.$(O) -Wl,--whole-archive $(objroot)lib/$(LIBJEMALLOC).$(A) -Wl,--no-whole-archive $(LDFLAGS) -pthread $(filter -lrt -lstdc++,$(LIBS)) $(LM) $(EXTRA_LDFLAGS) $(objroot)test/integration/cpp/%$(EXE): $(objroot)test/integration/cpp/%.$(O) $(C_TESTLIB_INTEGRATION_OBJS) $(C_UTIL_INTEGRATION_OBJS) $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB) @mkdir -p $(@D)