diff --git a/.github/workflows/linux-ci.yml b/.github/workflows/linux-ci.yml index c5e0c9aaf2..527a601dd9 100644 --- a/.github/workflows/linux-ci.yml +++ b/.github/workflows/linux-ci.yml @@ -692,4 +692,23 @@ jobs: make check + test-linux-lto: + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v4 + + - name: Install clang, lld and llvm + run: | + sudo apt-get update + sudo apt-get install -y clang lld llvm + + - name: Build and test (whole-program ThinLTO, je_ prefix) + run: | + autoconf + CC=clang AR=llvm-ar NM=llvm-nm RANLIB=llvm-ranlib \ + ./configure --with-jemalloc-prefix=je_ EXTRA_CFLAGS=-flto=thin + make -j3 EXTRA_LDFLAGS="-flto=thin -fuse-ld=lld" + make -j3 tests EXTRA_LDFLAGS="-flto=thin -fuse-ld=lld" + make check + diff --git a/Makefile.in b/Makefile.in index 459f98fb04..ba24f84494 100644 --- a/Makefile.in +++ b/Makefile.in @@ -334,6 +334,14 @@ ifeq (@enable_experimental_smallocx@, 1) TESTS_INTEGRATION += \ $(srcroot)test/integration/smallocx.c endif +# tcache_fiber_migration is a standalone LTO reproducer (issue #2890): the +# dedicated rule below links it against the static archive (--whole-archive) so +# the allocator fastpath inlines next to the swapcontext. Needs that archive, +# hence the enable_static guard; reproduces the bug only under whole-program +# LTO, a no-op guard otherwise. +ifeq ($(enable_static), 1) +TESTS_INTEGRATION += $(srcroot)test/integration/tcache_fiber_migration.c +endif ifeq (@enable_cxx@, 1) CPP_SRCS := $(srcroot)src/jemalloc_cpp.cpp TESTS_INTEGRATION_CPP := $(srcroot)test/integration/cpp/basic.cpp \ @@ -564,6 +572,14 @@ $(objroot)test/integration/%$(EXE): $(objroot)test/integration/%.$(O) $(C_TESTLI @mkdir -p $(@D) $(CC) $(TEST_LD_MODE) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(LJEMALLOC) $(LDFLAGS) $(filter-out -lm,$(filter -lrt -pthread -lstdc++,$(LIBS))) $(LM) $(EXTRA_LDFLAGS) +# tcache_fiber_migration (issue #2890) must inline the allocator next to the +# swapcontext, so link jemalloc statically (--whole-archive), without the test +# harness/shared lib; whole-program LTO (from the build's flags) does the +# inlining. Explicit rule -- overrides the generic integration rule above. +$(objroot)test/integration/tcache_fiber_migration$(EXE): $(objroot)test/integration/tcache_fiber_migration.$(O) $(objroot)lib/$(LIBJEMALLOC).$(A) + @mkdir -p $(@D) + $(CC) $(LDTARGET) $(objroot)test/integration/tcache_fiber_migration.$(O) -Wl,--whole-archive $(objroot)lib/$(LIBJEMALLOC).$(A) -Wl,--no-whole-archive $(LDFLAGS) -pthread $(filter -lrt -lstdc++,$(LIBS)) $(LM) $(EXTRA_LDFLAGS) + $(objroot)test/integration/cpp/%$(EXE): $(objroot)test/integration/cpp/%.$(O) $(C_TESTLIB_INTEGRATION_OBJS) $(C_UTIL_INTEGRATION_OBJS) $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB) @mkdir -p $(@D) $(CXX) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB) $(LDFLAGS) $(filter-out -lm,$(LIBS)) -lm $(EXTRA_LDFLAGS) diff --git a/configure.ac b/configure.ac index e57d0667e4..5ef440e8f3 100644 --- a/configure.ac +++ b/configure.ac @@ -2775,6 +2775,8 @@ if test "x${je_cv_tls_model}" = "xyes" -a \ AC_DEFINE([JEMALLOC_TLS_MODEL], [__attribute__((tls_model("initial-exec")))], [ ]) + AC_DEFINE([JEMALLOC_TLS_MODEL_INITIAL_EXEC], [ ], + [Defined when the TSD thread-locals use the initial-exec (static) TLS model, i.e. live at a fixed offset from the thread pointer.]) else AC_DEFINE([JEMALLOC_TLS_MODEL], [ ], [ ]) fi diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in index 31ae2e8ed2..841ae94233 100644 --- a/include/jemalloc/internal/jemalloc_internal_defs.h.in +++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in @@ -148,6 +148,13 @@ /* Non-empty if the tls_model attribute is supported. */ #undef JEMALLOC_TLS_MODEL +/* + * Defined when the TSD thread-locals use the initial-exec (static) TLS model, + * i.e. live at a fixed offset from the thread pointer. Gates the fast path of + * JEMALLOC_TLS_ADDR (see tsd_internals.h). + */ +#undef JEMALLOC_TLS_MODEL_INITIAL_EXEC + /* * JEMALLOC_DEBUG enables assertions and other sanity checks, and disables * inline functions. diff --git a/include/jemalloc/internal/tsd_internals.h b/include/jemalloc/internal/tsd_internals.h index f675587d73..55fac2c0e7 100644 --- a/include/jemalloc/internal/tsd_internals.h +++ b/include/jemalloc/internal/tsd_internals.h @@ -20,6 +20,92 @@ #include "jemalloc/internal/util.h" #include "jemalloc/internal/witness.h" +/* + * JEMALLOC_TLS_ADDR(tlsvar): the address of a thread-local, read so the compiler + * cannot hoist it across a user-space context switch (swapcontext, ucontext / + * coroutine runtimes). A plain `&tlsvar` is loop-invariant, so under LTO the + * compiler may hoist the read out of the inlined `malloc`/`free` and keep it in + * a register; if the context then resumes on another OS thread, that stale + * tsd/tcache belongs to the previous thread and the two race on it -- heap + * corruption visible only under LTO. See + * https://github.com/jemalloc/jemalloc/issues/2890 + * + * Fast path (static TLS only, hence gated on JEMALLOC_TLS_MODEL_INITIAL_EXEC -- + * jemalloc's default): the address is `thread_pointer + offset` with a + * thread-independent offset. The offset is captured once at runtime into a + * global by a noinline helper (so both of its terms are evaluated at a single + * point on a single thread), and the thread pointer is re-read on every call + * with a `volatile` asm the optimizer may not hoist or CSE (per-arch reads from + * mimalloc's mi_prim_tls_slot). The offset must NOT be computed inline as + * `&tlsvar - thread_pointer`: the compiler can hoist the (stale) `&tlsvar` + * while re-materializing the thread-pointer terms, and `fresh_tp + (stale_addr + * - fresh_tp)` cancels the fresh read back to the stale address. + * + * Otherwise (dynamic TLS models, unlisted arch, MinGW): a per-variable noinline + * accessor takes the address behind a `memory` barrier -- an opaque call the + * optimizer can neither inline, prove pure, nor hoist/CSE across the switch. + * Correct for every TLS model, at one real call per access. Non-GNU compilers + * (MSVC) have no inline asm and keep the plain `&tlsvar`. + */ +#if defined(__GNUC__) && !defined(_WIN32) && \ + defined(JEMALLOC_TLS_MODEL_INITIAL_EXEC) && \ + (defined(__aarch64__) || defined(__arm__) || defined(__x86_64__) || \ + defined(__i386__)) +JEMALLOC_ALWAYS_INLINE char * +jemalloc_thread_pointer(void) { + char *thread_pointer; +# if defined(__aarch64__) && defined(__APPLE__) + __asm__ __volatile__("mrs %0, tpidrro_el0\n\tbic %0, %0, #7" : "=r"(thread_pointer)); +# elif defined(__aarch64__) + __asm__ __volatile__("mrs %0, tpidr_el0" : "=r"(thread_pointer)); +# elif defined(__arm__) + __asm__ __volatile__("mrc p15, 0, %0, c13, c0, 3\n\tbic %0, %0, #3" : "=r"(thread_pointer)); +# elif defined(__x86_64__) && defined(__APPLE__) + __asm__ __volatile__("movq %%gs:0, %0" : "=r"(thread_pointer)); +# elif defined(__x86_64__) + __asm__ __volatile__("movq %%fs:0, %0" : "=r"(thread_pointer)); +# else /* __i386__ */ + __asm__ __volatile__("movl %%gs:0, %0" : "=r"(thread_pointer)); +# endif + return thread_pointer; +} +/* 1 is unreachable: tlsvar and the thread pointer are at least 4-aligned. */ +# define JEMALLOC_TLS_OFFSET_UNINITIALIZED 1 +# define JEMALLOC_TLS_ADDR_DEFINE(tlsvar) \ + static UNUSED intptr_t jemalloc_tls_offset_##tlsvar = \ + JEMALLOC_TLS_OFFSET_UNINITIALIZED; \ + static UNUSED JEMALLOC_NOINLINE intptr_t \ + jemalloc_tls_offset_init_##tlsvar(void) { \ + intptr_t tls_offset = (intptr_t)((char *)&(tlsvar) - \ + jemalloc_thread_pointer()); \ + jemalloc_tls_offset_##tlsvar = tls_offset; \ + return tls_offset; \ + } \ + JEMALLOC_ALWAYS_INLINE __typeof__(&(tlsvar)) \ + jemalloc_tls_addr_##tlsvar(void) { \ + intptr_t tls_offset = jemalloc_tls_offset_##tlsvar; \ + if (unlikely(tls_offset == \ + JEMALLOC_TLS_OFFSET_UNINITIALIZED)) { \ + tls_offset = jemalloc_tls_offset_init_##tlsvar(); \ + } \ + return (__typeof__(&(tlsvar)))(jemalloc_thread_pointer() + \ + tls_offset); \ + } +# define JEMALLOC_TLS_ADDR(tlsvar) (jemalloc_tls_addr_##tlsvar()) +#elif defined(__GNUC__) +# define JEMALLOC_TLS_ADDR_DEFINE(tlsvar) \ + static UNUSED JEMALLOC_NOINLINE __typeof__(&(tlsvar)) \ + jemalloc_tls_addr_##tlsvar(void) { \ + __typeof__(&(tlsvar)) tls_addr = &(tlsvar); \ + __asm__ __volatile__("" : "+r"(tls_addr) : : "memory"); \ + return tls_addr; \ + } +# define JEMALLOC_TLS_ADDR(tlsvar) (jemalloc_tls_addr_##tlsvar()) +#else +# define JEMALLOC_TLS_ADDR_DEFINE(tlsvar) +# define JEMALLOC_TLS_ADDR(tlsvar) (&(tlsvar)) +#endif + /* * Thread-Specific-Data layout * diff --git a/include/jemalloc/internal/tsd_malloc_thread_cleanup.h b/include/jemalloc/internal/tsd_malloc_thread_cleanup.h index 00756df1d0..10d3579b02 100644 --- a/include/jemalloc/internal/tsd_malloc_thread_cleanup.h +++ b/include/jemalloc/internal/tsd_malloc_thread_cleanup.h @@ -13,6 +13,8 @@ extern JEMALLOC_TSD_TYPE_ATTR(tsd_t) tsd_tls; extern JEMALLOC_TSD_TYPE_ATTR(bool) tsd_initialized; extern bool tsd_booted; +JEMALLOC_TLS_ADDR_DEFINE(tsd_tls) + /* Initialization/cleanup. */ JEMALLOC_ALWAYS_INLINE bool tsd_cleanup_wrapper(void) { @@ -53,7 +55,7 @@ tsd_get_allocates(void) { /* Get/set. */ JEMALLOC_ALWAYS_INLINE tsd_t * tsd_get(bool init) { - return &tsd_tls; + return JEMALLOC_TLS_ADDR(tsd_tls); } JEMALLOC_ALWAYS_INLINE void tsd_set(tsd_t *val) { diff --git a/include/jemalloc/internal/tsd_tls.h b/include/jemalloc/internal/tsd_tls.h index 6536eb540c..968d1c0bdf 100644 --- a/include/jemalloc/internal/tsd_tls.h +++ b/include/jemalloc/internal/tsd_tls.h @@ -13,6 +13,8 @@ extern JEMALLOC_TSD_TYPE_ATTR(tsd_t) tsd_tls; extern pthread_key_t tsd_tsd; extern bool tsd_booted; +JEMALLOC_TLS_ADDR_DEFINE(tsd_tls) + /* Initialization/cleanup. */ JEMALLOC_ALWAYS_INLINE bool tsd_boot0(void) { @@ -46,7 +48,7 @@ tsd_get_allocates(void) { /* Get/set. */ JEMALLOC_ALWAYS_INLINE tsd_t * tsd_get(bool init) { - return &tsd_tls; + return JEMALLOC_TLS_ADDR(tsd_tls); } JEMALLOC_ALWAYS_INLINE void diff --git a/include/jemalloc/internal/tsd_win.h b/include/jemalloc/internal/tsd_win.h index 8b22bec18d..d06cc27e7e 100644 --- a/include/jemalloc/internal/tsd_win.h +++ b/include/jemalloc/internal/tsd_win.h @@ -178,6 +178,8 @@ tsd_set(tsd_t *val) { extern JEMALLOC_TSD_TYPE_ATTR(tsd_wrapper_t) tsd_wrapper_tls; extern bool tsd_booted; +JEMALLOC_TLS_ADDR_DEFINE(tsd_wrapper_tls) + /* Initialization/cleanup. */ JEMALLOC_ALWAYS_INLINE bool tsd_cleanup_wrapper(void) { @@ -223,7 +225,7 @@ tsd_get_allocates(void) { /* Get/set. */ JEMALLOC_ALWAYS_INLINE tsd_t * tsd_get(bool init) { - return &(tsd_wrapper_tls.val); + return &(JEMALLOC_TLS_ADDR(tsd_wrapper_tls)->val); } JEMALLOC_ALWAYS_INLINE void diff --git a/scripts/gen_gh_actions.py b/scripts/gen_gh_actions.py index 4c5474ab7f..d40fc7f321 100755 --- a/scripts/gen_gh_actions.py +++ b/scripts/gen_gh_actions.py @@ -632,6 +632,34 @@ def generate_freebsd_job(arch): return job +def generate_linux_lto_job(): + """Dedicated lane: whole-program ThinLTO + the je_ public prefix, statically + linked. This is the configuration under which the tcache_fiber_migration + reproducer (issue #2890) actually exercises the bug -- the allocator + fastpath must be inlined next to the swapcontext, which only happens with + static linking under LTO. llvm-ar/nm/ranlib are needed to archive the LTO + bitcode; -fuse-ld=lld to link it.""" + return """ test-linux-lto: + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v4 + + - name: Install clang, lld and llvm + run: | + sudo apt-get update + sudo apt-get install -y clang lld llvm + + - name: Build and test (whole-program ThinLTO, je_ prefix) + run: | + autoconf + CC=clang AR=llvm-ar NM=llvm-nm RANLIB=llvm-ranlib \\ + ./configure --with-jemalloc-prefix=je_ EXTRA_CFLAGS=-flto=thin + make -j3 EXTRA_LDFLAGS="-flto=thin -fuse-ld=lld" + make -j3 tests EXTRA_LDFLAGS="-flto=thin -fuse-ld=lld" + make check +""" + + def main(): import sys @@ -642,6 +670,7 @@ def main(): jobs = '\n'.join(( generate_linux_job(AMD64), generate_linux_job(ARM64), + generate_linux_lto_job(), )) print(GITHUB_ACTIONS_TEMPLATE.format(name='Linux CI', jobs=jobs)) @@ -665,6 +694,7 @@ def main(): linux_jobs = '\n'.join(( generate_linux_job(AMD64), generate_linux_job(ARM64), + generate_linux_lto_job(), )) macos_jobs = '\n'.join(( generate_macos_job(AMD64), # Intel diff --git a/test/integration/tcache_fiber_migration.c b/test/integration/tcache_fiber_migration.c new file mode 100644 index 0000000000..e374af1f60 --- /dev/null +++ b/test/integration/tcache_fiber_migration.c @@ -0,0 +1,139 @@ +/* + * Standalone regression test for the thread-pointer hoisting bug (issue #2890): + * under whole-program LTO the inlined allocator fastpath caches the TLS-derived + * tcache base in a callee-saved register; a fiber resumed on a different OS + * thread then frees/allocs against the previous thread's tcache -> heap + * corruption. See JEMALLOC_TLS_ADDR in tsd_internals.h. + * + * Reproducing requires the allocator inlined next to the swapcontext, so: + * - call jemalloc directly (via JEMALLOC_MANGLE, so malloc/free bind to the + * configured-prefix symbols, not the libc wrappers, which do not inline); + * - link jemalloc *statically* (Makefile.in links this one test against + * libjemalloc.a with --whole-archive); + * - build with whole-program LTO. + * Hence it is a deliberately standalone program (no test harness, so it can be + * static-linked without symbol clashes). Without LTO it runs clean, so it is a + * no-op guard there. + */ +#include +#include +#include + +#define JEMALLOC_MANGLE +#include + +enum { + num_fibers = 128, + num_workers = 8, + ops_per_fiber = 20 * 1000, + fiber_stack_size = 1 << 16, + ready_queue_capacity = 512 /* power of two, > num_fibers */ +}; + +static ucontext_t fiber_context[num_fibers]; +/* Scheduler context to switch back to when a fiber yields; the resuming worker + * writes its own context here, so this changes when the fiber migrates. */ +static ucontext_t *return_context[num_fibers]; +static unsigned fiber_remaining_ops[num_fibers]; +static bool fiber_done[num_fibers]; + +static unsigned ready_queue[ready_queue_capacity]; +static unsigned ready_queue_head; +static unsigned ready_queue_tail; +static unsigned live_fibers; +static pthread_mutex_t queue_mtx = PTHREAD_MUTEX_INITIALIZER; + +static void +push_ready_fiber(unsigned id) { + pthread_mutex_lock(&queue_mtx); + ready_queue[ready_queue_head++ & (ready_queue_capacity - 1)] = id; + pthread_mutex_unlock(&queue_mtx); +} + +/* Returns a ready fiber id, -1 if none ready now, or -2 if all have finished. */ +static int +pop_ready_fiber(void) { + int id; + pthread_mutex_lock(&queue_mtx); + if (live_fibers == 0) { + id = -2; + } else if (ready_queue_tail != ready_queue_head) { + id = (int)ready_queue[ready_queue_tail++ & (ready_queue_capacity - 1)]; + } else { + id = -1; + } + pthread_mutex_unlock(&queue_mtx); + return id; +} + +static void +fiber_run(int id) { + void *p = malloc(64); + while (fiber_remaining_ops[id] > 0) { + fiber_remaining_ops[id]--; + free(p); /* push to the CURRENT thread's tcache */ + /* Yield; may be resumed on a different worker thread. */ + swapcontext(&fiber_context[id], return_context[id]); + p = malloc(64); /* pop; must come from the NEW thread's tcache */ + *(char *)p = (char)id; + } + free(p); + pthread_mutex_lock(&queue_mtx); + fiber_done[id] = true; + live_fibers--; + pthread_mutex_unlock(&queue_mtx); + swapcontext(&fiber_context[id], return_context[id]); +} + +static void * +worker_thread(void *arg) { + ucontext_t scheduler_context; + (void)arg; + for (;;) { + int id = pop_ready_fiber(); + if (id == -2) { + break; + } + if (id < 0) { + continue; + } + return_context[id] = &scheduler_context; + swapcontext(&scheduler_context, &fiber_context[id]); + if (!fiber_done[id]) { + push_ready_fiber((unsigned)id); + } + } + return NULL; +} + +int +main(void) { + void *stacks[num_fibers]; + pthread_t threads[num_workers]; + unsigned i; + + live_fibers = num_fibers; + for (i = 0; i < num_fibers; i++) { + stacks[i] = malloc(fiber_stack_size); + fiber_remaining_ops[i] = ops_per_fiber; + getcontext(&fiber_context[i]); + fiber_context[i].uc_stack.ss_sp = stacks[i]; + fiber_context[i].uc_stack.ss_size = fiber_stack_size; + fiber_context[i].uc_link = NULL; + makecontext(&fiber_context[i], (void (*)(void))fiber_run, 1, (int)i); + push_ready_fiber(i); + } + + for (i = 0; i < num_workers; i++) { + pthread_create(&threads[i], NULL, worker_thread, NULL); + } + for (i = 0; i < num_workers; i++) { + pthread_join(threads[i], NULL); + } + for (i = 0; i < num_fibers; i++) { + free(stacks[i]); + } + + /* Completing without a tcache-corruption crash is success. */ + return live_fibers == 0 ? 0 : 1; +}