Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions .github/workflows/linux-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -692,4 +692,23 @@ jobs:
make check


test-linux-lto:
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4

- name: Install clang, lld and llvm
run: |
sudo apt-get update
sudo apt-get install -y clang lld llvm

- name: Build and test (whole-program ThinLTO, je_ prefix)
run: |
autoconf
CC=clang AR=llvm-ar NM=llvm-nm RANLIB=llvm-ranlib \
./configure --with-jemalloc-prefix=je_ EXTRA_CFLAGS=-flto=thin
make -j3 EXTRA_LDFLAGS="-flto=thin -fuse-ld=lld"
make -j3 tests EXTRA_LDFLAGS="-flto=thin -fuse-ld=lld"
make check


16 changes: 16 additions & 0 deletions Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,14 @@ ifeq (@enable_experimental_smallocx@, 1)
TESTS_INTEGRATION += \
$(srcroot)test/integration/smallocx.c
endif
# tcache_fiber_migration is a standalone LTO reproducer (issue #2890): the
# dedicated rule below links it against the static archive (--whole-archive) so
# the allocator fastpath inlines next to the swapcontext. Needs that archive,
# hence the enable_static guard; reproduces the bug only under whole-program
# LTO, a no-op guard otherwise.
ifeq ($(enable_static), 1)
TESTS_INTEGRATION += $(srcroot)test/integration/tcache_fiber_migration.c
endif
ifeq (@enable_cxx@, 1)
CPP_SRCS := $(srcroot)src/jemalloc_cpp.cpp
TESTS_INTEGRATION_CPP := $(srcroot)test/integration/cpp/basic.cpp \
Expand Down Expand Up @@ -564,6 +572,14 @@ $(objroot)test/integration/%$(EXE): $(objroot)test/integration/%.$(O) $(C_TESTLI
@mkdir -p $(@D)
$(CC) $(TEST_LD_MODE) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(LJEMALLOC) $(LDFLAGS) $(filter-out -lm,$(filter -lrt -pthread -lstdc++,$(LIBS))) $(LM) $(EXTRA_LDFLAGS)

# tcache_fiber_migration (issue #2890) must inline the allocator next to the
# swapcontext, so link jemalloc statically (--whole-archive), without the test
# harness/shared lib; whole-program LTO (from the build's flags) does the
# inlining. Explicit rule -- overrides the generic integration rule above.
$(objroot)test/integration/tcache_fiber_migration$(EXE): $(objroot)test/integration/tcache_fiber_migration.$(O) $(objroot)lib/$(LIBJEMALLOC).$(A)
@mkdir -p $(@D)
$(CC) $(LDTARGET) $(objroot)test/integration/tcache_fiber_migration.$(O) -Wl,--whole-archive $(objroot)lib/$(LIBJEMALLOC).$(A) -Wl,--no-whole-archive $(LDFLAGS) -pthread $(filter -lrt -lstdc++,$(LIBS)) $(LM) $(EXTRA_LDFLAGS)

$(objroot)test/integration/cpp/%$(EXE): $(objroot)test/integration/cpp/%.$(O) $(C_TESTLIB_INTEGRATION_OBJS) $(C_UTIL_INTEGRATION_OBJS) $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB)
@mkdir -p $(@D)
$(CXX) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB) $(LDFLAGS) $(filter-out -lm,$(LIBS)) -lm $(EXTRA_LDFLAGS)
Expand Down
2 changes: 2 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -2775,6 +2775,8 @@ if test "x${je_cv_tls_model}" = "xyes" -a \
AC_DEFINE([JEMALLOC_TLS_MODEL],
[__attribute__((tls_model("initial-exec")))],
[ ])
AC_DEFINE([JEMALLOC_TLS_MODEL_INITIAL_EXEC], [ ],
[Defined when the TSD thread-locals use the initial-exec (static) TLS model, i.e. live at a fixed offset from the thread pointer.])
else
AC_DEFINE([JEMALLOC_TLS_MODEL], [ ], [ ])
fi
Expand Down
7 changes: 7 additions & 0 deletions include/jemalloc/internal/jemalloc_internal_defs.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,13 @@
/* Non-empty if the tls_model attribute is supported. */
#undef JEMALLOC_TLS_MODEL

/*
* Defined when the TSD thread-locals use the initial-exec (static) TLS model,
* i.e. live at a fixed offset from the thread pointer. Gates the fast path of
* JEMALLOC_TLS_ADDR (see tsd_internals.h).
*/
#undef JEMALLOC_TLS_MODEL_INITIAL_EXEC

/*
* JEMALLOC_DEBUG enables assertions and other sanity checks, and disables
* inline functions.
Expand Down
86 changes: 86 additions & 0 deletions include/jemalloc/internal/tsd_internals.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,92 @@
#include "jemalloc/internal/util.h"
#include "jemalloc/internal/witness.h"

/*
* JEMALLOC_TLS_ADDR(tlsvar): the address of a thread-local, read so the compiler
* cannot hoist it across a user-space context switch (swapcontext, ucontext /
* coroutine runtimes). A plain `&tlsvar` is loop-invariant, so under LTO the
* compiler may hoist the read out of the inlined `malloc`/`free` and keep it in
* a register; if the context then resumes on another OS thread, that stale
* tsd/tcache belongs to the previous thread and the two race on it -- heap
* corruption visible only under LTO. See
* https://github.com/jemalloc/jemalloc/issues/2890
*
* Fast path (static TLS only, hence gated on JEMALLOC_TLS_MODEL_INITIAL_EXEC --
* jemalloc's default): the address is `thread_pointer + offset` with a
* thread-independent offset. The offset is captured once at runtime into a
* global by a noinline helper (so both of its terms are evaluated at a single
* point on a single thread), and the thread pointer is re-read on every call
* with a `volatile` asm the optimizer may not hoist or CSE (per-arch reads from
* mimalloc's mi_prim_tls_slot). The offset must NOT be computed inline as
* `&tlsvar - thread_pointer`: the compiler can hoist the (stale) `&tlsvar`
* while re-materializing the thread-pointer terms, and `fresh_tp + (stale_addr
* - fresh_tp)` cancels the fresh read back to the stale address.
*
* Otherwise (dynamic TLS models, unlisted arch, MinGW): a per-variable noinline
* accessor takes the address behind a `memory` barrier -- an opaque call the
* optimizer can neither inline, prove pure, nor hoist/CSE across the switch.
* Correct for every TLS model, at one real call per access. Non-GNU compilers
* (MSVC) have no inline asm and keep the plain `&tlsvar`.
*/
#if defined(__GNUC__) && !defined(_WIN32) && \
defined(JEMALLOC_TLS_MODEL_INITIAL_EXEC) && \
(defined(__aarch64__) || defined(__arm__) || defined(__x86_64__) || \
defined(__i386__))
JEMALLOC_ALWAYS_INLINE char *
jemalloc_thread_pointer(void) {
char *thread_pointer;
# if defined(__aarch64__) && defined(__APPLE__)
__asm__ __volatile__("mrs %0, tpidrro_el0\n\tbic %0, %0, #7" : "=r"(thread_pointer));
# elif defined(__aarch64__)
__asm__ __volatile__("mrs %0, tpidr_el0" : "=r"(thread_pointer));
# elif defined(__arm__)
__asm__ __volatile__("mrc p15, 0, %0, c13, c0, 3\n\tbic %0, %0, #3" : "=r"(thread_pointer));
# elif defined(__x86_64__) && defined(__APPLE__)
__asm__ __volatile__("movq %%gs:0, %0" : "=r"(thread_pointer));
# elif defined(__x86_64__)
__asm__ __volatile__("movq %%fs:0, %0" : "=r"(thread_pointer));
# else /* __i386__ */
__asm__ __volatile__("movl %%gs:0, %0" : "=r"(thread_pointer));
# endif
return thread_pointer;
}
/* 1 is unreachable: tlsvar and the thread pointer are at least 4-aligned. */
# define JEMALLOC_TLS_OFFSET_UNINITIALIZED 1
# define JEMALLOC_TLS_ADDR_DEFINE(tlsvar) \
static UNUSED intptr_t jemalloc_tls_offset_##tlsvar = \
JEMALLOC_TLS_OFFSET_UNINITIALIZED; \
static UNUSED JEMALLOC_NOINLINE intptr_t \
jemalloc_tls_offset_init_##tlsvar(void) { \
intptr_t tls_offset = (intptr_t)((char *)&(tlsvar) - \
jemalloc_thread_pointer()); \
jemalloc_tls_offset_##tlsvar = tls_offset; \
return tls_offset; \
} \
JEMALLOC_ALWAYS_INLINE __typeof__(&(tlsvar)) \
jemalloc_tls_addr_##tlsvar(void) { \
intptr_t tls_offset = jemalloc_tls_offset_##tlsvar; \
if (unlikely(tls_offset == \
JEMALLOC_TLS_OFFSET_UNINITIALIZED)) { \
tls_offset = jemalloc_tls_offset_init_##tlsvar(); \
} \
return (__typeof__(&(tlsvar)))(jemalloc_thread_pointer() + \
tls_offset); \
}
# define JEMALLOC_TLS_ADDR(tlsvar) (jemalloc_tls_addr_##tlsvar())
#elif defined(__GNUC__)
# define JEMALLOC_TLS_ADDR_DEFINE(tlsvar) \
static UNUSED JEMALLOC_NOINLINE __typeof__(&(tlsvar)) \
jemalloc_tls_addr_##tlsvar(void) { \
__typeof__(&(tlsvar)) tls_addr = &(tlsvar); \
__asm__ __volatile__("" : "+r"(tls_addr) : : "memory"); \
return tls_addr; \
}
# define JEMALLOC_TLS_ADDR(tlsvar) (jemalloc_tls_addr_##tlsvar())
#else
# define JEMALLOC_TLS_ADDR_DEFINE(tlsvar)
# define JEMALLOC_TLS_ADDR(tlsvar) (&(tlsvar))
#endif

/*
* Thread-Specific-Data layout
*
Expand Down
4 changes: 3 additions & 1 deletion include/jemalloc/internal/tsd_malloc_thread_cleanup.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ extern JEMALLOC_TSD_TYPE_ATTR(tsd_t) tsd_tls;
extern JEMALLOC_TSD_TYPE_ATTR(bool) tsd_initialized;
extern bool tsd_booted;

JEMALLOC_TLS_ADDR_DEFINE(tsd_tls)

/* Initialization/cleanup. */
JEMALLOC_ALWAYS_INLINE bool
tsd_cleanup_wrapper(void) {
Expand Down Expand Up @@ -53,7 +55,7 @@ tsd_get_allocates(void) {
/* Get/set. */
JEMALLOC_ALWAYS_INLINE tsd_t *
tsd_get(bool init) {
return &tsd_tls;
return JEMALLOC_TLS_ADDR(tsd_tls);
}
JEMALLOC_ALWAYS_INLINE void
tsd_set(tsd_t *val) {
Expand Down
4 changes: 3 additions & 1 deletion include/jemalloc/internal/tsd_tls.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ extern JEMALLOC_TSD_TYPE_ATTR(tsd_t) tsd_tls;
extern pthread_key_t tsd_tsd;
extern bool tsd_booted;

JEMALLOC_TLS_ADDR_DEFINE(tsd_tls)

/* Initialization/cleanup. */
JEMALLOC_ALWAYS_INLINE bool
tsd_boot0(void) {
Expand Down Expand Up @@ -46,7 +48,7 @@ tsd_get_allocates(void) {
/* Get/set. */
JEMALLOC_ALWAYS_INLINE tsd_t *
tsd_get(bool init) {
return &tsd_tls;
return JEMALLOC_TLS_ADDR(tsd_tls);
}

JEMALLOC_ALWAYS_INLINE void
Expand Down
4 changes: 3 additions & 1 deletion include/jemalloc/internal/tsd_win.h
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,8 @@ tsd_set(tsd_t *val) {
extern JEMALLOC_TSD_TYPE_ATTR(tsd_wrapper_t) tsd_wrapper_tls;
extern bool tsd_booted;

JEMALLOC_TLS_ADDR_DEFINE(tsd_wrapper_tls)

/* Initialization/cleanup. */
JEMALLOC_ALWAYS_INLINE bool
tsd_cleanup_wrapper(void) {
Expand Down Expand Up @@ -223,7 +225,7 @@ tsd_get_allocates(void) {
/* Get/set. */
JEMALLOC_ALWAYS_INLINE tsd_t *
tsd_get(bool init) {
return &(tsd_wrapper_tls.val);
return &(JEMALLOC_TLS_ADDR(tsd_wrapper_tls)->val);
}

JEMALLOC_ALWAYS_INLINE void
Expand Down
30 changes: 30 additions & 0 deletions scripts/gen_gh_actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -632,6 +632,34 @@ def generate_freebsd_job(arch):
return job


def generate_linux_lto_job():
"""Dedicated lane: whole-program ThinLTO + the je_ public prefix, statically
linked. This is the configuration under which the tcache_fiber_migration
reproducer (issue #2890) actually exercises the bug -- the allocator
fastpath must be inlined next to the swapcontext, which only happens with
static linking under LTO. llvm-ar/nm/ranlib are needed to archive the LTO
bitcode; -fuse-ld=lld to link it."""
return """ test-linux-lto:
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4

- name: Install clang, lld and llvm
run: |
sudo apt-get update
sudo apt-get install -y clang lld llvm

- name: Build and test (whole-program ThinLTO, je_ prefix)
run: |
autoconf
CC=clang AR=llvm-ar NM=llvm-nm RANLIB=llvm-ranlib \\
./configure --with-jemalloc-prefix=je_ EXTRA_CFLAGS=-flto=thin
make -j3 EXTRA_LDFLAGS="-flto=thin -fuse-ld=lld"
make -j3 tests EXTRA_LDFLAGS="-flto=thin -fuse-ld=lld"
make check
"""


def main():
import sys

Expand All @@ -642,6 +670,7 @@ def main():
jobs = '\n'.join((
generate_linux_job(AMD64),
generate_linux_job(ARM64),
generate_linux_lto_job(),
))
print(GITHUB_ACTIONS_TEMPLATE.format(name='Linux CI', jobs=jobs))

Expand All @@ -665,6 +694,7 @@ def main():
linux_jobs = '\n'.join((
generate_linux_job(AMD64),
generate_linux_job(ARM64),
generate_linux_lto_job(),
))
macos_jobs = '\n'.join((
generate_macos_job(AMD64), # Intel
Expand Down
Loading
Loading