Skip to content
Draft
68 changes: 61 additions & 7 deletions include/jemalloc/internal/bin_inlines.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,31 @@ bin_slab_regind_impl(
size_t diff, regind;

/* Freeing a pointer outside the slab can cause assertion failure. */
assert((uintptr_t)ptr >= (uintptr_t)edata_addr_get(slab));
assert((uintptr_t)ptr < (uintptr_t)edata_past_get(slab));
/* Freeing an interior pointer can cause assertion failure. */
assert(((uintptr_t)ptr - (uintptr_t)edata_addr_get(slab))
% (uintptr_t)bin_infos[binind].reg_size
== 0);
if (unlikely((uintptr_t)ptr < (uintptr_t)edata_addr_get(slab)
|| (uintptr_t)ptr >= (uintptr_t)edata_past_get(slab))) {
safety_check_fail(
"bin_slab_regind: ptr %p outside slab [%p, %p)\n",
ptr, edata_addr_get(slab), edata_past_get(slab));
}

diff = (size_t)((uintptr_t)ptr - (uintptr_t)edata_addr_get(slab));

if (unlikely(diff % (uintptr_t)bin_infos[binind].reg_size != 0)) {
safety_check_fail(
"bin_slab_regind: ptr %p not aligned to reg_size "
"%zu (diff=%zu)\n",
ptr, bin_infos[binind].reg_size, diff);
}

/* Avoid doing division with a variable divisor. */
regind = div_compute(div_info, diff);
assert(regind < bin_infos[binind].nregs);

if (unlikely(regind >= bin_infos[binind].nregs)) {
safety_check_fail(
"bin_slab_regind: regind %zu >= nregs %u for "
"binind %u\n",
regind, bin_infos[binind].nregs, binind);
}
return regind;
}

Expand Down Expand Up @@ -80,9 +93,50 @@ bin_dalloc_locked_step(tsdn_t *tsdn, bool is_auto, bin_t *bin,
/* Freeing an unallocated pointer can cause assertion failure. */
assert(bitmap_get(slab_data->bitmap, &bin_info->bitmap_info, regind));

/* Debug: snapshot bitmap group before unset. */
size_t goff_dbg = regind >> LG_BITMAP_GROUP_NBITS;
bitmap_t before_dbg = *(volatile bitmap_t *)&slab_data->bitmap[goff_dbg];

bitmap_unset(slab_data->bitmap, &bin_info->bitmap_info, regind);

/* Debug: verify the bit was actually flipped. */
bitmap_t after_dbg = *(volatile bitmap_t *)&slab_data->bitmap[goff_dbg];
bitmap_t expected_bit = ZU(1) << (regind & BITMAP_GROUP_NBITS_MASK);
if (unlikely((before_dbg | expected_bit) != after_dbg)) {
safety_check_fail(
"bitmap_unset lost: binind %u regind %zu "
"goff %zu before %lx after %lx expected_bit %lx\n",
binind, regind, goff_dbg,
(unsigned long)before_dbg,
(unsigned long)after_dbg,
(unsigned long)expected_bit);
}

edata_nfree_inc(slab);

/* Debug: verify nfree/bitmap consistency after free. */
{
unsigned actual_free = 0;
unsigned ngroups =
#ifdef BITMAP_USE_TREE
bin_info->bitmap_info.levels[
bin_info->bitmap_info.nlevels].group_offset;
#else
bin_info->bitmap_info.ngroups;
#endif
for (unsigned gi = 0; gi < ngroups; gi++) {
actual_free += popcount_lu(slab_data->bitmap[gi]);
}
if (unlikely(actual_free != edata_nfree_get(slab))) {
safety_check_fail(
"bin_dalloc_locked_step: post-free "
"nfree/bitmap mismatch for binind %u "
"regind %zu: nfree=%u actual=%u\n",
binind, regind,
edata_nfree_get(slab), actual_free);
}
}

if (config_stats) {
info->ndalloc++;
}
Expand Down
6 changes: 3 additions & 3 deletions include/jemalloc/internal/bit_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,19 @@
*/
static inline unsigned
ffs_llu(unsigned long long x) {
util_assume(x != 0);
assert(x != 0);
return JEMALLOC_INTERNAL_FFSLL(x) - 1;
}

static inline unsigned
ffs_lu(unsigned long x) {
util_assume(x != 0);
assert(x != 0);
return JEMALLOC_INTERNAL_FFSL(x) - 1;
}

static inline unsigned
ffs_u(unsigned x) {
util_assume(x != 0);
assert(x != 0);
return JEMALLOC_INTERNAL_FFS(x) - 1;
}

Expand Down
46 changes: 42 additions & 4 deletions include/jemalloc/internal/bitmap.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "jemalloc/internal/jemalloc_preamble.h"
#include "jemalloc/internal/bit_util.h"
#include "jemalloc/internal/sc.h"
#include "jemalloc/internal/safety_check.h"

typedef unsigned long bitmap_t;
#define LG_SIZEOF_BITMAP LG_SIZEOF_LONG
Expand Down Expand Up @@ -224,6 +225,12 @@ bitmap_set(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit) {
assert(g & (ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK)));
g ^= ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK);
*gp = g;
/* Verify the store was not eliminated by DSE. */
if (unlikely(*(volatile bitmap_t *)gp != g)) {
safety_check_fail(
"bitmap_set: leaf store eliminated at group %zu, "
"expected %lx\n", goff, (unsigned long)g);
}
assert(bitmap_get(bitmap, binfo, bit));
#ifdef BITMAP_USE_TREE
/* Propagate group state transitions up the tree. */
Expand All @@ -237,6 +244,13 @@ bitmap_set(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit) {
assert(g & (ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK)));
g ^= ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK);
*gp = g;
/* Verify tree store was not eliminated. */
if (unlikely(*(volatile bitmap_t *)gp != g)) {
safety_check_fail(
"bitmap_set: tree store eliminated at "
"level %u group %zu, expected %lx\n",
i, goff, (unsigned long)g);
}
if (g != 0) {
break;
}
Expand Down Expand Up @@ -319,19 +333,43 @@ bitmap_sfu(bitmap_t *bitmap, const bitmap_info_t *binfo) {

#ifdef BITMAP_USE_TREE
i = binfo->nlevels - 1;
g = bitmap[binfo->levels[i].group_offset];
/*
* Use volatile reads to prevent LTO from hoisting/caching bitmap
* loads across iterations when bitmap_sfu is called in a loop
* (e.g. bin_slab_reg_alloc_batch). Without volatile, the compiler
* may reuse a stale bitmap value from before bitmap_set's store,
* causing the same bit to be returned twice.
*/
g = *(volatile bitmap_t *)&bitmap[binfo->levels[i].group_offset];
if (unlikely(g == 0)) {
safety_check_fail(
"bitmap_sfu: tree root is zero (bitmap full), "
"nlevels %u\n", binfo->nlevels);
}
bit = ffs_lu(g);
while (i > 0) {
i--;
g = bitmap[binfo->levels[i].group_offset + bit];
g = *(volatile bitmap_t *)&bitmap[
binfo->levels[i].group_offset + bit];
if (unlikely(g == 0)) {
safety_check_fail(
"bitmap_sfu: tree level %u group is zero "
"at offset %zu\n", i,
binfo->levels[i].group_offset + bit);
}
bit = (bit << LG_BITMAP_GROUP_NBITS) + ffs_lu(g);
}
#else
i = 0;
g = bitmap[0];
g = *(volatile bitmap_t *)&bitmap[0];
while (g == 0) {
i++;
g = bitmap[i];
if (unlikely(i >= BITMAP_BITS2GROUPS(binfo->nbits))) {
safety_check_fail(
"bitmap_sfu: all %u groups are zero "
"(bitmap full)\n", i);
}
g = *(volatile bitmap_t *)&bitmap[i];
}
bit = (i << LG_BITMAP_GROUP_NBITS) + ffs_lu(g);
#endif
Expand Down
15 changes: 12 additions & 3 deletions include/jemalloc/internal/cache_bin.h
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,7 @@ cache_bin_alloc_impl(cache_bin_t *bin, bool *success, bool adjust_low_water) {
* This may read from the empty position; however the loaded value won't
* be used. It's safe because the stack has one more slot reserved.
*/
void *ret = *bin->stack_head;
void *ret = *(void * volatile *)bin->stack_head;
cache_bin_sz_t low_bits = (cache_bin_sz_t)(uintptr_t)bin->stack_head;
void **new_head = bin->stack_head + 1;

Expand All @@ -403,6 +403,7 @@ cache_bin_alloc_impl(cache_bin_t *bin, bool *success, bool adjust_low_water) {
if (likely(low_bits != bin->low_bits_low_water)) {
bin->stack_head = new_head;
*success = true;
tcache_debug_on_pop(ret);
return ret;
}
if (!adjust_low_water) {
Expand All @@ -418,6 +419,7 @@ cache_bin_alloc_impl(cache_bin_t *bin, bool *success, bool adjust_low_water) {
bin->stack_head = new_head;
bin->low_bits_low_water = (cache_bin_sz_t)(uintptr_t)new_head;
*success = true;
tcache_debug_on_pop(ret);
return ret;
}
*success = false;
Expand Down Expand Up @@ -711,8 +713,15 @@ static inline void
cache_bin_finish_flush(
cache_bin_t *bin, cache_bin_ptr_array_t *arr, cache_bin_sz_t nflushed) {
unsigned rem = cache_bin_ncached_get_local(bin) - nflushed;
memmove(
bin->stack_head + nflushed, bin->stack_head, rem * sizeof(void *));
/*
* Use volatile pointers to prevent LTO from optimizing this
* memmove based on built-in memcpy/memmove/memset knowledge.
* Misoptimization here can leave stale flushed pointers in the
* bin, causing duplicate allocations.
*/
volatile void *dst = bin->stack_head + nflushed;
volatile void *src = bin->stack_head;
memmove((void *)dst, (void *)src, rem * sizeof(void *));
bin->stack_head += nflushed;
cache_bin_low_water_adjust(bin);
/* Reset the bin stats as it's merged during flush. */
Expand Down
9 changes: 5 additions & 4 deletions include/jemalloc/internal/div.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include "jemalloc/internal/jemalloc_preamble.h"
#include "jemalloc/internal/assert.h"
#include "jemalloc/internal/safety_check.h"

/*
* This module does the division that computes the index of a region in a slab,
Expand All @@ -25,17 +26,17 @@ void div_init(div_info_t *div_info, size_t divisor);

static inline size_t
div_compute(div_info_t *div_info, size_t n) {
assert(n <= (uint32_t)-1);
if (unlikely(n > (uint32_t)-1)) {
safety_check_fail(
"div_compute: n=%zu exceeds uint32 range\n", n);
}
/*
* This generates, e.g. mov; imul; shr on x86-64. On a 32-bit machine,
* the compilers I tried were all smart enough to turn this into the
* appropriate "get the high 32 bits of the result of a multiply" (e.g.
* mul; mov edx eax; on x86, umull on arm, etc.).
*/
size_t i = ((uint64_t)n * (uint64_t)div_info->magic) >> 32;
#ifdef JEMALLOC_DEBUG
assert(i * div_info->d == n);
#endif
return i;
}

Expand Down
3 changes: 3 additions & 0 deletions include/jemalloc/internal/edata.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "jemalloc/internal/slab_data.h"
#include "jemalloc/internal/sz.h"
#include "jemalloc/internal/typed_list.h"
#include <jemalloc/internal/util.h>

/*
* sizeof(edata_t) is 128 bytes on 64-bit architectures. Ensure the alignment
Expand Down Expand Up @@ -572,6 +573,8 @@ edata_nfree_inc(edata_t *edata) {
static inline void
edata_nfree_dec(edata_t *edata) {
assert(edata_slab_get(edata));
if (unlikely(edata_nfree_get(edata) == 0))
__builtin_trap();
edata->e_bits -= ((uint64_t)1U << EDATA_BITS_NFREE_SHIFT);
}

Expand Down
3 changes: 1 addition & 2 deletions include/jemalloc/internal/emap.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,6 @@ emap_assert_not_mapped(tsdn_t *tsdn, emap_t *emap, edata_t *edata) {

JEMALLOC_ALWAYS_INLINE bool
emap_edata_in_transition(tsdn_t *tsdn, emap_t *emap, edata_t *edata) {
assert(config_debug);
emap_assert_mapped(tsdn, emap, edata);

EMAP_DECLARE_RTREE_CTX;
Expand All @@ -171,7 +170,7 @@ JEMALLOC_ALWAYS_INLINE bool
emap_edata_is_acquired(tsdn_t *tsdn, emap_t *emap, edata_t *edata) {
if (!config_debug) {
/* For assertions only. */
return false;
return true;
}

/*
Expand Down
9 changes: 9 additions & 0 deletions include/jemalloc/internal/safety_check.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,15 @@ typedef void (*safety_check_abort_hook_t)(const char *message);
/* Can set to NULL for a default. */
void safety_check_set_abort(safety_check_abort_hook_t abort_fn);

/* Debug: pre-allocate backtrace table (call from tcache_init). */
void tcache_bt_ensure_table(void);
/* Debug: record backtrace on tcache push. */
void tcache_debug_bt_record(void *ptr);
/* Debug: remove backtrace record on tcache pop. */
void tcache_debug_on_pop(void *ptr);
/* Debug: scan for duplicates during tcache flush, print backtraces. */
void tcache_debug_check_flush(void **ptrs, unsigned nflush);

#define REDZONE_SIZE ((size_t)32)
#define REDZONE_FILL_VALUE 0xBC

Expand Down
6 changes: 6 additions & 0 deletions include/jemalloc/internal/tcache_inlines.h
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,12 @@ tcache_dalloc_small(
cache_bin_sz_t max = cache_bin_ncached_max_get(bin);
unsigned remain = max >> opt_lg_tcache_flush_small_div;
tcache_bin_flush_small(tsd, tcache, bin, binind, remain);
/*
* Compiler barrier: force reload of bin->stack_head after
* flush. Without this, LTO may cache stack_head from before
* the flush and use a stale value in the second dalloc_easy.
*/
__asm__ volatile("" : "+m"(*bin));
bool ret = cache_bin_dalloc_easy(bin, ptr);
assert(ret);
}
Expand Down
Loading
Loading