From 82d5d2abf966da0bcf87302cb3de657d040d2d49 Mon Sep 17 00:00:00 2001 From: struct Date: Mon, 4 May 2026 22:18:52 -0400 Subject: [PATCH 1/3] optionally use arm intrinsics to debug functions --- src/iso_alloc_profiler.c | 87 ++++++++++++++++++++++++++++++++++++++++ src/iso_alloc_sanity.c | 54 ++++++++++++++++++++++++- 2 files changed, 140 insertions(+), 1 deletion(-) diff --git a/src/iso_alloc_profiler.c b/src/iso_alloc_profiler.c index 9e4078d..0811b91 100644 --- a/src/iso_alloc_profiler.c +++ b/src/iso_alloc_profiler.c @@ -125,6 +125,92 @@ INTERNAL_HIDDEN uint64_t _iso_alloc_zone_leak_detector(iso_alloc_zone_t *zone, b uint32_t was_used = 0; int64_t bms = zone->bitmap_size / sizeof(bitmap_index_t); +#if USE_NEON + /* Process the bitmap two qwords at a time. The common case is that a + * qword is entirely zero (every chunk in those 32 slots is free and + * never used) — vectorising the zero-check lets us skip both qwords at + * once. For each non-zero qword we popcount the was_used pattern in + * O(1) and walk only the chunks whose low bit is set with ctz, instead + * of scanning all 32 pairs. */ + int64_t i = 0; + const int64_t bms_pair = bms & ~(int64_t) 1; + + for(; i < bms_pair; i += 2) { + int64x2_t v = vld1q_s64((const int64_t *) &bm[i]); + const uint64_t lane0 = (uint64_t) vgetq_lane_s64(v, 0); + const uint64_t lane1 = (uint64_t) vgetq_lane_s64(v, 1); + + if((lane0 | lane1) == 0) { + continue; + } + + const uint64_t lanes[2] = {lane0, lane1}; + for(int k = 0; k < 2; k++) { + uint64_t bts = lanes[k]; + if(bts == 0) { + continue; + } + + /* was_used: chunks encoded 01 (low=0, high=1). The mask of + * odd-bit positions whose paired even bit is clear is + * (~bts) & (bts >> 1) & USED_BIT_VECTOR. */ + uint64_t was_used_mask = (~bts) & (bts >> 1) & USED_BIT_VECTOR; + was_used += __builtin_popcountll(was_used_mask); + + /* Chunks with low bit set are either in-use (10) or canary + * (11). Walk just those positions to verify canaries. */ + uint64_t in_use_low = bts & USED_BIT_VECTOR; + while(in_use_low) { + int j = __builtin_ctzll(in_use_low); + in_use_low &= in_use_low - 1; + + int64_t bit_two = GET_BIT(bts, (j + 1)); + bit_slot_t bit_slot = (((bitmap_index_t)(i + k)) * BITS_PER_QWORD) + j; + const void *leak = (zone->user_pages_start + ((bit_slot >> 1) * zone->chunk_size)); + + if(bit_two == 1 && (check_canary_no_abort(zone, leak) != ERR)) { + continue; + } else { + in_use++; + + if(profile == false) { + LOG("Leaked chunk (%d) in zone[%d] of %d bytes detected at 0x%p (bit position = %d)", in_use, zone->index, zone->chunk_size, leak, bit_slot); + } + } + } + } + } + + for(; i < bms; i++) { + uint64_t bts = (uint64_t) bm[i]; + if(bts == 0) { + continue; + } + + uint64_t was_used_mask = (~bts) & (bts >> 1) & USED_BIT_VECTOR; + was_used += __builtin_popcountll(was_used_mask); + + uint64_t in_use_low = bts & USED_BIT_VECTOR; + while(in_use_low) { + int j = __builtin_ctzll(in_use_low); + in_use_low &= in_use_low - 1; + + int64_t bit_two = GET_BIT(bts, (j + 1)); + bit_slot_t bit_slot = ((bitmap_index_t) i * BITS_PER_QWORD) + j; + const void *leak = (zone->user_pages_start + ((bit_slot >> 1) * zone->chunk_size)); + + if(bit_two == 1 && (check_canary_no_abort(zone, leak) != ERR)) { + continue; + } else { + in_use++; + + if(profile == false) { + LOG("Leaked chunk (%d) in zone[%d] of %d bytes detected at 0x%p (bit position = %d)", in_use, zone->index, zone->chunk_size, leak, bit_slot); + } + } + } + } +#else for(bitmap_index_t i = 0; i < bms; i++) { for(int j = 0; j < BITS_PER_QWORD; j += BITS_PER_CHUNK) { @@ -161,6 +247,7 @@ INTERNAL_HIDDEN uint64_t _iso_alloc_zone_leak_detector(iso_alloc_zone_t *zone, b } } } +#endif if(profile == false) { LOG("Zone[%d] Total number of %d byte chunks(%d) used and free'd (%d) (%d percent), in use = %d", zone->index, zone->chunk_size, zone->chunk_count, diff --git a/src/iso_alloc_sanity.c b/src/iso_alloc_sanity.c index abe8e76..1acc390 100644 --- a/src/iso_alloc_sanity.c +++ b/src/iso_alloc_sanity.c @@ -103,7 +103,58 @@ INTERNAL_HIDDEN void _verify_zone(iso_alloc_zone_t *zone) { } } - for(bitmap_index_t i = 0; i < zone->max_bitmap_idx; i++) { + const bitmap_index_t max = zone->max_bitmap_idx; + +#if USE_NEON + /* A chunk needs its canary verified when the high bit of its 2-bit + * slot is set (was-used-now-free=01 or canary=11). The mask of all + * odd bit positions is ~USED_BIT_VECTOR. We process two qwords at a + * time and quick-reject any pair that has no odd bits set, then walk + * just the set odd-bit positions with ctz instead of checking all 32. */ + const uint64_t CANARY_BIT_VECTOR = ~(uint64_t) USED_BIT_VECTOR; + bitmap_index_t i = 0; + const bitmap_index_t max_pair = max & ~(bitmap_index_t) 1; + + for(; i < max_pair; i += 2) { + int64x2_t v = vld1q_s64((const int64_t *) &bm[i]); + const uint64_t lo = (uint64_t) vgetq_lane_s64(v, 0); + const uint64_t hi = (uint64_t) vgetq_lane_s64(v, 1); + + if(((lo | hi) & CANARY_BIT_VECTOR) == 0) { + continue; + } + + uint64_t mask0 = lo & CANARY_BIT_VECTOR; + while(mask0) { + int j = __builtin_ctzll(mask0); + mask0 &= mask0 - 1; + bit_slot = ((bit_slot_t) i << BITS_PER_QWORD_SHIFT) + j; + const void *p = POINTER_FROM_BITSLOT(zone, bit_slot); + check_canary(zone, p); + } + + uint64_t mask1 = hi & CANARY_BIT_VECTOR; + while(mask1) { + int j = __builtin_ctzll(mask1); + mask1 &= mask1 - 1; + bit_slot = ((bit_slot_t)(i + 1) << BITS_PER_QWORD_SHIFT) + j; + const void *p = POINTER_FROM_BITSLOT(zone, bit_slot); + check_canary(zone, p); + } + } + + for(; i < max; i++) { + uint64_t mask = (uint64_t) bm[i] & CANARY_BIT_VECTOR; + while(mask) { + int j = __builtin_ctzll(mask); + mask &= mask - 1; + bit_slot = ((bit_slot_t) i << BITS_PER_QWORD_SHIFT) + j; + const void *p = POINTER_FROM_BITSLOT(zone, bit_slot); + check_canary(zone, p); + } + } +#else + for(bitmap_index_t i = 0; i < max; i++) { bit_slot_t bsl = bm[i]; for(int64_t j = 1; j < BITS_PER_QWORD; j += BITS_PER_CHUNK) { /* If this bit is set it is either a free chunk or @@ -116,6 +167,7 @@ INTERNAL_HIDDEN void _verify_zone(iso_alloc_zone_t *zone) { } } } +#endif MASK_ZONE_PTRS(zone); } From f23083b7c61d2553fe278d26ab577e23b3db1438 Mon Sep 17 00:00:00 2001 From: struct Date: Tue, 5 May 2026 06:55:28 -0400 Subject: [PATCH 2/3] clang format --- src/iso_alloc_profiler.c | 2 +- src/iso_alloc_sanity.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/iso_alloc_profiler.c b/src/iso_alloc_profiler.c index 0811b91..67217c2 100644 --- a/src/iso_alloc_profiler.c +++ b/src/iso_alloc_profiler.c @@ -165,7 +165,7 @@ INTERNAL_HIDDEN uint64_t _iso_alloc_zone_leak_detector(iso_alloc_zone_t *zone, b in_use_low &= in_use_low - 1; int64_t bit_two = GET_BIT(bts, (j + 1)); - bit_slot_t bit_slot = (((bitmap_index_t)(i + k)) * BITS_PER_QWORD) + j; + bit_slot_t bit_slot = (((bitmap_index_t) (i + k)) * BITS_PER_QWORD) + j; const void *leak = (zone->user_pages_start + ((bit_slot >> 1) * zone->chunk_size)); if(bit_two == 1 && (check_canary_no_abort(zone, leak) != ERR)) { diff --git a/src/iso_alloc_sanity.c b/src/iso_alloc_sanity.c index 1acc390..4c27a61 100644 --- a/src/iso_alloc_sanity.c +++ b/src/iso_alloc_sanity.c @@ -137,7 +137,7 @@ INTERNAL_HIDDEN void _verify_zone(iso_alloc_zone_t *zone) { while(mask1) { int j = __builtin_ctzll(mask1); mask1 &= mask1 - 1; - bit_slot = ((bit_slot_t)(i + 1) << BITS_PER_QWORD_SHIFT) + j; + bit_slot = ((bit_slot_t) (i + 1) << BITS_PER_QWORD_SHIFT) + j; const void *p = POINTER_FROM_BITSLOT(zone, bit_slot); check_canary(zone, p); } From 7021c1e96e8956970c2ac820f5b89c19926bc9b0 Mon Sep 17 00:00:00 2001 From: struct Date: Tue, 5 May 2026 07:00:55 -0400 Subject: [PATCH 3/3] condense leak detector and zone verify loops with optional NEON quick-reject Both bitmap scans now share a single per-qword body using popcount/ctz on USED_BIT_VECTOR (and its complement for canary-bearing slots). The USE_NEON path is reduced to a 2-qword load + zero/canary quick-reject that lets us skip 64 chunk slots at a time when the bitmap range is empty of relevant bits, matching the structure used in iso_scan_zone_free_slot_slow. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/iso_alloc_profiler.c | 108 ++++++--------------------------------- src/iso_alloc_sanity.c | 68 ++++++------------------ 2 files changed, 31 insertions(+), 145 deletions(-) diff --git a/src/iso_alloc_profiler.c b/src/iso_alloc_profiler.c index 67217c2..3fea22b 100644 --- a/src/iso_alloc_profiler.c +++ b/src/iso_alloc_profiler.c @@ -125,71 +125,30 @@ INTERNAL_HIDDEN uint64_t _iso_alloc_zone_leak_detector(iso_alloc_zone_t *zone, b uint32_t was_used = 0; int64_t bms = zone->bitmap_size / sizeof(bitmap_index_t); + for(int64_t i = 0; i < bms;) { #if USE_NEON - /* Process the bitmap two qwords at a time. The common case is that a - * qword is entirely zero (every chunk in those 32 slots is free and - * never used) — vectorising the zero-check lets us skip both qwords at - * once. For each non-zero qword we popcount the was_used pattern in - * O(1) and walk only the chunks whose low bit is set with ctz, instead - * of scanning all 32 pairs. */ - int64_t i = 0; - const int64_t bms_pair = bms & ~(int64_t) 1; - - for(; i < bms_pair; i += 2) { - int64x2_t v = vld1q_s64((const int64_t *) &bm[i]); - const uint64_t lane0 = (uint64_t) vgetq_lane_s64(v, 0); - const uint64_t lane1 = (uint64_t) vgetq_lane_s64(v, 1); - - if((lane0 | lane1) == 0) { - continue; - } - - const uint64_t lanes[2] = {lane0, lane1}; - for(int k = 0; k < 2; k++) { - uint64_t bts = lanes[k]; - if(bts == 0) { + /* Two-qword quick-reject: load 16 bytes of bitmap and skip both + * qwords when every chunk in those 64 slots is free and never used. */ + if(i + 1 < bms) { + int64x2_t v = vld1q_s64((const int64_t *) &bm[i]); + if((vgetq_lane_s64(v, 0) | vgetq_lane_s64(v, 1)) == 0) { + i += 2; continue; } - - /* was_used: chunks encoded 01 (low=0, high=1). The mask of - * odd-bit positions whose paired even bit is clear is - * (~bts) & (bts >> 1) & USED_BIT_VECTOR. */ - uint64_t was_used_mask = (~bts) & (bts >> 1) & USED_BIT_VECTOR; - was_used += __builtin_popcountll(was_used_mask); - - /* Chunks with low bit set are either in-use (10) or canary - * (11). Walk just those positions to verify canaries. */ - uint64_t in_use_low = bts & USED_BIT_VECTOR; - while(in_use_low) { - int j = __builtin_ctzll(in_use_low); - in_use_low &= in_use_low - 1; - - int64_t bit_two = GET_BIT(bts, (j + 1)); - bit_slot_t bit_slot = (((bitmap_index_t) (i + k)) * BITS_PER_QWORD) + j; - const void *leak = (zone->user_pages_start + ((bit_slot >> 1) * zone->chunk_size)); - - if(bit_two == 1 && (check_canary_no_abort(zone, leak) != ERR)) { - continue; - } else { - in_use++; - - if(profile == false) { - LOG("Leaked chunk (%d) in zone[%d] of %d bytes detected at 0x%p (bit position = %d)", in_use, zone->index, zone->chunk_size, leak, bit_slot); - } - } - } } - } - - for(; i < bms; i++) { +#endif uint64_t bts = (uint64_t) bm[i]; if(bts == 0) { + i++; continue; } - uint64_t was_used_mask = (~bts) & (bts >> 1) & USED_BIT_VECTOR; - was_used += __builtin_popcountll(was_used_mask); + /* was_used (encoding 01: low=0, high=1) — popcount the odd-bit + * positions whose paired even bit is clear. */ + was_used += __builtin_popcountll((~bts) & (bts >> 1) & USED_BIT_VECTOR); + /* Chunks with low bit set are either in-use (10) or canary (11). + * Walk just those positions with ctz instead of testing all 32. */ uint64_t in_use_low = bts & USED_BIT_VECTOR; while(in_use_low) { int j = __builtin_ctzll(in_use_low); @@ -209,45 +168,8 @@ INTERNAL_HIDDEN uint64_t _iso_alloc_zone_leak_detector(iso_alloc_zone_t *zone, b } } } + i++; } -#else - for(bitmap_index_t i = 0; i < bms; i++) { - for(int j = 0; j < BITS_PER_QWORD; j += BITS_PER_CHUNK) { - - if(bm[i] == 0) { - continue; - } - - int64_t bit = GET_BIT(bm[i], j); - int64_t bit_two = GET_BIT(bm[i], (j + 1)); - - /* Chunk was used but is now free */ - if(bit == 0 && bit_two == 1) { - was_used++; - } - - if(bit == 1) { - /* Theres no difference between a leaked and previously - * used chunk (11) and a canary chunk (11). So in order - * to accurately report on leaks we need to verify the - * canary value. If it doesn't validate then we assume - * its a true leak and increment the in_use counter */ - bit_slot_t bit_slot = (i * BITS_PER_QWORD) + j; - const void *leak = (zone->user_pages_start + ((bit_slot >> 1) * zone->chunk_size)); - - if(bit_two == 1 && (check_canary_no_abort(zone, leak) != ERR)) { - continue; - } else { - in_use++; - - if(profile == false) { - LOG("Leaked chunk (%d) in zone[%d] of %d bytes detected at 0x%p (bit position = %d)", in_use, zone->index, zone->chunk_size, leak, bit_slot); - } - } - } - } - } -#endif if(profile == false) { LOG("Zone[%d] Total number of %d byte chunks(%d) used and free'd (%d) (%d percent), in use = %d", zone->index, zone->chunk_size, zone->chunk_count, diff --git a/src/iso_alloc_sanity.c b/src/iso_alloc_sanity.c index 4c27a61..56cea1a 100644 --- a/src/iso_alloc_sanity.c +++ b/src/iso_alloc_sanity.c @@ -103,47 +103,25 @@ INTERNAL_HIDDEN void _verify_zone(iso_alloc_zone_t *zone) { } } + /* A chunk needs canary verification when the high bit of its 2-bit + * slot is set (was-used-now-free=01 or canary=11) — the odd-bit mask. */ + const uint64_t CANARY_BIT_VECTOR = ~(uint64_t) USED_BIT_VECTOR; const bitmap_index_t max = zone->max_bitmap_idx; + for(bitmap_index_t i = 0; i < max;) { #if USE_NEON - /* A chunk needs its canary verified when the high bit of its 2-bit - * slot is set (was-used-now-free=01 or canary=11). The mask of all - * odd bit positions is ~USED_BIT_VECTOR. We process two qwords at a - * time and quick-reject any pair that has no odd bits set, then walk - * just the set odd-bit positions with ctz instead of checking all 32. */ - const uint64_t CANARY_BIT_VECTOR = ~(uint64_t) USED_BIT_VECTOR; - bitmap_index_t i = 0; - const bitmap_index_t max_pair = max & ~(bitmap_index_t) 1; - - for(; i < max_pair; i += 2) { - int64x2_t v = vld1q_s64((const int64_t *) &bm[i]); - const uint64_t lo = (uint64_t) vgetq_lane_s64(v, 0); - const uint64_t hi = (uint64_t) vgetq_lane_s64(v, 1); - - if(((lo | hi) & CANARY_BIT_VECTOR) == 0) { - continue; - } - - uint64_t mask0 = lo & CANARY_BIT_VECTOR; - while(mask0) { - int j = __builtin_ctzll(mask0); - mask0 &= mask0 - 1; - bit_slot = ((bit_slot_t) i << BITS_PER_QWORD_SHIFT) + j; - const void *p = POINTER_FROM_BITSLOT(zone, bit_slot); - check_canary(zone, p); - } - - uint64_t mask1 = hi & CANARY_BIT_VECTOR; - while(mask1) { - int j = __builtin_ctzll(mask1); - mask1 &= mask1 - 1; - bit_slot = ((bit_slot_t) (i + 1) << BITS_PER_QWORD_SHIFT) + j; - const void *p = POINTER_FROM_BITSLOT(zone, bit_slot); - check_canary(zone, p); + /* Two-qword quick-reject: skip 64 chunk slots when none of them + * have the high bit set (no canaries to verify in this range). */ + if(i + 1 < max) { + int64x2_t v = vld1q_s64((const int64_t *) &bm[i]); + uint64_t lo = (uint64_t) vgetq_lane_s64(v, 0); + uint64_t hi = (uint64_t) vgetq_lane_s64(v, 1); + if(((lo | hi) & CANARY_BIT_VECTOR) == 0) { + i += 2; + continue; + } } - } - - for(; i < max; i++) { +#endif uint64_t mask = (uint64_t) bm[i] & CANARY_BIT_VECTOR; while(mask) { int j = __builtin_ctzll(mask); @@ -152,22 +130,8 @@ INTERNAL_HIDDEN void _verify_zone(iso_alloc_zone_t *zone) { const void *p = POINTER_FROM_BITSLOT(zone, bit_slot); check_canary(zone, p); } + i++; } -#else - for(bitmap_index_t i = 0; i < max; i++) { - bit_slot_t bsl = bm[i]; - for(int64_t j = 1; j < BITS_PER_QWORD; j += BITS_PER_CHUNK) { - /* If this bit is set it is either a free chunk or - * a canary chunk. Either way it should have a set - * of canaries we can verify */ - if((GET_BIT(bsl, j)) == 1) { - bit_slot = (i << BITS_PER_QWORD_SHIFT) + j; - const void *p = POINTER_FROM_BITSLOT(zone, bit_slot); - check_canary(zone, p); - } - } - } -#endif MASK_ZONE_PTRS(zone); }