From ef7665770a53a6f278078d337c6b1f1bafe97dcd Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Mon, 4 May 2026 08:34:56 -0700 Subject: [PATCH 1/4] Use adaptive collection threshold for free-threaded GC. --- Include/internal/pycore_interp_structs.h | 10 +- Python/gc_free_threading.c | 344 ++++++++--------------- 2 files changed, 129 insertions(+), 225 deletions(-) diff --git a/Include/internal/pycore_interp_structs.h b/Include/internal/pycore_interp_structs.h index 2d04c173e85abe..dd37463a03818a 100644 --- a/Include/internal/pycore_interp_structs.h +++ b/Include/internal/pycore_interp_structs.h @@ -264,12 +264,10 @@ struct _gc_runtime_state { /* True if gc.freeze() has been used. */ int freeze_active; - /* Memory usage of the process (RSS + swap) after last GC. */ - Py_ssize_t last_mem; - - /* This accumulates the new object count whenever collection is deferred - due to the RSS increase condition not being meet. Reset on collection. */ - Py_ssize_t deferred_count; + /* Adaptive threshold used to decide when to trigger a collection. + Adjusted after each collection based on the fraction of objects found to + be trash. */ + int adaptive_threshold; /* Mutex held for gc_should_collect_mem_usage(). */ PyMutex mutex; diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index b4fcd365592aa5..1a380d2f221309 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -17,30 +17,22 @@ #include "pydtrace.h" -// Platform-specific includes for get_process_mem_usage(). -#ifdef _WIN32 - #include - #include // For GetProcessMemoryInfo -#elif defined(__linux__) - #include // For sysconf, getpid -#elif defined(__APPLE__) - #include - #include // Required for TASK_VM_INFO - #include // For sysconf, getpid -#elif defined(__FreeBSD__) - #include - #include - #include // Requires sys/user.h for kinfo_proc definition - #include - #include // For sysconf, getpid - #include // For O_RDONLY - #include // For _POSIX2_LINE_MAX -#elif defined(__OpenBSD__) - #include - #include - #include // For kinfo_proc - #include // For sysconf, getpid -#endif +// Upper bound on the adaptive threshold, expressed as long_lived_total / N +// (where long_lived_total is the count of objects in the mimalloc GC heap). +// Scaling with the heap size keeps the amortized GC cost roughly linear in +// total allocations: when the heap is large we can afford to wait longer +// between passes, since each pass costs O(long_lived_total) for the +// mark-alive walk. At divisor 2, no more than one GC pass fires per heap +// doubling in the no-trash limit. +#define GC_THRESHOLD_MAX_DIVISOR 2 + +// Decay constant for mapping the trash ratio (collected / long_lived_total) +// to a target threshold via 1 / (1 + K * ratio). With K=8: ratio=0.05 maps +// to ~71% of the max range, ratio=0.25 to ~33%, ratio=0.5 to ~20%, +// ratio=1.0 to ~11%. Higher K decays faster. The 1/4-up / 3/4-down step +// applied later does most of the noise filtering, so the exact shape here +// matters less than the monotonicity. +#define GC_THRESHOLD_DECAY_K 8 // enable the "mark alive" pass of GC #define GC_ENABLE_MARK_ALIVE 1 @@ -1690,6 +1682,7 @@ _PyGC_InitState(GCState *gcstate) { // TODO: move to pycore_runtime_init.h once the incremental GC lands. gcstate->young.threshold = 2000; + gcstate->adaptive_threshold = gcstate->young.threshold; } @@ -2016,205 +2009,32 @@ cleanup_worklist(struct worklist *worklist) } } -// Return the memory usage (typically RSS + swap) of the process, in units of -// KB. Returns -1 if this operation is not supported or on failure. -static Py_ssize_t -get_process_mem_usage(void) -{ -#ifdef _WIN32 - // Windows implementation using GetProcessMemoryInfo - // Returns WorkingSetSize + PagefileUsage - PROCESS_MEMORY_COUNTERS pmc; - HANDLE hProcess = GetCurrentProcess(); - if (NULL == hProcess) { - // Should not happen for the current process - return -1; - } - - // GetProcessMemoryInfo returns non-zero on success - if (GetProcessMemoryInfo(hProcess, &pmc, sizeof(pmc))) { - // Values are in bytes, convert to KB. - return (Py_ssize_t)((pmc.WorkingSetSize + pmc.PagefileUsage) / 1024); - } - else { - return -1; - } - -#elif __linux__ - FILE* fp = fopen("/proc/self/status", "r"); - if (fp == NULL) { - return -1; - } - - char line_buffer[256]; - long long rss_kb = -1; - long long swap_kb = -1; - - while (fgets(line_buffer, sizeof(line_buffer), fp) != NULL) { - if (rss_kb == -1 && strncmp(line_buffer, "VmRSS:", 6) == 0) { - sscanf(line_buffer + 6, "%lld", &rss_kb); - } - else if (swap_kb == -1 && strncmp(line_buffer, "VmSwap:", 7) == 0) { - sscanf(line_buffer + 7, "%lld", &swap_kb); - } - if (rss_kb != -1 && swap_kb != -1) { - break; // Found both - } - } - fclose(fp); - - if (rss_kb != -1 && swap_kb != -1) { - return (Py_ssize_t)(rss_kb + swap_kb); - } - return -1; - -#elif defined(__APPLE__) - // --- MacOS (Darwin) --- - // Returns phys_footprint (RAM + compressed memory) - task_vm_info_data_t vm_info; - mach_msg_type_number_t count = TASK_VM_INFO_COUNT; - kern_return_t kerr; - - kerr = task_info(mach_task_self(), TASK_VM_INFO, (task_info_t)&vm_info, &count); - if (kerr != KERN_SUCCESS) { - return -1; - } - // phys_footprint is in bytes. Convert to KB. - return (Py_ssize_t)(vm_info.phys_footprint / 1024); - -#elif defined(__FreeBSD__) - // NOTE: Returns RSS only. Per-process swap usage isn't readily available - long page_size_kb = sysconf(_SC_PAGESIZE) / 1024; - if (page_size_kb <= 0) { - return -1; - } - - // Using /dev/null for vmcore avoids needing dump file. - // NULL for kernel file uses running kernel. - char errbuf[_POSIX2_LINE_MAX]; // For kvm error messages - kvm_t *kd = kvm_openfiles(NULL, "/dev/null", NULL, O_RDONLY, errbuf); - if (kd == NULL) { - return -1; - } - - // KERN_PROC_PID filters for the specific process ID - // n_procs will contain the number of processes returned (should be 1 or 0) - pid_t pid = getpid(); - int n_procs; - struct kinfo_proc *kp = kvm_getprocs(kd, KERN_PROC_PID, pid, &n_procs); - if (kp == NULL) { - kvm_close(kd); - return -1; - } - - Py_ssize_t rss_kb = -1; - if (n_procs > 0) { - // kp[0] contains the info for our process - // ki_rssize is in pages. Convert to KB. - rss_kb = (Py_ssize_t)kp->ki_rssize * page_size_kb; - } - else { - // Process with PID not found, shouldn't happen for self. - rss_kb = -1; - } - - kvm_close(kd); - return rss_kb; - -#elif defined(__OpenBSD__) - // NOTE: Returns RSS only. Per-process swap usage isn't readily available - long page_size_kb = sysconf(_SC_PAGESIZE) / 1024; - if (page_size_kb <= 0) { - return -1; - } - - struct kinfo_proc kp; - pid_t pid = getpid(); - int mib[6]; - size_t len = sizeof(kp); - - mib[0] = CTL_KERN; - mib[1] = KERN_PROC; - mib[2] = KERN_PROC_PID; - mib[3] = pid; - mib[4] = sizeof(struct kinfo_proc); // size of the structure we want - mib[5] = 1; // want 1 structure back - if (sysctl(mib, 6, &kp, &len, NULL, 0) == -1) { - return -1; - } - - if (len > 0) { - // p_vm_rssize is in pages on OpenBSD. Convert to KB. - return (Py_ssize_t)kp.p_vm_rssize * page_size_kb; - } - else { - // Process info not returned - return -1; - } -#else - // Unsupported platform - return -1; -#endif -} - static bool -gc_should_collect_mem_usage(GCState *gcstate) -{ - Py_ssize_t mem = get_process_mem_usage(); - if (mem < 0) { - // Reading process memory usage is not support or failed. - return true; - } - int threshold = gcstate->young.threshold; - Py_ssize_t deferred = _Py_atomic_load_ssize_relaxed(&gcstate->deferred_count); - if (deferred > threshold * 40) { - // Too many new container objects since last GC, even though memory use - // might not have increased much. This is intended to avoid resource - // exhaustion if some objects consume resources but don't result in a - // memory usage increase. We use 40x as the factor here because older - // versions of Python would do full collections after roughly every - // 70,000 new container objects. - return true; - } - Py_ssize_t last_mem = _Py_atomic_load_ssize_relaxed(&gcstate->last_mem); - Py_ssize_t mem_threshold = Py_MAX(last_mem / 10, 128); - if ((mem - last_mem) > mem_threshold) { - // The process memory usage has increased too much, do a collection. - return true; - } - else { - // The memory usage has not increased enough, defer the collection and - // clear the young object count so we don't check memory usage again - // on the next call to gc_should_collect(). - PyMutex_Lock(&gcstate->mutex); - int young_count = _Py_atomic_exchange_int(&gcstate->young.count, 0); - _Py_atomic_store_ssize_relaxed(&gcstate->deferred_count, - gcstate->deferred_count + young_count); - PyMutex_Unlock(&gcstate->mutex); - return false; - } -} - -static bool -gc_should_collect(GCState *gcstate) +gc_should_collect(PyThreadState *tstate) { + GCState *gcstate = &tstate->interp->gc; int count = _Py_atomic_load_int_relaxed(&gcstate->young.count); - int threshold = gcstate->young.threshold; + int base = gcstate->young.threshold; + int adaptive = gcstate->adaptive_threshold; int gc_enabled = _Py_atomic_load_int_relaxed(&gcstate->enabled); - if (count <= threshold || threshold == 0 || !gc_enabled) { + if (base == 0 || !gc_enabled) { return false; } if (gcstate->old[0].threshold == 0) { - // A few tests rely on immediate scheduling of the GC so we ignore the - // extra conditions if generations[1].threshold is set to zero. - return true; + // A few tests rely on immediate scheduling of the GC so we ignore + // the adaptive threshold if generations[1].threshold is set to zero + // and just trigger when the base is exceeded. + return count > base; + } + if (count <= adaptive) { + return false; } if (count < gcstate->long_lived_total / 4) { - // Avoid quadratic behavior by scaling threshold to the number of live - // objects. + // Avoid quadratic behavior by scaling the trigger to the number of + // live objects. return false; } - return gc_should_collect_mem_usage(gcstate); + return true; } static void @@ -2231,7 +2051,7 @@ record_allocation(PyThreadState *tstate) _Py_atomic_add_int(&gcstate->young.count, (int)gc->alloc_count); gc->alloc_count = 0; - if (gc_should_collect(gcstate) && + if (gc_should_collect(tstate) && !_Py_atomic_load_int_relaxed(&gcstate->collecting)) { _Py_ScheduleGC(tstate); @@ -2264,6 +2084,89 @@ record_deallocation(PyThreadState *tstate) } } +// Update the adaptive threshold for the next collection based on how +// much trash this pass found relative to the cost of the pass. The +// GC cost is dominated by the mark-alive walk, which is O(objects in +// the mimalloc GC heap) -- that's exactly what long_lived_total +// counts (including untracked and frozen objects in the heap). So +// the productive ratio is collected / long_lived_total: the fraction +// of GC work that actually freed memory. A high ratio means we +// should collect sooner; a low ratio means GC work was largely wasted +// and we can afford to wait longer. We map the ratio through a +// hyperbolic decay to a target in [base, max_threshold]: +// target = base + (max - base) * total / (total + K * collected) +// where max_threshold scales with long_lived_total so that amortized +// GC cost stays linear in total allocations on large heaps. +// +// We adapt the threshold asymmetrically: slowly when raising it and +// quickly when lowering it. The two directions have very different +// failure modes -- raising too aggressively risks heap blowup (and +// possibly OOM in memory-constrained environments like containers), +// while lowering too slowly only costs a few extra GC passes. So we +// err on the side of more frequent collection. When trash appears, +// we snap toward the new (lower) target in a single big step; when +// trash disappears, we creep up gradually so that one fortunate pass +// doesn't push us into a long deferral. +// +// Both updates are weighted moves toward the target rather than +// direct assignments, to avoid "hunting" -- bouncing around due to +// pass-to-pass noise. Up: 1/4 step. Down: 3/4 step. +static void +update_adaptive_threshold(GCState *gcstate, Py_ssize_t collected, + Py_ssize_t total) +{ + int base = gcstate->young.threshold; + if (base <= 0) { + return; + } + Py_ssize_t max_threshold = total / GC_THRESHOLD_MAX_DIVISOR; + if (max_threshold > INT_MAX) { + max_threshold = INT_MAX; + } + if (max_threshold < base) { + // For small heaps the heap-scaled max would be below the + // user-configured base; fall back to base in that case. + max_threshold = base; + } + // Scale total/collected down if needed to keep the multiply below + // from overflowing. Only the ratio matters here, not the scale. + Py_ssize_t r_total = total; + Py_ssize_t r_collected = collected; + while (r_total > ((Py_ssize_t)1 << 30)) { + r_total >>= 1; + r_collected >>= 1; + } + Py_ssize_t denom = r_total + (Py_ssize_t)GC_THRESHOLD_DECAY_K * r_collected; + Py_ssize_t target = denom > 0 + ? base + (max_threshold - base) * r_total / denom + : max_threshold; + int target_i = target > INT_MAX ? INT_MAX : (int)target; + int adaptive = gcstate->adaptive_threshold; + if (adaptive < base) { + // User changed the base via gc.set_threshold; resync. + adaptive = base; + } + if (target_i >= adaptive) { + // Raising the threshold: cautious 1/4 step. + adaptive = (int)(((long long)adaptive * 3 + (long long)target_i) / 4); + } + else { + // Lowering the threshold: aggressive 3/4 step. + adaptive = (int)(((long long)adaptive + (long long)target_i * 3) / 4); + } + if (adaptive < base) { + adaptive = base; + } + else if (adaptive > max_threshold) { + adaptive = (int)max_threshold; + } + gcstate->adaptive_threshold = adaptive; +#if 0 + fprintf(stderr, "gc adapt collected %zd long_lived %zd max %zd target %zd adaptive %d\n", + collected, total, max_threshold, target, adaptive); +#endif +} + static void gc_collect_internal(PyInterpreterState *interp, struct collection_state *state, int generation) { @@ -2275,7 +2178,6 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state, } state->gcstate->young.count = 0; - state->gcstate->deferred_count = 0; for (int i = 1; i <= generation; ++i) { state->gcstate->old[i-1].count = 0; } @@ -2379,10 +2281,14 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state, // to be freed. delete_garbage(state); - // Store the current memory usage, can be smaller now if breaking cycles - // freed some memory. - Py_ssize_t last_mem = get_process_mem_usage(); - _Py_atomic_store_ssize_relaxed(&state->gcstate->last_mem, last_mem); + // Only update the adaptive threshold for collections triggered by + // hitting the threshold itself. Manual gc.collect() calls and + // shutdown collections are not representative of the steady-state + // trash ratio and would skew the adaptation. + if (state->reason == _Py_GC_REASON_HEAP) { + update_adaptive_threshold(state->gcstate, state->collected, + state->long_lived_total); + } // Append objects with legacy finalizers to the "gc.garbage" list. handle_legacy_finalizers(state); @@ -2423,7 +2329,7 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason) return 0; } - if (reason == _Py_GC_REASON_HEAP && !gc_should_collect(gcstate)) { + if (reason == _Py_GC_REASON_HEAP && !gc_should_collect(tstate)) { // Don't collect if the threshold is not exceeded. _Py_atomic_store_int(&gcstate->collecting, 0); return 0; From b1dd2f83d6d9a457a8b5083d0c88506bb05e4941 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Mon, 4 May 2026 15:54:17 -0700 Subject: [PATCH 2/4] More robust threshold calculations. Remove smoothing, set adaptive threshold directly. --- Python/gc_free_threading.c | 154 +++++++++++++++++++------------------ 1 file changed, 80 insertions(+), 74 deletions(-) diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 1a380d2f221309..1dd3ea707646fa 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -18,21 +18,37 @@ #include "pydtrace.h" // Upper bound on the adaptive threshold, expressed as long_lived_total / N -// (where long_lived_total is the count of objects in the mimalloc GC heap). -// Scaling with the heap size keeps the amortized GC cost roughly linear in -// total allocations: when the heap is large we can afford to wait longer -// between passes, since each pass costs O(long_lived_total) for the -// mark-alive walk. At divisor 2, no more than one GC pass fires per heap -// doubling in the no-trash limit. +// (where long_lived_total is the count of *surviving* objects in the +// mimalloc GC heap after the most recent pass -- it is decremented as +// unreachable objects are identified). Scaling with the survivor count +// keeps the amortized GC cost roughly linear in total allocations: when +// the live heap is large we can afford to wait longer between passes, +// since each pass costs O(long_lived_total) for the mark-alive walk. +#ifndef GC_THRESHOLD_MAX_DIVISOR #define GC_THRESHOLD_MAX_DIVISOR 2 +#endif -// Decay constant for mapping the trash ratio (collected / long_lived_total) -// to a target threshold via 1 / (1 + K * ratio). With K=8: ratio=0.05 maps -// to ~71% of the max range, ratio=0.25 to ~33%, ratio=0.5 to ~20%, -// ratio=1.0 to ~11%. Higher K decays faster. The 1/4-up / 3/4-down step -// applied later does most of the noise filtering, so the exact shape here -// matters less than the monotonicity. +// Decay constant for mapping the trash ratio collected/long_lived_total +// (i.e. trash collected per surviving live object, equivalently C/(N-C) +// in pre-collection terms -- unbounded above) to a target threshold via +// 1 / (1 + K * ratio). With K=8, expressing the input as the fraction +// of pre-collection heap that was trash: 5% trash maps to ~70% of the +// [min, max] range, 20% to ~33%, 50% to ~11%, 75% to ~4%, 90% to +// ~1.4%. Higher K decays faster. The lower endpoint of the range is +// base (so the user's gc.set_threshold value is a hard floor); see +// GC_THRESHOLD_MIN_DIVISOR if you want to change that. +#ifndef GC_THRESHOLD_DECAY_K #define GC_THRESHOLD_DECAY_K 8 +#endif + +// Lower asymptote of the adaptive curve, expressed as base / N. N=1 +// makes the user's threshold a hard floor: the adaptive system +// never collects more often than the user asked via gc.set_threshold. +// Larger N treats base as a pivot, allowing heavy-trash workloads to +// collect more frequently than requested. +#ifndef GC_THRESHOLD_MIN_DIVISOR +#define GC_THRESHOLD_MIN_DIVISOR 1 +#endif // enable the "mark alive" pass of GC #define GC_ENABLE_MARK_ALIVE 1 @@ -2085,86 +2101,76 @@ record_deallocation(PyThreadState *tstate) } // Update the adaptive threshold for the next collection based on how -// much trash this pass found relative to the cost of the pass. The -// GC cost is dominated by the mark-alive walk, which is O(objects in -// the mimalloc GC heap) -- that's exactly what long_lived_total -// counts (including untracked and frozen objects in the heap). So -// the productive ratio is collected / long_lived_total: the fraction -// of GC work that actually freed memory. A high ratio means we -// should collect sooner; a low ratio means GC work was largely wasted -// and we can afford to wait longer. We map the ratio through a -// hyperbolic decay to a target in [base, max_threshold]: -// target = base + (max - base) * total / (total + K * collected) -// where max_threshold scales with long_lived_total so that amortized -// GC cost stays linear in total allocations on large heaps. -// -// We adapt the threshold asymmetrically: slowly when raising it and -// quickly when lowering it. The two directions have very different -// failure modes -- raising too aggressively risks heap blowup (and -// possibly OOM in memory-constrained environments like containers), -// while lowering too slowly only costs a few extra GC passes. So we -// err on the side of more frequent collection. When trash appears, -// we snap toward the new (lower) target in a single big step; when -// trash disappears, we creep up gradually so that one fortunate pass -// doesn't push us into a long deferral. -// -// Both updates are weighted moves toward the target rather than -// direct assignments, to avoid "hunting" -- bouncing around due to -// pass-to-pass noise. Up: 1/4 step. Down: 3/4 step. +// much trash this pass found relative to the cost of the pass. static void -update_adaptive_threshold(GCState *gcstate, Py_ssize_t collected, - Py_ssize_t total) -{ +update_adaptive_threshold(GCState *gcstate, long long collected, + long long live) +{ + // The GC cost is dominated by the mark-alive walk, which is O(objects in + // the mimalloc GC heap) -- that's exactly what long_lived_total counts + // (including untracked and frozen objects in the heap). By the time we + // are called it has already been decremented for the objects this pass + // identified as unreachable, so it is the survivor count L (= N - C in + // pre-collection terms). The productive ratio is collected/live = C/L, + // i.e. trash freed per surviving live object; equivalently C/(N-C). This + // is unbounded above: as a pass approaches collecting everything, L + // shrinks toward zero and the ratio grows without bound, which is what we + // want -- a 99%-trash pass should drive the threshold to its floor. A + // high ratio means we should collect sooner; a low ratio means GC work + // was largely wasted and we can afford to wait longer. We map the ratio + // through a hyperbolic decay to a target in [min, max_threshold]: target + // = min + (max - min) * live / (live + K * collected) where max_threshold + // scales with long_lived_total so that amortized GC cost stays linear + // in total allocations on large heaps, and min_threshold = base / + // GC_THRESHOLD_MIN_DIVISOR acts as the curve's lower asymptote and hard + // floor. The default MIN_DIVISOR=1 makes the user's gc.set_threshold + // value a true minimum interval between collections. int base = gcstate->young.threshold; if (base <= 0) { return; } - Py_ssize_t max_threshold = total / GC_THRESHOLD_MAX_DIVISOR; + int min_threshold = base / GC_THRESHOLD_MIN_DIVISOR; + if (min_threshold < 1) { + min_threshold = 1; + } + if (collected < 0) { + collected = 0; + } + if (live < 0) { + live = 0; + } + long long max_threshold = live / GC_THRESHOLD_MAX_DIVISOR; if (max_threshold > INT_MAX) { max_threshold = INT_MAX; } if (max_threshold < base) { - // For small heaps the heap-scaled max would be below the - // user-configured base; fall back to base in that case. max_threshold = base; } - // Scale total/collected down if needed to keep the multiply below + // Scale live/collected down if needed to keep the multiply below // from overflowing. Only the ratio matters here, not the scale. - Py_ssize_t r_total = total; - Py_ssize_t r_collected = collected; - while (r_total > ((Py_ssize_t)1 << 30)) { - r_total >>= 1; - r_collected >>= 1; - } - Py_ssize_t denom = r_total + (Py_ssize_t)GC_THRESHOLD_DECAY_K * r_collected; - Py_ssize_t target = denom > 0 - ? base + (max_threshold - base) * r_total / denom + // Cap at 2^30 so that K*collected and (max-min)*live both fit + // comfortably in long long. + while (live > (1LL << 30)) { + live >>= 1; + collected >>= 1; + } + long long denom = live + GC_THRESHOLD_DECAY_K * collected; + long long target = denom > 0 + ? min_threshold + (max_threshold - min_threshold) * live / denom : max_threshold; - int target_i = target > INT_MAX ? INT_MAX : (int)target; - int adaptive = gcstate->adaptive_threshold; - if (adaptive < base) { - // User changed the base via gc.set_threshold; resync. - adaptive = base; - } - if (target_i >= adaptive) { - // Raising the threshold: cautious 1/4 step. - adaptive = (int)(((long long)adaptive * 3 + (long long)target_i) / 4); - } - else { - // Lowering the threshold: aggressive 3/4 step. - adaptive = (int)(((long long)adaptive + (long long)target_i * 3) / 4); - } - if (adaptive < base) { - adaptive = base; + int adaptive = target > INT_MAX ? INT_MAX : (int)target; + if (adaptive < min_threshold) { + adaptive = min_threshold; } else if (adaptive > max_threshold) { adaptive = (int)max_threshold; } + // The new threshold is set directly to the computed target -- no + // smoothing. Software workloads can change abruptly (a program may go + // from zero cyclic trash to millions/sec and back within seconds), and in + // that regime the most recent pass is a better predictor of the next pass + // than a moving average. gcstate->adaptive_threshold = adaptive; -#if 0 - fprintf(stderr, "gc adapt collected %zd long_lived %zd max %zd target %zd adaptive %d\n", - collected, total, max_threshold, target, adaptive); -#endif } static void From 6a12aef2fdee16aefa77c309e2d9f0e98685edde Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Tue, 5 May 2026 03:34:17 -0700 Subject: [PATCH 3/4] Remove unneeded 'mutex' member. --- Include/internal/pycore_gc.h | 1 + Include/internal/pycore_interp_structs.h | 3 -- Python/gc_free_threading.c | 41 ++++++++++++++++++++++++ Python/sysmodule.c | 29 +++++++++++++++++ 4 files changed, 71 insertions(+), 3 deletions(-) diff --git a/Include/internal/pycore_gc.h b/Include/internal/pycore_gc.h index bfe52f42f1141c..77745266ebee75 100644 --- a/Include/internal/pycore_gc.h +++ b/Include/internal/pycore_gc.h @@ -341,6 +341,7 @@ extern int _PyGC_VisitStackRef(union _PyStackRef *ref, visitproc visit, void *ar #ifdef Py_GIL_DISABLED extern void _PyGC_VisitObjectsWorldStopped(PyInterpreterState *interp, gcvisitobjects_t callback, void *arg); +extern Py_ssize_t _PyGC_GetMimallocAllocatedBytes(PyInterpreterState *interp); #endif #ifdef __cplusplus diff --git a/Include/internal/pycore_interp_structs.h b/Include/internal/pycore_interp_structs.h index dd37463a03818a..445399859d2dba 100644 --- a/Include/internal/pycore_interp_structs.h +++ b/Include/internal/pycore_interp_structs.h @@ -268,9 +268,6 @@ struct _gc_runtime_state { Adjusted after each collection based on the fraction of objects found to be trash. */ int adaptive_threshold; - - /* Mutex held for gc_should_collect_mem_usage(). */ - PyMutex mutex; #else PyGC_Head *generation0; #endif diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 1dd3ea707646fa..80d923d0bcc468 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -463,6 +463,47 @@ gc_visit_heaps(PyInterpreterState *interp, mi_block_visit_fun *visitor, return err; } +// Visitor for _PyGC_GetMimallocAllocatedBytes(): called once per heap area +// when visit_blocks=false. Sums area->used * area->block_size. +static bool +mimalloc_used_area_visitor(const mi_heap_t *heap, const mi_heap_area_t *area, + void *block, size_t block_size, void *arg) +{ + if (block == NULL) { + *(Py_ssize_t *)arg += (Py_ssize_t)(area->used * area->block_size); + } + return true; +} + +// Return the total bytes in use across all mimalloc heaps for all threads in +// the interpreter, plus the per-interp abandoned pool. +Py_ssize_t +_PyGC_GetMimallocAllocatedBytes(PyInterpreterState *interp) +{ + Py_ssize_t total = 0; + _PyEval_StopTheWorld(interp); + HEAD_LOCK(&_PyRuntime); + _Py_FOR_EACH_TSTATE_UNLOCKED(interp, p) { + struct _mimalloc_thread_state *m = + &((_PyThreadStateImpl *)p)->mimalloc; + if (!_Py_atomic_load_int(&m->initialized)) { + continue; + } + for (int h = 0; h < _Py_MIMALLOC_HEAP_COUNT; h++) { + mi_heap_visit_blocks(&m->heaps[h], false, + mimalloc_used_area_visitor, &total); + } + } + mi_abandoned_pool_t *pool = &interp->mimalloc.abandoned_pool; + for (uint8_t tag = 0; tag < _Py_MIMALLOC_HEAP_COUNT; tag++) { + _mi_abandoned_pool_visit_blocks(pool, tag, false, + mimalloc_used_area_visitor, &total); + } + HEAD_UNLOCK(&_PyRuntime); + _PyEval_StartTheWorld(interp); + return total; +} + static inline void gc_visit_stackref(_PyStackRef stackref) { diff --git a/Python/sysmodule.c b/Python/sysmodule.c index c6447d03369a94..2ced66c85a5a0c 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -19,6 +19,7 @@ Data members: #include "pycore_call.h" // _PyObject_CallNoArgs() #include "pycore_ceval.h" // _PyEval_SetAsyncGenFinalizer() #include "pycore_frame.h" // _PyInterpreterFrame +#include "pycore_gc.h" // _PyGC_GetMimallocAllocatedBytes() #include "pycore_import.h" // _PyImport_SetDLOpenFlags() #include "pycore_initconfig.h" // _PyStatus_EXCEPTION() #include "pycore_interpframe.h" // _PyFrame_GetFirstComplete() @@ -2060,6 +2061,32 @@ sys_getallocatedblocks_impl(PyObject *module) return _Py_GetGlobalAllocatedBlocks(); } +PyDoc_STRVAR(sys__get_mimalloc_allocated_bytes__doc__, +"_get_mimalloc_allocated_bytes($module, /)\n" +"--\n" +"\n" +"Return total bytes allocated across all mimalloc heaps in this interpreter.\n" +"\n" +"Free-threaded build only. Stops the world while reading per-thread heap\n" +"structures. Intended for benchmarking: the OS RSS does not reliably reflect\n" +"Python's live memory because mimalloc retains freed pages.\n" +"Raises NotImplementedError on the GIL-enabled build."); + +static PyObject * +sys__get_mimalloc_allocated_bytes(PyObject *module, PyObject *Py_UNUSED(ignored)) +{ +#ifdef Py_GIL_DISABLED + PyInterpreterState *interp = _PyInterpreterState_GET(); + Py_ssize_t total = _PyGC_GetMimallocAllocatedBytes(interp); + return PyLong_FromSsize_t(total); +#else + PyErr_SetString(PyExc_NotImplementedError, + "sys._get_mimalloc_allocated_bytes() is only available " + "on the free-threaded build"); + return NULL; +#endif +} + /*[clinic input] sys.getunicodeinternedsize -> Py_ssize_t @@ -2927,6 +2954,8 @@ static PyMethodDef sys_methods[] = { SYS_GETDEFAULTENCODING_METHODDEF SYS_GETDLOPENFLAGS_METHODDEF SYS_GETALLOCATEDBLOCKS_METHODDEF + {"_get_mimalloc_allocated_bytes", sys__get_mimalloc_allocated_bytes, + METH_NOARGS, sys__get_mimalloc_allocated_bytes__doc__}, SYS_GETUNICODEINTERNEDSIZE_METHODDEF SYS_GETFILESYSTEMENCODING_METHODDEF SYS_GETFILESYSTEMENCODEERRORS_METHODDEF From 67a2a6625c9d9f23aa29aafbcd2d0129cfdfaa8a Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Tue, 5 May 2026 04:01:32 -0700 Subject: [PATCH 4/4] Add blurb, update docs, remove debug func. Remove the sys._get_mimalloc_allocated_bytes() function. --- Doc/library/gc.rst | 10 +- Include/internal/pycore_gc.h | 1 - InternalDocs/garbage_collector.md | 118 ++++++++++++++++++ ...-05-05-03-40-24.gh-issue-148937.2EvYx-.rst | 3 + Python/gc_free_threading.c | 69 +--------- Python/sysmodule.c | 29 ----- 6 files changed, 130 insertions(+), 100 deletions(-) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2026-05-05-03-40-24.gh-issue-148937.2EvYx-.rst diff --git a/Doc/library/gc.rst b/Doc/library/gc.rst index 701af579453ce3..444e7df028c183 100644 --- a/Doc/library/gc.rst +++ b/Doc/library/gc.rst @@ -133,10 +133,12 @@ The :mod:`!gc` module provides the following functions: With the third generation, things are a bit more complicated, see `Collecting the oldest generation `_ for more information. - In the free-threaded build, the increase in process memory usage is also - checked before running the collector. If the memory usage has not increased - by 10% since the last collection and the net number of object allocations - has not exceeded 40 times *threshold0*, the collection is not run. + In the free-threaded build, the effective collection threshold is adapted + based on how much cyclic trash the last collection found. If few trash + cycles were found, the threshold is adjusted higher, up to half the count + of live objects. If many were found, the threshold is adjusted lower, down + to a minimum of *threshold0*. Setting *threshold1* to zero disables this + adaptation and causes *threshold0* to be used directly. See `Garbage collector design `_ for more information. diff --git a/Include/internal/pycore_gc.h b/Include/internal/pycore_gc.h index 77745266ebee75..bfe52f42f1141c 100644 --- a/Include/internal/pycore_gc.h +++ b/Include/internal/pycore_gc.h @@ -341,7 +341,6 @@ extern int _PyGC_VisitStackRef(union _PyStackRef *ref, visitproc visit, void *ar #ifdef Py_GIL_DISABLED extern void _PyGC_VisitObjectsWorldStopped(PyInterpreterState *interp, gcvisitobjects_t callback, void *arg); -extern Py_ssize_t _PyGC_GetMimallocAllocatedBytes(PyInterpreterState *interp); #endif #ifdef __cplusplus diff --git a/InternalDocs/garbage_collector.md b/InternalDocs/garbage_collector.md index 0ef45ff8e02bc5..f85113f4cd72db 100644 --- a/InternalDocs/garbage_collector.md +++ b/InternalDocs/garbage_collector.md @@ -458,6 +458,124 @@ in the total number of objects (the effect of which can be summarized thusly: grows, but we do fewer and fewer of them"). +Adaptive collection threshold (free-threaded build) +=================================================== + +> [!NOTE] +> This section applies only to the free-threaded build. The default +> (GIL) build uses the generational thresholds described above. + +The free-threaded GC is non-generational: every collection scans the entire +heap. It therefore needs a different mechanism than `threshold0` / +`threshold1` to decide when to run. Instead, it maintains an *adaptive* +trigger that scales with the size of the live heap and adjusts itself based +on how much trash recent collections actually found. The logic lives in +`update_adaptive_threshold()` in `Python/gc_free_threading.c`, which is +called after each collection that fired because the threshold was reached +(`reason == _Py_GC_REASON_HEAP`). Manual `gc.collect()` calls and shutdown +collections do not update the adaptive state — they aren't representative of +the steady-state trash rate. + +Every allocation increments `young.count`. A collection is considered when +`count` exceeds `gcstate->adaptive_threshold` (subject to the quadratic +guard below). The job of `update_adaptive_threshold()` is to choose a good +value for `adaptive_threshold` for the *next* pass. + +The cost model +-------------- + +A free-threaded GC pass is dominated by the mark-alive walk over the +mimalloc GC heap, whose cost is roughly `O(L)` where `L` is the count of +*surviving* live objects (this is what `long_lived_total` records — by the +time `update_adaptive_threshold()` runs it has already been decremented for +the unreachable objects identified this pass). If `T` is the number of +allocations between passes, the amortized GC cost per allocation is +proportional to `L / T`. To keep amortized cost roughly linear in total +allocations as the program grows, `T` should scale with `L`. This gives an +upper bound: + + T_max = L / GC_THRESHOLD_MAX_DIVISOR + +`T_max` alone is wrong, however: a program churning short-lived cycles +wants GC to run often, not just once per heap doubling. We also have a +user-configured pivot — the value of `gc.set_threshold()`, called `base` +below — and a derived lower bound: + + T_min = base / GC_THRESHOLD_MIN_DIVISOR + +The adaptive threshold lives in `[T_min, T_max]`, and `update_adaptive_threshold()` +chooses where in that range to sit based on recent trash productivity. + +Trash ratio and hyperbolic decay +-------------------------------- + +After a threshold-triggered collection we know two numbers: how many +objects the pass collected, `C`, and the survivor count `L` (so the +pre-collection heap size was `N = L + C`). The trash ratio + + r = C / L + +measures trash freed per surviving live object — equivalently, how many +extra walk units the next pass would do as a multiple of the walk units +already paid for in survivors. A high ratio means the pass paid for +itself; a low ratio means the walk was largely wasted. We use `C/L` +rather than `C/N` because (a) `L` is what the *next* pass will walk, not +`N`, and (b) `C/L` is unbounded above (as `C` approaches `N`, `L` shrinks +toward zero and `r` grows without bound), which lets the curve drive the +threshold all the way to its floor in genuinely high-trash regimes. + +We map `r` to a target threshold via a hyperbolic decay: + + target = T_min + (T_max - T_min) / (1 + K * r) + +with `K = GC_THRESHOLD_DECAY_K`. At `r = 0` (no trash) the target equals +`T_max`; as `r` grows the target decays smoothly toward the asymptote +`T_min`. In the implementation this is rearranged to keep the math in +integers: + + target = T_min + (T_max - T_min) * L / (L + K * C) + +`L` and `C` are scaled down (right-shifted) ahead of the multiply if `L` +exceeds 2^30, since only the ratio matters. If a pass somehow collects +everything (`L == 0`), the rearranged form would have a zero denominator; +in that case we fall back to `T_max`. + +The new threshold is set directly to the computed target — there is no +EMA or weighted step. Software workloads can change abruptly (a program +may go from zero cyclic trash to millions per second and back within +seconds), and in that regime the most recent pass is a better predictor +of the next than a long-history average. + +Tunables +-------- + +Three compile-time `#define`s in `Python/gc_free_threading.c` control the +shape of the curve. All three are `#ifndef`-guarded so a build can +override them with `-DGC_THRESHOLD_*=value`: + +| Macro | Default | Meaning | +|---|---|---| +| `GC_THRESHOLD_MAX_DIVISOR` | 2 | `T_max = L / N`. Larger N collects less often on big heaps. | +| `GC_THRESHOLD_DECAY_K` | 8 | Decay rate of the hyperbolic curve. Larger K reaches `T_min` faster. | +| `GC_THRESHOLD_MIN_DIVISOR` | 1 | `T_min = base / N`. N=1 makes the user's `gc.set_threshold` value a hard minimum interval between collections. | + +If `T_max` (i.e. `L / GC_THRESHOLD_MAX_DIVISOR`) falls below `base`, it is +clamped up to `base`: on a small heap the curve runs over `[T_min, base]` +rather than over `[T_min, L/N]` — which would otherwise collapse below +`base` for tiny heaps. + +Quadratic-behavior guard +------------------------ + +Even if `count` exceeds `adaptive_threshold`, GC will not actually fire +unless `count >= long_lived_total / 4` (see `gc_should_collect()`). This +pre-existing guard prevents pathological behavior on heaps that are +growing in pure-non-trash regions: it gives `T` a second floor proportional +to the live heap so that no matter how aggressively the adaptive math +pushes the threshold down, we never collect so often that GC cost +dominates allocation cost. + + Optimization: excluding reachable objects ========================================= diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-05-05-03-40-24.gh-issue-148937.2EvYx-.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-05-05-03-40-24.gh-issue-148937.2EvYx-.rst new file mode 100644 index 00000000000000..e618641b6fef71 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-05-05-03-40-24.gh-issue-148937.2EvYx-.rst @@ -0,0 +1,3 @@ +For the free-threaded build, the cyclic GC now adapts the collection +threshold based on how successful the last automatic collection was in +finding trash. diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 80d923d0bcc468..0003d8818621c1 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -463,47 +463,6 @@ gc_visit_heaps(PyInterpreterState *interp, mi_block_visit_fun *visitor, return err; } -// Visitor for _PyGC_GetMimallocAllocatedBytes(): called once per heap area -// when visit_blocks=false. Sums area->used * area->block_size. -static bool -mimalloc_used_area_visitor(const mi_heap_t *heap, const mi_heap_area_t *area, - void *block, size_t block_size, void *arg) -{ - if (block == NULL) { - *(Py_ssize_t *)arg += (Py_ssize_t)(area->used * area->block_size); - } - return true; -} - -// Return the total bytes in use across all mimalloc heaps for all threads in -// the interpreter, plus the per-interp abandoned pool. -Py_ssize_t -_PyGC_GetMimallocAllocatedBytes(PyInterpreterState *interp) -{ - Py_ssize_t total = 0; - _PyEval_StopTheWorld(interp); - HEAD_LOCK(&_PyRuntime); - _Py_FOR_EACH_TSTATE_UNLOCKED(interp, p) { - struct _mimalloc_thread_state *m = - &((_PyThreadStateImpl *)p)->mimalloc; - if (!_Py_atomic_load_int(&m->initialized)) { - continue; - } - for (int h = 0; h < _Py_MIMALLOC_HEAP_COUNT; h++) { - mi_heap_visit_blocks(&m->heaps[h], false, - mimalloc_used_area_visitor, &total); - } - } - mi_abandoned_pool_t *pool = &interp->mimalloc.abandoned_pool; - for (uint8_t tag = 0; tag < _Py_MIMALLOC_HEAP_COUNT; tag++) { - _mi_abandoned_pool_visit_blocks(pool, tag, false, - mimalloc_used_area_visitor, &total); - } - HEAD_UNLOCK(&_PyRuntime); - _PyEval_StartTheWorld(interp); - return total; -} - static inline void gc_visit_stackref(_PyStackRef stackref) { @@ -2142,30 +2101,13 @@ record_deallocation(PyThreadState *tstate) } // Update the adaptive threshold for the next collection based on how -// much trash this pass found relative to the cost of the pass. +// much trash this pass found relative to the cost of the pass. See +// InternalDocs/garbage_collector.md for additional explaination of this +// calculation. static void update_adaptive_threshold(GCState *gcstate, long long collected, long long live) { - // The GC cost is dominated by the mark-alive walk, which is O(objects in - // the mimalloc GC heap) -- that's exactly what long_lived_total counts - // (including untracked and frozen objects in the heap). By the time we - // are called it has already been decremented for the objects this pass - // identified as unreachable, so it is the survivor count L (= N - C in - // pre-collection terms). The productive ratio is collected/live = C/L, - // i.e. trash freed per surviving live object; equivalently C/(N-C). This - // is unbounded above: as a pass approaches collecting everything, L - // shrinks toward zero and the ratio grows without bound, which is what we - // want -- a 99%-trash pass should drive the threshold to its floor. A - // high ratio means we should collect sooner; a low ratio means GC work - // was largely wasted and we can afford to wait longer. We map the ratio - // through a hyperbolic decay to a target in [min, max_threshold]: target - // = min + (max - min) * live / (live + K * collected) where max_threshold - // scales with long_lived_total so that amortized GC cost stays linear - // in total allocations on large heaps, and min_threshold = base / - // GC_THRESHOLD_MIN_DIVISOR acts as the curve's lower asymptote and hard - // floor. The default MIN_DIVISOR=1 makes the user's gc.set_threshold - // value a true minimum interval between collections. int base = gcstate->young.threshold; if (base <= 0) { return; @@ -2206,11 +2148,6 @@ update_adaptive_threshold(GCState *gcstate, long long collected, else if (adaptive > max_threshold) { adaptive = (int)max_threshold; } - // The new threshold is set directly to the computed target -- no - // smoothing. Software workloads can change abruptly (a program may go - // from zero cyclic trash to millions/sec and back within seconds), and in - // that regime the most recent pass is a better predictor of the next pass - // than a moving average. gcstate->adaptive_threshold = adaptive; } diff --git a/Python/sysmodule.c b/Python/sysmodule.c index 2ced66c85a5a0c..c6447d03369a94 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -19,7 +19,6 @@ Data members: #include "pycore_call.h" // _PyObject_CallNoArgs() #include "pycore_ceval.h" // _PyEval_SetAsyncGenFinalizer() #include "pycore_frame.h" // _PyInterpreterFrame -#include "pycore_gc.h" // _PyGC_GetMimallocAllocatedBytes() #include "pycore_import.h" // _PyImport_SetDLOpenFlags() #include "pycore_initconfig.h" // _PyStatus_EXCEPTION() #include "pycore_interpframe.h" // _PyFrame_GetFirstComplete() @@ -2061,32 +2060,6 @@ sys_getallocatedblocks_impl(PyObject *module) return _Py_GetGlobalAllocatedBlocks(); } -PyDoc_STRVAR(sys__get_mimalloc_allocated_bytes__doc__, -"_get_mimalloc_allocated_bytes($module, /)\n" -"--\n" -"\n" -"Return total bytes allocated across all mimalloc heaps in this interpreter.\n" -"\n" -"Free-threaded build only. Stops the world while reading per-thread heap\n" -"structures. Intended for benchmarking: the OS RSS does not reliably reflect\n" -"Python's live memory because mimalloc retains freed pages.\n" -"Raises NotImplementedError on the GIL-enabled build."); - -static PyObject * -sys__get_mimalloc_allocated_bytes(PyObject *module, PyObject *Py_UNUSED(ignored)) -{ -#ifdef Py_GIL_DISABLED - PyInterpreterState *interp = _PyInterpreterState_GET(); - Py_ssize_t total = _PyGC_GetMimallocAllocatedBytes(interp); - return PyLong_FromSsize_t(total); -#else - PyErr_SetString(PyExc_NotImplementedError, - "sys._get_mimalloc_allocated_bytes() is only available " - "on the free-threaded build"); - return NULL; -#endif -} - /*[clinic input] sys.getunicodeinternedsize -> Py_ssize_t @@ -2954,8 +2927,6 @@ static PyMethodDef sys_methods[] = { SYS_GETDEFAULTENCODING_METHODDEF SYS_GETDLOPENFLAGS_METHODDEF SYS_GETALLOCATEDBLOCKS_METHODDEF - {"_get_mimalloc_allocated_bytes", sys__get_mimalloc_allocated_bytes, - METH_NOARGS, sys__get_mimalloc_allocated_bytes__doc__}, SYS_GETUNICODEINTERNEDSIZE_METHODDEF SYS_GETFILESYSTEMENCODING_METHODDEF SYS_GETFILESYSTEMENCODEERRORS_METHODDEF