dakingffo · caomengxuan666 · May 3, 2026 · May 3, 2026 · May 6, 2026 · May 6, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -51,6 +51,12 @@ set(HDR_HISTOGRAM_BUILD_SHARED OFF CACHE BOOL "" FORCE)
 set(HDR_LOG_REQUIRED "DISABLED" CACHE STRING "" FORCE)
 FetchContent_MakeAvailable(hdrhistogram)
 
+if(TARGET hdr_histogram_static)
+    target_compile_options(hdr_histogram_static PRIVATE
+        $<$<C_COMPILER_ID:Clang>:-Wno-incompatible-pointer-types -Wno-deprecated-declarations>
+    )
+endif()
+
 # moodycamel ConcurrentQueue
 FetchContent_Declare(
     concurrentqueue
@@ -85,6 +91,34 @@ target_link_libraries(mpsc_bench_throughput
 )
 target_compile_options(mpsc_bench_throughput ${COMMON_TARGET_PROPERTIES})
 
+add_executable(mpsc_bench_memory_cycle benchmarks/bench_memory_cycle.cpp)
+target_include_directories(mpsc_bench_memory_cycle
+    PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/include
+        ${CMAKE_CURRENT_SOURCE_DIR}
+)
+target_link_libraries(mpsc_bench_memory_cycle
+    PRIVATE
+        benchmark::benchmark_main
+        Threads::Threads
+        ${ATOMIC_LIBRARY}
+)
+target_compile_options(mpsc_bench_memory_cycle ${COMMON_TARGET_PROPERTIES})
+
+add_executable(mpsc_bench_reclaim benchmarks/bench_reclaim.cpp)
+target_include_directories(mpsc_bench_reclaim
+    PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/include
+        ${CMAKE_CURRENT_SOURCE_DIR}
+)
+target_link_libraries(mpsc_bench_reclaim
+    PRIVATE
+        benchmark::benchmark_main
+        Threads::Threads
+        ${ATOMIC_LIBRARY}
+)
+target_compile_options(mpsc_bench_reclaim ${COMMON_TARGET_PROPERTIES})
+
 add_executable(mpsc_bench_latency benchmarks/bench_latency.cpp)
 target_include_directories(mpsc_bench_latency 
     PRIVATE 

diff --git a/README.md b/README.md
@@ -203,47 +203,69 @@ Command:
 ```powershell
 cmake -S . -B out/build/clang-local -G Ninja -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
 cmake --build out/build/clang-local --target mpsc_vs_mpmc_benchmark
-.\out\build\clang-local\mpsc_vs_mpmc_benchmark.exe --benchmark_min_time=0.1s --benchmark_repetitions=1 --benchmark_counters_tabular=true
+.\out\build\clang-local\mpsc_vs_mpmc_benchmark.exe --benchmark_min_time=0.5s --benchmark_repetitions=1 --benchmark_counters_tabular=true
 ```
 
 Uniform single-element enqueue:
 
 | Queue | P (Producers) | C (Consumer) | **Throughput (M items/s)** |
 | :--- | :--- | :--- | :--- |
-| **daking** | 1 | 1 | **86.78** |
-| daking | 2 | 1 | **32.29** |
-| daking | 4 | 1 | **32.52** |
-| daking | 8 | 1 | 31.16 |
-| moodycamel | 1 | 1 | 22.74 |
-| moodycamel | 2 | 1 | 27.82 |
-| moodycamel | 4 | 1 | 29.79 |
-| **moodycamel** | 8 | 1 | **32.07** |
+| **daking** | 1 | 1 | **82.21** |
+| daking | 2 | 1 | **29.69** |
+| daking | 4 | 1 | **32.22** |
+| daking | 8 | 1 | 28.22 |
+| moodycamel | 1 | 1 | 23.21 |
+| moodycamel | 2 | 1 | 28.50 |
+| moodycamel | 4 | 1 | 30.75 |
+| **moodycamel** | 8 | 1 | **30.86** |
 
 Uneven sequential burst:
 
 | Queue | P (Producers) | C (Consumer) | Relay % | **Throughput (M items/s)** |
 | :--- | :--- | :--- | :--- | :--- |
-| daking | 4 | 1 | $50.0\%$ | **33.89** |
-| daking | 4 | 1 | $90.0\%$ | **61.72** |
-| daking | 4 | 1 | $98.0\%$ | **73.72** |
-| moodycamel | 4 | 1 | $50.0\%$ | 23.84 |
-| moodycamel | 4 | 1 | $90.0\%$ | 22.25 |
-| moodycamel | 4 | 1 | $98.0\%$ | 17.64 |
+| daking | 4 | 1 | $50.0\%$ | **32.62** |
+| daking | 4 | 1 | $90.0\%$ | **58.54** |
+| daking | 4 | 1 | $98.0\%$ | **68.39** |
+| moodycamel | 4 | 1 | $50.0\%$ | 25.31 |
+| moodycamel | 4 | 1 | $90.0\%$ | 22.75 |
+| moodycamel | 4 | 1 | $98.0\%$ | 21.69 |
 
 Bulk enqueue:
 
 | Queue | P (Producers) | C (Consumer) | **Throughput (M items/s)** |
 | :--- | :--- | :--- | :--- |
-| **daking** | 1 | 1 | **102.25** |
-| **daking** | 2 | 1 | **102.37** |
-| **daking** | 4 | 1 | **85.23** |
-| **daking** | 8 | 1 | **77.16** |
-| moodycamel | 1 | 1 | 17.02 |
-| moodycamel | 2 | 1 | 18.85 |
-| moodycamel | 4 | 1 | 18.06 |
-| moodycamel | 8 | 1 | 16.02 |
+| **daking** | 1 | 1 | **167.47** |
+| **daking** | 2 | 1 | **191.27** |
+| **daking** | 4 | 1 | **195.14** |
+| **daking** | 8 | 1 | **159.11** |
+| moodycamel | 1 | 1 | 36.24 |
+| moodycamel | 2 | 1 | 36.23 |
+| moodycamel | 4 | 1 | 34.82 |
+| moodycamel | 8 | 1 | 33.99 |
 
-**Part V: Enqueue/Dequeue Latency**
+**Part V: Memory Lifecycle Benchmark (Stable vs Idle Reclaim)**
+
+This benchmark runs two production/drain bursts on the same queue. The `stable` mode keeps the warm global pool between bursts. The `idle_reclaim` mode calls `shrink_to_fit()` after the first burst and then measures the next burst from the reclaimed state.
+
+Command:
+
+```powershell
+cmake --build out/build/clang-local --target mpsc_bench_memory_cycle
+.\out\build\clang-local\mpsc_bench_memory_cycle.exe --benchmark_min_time=0.2s --benchmark_repetitions=1 --benchmark_counters_tabular=true
+```
+
+| Mode | P (Producers) | Nodes Before | Nodes After | **Throughput (M items/s)** | Shrink Time (us) |
+| :--- | :--- | :--- | :--- | :--- | :--- |
+| stable | 1 | 16.384k | 16.384k | 96.69 | - |
+| stable | 4 | 16.384k | 16.384k | 32.20 | - |
+| stable | 16 | 32.768k | 32.768k | 37.54 | - |
+| idle_reclaim | 1 | 32.768k | 256 | 93.94 | 3.5 |
+| idle_reclaim | 4 | 32.768k | 256 | 31.98 | 5.5 |
+| idle_reclaim | 16 | 262.144k | 256 | 37.01 | 135.2 |
+
+The idle reclaim path aggressively returns the global pool to the minimum chunk count while keeping the next burst in the same throughput band. It should still be treated as a quiescent-state operation, not an online elastic reclamation mechanism.
+
+**Part VI: Enqueue/Dequeue Latency**
 
 (Based on HdrHistogram, Test on Linux)
 We get below performance：
@@ -284,6 +306,8 @@ We get below performance：
 2.  `ThreadLocalCapacity` (thread-local capacity) is fixed at **compile time**.
 3.  Pointer chasing cannot be avoided because it is a pure linked-list structure.
 
+If you need to reclaim memory while keeping the queue instance alive, call `shrink_to_fit()` only after the queue has been fully drained and all producers have stopped. This is an idle-time reclamation path, not a concurrent shrink path. It is intentionally conservative and is not designed for online elastic memory reclamation.
+
 ## Features
 
 1.  Multiple-Producer, Single-Consumer (MPSC). The closer the contention scenario is to SPSC, the closer the throughput gets to the SPSC benchmark performance.

diff --git a/README.zh.md b/README.zh.md
@@ -203,47 +203,69 @@ Compiler: Clang 22.1.1 Release
 ```powershell
 cmake -S . -B out/build/clang-local -G Ninja -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
 cmake --build out/build/clang-local --target mpsc_vs_mpmc_benchmark
-.\out\build\clang-local\mpsc_vs_mpmc_benchmark.exe --benchmark_min_time=0.1s --benchmark_repetitions=1 --benchmark_counters_tabular=true
+.\out\build\clang-local\mpsc_vs_mpmc_benchmark.exe --benchmark_min_time=0.5s --benchmark_repetitions=1 --benchmark_counters_tabular=true
 ```
 
 均匀单元素入队：
 
 | 队列 | P (生产者) | C (消费者) | **吞吐量 (M items/s)** |
 | :--- | :--- | :--- | :--- |
-| **daking** | 1 | 1 | **86.78** |
-| daking | 2 | 1 | **32.29** |
-| daking | 4 | 1 | **32.52** |
-| daking | 8 | 1 | 31.16 |
-| moodycamel | 1 | 1 | 22.74 |
-| moodycamel | 2 | 1 | 27.82 |
-| moodycamel | 4 | 1 | 29.79 |
-| **moodycamel** | 8 | 1 | **32.07** |
+| **daking** | 1 | 1 | **82.21** |
+| daking | 2 | 1 | **29.69** |
+| daking | 4 | 1 | **32.22** |
+| daking | 8 | 1 | 28.22 |
+| moodycamel | 1 | 1 | 23.21 |
+| moodycamel | 2 | 1 | 28.50 |
+| moodycamel | 4 | 1 | 30.75 |
+| **moodycamel** | 8 | 1 | **30.86** |
 
 不均匀顺序爆发：
 
 | 队列 | P (生产者) | C (消费者) | 接力百分比 | **吞吐量 (M items/s)** |
 | :--- | :--- | :--- | :--- | :--- |
-| daking | 4 | 1 | $50.0\%$ | **33.89** |
-| daking | 4 | 1 | $90.0\%$ | **61.72** |
-| daking | 4 | 1 | $98.0\%$ | **73.72** |
-| moodycamel | 4 | 1 | $50.0\%$ | 23.84 |
-| moodycamel | 4 | 1 | $90.0\%$ | 22.25 |
-| moodycamel | 4 | 1 | $98.0\%$ | 17.64 |
+| daking | 4 | 1 | $50.0\%$ | **32.62** |
+| daking | 4 | 1 | $90.0\%$ | **58.54** |
+| daking | 4 | 1 | $98.0\%$ | **68.39** |
+| moodycamel | 4 | 1 | $50.0\%$ | 25.31 |
+| moodycamel | 4 | 1 | $90.0\%$ | 22.75 |
+| moodycamel | 4 | 1 | $98.0\%$ | 21.69 |
 
 批量入队：
 
 | 队列 | P (生产者) | C (消费者) | **吞吐量 (M items/s)** |
 | :--- | :--- | :--- | :--- |
-| **daking** | 1 | 1 | **102.25** |
-| **daking** | 2 | 1 | **102.37** |
-| **daking** | 4 | 1 | **85.23** |
-| **daking** | 8 | 1 | **77.16** |
-| moodycamel | 1 | 1 | 17.02 |
-| moodycamel | 2 | 1 | 18.85 |
-| moodycamel | 4 | 1 | 18.06 |
-| moodycamel | 8 | 1 | 16.02 |
+| **daking** | 1 | 1 | **167.47** |
+| **daking** | 2 | 1 | **191.27** |
+| **daking** | 4 | 1 | **195.14** |
+| **daking** | 8 | 1 | **159.11** |
+| moodycamel | 1 | 1 | 36.24 |
+| moodycamel | 2 | 1 | 36.23 |
+| moodycamel | 4 | 1 | 34.82 |
+| moodycamel | 8 | 1 | 33.99 |
 
-**第五部分：Enqueue/Dequeue Latency**
+**第五部分：内存生命周期基准测试（稳定模式 vs 空闲回收）**
+
+这个基准测试在同一个队列上连续执行两轮生产/清空 burst。`stable` 模式会保留第一轮之后的全局池热状态；`idle_reclaim` 模式会在第一轮后调用 `shrink_to_fit()`，再从回收后的状态执行下一轮 burst。
+
+运行命令：
+
+```powershell
+cmake --build out/build/clang-local --target mpsc_bench_memory_cycle
+.\out\build\clang-local\mpsc_bench_memory_cycle.exe --benchmark_min_time=0.2s --benchmark_repetitions=1 --benchmark_counters_tabular=true
+```
+
+| 模式 | P (生产者) | 回收前节点数 | 回收后节点数 | **吞吐量 (M items/s)** | Shrink 耗时 (us) |
+| :--- | :--- | :--- | :--- | :--- | :--- |
+| stable | 1 | 16.384k | 16.384k | 96.69 | - |
+| stable | 4 | 16.384k | 16.384k | 32.20 | - |
+| stable | 16 | 32.768k | 32.768k | 37.54 | - |
+| idle_reclaim | 1 | 32.768k | 256 | 93.94 | 3.5 |
+| idle_reclaim | 4 | 32.768k | 256 | 31.98 | 5.5 |
+| idle_reclaim | 16 | 262.144k | 256 | 37.01 | 135.2 |
+
+空闲回收路径可以把全局池激进地回收到最小 chunk 数，同时下一轮 burst 的吞吐仍处在同一档。但它仍然应该被视为静止期操作，不是在线弹性回收机制。
+
+**第六部分：Enqueue/Dequeue Latency**
 
 (此部分基于HdrHistogram，在Linux平台测试)
 我们得到了以下延迟表现：
@@ -286,6 +308,8 @@ cmake --build out/build/clang-local --target mpsc_vs_mpmc_benchmark
 2.  `ThreadLocalCapacity`（线程本地容量）在**编译时**已固定。
 3.  无法避免指针追逐，因为是纯链表结构。
 
+如果你需要在队列实例仍然存在时回收内存，可以在队列已经完全清空并且所有生产者停止之后调用 `shrink_to_fit()`。它是一个空闲期回收接口，不是并发缩容接口，也不是在线弹性回收设计。
+
 ## 特性 (FEATURES)
 
 1.  多生产者，单消费者（MPSC）， 若竞争场景越接近SPSC，吞吐量越接近SPSC基准测试性能。 

diff --git a/benchmarks/bench_latency.cpp b/benchmarks/bench_latency.cpp
@@ -132,8 +132,36 @@ static void BM_MPSC_PureDequeueLatency(benchmark::State& state) {
     hdr_close(hist);
 }
 
+static void BM_MPSC_ShrinkToFitLatency(benchmark::State& state) {
+    hdr_histogram* hist;
+    hdr_init(1, 1000000, 3, &hist);
+
+    for (auto _ : state) {
+        TestQueue q;
+        TestQueue::reserve_global_chunk(64);
+        for (int i = 0; i < 10000; ++i) {
+            q.enqueue(i);
+        }
+        int val = 0;
+        while (q.try_dequeue(val)) {}
+        auto start = __rdtsc();
+        bool ok = q.shrink_to_fit();
+        auto end = __rdtsc();
+        if (!ok) {
+            state.SkipWithError("shrink_to_fit failed unexpectedly");
+            break;
+        }
+        hdr_record_value(hist, end - start);
+    }
+
+    state.counters["P99_ns"] = hdr_value_at_percentile(hist, 99.0) / CYCLES_PER_NS;
+    state.counters["P99.9_ns"] = hdr_value_at_percentile(hist, 99.9) / CYCLES_PER_NS;
+    hdr_close(hist);
+}
+
 BENCHMARK(BM_MPSC_PureEnqueueLatency)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(16)->Unit(benchmark::kMicrosecond);
 BENCHMARK(BM_MPSC_PureDequeueLatency)->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_MPSC_ShrinkToFitLatency)->Unit(benchmark::kMicrosecond);
 
 BENCHMARK_MAIN();
 
@@ -181,4 +209,4 @@ int main(int argc, char** argv) {
     return 0;
 }
 
-#endif
+#endif