From 2c285ff2d097d2852b111196e0948876bd85dc28 Mon Sep 17 00:00:00 2001 From: huangjun Date: Sat, 25 Apr 2026 21:59:03 +0800 Subject: [PATCH] Roll back LocalityAwareLoadBalancer to gettimeofday_us to match callers PR #3268 ("Use monotonic time instead of wall time") switched LocalityAwareLoadBalancer::Weight::Update's end_time_us and LocalityAwareLoadBalancer::Describe's now to butil::cpuwide_time_us(), but every caller that supplies CallInfo::begin_time_us still uses butil::gettimeofday_us(): - Channel::CallMethod (channel.cpp:451) -> Controller::IssueRPC -> Controller::Call::begin_time_us -> SelectIn::begin_time_us -> CallInfo::begin_time_us - Controller::OnVersionedRPCReturned retry sites (controller.cpp:672, 715) call IssueRPC(gettimeofday_us()) on backup-request and regular retries The mismatched time domains make latency = end_time_us - ci.begin_time_us = cpuwide_now - wallclock_begin ~= -1.7e15 us trigger the `if (latency <= 0) { /* time skews, ignore */ return 0; }` short-circuit on every call. _time_q never accumulates samples, _avg_latency stays at 0, and locality-aware weight feedback is silently disabled. Visible downstream symptom: cold-start `list://` channels with `lb=la` and 2 backends occasionally fail RPCs with EHOSTDOWN ("Fail to select server from list://...") on retry even when one backend is healthy. Bisected reproduction in xsky/brpc fork: - 51 commit range c41e838..604dad0c (1.16.1 .. 1.17.0-rc2) - master code + LA-driven multipath probe at 2 backends, max_retry=1, repeat 500x: * commit 771de31e (one before #3268): 0/500 fail * commit 12fb539a (#3268): 25/500 fail * commit 12fb539a + revert only Weight::Update::end_time_us to gettimeofday_us: 0/500 fail This commit reverts the LA-side of #3268's clock change so the LB lines up with its existing callers again. Channel::CallMethod and the retry paths in Controller stay on butil::gettimeofday_us(), which preserves the wall-clock semantics of Controller::_begin_time_us / Controller::latency_us() that public users rely on. Adds test/brpc_load_balancer_unittest.cpp::la_records_latency_with_consistent_time_source which drives a series of SelectServer + Feedback cycles against LocalityAwareLoadBalancer (no Server / Channel needed) and asserts that _avg_latency reflects the elapsed time, rather than being stuck at 0 because of a time-source mismatch. --- .../policy/locality_aware_load_balancer.cpp | 6 +- test/brpc_load_balancer_unittest.cpp | 77 +++++++++++++++++++ 2 files changed, 80 insertions(+), 3 deletions(-) diff --git a/src/brpc/policy/locality_aware_load_balancer.cpp b/src/brpc/policy/locality_aware_load_balancer.cpp index 455f6fc397..beea51690e 100644 --- a/src/brpc/policy/locality_aware_load_balancer.cpp +++ b/src/brpc/policy/locality_aware_load_balancer.cpp @@ -18,7 +18,7 @@ #include // numeric_limits #include -#include "butil/time.h" // cpuwide_time_us +#include "butil/time.h" // gettimeofday_us #include "butil/fast_rand.h" #include "brpc/log.h" #include "brpc/socket.h" @@ -376,7 +376,7 @@ void LocalityAwareLoadBalancer::Feedback(const CallInfo& info) { int64_t LocalityAwareLoadBalancer::Weight::Update( const CallInfo& ci, size_t index) { - const int64_t end_time_us = butil::cpuwide_time_us(); + const int64_t end_time_us = butil::gettimeofday_us(); const int64_t latency = end_time_us - ci.begin_time_us; BAIDU_SCOPED_LOCK(_mutex); if (Disabled()) { @@ -524,7 +524,7 @@ void LocalityAwareLoadBalancer::Describe( if (_db_servers.Read(&s) != 0) { os << "fail to read _db_servers"; } else { - const int64_t now = butil::cpuwide_time_us(); + const int64_t now = butil::gettimeofday_us(); const size_t n = s->weight_tree.size(); os << '['; for (size_t i = 0; i < n; ++i) { diff --git a/test/brpc_load_balancer_unittest.cpp b/test/brpc_load_balancer_unittest.cpp index 2a2be242aa..76ad005eac 100644 --- a/test/brpc_load_balancer_unittest.cpp +++ b/test/brpc_load_balancer_unittest.cpp @@ -1303,6 +1303,83 @@ TEST_F(LoadBalancerTest, revived_from_all_failed_intergrated) { } #endif // BUTIL_USE_ASAN +// Regression for #3268's incomplete migration of LocalityAwareLoadBalancer. +// +// #3268 switched `LocalityAwareLoadBalancer::Weight::Update::end_time_us` and +// `LocalityAwareLoadBalancer::Describe::now` to `butil::cpuwide_time_us()` +// while every caller that supplies `CallInfo::begin_time_us` (the RPC entry +// in `Channel::CallMethod` and the retry sites in +// `Controller::OnVersionedRPCReturned`) still uses `butil::gettimeofday_us()`. +// The resulting time-source mismatch makes +// +// latency = end_time_us - ci.begin_time_us +// = cpuwide_now - wallclock_begin +// ~= -1.7e15 us (huge negative) +// +// trigger the +// +// if (latency <= 0) { /* time skews, ignore the sample */ return 0; } +// +// short-circuit on every call. `_time_q` never accumulates samples, +// `_avg_latency` stays at 0, and locality-aware weight feedback is silently +// disabled. Visible downstream symptom: cold-start `list://` channels with +// `lb=la` and 2 backends occasionally fail RPCs with `EHOSTDOWN` +// ("Fail to select server") on retry even when one backend is healthy. +// +// This commit reverts the LA side of #3268, so `Weight::Update` and +// `Describe` once again use `butil::gettimeofday_us()` to match every +// existing caller of `CallInfo::begin_time_us`. +// +// The test below runs entirely against `LocalityAwareLoadBalancer` (no +// Server / Channel is involved), so it is hermetic. It supplies a +// gettimeofday-based `begin_time_us` (matching what `Channel::CallMethod` +// passes today) and asserts that the LB records a positive `_avg_latency`, +// rather than tripping the time-skew short-circuit. +TEST_F(LoadBalancerTest, la_records_latency_with_consistent_time_source) { + LALB lalb; + char addr[] = "192.168.1.1:8080"; + butil::EndPoint dummy; + ASSERT_EQ(0, str2endpoint(addr, &dummy)); + brpc::ServerId id(8888); + brpc::SocketOptions options; + options.remote_side = dummy; + ASSERT_EQ(0, brpc::Socket::Create(options, &id.id)); + ASSERT_TRUE(lalb.AddServer(id)); + + auto avg_latency = [&]() -> int64_t { + std::ostringstream os; + brpc::DescribeOptions opts; + opts.verbose = true; + lalb.Describe(os, opts); + const std::string s = os.str(); + const size_t p = s.find("avg_latency="); + if (p == std::string::npos) return -1; + return strtoll(s.c_str() + p + strlen("avg_latency="), NULL, 10); + }; + + // Drive a few "RPCs": pick a server, sleep ~2ms, feed back. begin_time_us + // comes from gettimeofday_us(), matching what Channel::CallMethod and the + // retry sites in Controller::OnVersionedRPCReturned pass on every RPC. + for (int i = 0; i < 8; ++i) { + const int64_t begin_us = butil::gettimeofday_us(); + brpc::SocketUniquePtr ptr; + brpc::LoadBalancer::SelectIn in = { begin_us, true, false, 0u, NULL }; + brpc::LoadBalancer::SelectOut out(&ptr); + ASSERT_EQ(0, lalb.SelectServer(in, &out)); + bthread_usleep(2000); + brpc::LoadBalancer::CallInfo ci = { begin_us, id.id, 0, NULL }; + lalb.Feedback(ci); + } + + // _avg_latency must reflect actual elapsed time. If this is 0, either + // Weight::Update::end_time_us was changed away from gettimeofday_us + // again (re-introducing the time-source mismatch) or some caller of + // CallInfo::begin_time_us drifted to a different clock domain. + EXPECT_GT(avg_latency(), 0); + + ASSERT_EQ(0, brpc::Socket::SetFailed(id.id)); +} + TEST_F(LoadBalancerTest, la_selection_too_long) { brpc::GlobalInitializeOrDie(); brpc::LoadBalancerWithNaming lb;