/* * This file is open source software, licensed to you under the terms * of the Apache License, Version 2.0 (the "License"). See the NOTICE file * distributed with this work for additional information regarding copyright * ownership. You may not use this file except in compliance with the License. * * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ /* * Copyright (C) 2023 ScyllaDB Ltd. */ #include #include #include #include #include #include #include // The test allows measuring if the shared_token_bucket<> allows the tokens // consumers to get tokens at the rate the bucket is configured with. // // Report example: // // effective rate is 1016575.0t/s, [239668 ... 263522] // // The last line "effective rate" is the tokens-per-second rate all workers were // able to get. It should be equal to the configured rate (context::rate below) // // In braces there are minimal and maximum rate of individual shards. These numbers // should not differ to much from each other, if they do it means that the t.b. // is not fair using clock_type = std::chrono::steady_clock; // The test uses uint64_t tokens with per-second time-measurement and can use // capped and non-capped token buckets. using capped_token_bucket_t = internal::shared_token_bucket, internal::capped_release::yes>; using pure_token_bucket_t = internal::shared_token_bucket, internal::capped_release::no>; // The test entry point calls map_reduce on the sharded<> workers set and this // is what each worker produces, so that the entry point could accumulate the // final result from struct work_result { uint64_t tokens; uint64_t released; }; struct statistics { uint64_t total = 0; uint64_t min = std::numeric_limits::max(); uint64_t max = std::numeric_limits::min(); }; statistics accumulate(statistics acc, const work_result& val) { return statistics { .total = acc.total + val.tokens, .min = std::min(acc.min, val.tokens), .max = std::max(acc.max, val.tokens), }; } // The worker itself. Has a reference on the shared token bucket and tries to // consume as many tokens as it can without artificial delays. Reports the number // of tokens grabbed from bucket while running template struct worker : public seastar::peering_sharded_service> { TokenBucket& tb; // Capped bucket requires that t.b. user releases the tokens so that // they could be replenished. Respectively, the worker keeps track of // total number of grabbed tokens as well as the number of not-yet-released // tokens and the total number of released tokens. The "available" number // of tokens is used to decide if the worker can release them or not uint64_t tokens = 0; uint64_t available = 0; uint64_t released = 0; // Tokens are released in a timer. This mimics default reactor workflow // when requests are reaped from the kernel each task-quota -- 0.5ms static constexpr auto release_period = std::chrono::microseconds(500); const uint64_t release_per_tick = 0; clock_type::time_point last_release; timer<> release_tokens; std::optional> head; // The IO-scheduler doesn't get more than this number of tokens per tick. // The test tries to mimic this behavior const uint64_t threshold; // The number of tokens to grab at a time. It's a distribution to resemble // IO queue that tries to grab different amount of tokens for requests of // different sizes and direction std::uniform_int_distribution size; // Per-tick statistics. Collected, but not reported by default struct tick_data { std::chrono::microseconds delay; std::chrono::microseconds sleep; uint64_t defic; uint64_t tokens; uint64_t total; template tick_data(D1 dur, D2 slp, uint64_t def, uint64_t lt, uint64_t tt) noexcept : delay(std::chrono::duration_cast(dur)) , sleep(std::chrono::duration_cast(slp)) , defic(def) , tokens(lt) , total(tt) {} }; std::deque ticks; worker(TokenBucket& tb_) noexcept : tb(tb_) , release_per_tick(double(tb.rate()) / smp::count * std::chrono::duration_cast>(release_period).count()) , last_release(clock_type::now()) , release_tokens([this] { do_release(); }) , threshold(tb.limit() / smp::count) , size(1, std::min(threshold, 128)) { release_tokens.arm_periodic(std::chrono::duration_cast(release_period)); fmt::print("{} worker, threshold {}, release-per-tick {}\n", this_shard_id(), threshold, release_per_tick); } void do_release(uint64_t tokens) { available -= tokens; released += tokens; if constexpr (TokenBucket::is_capped == internal::capped_release::yes) { tb.release(tokens); } } void do_release() { // Timer can fire later than programmed because of hogs not yielding in a timely // manner. If that was an IO queue more requests would have been reaped from the // kernel, so do the same here -- scale the number of releasable tokens proportionally auto now = clock_type::now(); auto real_delay = std::chrono::duration_cast>(now - last_release); last_release = now; uint64_t to_release = real_delay.count() * release_per_tick / std::chrono::duration_cast>(release_period).count(); do_release(std::min(to_release, available)); } future work(std::function(std::chrono::duration d)> do_sleep) { assert(tokens == 0); auto start = clock_type::now(); // Run for 1 second. The perf suite would restart this method several times return do_until([end = start + std::chrono::seconds(1)] { return clock_type::now() >= end; }, [this, start, do_sleep = std::move(do_sleep)] { uint64_t d = 0; uint64_t l_tokens = 0; int sz; while (l_tokens < threshold) { if (head) { tb.replenish(clock_type::now()); d = tb.deficiency(head->second); if (d > 0) { break; } sz = head->first; head.reset(); } else { sz = size(testing::local_random_engine); auto h = tb.grab(sz); d = tb.deficiency(h); if (d > 0) { head = std::make_pair(sz, h); break; } } tokens += sz; l_tokens += sz; available += sz; } auto p = tb.duration_for(d); ticks.emplace_back(clock_type::now() - start, p, d, l_tokens, tokens); if (ticks.size() > 2048) { ticks.pop_front(); } return do_sleep(p); } ).then([this, start] { // Reports: // - shard-id // - total number of tokens and total time taken // - effective speed // - expected speed (token-bucket.rate() / smp::count) // - ticks -- the number of times the worker had change to grab tokens // - the info about tokens releasing auto delay = std::chrono::duration_cast>(clock_type::now() - start).count(); fmt::print("{} {}t/{:.3f}s, speed is {:.1f}t/s goal {:.1f}t/s, {} ticks, released {} (accumulated {})\n", this_shard_id(), tokens, delay, double(tokens) / delay, double(tb.rate()) / smp::count, ticks.size(), released, available); do_release(available); work_result r { .tokens = std::exchange(this->tokens, 0), .released = std::exchange(this->released, 0), }; return make_ready_future(std::move(r)); }); } // The below two are how worker waits for the token-bucket deficiency // to disappear (i.e. -- when the requested number of tokens are replenished // // Two options -- poll infinitely or sleep for the estimated (by the // bucket method) duration future work_sleeping() { return work([] (std::chrono::duration d) { return seastar::sleep(std::chrono::duration_cast(d)); }); } future work_yielding() { return work([] (std::chrono::duration) { return seastar::yield(); }); } future<> print_and_clear_ticks() { fmt::print("{} {} ticks\n", this_shard_id(), ticks.size()); std::chrono::microseconds p(0); for (auto& td : ticks) { fmt::print(" {:8} +{:5} us {:5}/{:5} def {:3} sleep {:5} us\n", td.delay.count(), (td.delay - p).count(), td.tokens, td.total, td.defic, td.sleep.count()); p = td.delay; } ticks.clear(); if (this_shard_id() == smp::count - 1) { return make_ready_future<>(); } return this->container().invoke_on(this_shard_id() + 1, &worker::print_and_clear_ticks); } }; // CPU hog that occupies CPU for "busy" duration, then sleeps for "rest" duration // The actual periods are randomized to be thus "on average" struct hog { std::exponential_distribution busy; std::exponential_distribution rest; std::optional> stopped; bool keep_going = false; uint64_t _iterations = 0; template hog(T1 b, T2 r) noexcept : busy(1.0 / std::chrono::duration_cast>(b).count()) , rest(1.0 / std::chrono::duration_cast>(r).count()) {} void work() { assert(!stopped.has_value()); keep_going = true; stopped = do_until([this] { return !keep_going; }, [this] { auto p = std::chrono::duration(rest(testing::local_random_engine)); return seastar::sleep(std::chrono::duration_cast(p)).then([this] { _iterations++; auto until = clock_type::now() + std::chrono::duration(busy(testing::local_random_engine)); do { } while (clock_type::now() < until && keep_going); }); } ); } future<> terminate() { assert(stopped.has_value()); keep_going = false; auto f = std::move(*stopped); stopped.reset(); return f; } }; template struct context { using worker_t = worker; TokenBucket tb; seastar::sharded w; seastar::sharded h; static constexpr uint64_t rate = 1000000; static constexpr uint64_t limit = rate / 2000; static constexpr uint64_t threshold = 1; context() : tb(rate, limit, threshold) { w.start(std::ref(tb)).get(); h.start(std::chrono::microseconds(300), std::chrono::microseconds(100)).get(); fmt::print("Created tb {}t/s (limit {} threshold {})\n", tb.rate(), tb.limit(), tb.threshold()); } ~context() { h.stop().get(); w.stop().get(); } template future<> run_workers(Fn&& fn) { auto start = clock_type::now(); return w.map_reduce0(std::forward(fn), statistics{}, accumulate).then([start] (statistics st) { auto delay = std::chrono::duration_cast>(clock_type::now() - start).count(); fmt::print("effective rate is {:.1f}t/s, [{} ... {}]\n", st.total / delay, st.min, st.max); }); } future<> test_sleeping() { fmt::print("---8<---\n"); return run_workers(&worker_t::work_sleeping); } future<> test_yielding() { fmt::print("---8<---\n"); return run_workers(&worker_t::work_yielding); } future<> test_sleeping_with_hog() { fmt::print("---8<---\n"); return h.invoke_on_all(&hog::work).then([this] { return run_workers(&worker_t::work_sleeping).then([this] { return h.invoke_on_all(&hog::terminate); }); }); } }; struct perf_capped_context : public context {}; struct perf_pure_context : public context {}; // There are 3 tests run over 2 types of buckets: // // - poll token bucket for tokens in case of deficiency // - sleep in case token bucket reports deficiency // - sleep on deficiency, but run CPU hogs in the background // // All tests are run with capped and non-capped (called pure) token buckets PERF_TEST_F(perf_capped_context, yielding_throughput) { return test_yielding(); } PERF_TEST_F(perf_capped_context, sleeping_throughput) { return test_sleeping(); } PERF_TEST_F(perf_capped_context, sleeping_throughput_with_hog) { return test_sleeping_with_hog(); } PERF_TEST_F(perf_pure_context, yielding_throughput) { return test_yielding(); } PERF_TEST_F(perf_pure_context, sleeping_throughput) { return test_sleeping(); } PERF_TEST_F(perf_pure_context, sleeping_throughput_with_hog) { return test_sleeping_with_hog(); }