agent-enviroments/builder/libs/seastar/apps/iotune/iotune.cc
2024-09-10 17:06:08 +03:00

918 lines
36 KiB
C++

/*
* This file is open source software, licensed to you under the terms
* of the Apache License, Version 2.0 (the "License"). See the NOTICE file
* distributed with this work for additional information regarding copyright
* ownership. You may not use this file except in compliance with the License.
*
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* Copyright (C) 2018 ScyllaDB
*
* The goal of this program is to allow a user to properly configure the Seastar I/O
* scheduler.
*/
#include <iostream>
#include <chrono>
#include <random>
#include <memory>
#include <vector>
#include <cmath>
#include <sys/vfs.h>
#include <sys/sysmacros.h>
#include <boost/range/irange.hpp>
#include <boost/program_options.hpp>
#include <boost/iterator/counting_iterator.hpp>
#include <fstream>
#include <wordexp.h>
#include <yaml-cpp/yaml.h>
#include <fmt/printf.h>
#include <seastar/core/seastar.hh>
#include <seastar/core/file.hh>
#include <seastar/core/thread.hh>
#include <seastar/core/sstring.hh>
#include <seastar/core/posix.hh>
#include <seastar/core/aligned_buffer.hh>
#include <seastar/core/sharded.hh>
#include <seastar/core/app-template.hh>
#include <seastar/core/shared_ptr.hh>
#include <seastar/core/fsqual.hh>
#include <seastar/core/loop.hh>
#include <seastar/util/defer.hh>
#include <seastar/util/log.hh>
#include <seastar/util/std-compat.hh>
#include <seastar/util/read_first_line.hh>
using namespace seastar;
using namespace std::chrono_literals;
namespace fs = std::filesystem;
logger iotune_logger("iotune");
using iotune_clock = std::chrono::steady_clock;
static thread_local std::default_random_engine random_generator(std::chrono::duration_cast<std::chrono::nanoseconds>(iotune_clock::now().time_since_epoch()).count());
void check_device_properties(fs::path dev_sys_file) {
auto sched_file = dev_sys_file / "queue" / "scheduler";
auto sched_string = read_first_line(sched_file);
auto beg = sched_string.find('[');
size_t len = sched_string.size();
if (beg == sstring::npos) {
beg = 0;
} else {
auto end = sched_string.find(']');
if (end != sstring::npos) {
len = end - beg - 1;
}
beg++;
}
auto scheduler = sched_string.substr(beg, len);
if ((scheduler != "noop") && (scheduler != "none")) {
iotune_logger.warn("Scheduler for {} set to {}. It is recommend to set it to noop before evaluation so as not to skew the results.",
sched_file.string(), scheduler);
}
auto nomerges_file = dev_sys_file / "queue" / "nomerges";
auto nomerges = read_first_line_as<unsigned>(nomerges_file);
if (nomerges != 2u) {
iotune_logger.warn("nomerges for {} set to {}. It is recommend to set it to 2 before evaluation so that merges are disabled. Results can be skewed otherwise.",
nomerges_file.string(), nomerges);
}
auto write_cache_file = dev_sys_file / "queue" / "write_cache";
auto write_cache = read_first_line_as<std::string>(write_cache_file);
if (write_cache == "write back") {
iotune_logger.warn("write_cache for {} is set to write back. Some disks have poor implementation of this mode, pay attention to the measurements accuracy.",
write_cache_file.string());
}
}
struct evaluation_directory {
sstring _name;
// We know that if we issue more than this, they will be blocked on linux anyway.
unsigned _max_iodepth = 0;
uint64_t _available_space;
uint64_t _min_data_transfer_size = 512;
unsigned _disks_per_array = 0;
void scan_device(unsigned dev_maj, unsigned dev_min) {
scan_device(fmt::format("{}:{}", dev_maj, dev_min));
}
void scan_device(std::string dev_str) {
scan_device(fs::path("/sys/dev/block") / dev_str);
}
void scan_device(fs::path sys_file) {
try {
sys_file = fs::canonical(sys_file);
bool is_leaf = true;
if (fs::exists(sys_file / "slaves")) {
for (auto& dev : fs::directory_iterator(sys_file / "slaves")) {
is_leaf = false;
scan_device(read_first_line(dev.path() / "dev"));
}
}
// our work is done if not leaf. We'll tune the leaves
if (!is_leaf) {
return;
}
if (fs::exists(sys_file / "partition")) {
scan_device(sys_file.remove_filename());
} else {
check_device_properties(sys_file);
auto queue_dir = sys_file / "queue";
auto disk_min_io_size = read_first_line_as<uint64_t>(queue_dir / "minimum_io_size");
_min_data_transfer_size = std::max(_min_data_transfer_size, disk_min_io_size);
_max_iodepth += read_first_line_as<uint64_t>(queue_dir / "nr_requests");
_disks_per_array++;
}
} catch (std::system_error& se) {
iotune_logger.error("Error while parsing sysfs. Will continue with guessed values: {}", se.what());
_max_iodepth = 128;
}
_disks_per_array = std::max(_disks_per_array, 1u);
}
public:
evaluation_directory(sstring name)
: _name(name)
, _available_space(fs::space(fs::path(_name)).available)
{}
unsigned max_iodepth() const {
return _max_iodepth;
}
fs::path path() const {
return fs::path(_name);
}
const sstring& name() const {
return _name;
}
unsigned disks_per_array() const {
return _disks_per_array;
}
uint64_t minimum_io_size() const {
return _min_data_transfer_size;
}
future<> discover_directory() {
return seastar::async([this] {
auto f = open_directory(_name).get();
auto st = f.stat().get();
f.close().get();
scan_device(major(st.st_dev), minor(st.st_dev));
});
}
uint64_t available_space() const {
return _available_space;
}
};
struct io_rates {
float bytes_per_sec = 0;
float iops = 0;
io_rates operator+(const io_rates& a) const {
return io_rates{bytes_per_sec + a.bytes_per_sec, iops + a.iops};
}
io_rates& operator+=(const io_rates& a) {
bytes_per_sec += a.bytes_per_sec;
iops += a.iops;
return *this;
}
};
struct row_stats {
size_t points;
double average;
double stdev;
float stdev_percents() const {
return points > 0 ? stdev / average : 0.0;
}
};
template <typename T>
static row_stats get_row_stats_for(const std::vector<T>& v) {
if (v.size() == 0) {
return row_stats{0, 0.0, 0.0};
}
double avg = std::accumulate(v.begin(), v.end(), 0.0) / v.size();
double stdev = std::sqrt(std::transform_reduce(v.begin(), v.end(), 0.0,
std::plus<double>(), [avg] (auto& v) -> double { return (v - avg) * (v - avg); }) / v.size());
return row_stats{ v.size(), avg, stdev };
}
class invalid_position : public std::exception {
public:
virtual const char* what() const noexcept {
return "file access position invalid";
}
};
struct position_generator {
virtual uint64_t get_pos() = 0;
virtual bool is_sequential() const = 0;
virtual ~position_generator() {}
};
class sequential_issuer : public position_generator {
size_t _buffer_size;
uint64_t _position = 0;
uint64_t _size_limit;
public:
sequential_issuer(size_t buffer_size, uint64_t size_limit)
: _buffer_size(buffer_size)
, _size_limit(size_limit)
{}
virtual bool is_sequential() const {
return true;
}
virtual uint64_t get_pos() {
if (_position >= _size_limit) {
// Wrap around if reaching EOF. The write bandwidth is lower,
// and we also split the write bandwidth among shards, while we
// read only from shard 0, so shard 0's file may not be large
// enough to read from.
_position = 0;
}
auto pos = _position;
_position += _buffer_size;
return pos;
}
};
class random_issuer : public position_generator {
size_t _buffer_size;
uint64_t _last_position;
std::uniform_int_distribution<uint64_t> _pos_distribution;
public:
random_issuer(size_t buffer_size, uint64_t last_position)
: _buffer_size(buffer_size)
, _last_position(last_position)
, _pos_distribution(0, (last_position / buffer_size) - 1)
{}
virtual bool is_sequential() const {
return false;
}
virtual uint64_t get_pos() {
uint64_t pos = _pos_distribution(random_generator) * _buffer_size;
if (pos >= _last_position) {
throw invalid_position();
}
return pos;
}
};
class request_issuer {
public:
virtual future<size_t> issue_request(uint64_t pos, char* buf, uint64_t size) = 0;
virtual ~request_issuer() {}
};
class write_request_issuer : public request_issuer {
file _file;
public:
explicit write_request_issuer(file f) : _file(f) {}
future<size_t> issue_request(uint64_t pos, char* buf, uint64_t size) override {
return _file.dma_write(pos, buf, size);
}
};
class read_request_issuer : public request_issuer {
file _file;
public:
explicit read_request_issuer(file f) : _file(f) {}
future<size_t> issue_request(uint64_t pos, char* buf, uint64_t size) override {
return _file.dma_read(pos, buf, size);
}
};
class io_worker {
class requests_rate_meter {
std::vector<unsigned>& _rates;
const unsigned& _requests;
unsigned _prev_requests = 0;
timer<> _tick;
static constexpr auto period = 1s;
public:
requests_rate_meter(std::chrono::duration<double> duration, std::vector<unsigned>& rates, const unsigned& requests)
: _rates(rates)
, _requests(requests)
, _tick([this] {
_rates.push_back(_requests - _prev_requests);
_prev_requests = _requests;
})
{
_rates.reserve(256); // ~2 minutes
if (duration > 4 * period) {
_tick.arm_periodic(period);
}
}
~requests_rate_meter() {
if (_tick.armed()) {
_tick.cancel();
} else {
_rates.push_back(_requests);
}
}
};
uint64_t _bytes = 0;
uint64_t _max_offset = 0;
unsigned _requests = 0;
size_t _buffer_size;
std::chrono::time_point<iotune_clock, std::chrono::duration<double>> _start_measuring;
std::chrono::time_point<iotune_clock, std::chrono::duration<double>> _end_measuring;
std::chrono::time_point<iotune_clock, std::chrono::duration<double>> _end_load;
// track separately because in the sequential case we may exhaust the file before _duration
std::chrono::time_point<iotune_clock, std::chrono::duration<double>> _last_time_seen;
requests_rate_meter _rr_meter;
std::unique_ptr<position_generator> _pos_impl;
std::unique_ptr<request_issuer> _req_impl;
public:
bool is_sequential() const {
return _pos_impl->is_sequential();
}
bool should_stop() const {
return iotune_clock::now() >= _end_load;
}
io_worker(size_t buffer_size, std::chrono::duration<double> duration, std::unique_ptr<request_issuer> reqs, std::unique_ptr<position_generator> pos, std::vector<unsigned>& rates)
: _buffer_size(buffer_size)
, _start_measuring(iotune_clock::now() + std::chrono::duration<double>(10ms))
, _end_measuring(_start_measuring + duration)
, _end_load(_end_measuring + 10ms)
, _last_time_seen(_start_measuring)
, _rr_meter(duration, rates, _requests)
, _pos_impl(std::move(pos))
, _req_impl(std::move(reqs))
{}
std::unique_ptr<char[], free_deleter> get_buffer() {
return allocate_aligned_buffer<char>(_buffer_size, _buffer_size);
}
future<> issue_request(char* buf) {
uint64_t pos = _pos_impl->get_pos();
return _req_impl->issue_request(pos, buf, _buffer_size).then([this, pos] (size_t size) {
auto now = iotune_clock::now();
_max_offset = std::max(_max_offset, pos + size);
if ((now > _start_measuring) && (now < _end_measuring)) {
_last_time_seen = now;
_bytes += size;
_requests++;
}
});
}
uint64_t max_offset() const noexcept { return _max_offset; }
io_rates get_io_rates() const {
io_rates rates;
auto t = _last_time_seen - _start_measuring;
if (!t.count()) {
throw std::runtime_error("No data collected");
}
rates.bytes_per_sec = _bytes / t.count();
rates.iops = _requests / t.count();
return rates;
}
};
class test_file {
public:
enum class pattern { sequential, random };
private:
fs::path _dirpath;
uint64_t _file_size;
file _file;
uint64_t _forced_random_io_buffer_size;
std::unique_ptr<position_generator> get_position_generator(size_t buffer_size, pattern access_pattern) {
if (access_pattern == pattern::sequential) {
return std::make_unique<sequential_issuer>(buffer_size, _file_size);
} else {
return std::make_unique<random_issuer>(buffer_size, _file_size);
}
}
uint64_t calculate_buffer_size(pattern access_pattern, uint64_t buffer_size, uint64_t operation_alignment) const {
if (access_pattern == pattern::random && _forced_random_io_buffer_size != 0u) {
return _forced_random_io_buffer_size;
}
return std::max(buffer_size, operation_alignment);
}
public:
test_file(const ::evaluation_directory& dir, uint64_t maximum_size, uint64_t random_io_buffer_size)
: _dirpath(dir.path() / fs::path(fmt::format("ioqueue-discovery-{}", this_shard_id())))
, _file_size(maximum_size)
, _forced_random_io_buffer_size(random_io_buffer_size)
{}
future<> create_data_file() {
// XFS likes access in many directories better.
return make_directory(_dirpath.string()).then([this] {
auto testfile = _dirpath / fs::path("testfile");
file_open_options options;
options.extent_allocation_size_hint = _file_size;
return open_file_dma(testfile.string(), open_flags::rw | open_flags::create, std::move(options)).then([this, testfile] (file file) {
_file = file;
if (this_shard_id() == 0) {
iotune_logger.info("Filesystem parameters: read alignment {}, write alignment {}", _file.disk_read_dma_alignment(), _file.disk_write_dma_alignment());
}
return remove_file(testfile.string()).then([this] {
return remove_file(_dirpath.string());
});
}).then([this] {
return _file.truncate(_file_size);
});
});
}
future<io_rates> do_workload(std::unique_ptr<io_worker> worker_ptr, unsigned max_os_concurrency, bool update_file_size = false) {
if (update_file_size) {
_file_size = 0;
}
auto worker = worker_ptr.get();
auto concurrency = boost::irange<unsigned, unsigned>(0, max_os_concurrency, 1);
return parallel_for_each(std::move(concurrency), [worker] (unsigned idx) {
auto bufptr = worker->get_buffer();
auto buf = bufptr.get();
return do_until([worker] { return worker->should_stop(); }, [buf, worker] {
return worker->issue_request(buf);
}).finally([alive = std::move(bufptr)] {});
}).then_wrapped([this, worker = std::move(worker_ptr), update_file_size] (future<> f) {
try {
f.get();
} catch (invalid_position& ip) {
// expected if sequential. Example: reading and the file ended.
if (!worker->is_sequential()) {
throw;
}
}
if (update_file_size) {
_file_size = worker->max_offset();
}
return make_ready_future<io_rates>(worker->get_io_rates());
});
}
future<io_rates> read_workload(size_t buffer_size, pattern access_pattern, unsigned max_os_concurrency, std::chrono::duration<double> duration, std::vector<unsigned>& rates) {
buffer_size = calculate_buffer_size(access_pattern, buffer_size, _file.disk_read_dma_alignment());
auto worker = std::make_unique<io_worker>(buffer_size, duration, std::make_unique<read_request_issuer>(_file), get_position_generator(buffer_size, access_pattern), rates);
return do_workload(std::move(worker), max_os_concurrency);
}
future<io_rates> write_workload(size_t buffer_size, pattern access_pattern, unsigned max_os_concurrency, std::chrono::duration<double> duration, std::vector<unsigned>& rates) {
buffer_size = calculate_buffer_size(access_pattern, buffer_size, _file.disk_write_dma_alignment());
auto worker = std::make_unique<io_worker>(buffer_size, duration, std::make_unique<write_request_issuer>(_file), get_position_generator(buffer_size, access_pattern), rates);
bool update_file_size = worker->is_sequential();
return do_workload(std::move(worker), max_os_concurrency, update_file_size).then([this] (io_rates r) {
return _file.flush().then([r = std::move(r)] () mutable {
return make_ready_future<io_rates>(std::move(r));
});
});
}
future<> stop() {
return _file ? _file.close() : make_ready_future<>();
}
};
class iotune_multi_shard_context {
::evaluation_directory _test_directory;
uint64_t _random_io_buffer_size;
unsigned per_shard_io_depth() const {
auto iodepth = _test_directory.max_iodepth() / smp::count;
if (this_shard_id() < _test_directory.max_iodepth() % smp::count) {
iodepth++;
}
return std::min(iodepth, 128u);
}
seastar::sharded<test_file> _iotune_test_file;
std::vector<unsigned> serial_rates;
seastar::sharded<std::vector<unsigned>> sharded_rates;
public:
future<> stop() {
return _iotune_test_file.stop().then([this] { return sharded_rates.stop(); });
}
future<> start() {
const auto maximum_size = (_test_directory.available_space() / (2 * smp::count));
return _iotune_test_file.start(_test_directory, maximum_size, _random_io_buffer_size).then([this] {
return sharded_rates.start();
});
}
future<row_stats> get_serial_rates() {
row_stats ret = get_row_stats_for<unsigned>(serial_rates);
serial_rates.clear();
return make_ready_future<row_stats>(ret);
}
future<row_stats> get_sharded_worst_rates() {
return sharded_rates.map_reduce0([] (std::vector<unsigned>& rates) {
row_stats ret = get_row_stats_for<unsigned>(rates);
rates.clear();
return ret;
}, row_stats{0, 0.0, 0.0},
[] (const row_stats& res, row_stats lres) {
return res.stdev < lres.stdev ? lres : res;
});
}
future<> create_data_file() {
return _iotune_test_file.invoke_on_all([] (test_file& tf) {
return tf.create_data_file();
});
}
future<io_rates> write_sequential_data(unsigned shard, size_t buffer_size, std::chrono::duration<double> duration) {
return _iotune_test_file.invoke_on(shard, [this, buffer_size, duration] (test_file& tf) {
return tf.write_workload(buffer_size, test_file::pattern::sequential, 4 * _test_directory.disks_per_array(), duration, serial_rates);
});
}
future<io_rates> read_sequential_data(unsigned shard, size_t buffer_size, std::chrono::duration<double> duration) {
return _iotune_test_file.invoke_on(shard, [this, buffer_size, duration] (test_file& tf) {
return tf.read_workload(buffer_size, test_file::pattern::sequential, 4 * _test_directory.disks_per_array(), duration, serial_rates);
});
}
future<io_rates> write_random_data(size_t buffer_size, std::chrono::duration<double> duration) {
return _iotune_test_file.map_reduce0([buffer_size, this, duration] (test_file& tf) {
const auto shard_io_depth = per_shard_io_depth();
if (shard_io_depth == 0) {
return make_ready_future<io_rates>();
} else {
return tf.write_workload(buffer_size, test_file::pattern::random, shard_io_depth, duration, sharded_rates.local());
}
}, io_rates(), std::plus<io_rates>());
}
future<io_rates> read_random_data(size_t buffer_size, std::chrono::duration<double> duration) {
return _iotune_test_file.map_reduce0([buffer_size, this, duration] (test_file& tf) {
const auto shard_io_depth = per_shard_io_depth();
if (shard_io_depth == 0) {
return make_ready_future<io_rates>();
} else {
return tf.read_workload(buffer_size, test_file::pattern::random, shard_io_depth, duration, sharded_rates.local());
}
}, io_rates(), std::plus<io_rates>());
}
private:
template <typename Fn>
future<uint64_t> saturate(float rate_threshold, size_t buffer_size, std::chrono::duration<double> duration, Fn&& workload) {
return _iotune_test_file.invoke_on(0, [this, rate_threshold, buffer_size, duration, workload] (test_file& tf) {
return (tf.*workload)(buffer_size, test_file::pattern::sequential, 1, duration, serial_rates).then([this, rate_threshold, buffer_size, duration, workload] (io_rates rates) {
serial_rates.clear();
if (rates.bytes_per_sec < rate_threshold) {
// The throughput with the given buffer-size is already "small enough", so
// return back its previous value
return make_ready_future<uint64_t>(buffer_size * 2);
} else {
return saturate(rate_threshold, buffer_size / 2, duration, workload);
}
});
});
}
public:
future<uint64_t> saturate_write(float rate_threshold, size_t buffer_size, std::chrono::duration<double> duration) {
return saturate(rate_threshold, buffer_size, duration, &test_file::write_workload);
}
future<uint64_t> saturate_read(float rate_threshold, size_t buffer_size, std::chrono::duration<double> duration) {
return saturate(rate_threshold, buffer_size, duration, &test_file::read_workload);
}
iotune_multi_shard_context(::evaluation_directory dir, uint64_t random_io_buffer_size)
: _test_directory(dir)
, _random_io_buffer_size(random_io_buffer_size)
{}
};
struct disk_descriptor {
std::string mountpoint;
uint64_t read_iops;
uint64_t read_bw;
uint64_t write_iops;
uint64_t write_bw;
std::optional<uint64_t> read_sat_len;
std::optional<uint64_t> write_sat_len;
};
void string_to_file(sstring conf_file, sstring buf) {
auto f = file_desc::open(conf_file, O_WRONLY | O_CLOEXEC | O_CREAT | O_TRUNC, 0664);
auto ret = f.write(buf.data(), buf.size());
if (!ret || (*ret != buf.size())) {
throw std::runtime_error(fmt::format("Can't write {}: {}", conf_file, *ret));
}
}
void write_configuration_file(sstring conf_file, std::string format, sstring properties_file) {
sstring buf;
if (format == "seastar") {
buf = fmt::format("io-properties-file={}\n", properties_file);
} else {
buf = fmt::format("SEASTAR_IO=\"--io-properties-file={}\"\n", properties_file);
}
string_to_file(conf_file, buf);
}
void write_property_file(sstring conf_file, std::vector<disk_descriptor> disk_descriptors) {
YAML::Emitter out;
out << YAML::BeginMap;
out << YAML::Key << "disks";
out << YAML::BeginSeq;
for (auto& desc : disk_descriptors) {
out << YAML::BeginMap;
out << YAML::Key << "mountpoint" << YAML::Value << desc.mountpoint;
out << YAML::Key << "read_iops" << YAML::Value << desc.read_iops;
out << YAML::Key << "read_bandwidth" << YAML::Value << desc.read_bw;
out << YAML::Key << "write_iops" << YAML::Value << desc.write_iops;
out << YAML::Key << "write_bandwidth" << YAML::Value << desc.write_bw;
if (desc.read_sat_len) {
out << YAML::Key << "read_saturation_length" << YAML::Value << *desc.read_sat_len;
}
if (desc.write_sat_len) {
out << YAML::Key << "write_saturation_length" << YAML::Value << *desc.write_sat_len;
}
out << YAML::EndMap;
}
out << YAML::EndSeq;
out << YAML::EndMap;
out << YAML::Newline;
string_to_file(conf_file, sstring(out.c_str(), out.size()));
}
// Returns the mountpoint of a path. It works by walking backwards from the canonical path
// (absolute, with symlinks resolved), until we find a point that crosses a device ID.
fs::path mountpoint_of(sstring filename) {
fs::path mnt_candidate = fs::canonical(fs::path(filename));
std::optional<dev_t> candidate_id = {};
auto current = mnt_candidate;
do {
auto f = open_directory(current.string()).get();
auto st = f.stat().get();
if ((candidate_id) && (*candidate_id != st.st_dev)) {
return mnt_candidate;
}
mnt_candidate = current;
candidate_id = st.st_dev;
current = current.parent_path();
} while (mnt_candidate != current);
return mnt_candidate;
}
int main(int ac, char** av) {
namespace bpo = boost::program_options;
bool fs_check = false;
app_template::config app_cfg;
app_cfg.name = "IOTune";
app_template app(std::move(app_cfg));
auto opt_add = app.add_options();
opt_add
("evaluation-directory", bpo::value<std::vector<sstring>>()->required(), "directory where to execute the evaluation")
("properties-file", bpo::value<sstring>(), "path in which to write the YAML file")
("options-file", bpo::value<sstring>(), "path in which to write the legacy conf file")
("duration", bpo::value<unsigned>()->default_value(120), "time, in seconds, for which to run the test")
("format", bpo::value<sstring>()->default_value("seastar"), "Configuration file format (seastar | envfile)")
("fs-check", bpo::bool_switch(&fs_check), "perform FS check only")
("accuracy", bpo::value<unsigned>()->default_value(3), "acceptable deviation of measurements (percents)")
("saturation", bpo::value<sstring>()->default_value(""), "measure saturation lengths (read | write | both) (this is very slow!)")
("random-io-buffer-size", bpo::value<unsigned>()->default_value(0), "force buffer size for random write and random read")
;
return app.run(ac, av, [&] {
return seastar::async([&] {
auto& configuration = app.configuration();
auto eval_dirs = configuration["evaluation-directory"].as<std::vector<sstring>>();
auto format = configuration["format"].as<sstring>();
auto duration = std::chrono::duration<double>(configuration["duration"].as<unsigned>() * 1s);
auto accuracy = configuration["accuracy"].as<unsigned>();
auto saturation = configuration["saturation"].as<sstring>();
auto random_io_buffer_size = configuration["random-io-buffer-size"].as<unsigned>();
bool read_saturation, write_saturation;
if (saturation == "") {
read_saturation = false;
write_saturation = false;
} else if (saturation == "both") {
read_saturation = true;
write_saturation = true;
} else if (saturation == "read") {
read_saturation = true;
write_saturation = false;
} else if (saturation == "write") {
read_saturation = false;
write_saturation = true;
} else {
fmt::print("Bad --saturation value\n");
return 1;
}
std::vector<disk_descriptor> disk_descriptors;
std::unordered_map<sstring, sstring> mountpoint_map;
// We want to evaluate once per mountpoint, but we still want to write in one of the
// directories that we were provided - we may not have permissions to write into the
// mountpoint itself. If we are passed more than one directory per mountpoint, we don't
// really care to which one we write, so this simple hash will do.
for (auto& eval_dir : eval_dirs) {
mountpoint_map[mountpoint_of(eval_dir).string()] = eval_dir;
}
for (auto eval: mountpoint_map) {
auto mountpoint = eval.first;
auto eval_dir = eval.second;
if (!filesystem_has_good_aio_support(eval_dir, false)) {
iotune_logger.error("Exception when qualifying filesystem at {}", eval_dir);
return 1;
}
auto rec = 10000000000ULL;
auto avail = fs_avail(eval_dir).get();
if (avail < rec) {
uint64_t val;
const char* units;
if (avail >= 1000000000) {
val = (avail + 500000000) / 1000000000;
units = "GB";
} else if (avail >= 1000000) {
val = (avail + 500000) / 1000000;
units = "MB";
} else {
val = avail;
units = "bytes";
}
iotune_logger.warn("Available space on filesystem at {}: {} {}: is less than recommended: {} GB",
eval_dir, val, units, rec / 1000000000ULL);
}
iotune_logger.info("{} passed sanity checks", eval_dir);
if (fs_check) {
continue;
}
// Directory is the same object for all tests.
::evaluation_directory test_directory(eval_dir);
test_directory.discover_directory().get();
iotune_logger.info("Disk parameters: max_iodepth={} disks_per_array={} minimum_io_size={}",
test_directory.max_iodepth(), test_directory.disks_per_array(), test_directory.minimum_io_size());
if (test_directory.max_iodepth() < smp::count) {
iotune_logger.warn("smp::count={} is greater than max_iodepth={} - shards above max_io_depth "
"will be ignored during random read and random write measurements",
smp::count, test_directory.max_iodepth());
}
if (random_io_buffer_size != 0u) {
iotune_logger.info("Forcing buffer_size={} for random IO!", random_io_buffer_size);
}
::iotune_multi_shard_context iotune_tests(test_directory, random_io_buffer_size);
iotune_tests.start().get();
auto stop = defer([&iotune_tests] () noexcept {
try {
iotune_tests.stop().get();
} catch (...) {
fmt::print("Error occurred during iotune context shutdown: {}", std::current_exception());
abort();
}
});
row_stats rates;
auto accuracy_msg = [accuracy, &rates] {
auto stdev = rates.stdev_percents() * 100.0;
return (accuracy == 0 || stdev > accuracy) ? fmt::format(" (deviation {}%)", int(round(stdev))) : std::string("");
};
iotune_tests.create_data_file().get();
fmt::print("Starting Evaluation. This may take a while...\n");
fmt::print("Measuring sequential write bandwidth: ");
std::cout.flush();
io_rates write_bw;
size_t sequential_buffer_size = 1 << 20;
for (unsigned shard = 0; shard < smp::count; ++shard) {
write_bw += iotune_tests.write_sequential_data(shard, sequential_buffer_size, duration * 0.70 / smp::count).get();
}
write_bw.bytes_per_sec /= smp::count;
rates = iotune_tests.get_serial_rates().get();
fmt::print("{} MB/s{}\n", uint64_t(write_bw.bytes_per_sec / (1024 * 1024)), accuracy_msg());
std::optional<uint64_t> write_sat;
if (write_saturation) {
fmt::print("Measuring write saturation length: ");
std::cout.flush();
write_sat = iotune_tests.saturate_write(write_bw.bytes_per_sec * (1.0 - rates.stdev_percents()), sequential_buffer_size/2, duration * 0.70).get();
fmt::print("{}\n", *write_sat);
}
fmt::print("Measuring sequential read bandwidth: ");
std::cout.flush();
auto read_bw = iotune_tests.read_sequential_data(0, sequential_buffer_size, duration * 0.1).get();
rates = iotune_tests.get_serial_rates().get();
fmt::print("{} MB/s{}\n", uint64_t(read_bw.bytes_per_sec / (1024 * 1024)), accuracy_msg());
std::optional<uint64_t> read_sat;
if (read_saturation) {
fmt::print("Measuring read saturation length: ");
std::cout.flush();
read_sat = iotune_tests.saturate_read(read_bw.bytes_per_sec * (1.0 - rates.stdev_percents()), sequential_buffer_size/2, duration * 0.1).get();
fmt::print("{}\n", *read_sat);
}
fmt::print("Measuring random write IOPS: ");
std::cout.flush();
auto write_iops = iotune_tests.write_random_data(test_directory.minimum_io_size(), duration * 0.1).get();
rates = iotune_tests.get_sharded_worst_rates().get();
fmt::print("{} IOPS{}\n", uint64_t(write_iops.iops), accuracy_msg());
fmt::print("Measuring random read IOPS: ");
std::cout.flush();
auto read_iops = iotune_tests.read_random_data(test_directory.minimum_io_size(), duration * 0.1).get();
rates = iotune_tests.get_sharded_worst_rates().get();
fmt::print("{} IOPS{}\n", uint64_t(read_iops.iops), accuracy_msg());
struct disk_descriptor desc;
desc.mountpoint = mountpoint;
desc.read_iops = read_iops.iops;
desc.read_bw = read_bw.bytes_per_sec;
desc.read_sat_len = read_sat;
desc.write_iops = write_iops.iops;
desc.write_bw = write_bw.bytes_per_sec;
desc.write_sat_len = write_sat;
disk_descriptors.push_back(std::move(desc));
}
if (fs_check) {
return 0;
}
auto file = "properties file";
try {
if (configuration.count("properties-file")) {
fmt::print("Writing result to {}\n", configuration["properties-file"].as<sstring>());
write_property_file(configuration["properties-file"].as<sstring>(), disk_descriptors);
}
file = "configuration file";
if (configuration.count("options-file")) {
fmt::print("Writing result to {}\n", configuration["options-file"].as<sstring>());
write_configuration_file(configuration["options-file"].as<sstring>(), format, configuration["properties-file"].as<sstring>());
}
} catch (...) {
iotune_logger.error("Exception when writing {}: {}.\nPlease add the above values manually to your seastar command line.", file, std::current_exception());
return 1;
}
return 0;
});
});
}