agent-enviroments/builder/libs/seastar/apps/iotune/iotune.cc

/*
 * This file is open source software, licensed to you under the terms
 * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
 * distributed with this work for additional information regarding copyright
 * ownership.  You may not use this file except in compliance with the License.
 *
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
/*
 * Copyright (C) 2018 ScyllaDB
 *
 * The goal of this program is to allow a user to properly configure the Seastar I/O
 * scheduler.
 */
#include <iostream>
#include <chrono>
#include <random>
#include <memory>
#include <vector>
#include <cmath>
#include <sys/vfs.h>
#include <sys/sysmacros.h>
#include <boost/range/irange.hpp>
#include <boost/program_options.hpp>
#include <boost/iterator/counting_iterator.hpp>
#include <fstream>
#include <wordexp.h>
#include <yaml-cpp/yaml.h>
#include <fmt/printf.h>
#include <seastar/core/seastar.hh>
#include <seastar/core/file.hh>
#include <seastar/core/thread.hh>
#include <seastar/core/sstring.hh>
#include <seastar/core/posix.hh>
#include <seastar/core/aligned_buffer.hh>
#include <seastar/core/sharded.hh>
#include <seastar/core/app-template.hh>
#include <seastar/core/shared_ptr.hh>
#include <seastar/core/fsqual.hh>
#include <seastar/core/loop.hh>
#include <seastar/util/defer.hh>
#include <seastar/util/log.hh>
#include <seastar/util/std-compat.hh>
#include <seastar/util/read_first_line.hh>

using namespace seastar;
using namespace std::chrono_literals;
namespace fs = std::filesystem;

logger iotune_logger("iotune");

using iotune_clock = std::chrono::steady_clock;
static thread_local std::default_random_engine random_generator(std::chrono::duration_cast<std::chrono::nanoseconds>(iotune_clock::now().time_since_epoch()).count());

void check_device_properties(fs::path dev_sys_file) {
    auto sched_file = dev_sys_file / "queue" / "scheduler";
    auto sched_string = read_first_line(sched_file);
    auto beg = sched_string.find('[');
    size_t len = sched_string.size();
    if (beg == sstring::npos) {
        beg = 0;
    } else {
        auto end = sched_string.find(']');
        if (end != sstring::npos) {
            len = end - beg - 1;
        }
        beg++;
    }
    auto scheduler = sched_string.substr(beg, len);
    if ((scheduler != "noop") && (scheduler != "none")) {
        iotune_logger.warn("Scheduler for {} set to {}. It is recommend to set it to noop before evaluation so as not to skew the results.",
                sched_file.string(), scheduler);
    }

    auto nomerges_file = dev_sys_file / "queue" / "nomerges";
    auto nomerges = read_first_line_as<unsigned>(nomerges_file);
    if (nomerges != 2u) {
        iotune_logger.warn("nomerges for {} set to {}. It is recommend to set it to 2 before evaluation so that merges are disabled. Results can be skewed otherwise.",
                nomerges_file.string(), nomerges);
    }

    auto write_cache_file = dev_sys_file / "queue" / "write_cache";
    auto write_cache = read_first_line_as<std::string>(write_cache_file);
    if (write_cache == "write back") {
        iotune_logger.warn("write_cache for {} is set to write back. Some disks have poor implementation of this mode, pay attention to the measurements accuracy.",
                write_cache_file.string());
    }
}

struct evaluation_directory {
    sstring _name;
    // We know that if we issue more than this, they will be blocked on linux anyway.
    unsigned _max_iodepth = 0;
    uint64_t _available_space;
    uint64_t _min_data_transfer_size = 512;
    unsigned _disks_per_array = 0;

    void scan_device(unsigned dev_maj, unsigned dev_min) {
        scan_device(fmt::format("{}:{}", dev_maj, dev_min));
    }

    void scan_device(std::string dev_str) {
        scan_device(fs::path("/sys/dev/block") / dev_str);
    }

    void scan_device(fs::path sys_file) {
        try {
            sys_file = fs::canonical(sys_file);
            bool is_leaf = true;
            if (fs::exists(sys_file / "slaves")) {
                for (auto& dev : fs::directory_iterator(sys_file / "slaves")) {
                    is_leaf = false;
                    scan_device(read_first_line(dev.path() / "dev"));
                }
            }

            // our work is done if not leaf. We'll tune the leaves
            if (!is_leaf) {
                return;
            }

            if (fs::exists(sys_file / "partition")) {
                scan_device(sys_file.remove_filename());
            } else {
                check_device_properties(sys_file);
                auto queue_dir = sys_file / "queue";
                auto disk_min_io_size = read_first_line_as<uint64_t>(queue_dir / "minimum_io_size");

                _min_data_transfer_size = std::max(_min_data_transfer_size, disk_min_io_size);
                _max_iodepth += read_first_line_as<uint64_t>(queue_dir / "nr_requests");
                _disks_per_array++;
            }
        } catch (std::system_error& se) {
            iotune_logger.error("Error while parsing sysfs. Will continue with guessed values: {}", se.what());
            _max_iodepth = 128;
        }
        _disks_per_array = std::max(_disks_per_array, 1u);
    }
public:
    evaluation_directory(sstring name)
        : _name(name)
        , _available_space(fs::space(fs::path(_name)).available)
    {}

    unsigned max_iodepth() const {
        return _max_iodepth;
    }

    fs::path path() const {
        return fs::path(_name);
    }

    const sstring& name() const {
        return _name;
    }

    unsigned disks_per_array() const {
        return _disks_per_array;
    }

    uint64_t minimum_io_size() const {
        return _min_data_transfer_size;
    }

    future<> discover_directory() {
        return seastar::async([this] {
            auto f = open_directory(_name).get();
            auto st = f.stat().get();
            f.close().get();

            scan_device(major(st.st_dev), minor(st.st_dev));
        });
    }

    uint64_t available_space() const {
        return _available_space;
    }
};

struct io_rates {
    float bytes_per_sec = 0;
    float iops = 0;
    io_rates operator+(const io_rates& a) const {
        return io_rates{bytes_per_sec + a.bytes_per_sec, iops + a.iops};
    }

    io_rates& operator+=(const io_rates& a) {
        bytes_per_sec += a.bytes_per_sec;
        iops += a.iops;
        return *this;
    }
};

struct row_stats {
    size_t points;
    double average;
    double stdev;

    float stdev_percents() const {
        return points > 0 ? stdev / average : 0.0;
    }
};

template <typename T>
static row_stats get_row_stats_for(const std::vector<T>& v) {
    if (v.size() == 0) {
        return row_stats{0, 0.0, 0.0};
    }

    double avg = std::accumulate(v.begin(), v.end(), 0.0) / v.size();
    double stdev = std::sqrt(std::transform_reduce(v.begin(), v.end(), 0.0,
                std::plus<double>(), [avg] (auto& v) -> double { return (v - avg) * (v - avg); }) / v.size());

    return row_stats{ v.size(), avg, stdev };
}

class invalid_position : public std::exception {
public:
    virtual const char* what() const noexcept {
        return "file access position invalid";
    }
};

struct position_generator {
    virtual uint64_t get_pos() = 0;
    virtual bool is_sequential() const = 0;
    virtual ~position_generator() {}
};

class sequential_issuer : public position_generator {
    size_t _buffer_size;
    uint64_t _position = 0;
    uint64_t _size_limit;
public:
    sequential_issuer(size_t buffer_size, uint64_t size_limit)
        : _buffer_size(buffer_size)
        , _size_limit(size_limit)
    {}

    virtual bool is_sequential() const {
        return true;
    }

    virtual uint64_t get_pos() {
        if (_position >= _size_limit) {
            // Wrap around if reaching EOF. The write bandwidth is lower,
            // and we also split the write bandwidth among shards, while we
            // read only from shard 0, so shard 0's file may not be large
            // enough to read from.
            _position = 0;
        }
        auto pos = _position;
        _position += _buffer_size;
        return pos;
    }
};

class random_issuer : public position_generator {
    size_t _buffer_size;
    uint64_t _last_position;
    std::uniform_int_distribution<uint64_t> _pos_distribution;
public:
    random_issuer(size_t buffer_size, uint64_t last_position)
        : _buffer_size(buffer_size)
        , _last_position(last_position)
        , _pos_distribution(0, (last_position / buffer_size) - 1)
    {}

    virtual bool is_sequential() const {
        return false;
    }

    virtual uint64_t get_pos() {
        uint64_t pos = _pos_distribution(random_generator) * _buffer_size;
        if (pos >= _last_position) {
            throw invalid_position();
        }
        return pos;
    }
};

class request_issuer {
public:
    virtual future<size_t> issue_request(uint64_t pos, char* buf, uint64_t size) = 0;
    virtual ~request_issuer() {}
};


class write_request_issuer : public request_issuer {
    file _file;
public:
    explicit write_request_issuer(file f) : _file(f) {}
    future<size_t> issue_request(uint64_t pos, char* buf, uint64_t size) override {
        return _file.dma_write(pos, buf, size);
    }
};

class read_request_issuer : public request_issuer {
    file _file;
public:
    explicit read_request_issuer(file f) : _file(f) {}
    future<size_t> issue_request(uint64_t pos, char* buf, uint64_t size) override {
        return _file.dma_read(pos, buf, size);
    }
};

class io_worker {
    class requests_rate_meter {
        std::vector<unsigned>& _rates;
        const unsigned& _requests;
        unsigned _prev_requests = 0;
        timer<> _tick;

        static constexpr auto period = 1s;

    public:
        requests_rate_meter(std::chrono::duration<double> duration, std::vector<unsigned>& rates, const unsigned& requests)
            : _rates(rates)
            , _requests(requests)
            , _tick([this] {
                _rates.push_back(_requests - _prev_requests);
                _prev_requests = _requests;
            })
        {
            _rates.reserve(256); // ~2 minutes
            if (duration > 4 * period) {
                _tick.arm_periodic(period);
            }
        }

        ~requests_rate_meter() {
            if (_tick.armed()) {
                _tick.cancel();
            } else {
                _rates.push_back(_requests);
            }
        }
    };

    uint64_t _bytes = 0;
    uint64_t _max_offset = 0;
    unsigned _requests = 0;
    size_t _buffer_size;
    std::chrono::time_point<iotune_clock, std::chrono::duration<double>> _start_measuring;
    std::chrono::time_point<iotune_clock, std::chrono::duration<double>> _end_measuring;
    std::chrono::time_point<iotune_clock, std::chrono::duration<double>> _end_load;
    // track separately because in the sequential case we may exhaust the file before _duration
    std::chrono::time_point<iotune_clock, std::chrono::duration<double>> _last_time_seen;

    requests_rate_meter _rr_meter;
    std::unique_ptr<position_generator> _pos_impl;
    std::unique_ptr<request_issuer> _req_impl;
public:
    bool is_sequential() const {
        return _pos_impl->is_sequential();
    }

    bool should_stop() const {
        return iotune_clock::now() >= _end_load;
    }

    io_worker(size_t buffer_size, std::chrono::duration<double> duration, std::unique_ptr<request_issuer> reqs, std::unique_ptr<position_generator> pos, std::vector<unsigned>& rates)
        : _buffer_size(buffer_size)
        , _start_measuring(iotune_clock::now() + std::chrono::duration<double>(10ms))
        , _end_measuring(_start_measuring + duration)
        , _end_load(_end_measuring + 10ms)
        , _last_time_seen(_start_measuring)
        , _rr_meter(duration, rates, _requests)
        , _pos_impl(std::move(pos))
        , _req_impl(std::move(reqs))
    {}

    std::unique_ptr<char[], free_deleter> get_buffer() {
        return allocate_aligned_buffer<char>(_buffer_size, _buffer_size);
    }

    future<> issue_request(char* buf) {
        uint64_t pos = _pos_impl->get_pos();
        return _req_impl->issue_request(pos, buf, _buffer_size).then([this, pos] (size_t size) {
            auto now = iotune_clock::now();
            _max_offset = std::max(_max_offset, pos + size);
            if ((now > _start_measuring) && (now < _end_measuring)) {
                _last_time_seen = now;
                _bytes += size;
                _requests++;
            }
        });
    }

    uint64_t max_offset() const noexcept { return _max_offset; }

    io_rates get_io_rates() const {
        io_rates rates;
        auto t = _last_time_seen - _start_measuring;
        if (!t.count()) {
            throw std::runtime_error("No data collected");
        }
        rates.bytes_per_sec = _bytes / t.count();
        rates.iops = _requests / t.count();
        return rates;
    }
};

class test_file {
public:
    enum class pattern { sequential, random };
private:
    fs::path _dirpath;
    uint64_t _file_size;
    file _file;
    uint64_t _forced_random_io_buffer_size;

    std::unique_ptr<position_generator> get_position_generator(size_t buffer_size, pattern access_pattern) {
        if (access_pattern == pattern::sequential) {
            return std::make_unique<sequential_issuer>(buffer_size, _file_size);
        } else {
            return std::make_unique<random_issuer>(buffer_size, _file_size);
        }
    }

    uint64_t calculate_buffer_size(pattern access_pattern, uint64_t buffer_size, uint64_t operation_alignment) const {
        if (access_pattern == pattern::random && _forced_random_io_buffer_size != 0u) {
            return _forced_random_io_buffer_size;
        }

        return std::max(buffer_size, operation_alignment);
    }

public:
    test_file(const ::evaluation_directory& dir, uint64_t maximum_size, uint64_t random_io_buffer_size)
        : _dirpath(dir.path() / fs::path(fmt::format("ioqueue-discovery-{}", this_shard_id())))
        , _file_size(maximum_size)
        , _forced_random_io_buffer_size(random_io_buffer_size)
    {}

    future<> create_data_file() {
        // XFS likes access in many directories better.
        return make_directory(_dirpath.string()).then([this] {
            auto testfile = _dirpath / fs::path("testfile");
            file_open_options options;
            options.extent_allocation_size_hint = _file_size;
            return open_file_dma(testfile.string(), open_flags::rw | open_flags::create, std::move(options)).then([this, testfile] (file file) {
                _file = file;
                if (this_shard_id() == 0) {
                    iotune_logger.info("Filesystem parameters: read alignment {}, write alignment {}", _file.disk_read_dma_alignment(), _file.disk_write_dma_alignment());
                }
                return remove_file(testfile.string()).then([this] {
                    return remove_file(_dirpath.string());
                });
            }).then([this] {
                return _file.truncate(_file_size);
            });
        });
    }

    future<io_rates> do_workload(std::unique_ptr<io_worker> worker_ptr, unsigned max_os_concurrency, bool update_file_size = false) {
        if (update_file_size) {
            _file_size = 0;
        }

        auto worker = worker_ptr.get();
        auto concurrency = boost::irange<unsigned, unsigned>(0, max_os_concurrency, 1);
        return parallel_for_each(std::move(concurrency), [worker] (unsigned idx) {
            auto bufptr = worker->get_buffer();
            auto buf = bufptr.get();
            return do_until([worker] { return worker->should_stop(); }, [buf, worker] {
                return worker->issue_request(buf);
            }).finally([alive = std::move(bufptr)] {});
        }).then_wrapped([this, worker = std::move(worker_ptr), update_file_size] (future<> f) {
            try {
                f.get();
            } catch (invalid_position& ip) {
                // expected if sequential. Example: reading and the file ended.
                if (!worker->is_sequential()) {
                    throw;
                }
            }

            if (update_file_size) {
                _file_size = worker->max_offset();
            }
            return make_ready_future<io_rates>(worker->get_io_rates());
        });
    }

    future<io_rates> read_workload(size_t buffer_size, pattern access_pattern, unsigned max_os_concurrency, std::chrono::duration<double> duration, std::vector<unsigned>& rates) {
        buffer_size = calculate_buffer_size(access_pattern, buffer_size, _file.disk_read_dma_alignment());
        auto worker = std::make_unique<io_worker>(buffer_size, duration, std::make_unique<read_request_issuer>(_file), get_position_generator(buffer_size, access_pattern), rates);
        return do_workload(std::move(worker), max_os_concurrency);
    }

    future<io_rates> write_workload(size_t buffer_size, pattern access_pattern, unsigned max_os_concurrency, std::chrono::duration<double> duration, std::vector<unsigned>& rates) {
        buffer_size = calculate_buffer_size(access_pattern, buffer_size, _file.disk_write_dma_alignment());
        auto worker = std::make_unique<io_worker>(buffer_size, duration, std::make_unique<write_request_issuer>(_file), get_position_generator(buffer_size, access_pattern), rates);
        bool update_file_size = worker->is_sequential();
        return do_workload(std::move(worker), max_os_concurrency, update_file_size).then([this] (io_rates r) {
            return _file.flush().then([r = std::move(r)] () mutable {
                return make_ready_future<io_rates>(std::move(r));
            });
        });
    }

    future<> stop() {
        return _file ? _file.close() : make_ready_future<>();
    }
};

class iotune_multi_shard_context {
    ::evaluation_directory _test_directory;
    uint64_t _random_io_buffer_size;

    unsigned per_shard_io_depth() const {
        auto iodepth = _test_directory.max_iodepth() / smp::count;
        if (this_shard_id() < _test_directory.max_iodepth() % smp::count) {
            iodepth++;
        }
        return std::min(iodepth, 128u);
    }
    seastar::sharded<test_file> _iotune_test_file;

    std::vector<unsigned> serial_rates;
    seastar::sharded<std::vector<unsigned>> sharded_rates;

public:
    future<> stop() {
        return _iotune_test_file.stop().then([this] { return sharded_rates.stop(); });
    }

    future<> start() {
       const auto maximum_size = (_test_directory.available_space() / (2 * smp::count));
       return _iotune_test_file.start(_test_directory, maximum_size, _random_io_buffer_size).then([this] {
           return sharded_rates.start();
       });
    }

    future<row_stats> get_serial_rates() {
        row_stats ret = get_row_stats_for<unsigned>(serial_rates);
        serial_rates.clear();
        return make_ready_future<row_stats>(ret);
    }

    future<row_stats> get_sharded_worst_rates() {
        return sharded_rates.map_reduce0([] (std::vector<unsigned>& rates) {
            row_stats ret = get_row_stats_for<unsigned>(rates);
            rates.clear();
            return ret;
        }, row_stats{0, 0.0, 0.0},
        [] (const row_stats& res, row_stats lres) {
            return res.stdev < lres.stdev ? lres : res;
        });
    }

    future<> create_data_file() {
        return _iotune_test_file.invoke_on_all([] (test_file& tf) {
            return tf.create_data_file();
        });
    }

    future<io_rates> write_sequential_data(unsigned shard, size_t buffer_size, std::chrono::duration<double> duration) {
        return _iotune_test_file.invoke_on(shard, [this, buffer_size, duration] (test_file& tf) {
            return tf.write_workload(buffer_size, test_file::pattern::sequential, 4 * _test_directory.disks_per_array(), duration, serial_rates);
        });
    }

    future<io_rates> read_sequential_data(unsigned shard, size_t buffer_size, std::chrono::duration<double> duration) {
        return _iotune_test_file.invoke_on(shard, [this, buffer_size, duration] (test_file& tf) {
            return tf.read_workload(buffer_size, test_file::pattern::sequential, 4 * _test_directory.disks_per_array(), duration, serial_rates);
        });
    }

    future<io_rates> write_random_data(size_t buffer_size, std::chrono::duration<double> duration) {
        return _iotune_test_file.map_reduce0([buffer_size, this, duration] (test_file& tf) {
            const auto shard_io_depth = per_shard_io_depth();
            if (shard_io_depth == 0) {
                return make_ready_future<io_rates>();
            } else {
                return tf.write_workload(buffer_size, test_file::pattern::random, shard_io_depth, duration, sharded_rates.local());
            }
        }, io_rates(), std::plus<io_rates>());
    }

    future<io_rates> read_random_data(size_t buffer_size, std::chrono::duration<double> duration) {
        return _iotune_test_file.map_reduce0([buffer_size, this, duration] (test_file& tf) {
            const auto shard_io_depth = per_shard_io_depth();
            if (shard_io_depth == 0) {
                return make_ready_future<io_rates>();
            } else {
                return tf.read_workload(buffer_size, test_file::pattern::random, shard_io_depth, duration, sharded_rates.local());
            }
        }, io_rates(), std::plus<io_rates>());
    }

private:
    template <typename Fn>
    future<uint64_t> saturate(float rate_threshold, size_t buffer_size, std::chrono::duration<double> duration, Fn&& workload) {
        return _iotune_test_file.invoke_on(0, [this, rate_threshold, buffer_size, duration, workload] (test_file& tf) {
            return (tf.*workload)(buffer_size, test_file::pattern::sequential, 1, duration, serial_rates).then([this, rate_threshold, buffer_size, duration, workload] (io_rates rates) {
                serial_rates.clear();
                if (rates.bytes_per_sec < rate_threshold) {
                    // The throughput with the given buffer-size is already "small enough", so
                    // return back its previous value
                    return make_ready_future<uint64_t>(buffer_size * 2);
                } else {
                    return saturate(rate_threshold, buffer_size / 2, duration, workload);
                }
            });
        });
    }

public:
    future<uint64_t> saturate_write(float rate_threshold, size_t buffer_size, std::chrono::duration<double> duration) {
        return saturate(rate_threshold, buffer_size, duration, &test_file::write_workload);
    }

    future<uint64_t> saturate_read(float rate_threshold, size_t buffer_size, std::chrono::duration<double> duration) {
        return saturate(rate_threshold, buffer_size, duration, &test_file::read_workload);
    }

    iotune_multi_shard_context(::evaluation_directory dir, uint64_t random_io_buffer_size)
        : _test_directory(dir)
        , _random_io_buffer_size(random_io_buffer_size)
    {}
};

struct disk_descriptor {
    std::string mountpoint;
    uint64_t read_iops;
    uint64_t read_bw;
    uint64_t write_iops;
    uint64_t write_bw;
    std::optional<uint64_t> read_sat_len;
    std::optional<uint64_t> write_sat_len;
};

void string_to_file(sstring conf_file, sstring buf) {
    auto f = file_desc::open(conf_file, O_WRONLY | O_CLOEXEC | O_CREAT | O_TRUNC, 0664);
    auto ret = f.write(buf.data(), buf.size());
    if (!ret || (*ret != buf.size())) {
        throw std::runtime_error(fmt::format("Can't write {}: {}", conf_file, *ret));
    }
}

void write_configuration_file(sstring conf_file, std::string format, sstring properties_file) {
    sstring buf;
    if (format == "seastar") {
        buf = fmt::format("io-properties-file={}\n", properties_file);
    } else {
        buf = fmt::format("SEASTAR_IO=\"--io-properties-file={}\"\n", properties_file);
    }
    string_to_file(conf_file, buf);
}

void write_property_file(sstring conf_file, std::vector<disk_descriptor> disk_descriptors) {
    YAML::Emitter out;
    out << YAML::BeginMap;
    out << YAML::Key << "disks";
    out << YAML::BeginSeq;
    for (auto& desc : disk_descriptors) {
        out << YAML::BeginMap;
        out << YAML::Key << "mountpoint" << YAML::Value << desc.mountpoint;
        out << YAML::Key << "read_iops" << YAML::Value << desc.read_iops;
        out << YAML::Key << "read_bandwidth" << YAML::Value << desc.read_bw;
        out << YAML::Key << "write_iops" << YAML::Value << desc.write_iops;
        out << YAML::Key << "write_bandwidth" << YAML::Value << desc.write_bw;
        if (desc.read_sat_len) {
            out << YAML::Key << "read_saturation_length" << YAML::Value << *desc.read_sat_len;
        }
        if (desc.write_sat_len) {
            out << YAML::Key << "write_saturation_length" << YAML::Value << *desc.write_sat_len;
        }
        out << YAML::EndMap;
    }
    out << YAML::EndSeq;
    out << YAML::EndMap;
    out << YAML::Newline;

    string_to_file(conf_file, sstring(out.c_str(), out.size()));
}

// Returns the mountpoint of a path. It works by walking backwards from the canonical path
// (absolute, with symlinks resolved), until we find a point that crosses a device ID.
fs::path mountpoint_of(sstring filename) {
    fs::path mnt_candidate = fs::canonical(fs::path(filename));
    std::optional<dev_t> candidate_id = {};
    auto current = mnt_candidate;
    do {
        auto f = open_directory(current.string()).get();
        auto st = f.stat().get();
        if ((candidate_id) && (*candidate_id != st.st_dev)) {
            return mnt_candidate;
        }
        mnt_candidate = current;
        candidate_id = st.st_dev;
        current = current.parent_path();
    } while (mnt_candidate != current);

    return mnt_candidate;
}

int main(int ac, char** av) {
    namespace bpo = boost::program_options;
    bool fs_check = false;

    app_template::config app_cfg;
    app_cfg.name = "IOTune";

    app_template app(std::move(app_cfg));
    auto opt_add = app.add_options();
    opt_add
        ("evaluation-directory", bpo::value<std::vector<sstring>>()->required(), "directory where to execute the evaluation")
        ("properties-file", bpo::value<sstring>(), "path in which to write the YAML file")
        ("options-file", bpo::value<sstring>(), "path in which to write the legacy conf file")
        ("duration", bpo::value<unsigned>()->default_value(120), "time, in seconds, for which to run the test")
        ("format", bpo::value<sstring>()->default_value("seastar"), "Configuration file format (seastar | envfile)")
        ("fs-check", bpo::bool_switch(&fs_check), "perform FS check only")
        ("accuracy", bpo::value<unsigned>()->default_value(3), "acceptable deviation of measurements (percents)")
        ("saturation", bpo::value<sstring>()->default_value(""), "measure saturation lengths (read | write | both) (this is very slow!)")
        ("random-io-buffer-size", bpo::value<unsigned>()->default_value(0), "force buffer size for random write and random read")
    ;

    return app.run(ac, av, [&] {
        return seastar::async([&] {
            auto& configuration = app.configuration();
            auto eval_dirs = configuration["evaluation-directory"].as<std::vector<sstring>>();
            auto format = configuration["format"].as<sstring>();
            auto duration = std::chrono::duration<double>(configuration["duration"].as<unsigned>() * 1s);
            auto accuracy = configuration["accuracy"].as<unsigned>();
            auto saturation = configuration["saturation"].as<sstring>();
            auto random_io_buffer_size = configuration["random-io-buffer-size"].as<unsigned>();

            bool read_saturation, write_saturation;
            if (saturation == "") {
                read_saturation = false;
                write_saturation = false;
            } else if (saturation == "both") {
                read_saturation = true;
                write_saturation = true;
            } else if (saturation == "read") {
                read_saturation = true;
                write_saturation = false;
            } else if (saturation == "write") {
                read_saturation = false;
                write_saturation = true;
            } else {
                fmt::print("Bad --saturation value\n");
                return 1;
            }

            std::vector<disk_descriptor> disk_descriptors;
            std::unordered_map<sstring, sstring> mountpoint_map;
            // We want to evaluate once per mountpoint, but we still want to write in one of the
            // directories that we were provided - we may not have permissions to write into the
            // mountpoint itself. If we are passed more than one directory per mountpoint, we don't
            // really care to which one we write, so this simple hash will do.
            for (auto& eval_dir : eval_dirs) {
                mountpoint_map[mountpoint_of(eval_dir).string()] = eval_dir;
            }
            for (auto eval: mountpoint_map) {
                auto mountpoint = eval.first;
                auto eval_dir = eval.second;

                if (!filesystem_has_good_aio_support(eval_dir, false)) {
                    iotune_logger.error("Exception when qualifying filesystem at {}", eval_dir);
                    return 1;
                }

                auto rec = 10000000000ULL;
                auto avail = fs_avail(eval_dir).get();
                if (avail < rec) {
                    uint64_t val;
                    const char* units;
                    if (avail >= 1000000000) {
                        val = (avail + 500000000) / 1000000000;
                        units = "GB";
                    } else if (avail >= 1000000) {
                        val = (avail + 500000) / 1000000;
                        units = "MB";
                    } else {
                        val = avail;
                        units = "bytes";
                    }
                    iotune_logger.warn("Available space on filesystem at {}: {} {}: is less than recommended: {} GB",
                                       eval_dir, val, units, rec / 1000000000ULL);
                }

                iotune_logger.info("{} passed sanity checks", eval_dir);
                if (fs_check) {
                    continue;
                }

                // Directory is the same object for all tests.
                ::evaluation_directory test_directory(eval_dir);
                test_directory.discover_directory().get();
                iotune_logger.info("Disk parameters: max_iodepth={} disks_per_array={} minimum_io_size={}",
                        test_directory.max_iodepth(), test_directory.disks_per_array(), test_directory.minimum_io_size());

                if (test_directory.max_iodepth() < smp::count) {
                    iotune_logger.warn("smp::count={} is greater than max_iodepth={} - shards above max_io_depth "
                                       "will be ignored during random read and random write measurements",
                                       smp::count, test_directory.max_iodepth());
                }

                if (random_io_buffer_size != 0u) {
                    iotune_logger.info("Forcing buffer_size={} for random IO!", random_io_buffer_size);
                }

                ::iotune_multi_shard_context iotune_tests(test_directory, random_io_buffer_size);
                iotune_tests.start().get();
                auto stop = defer([&iotune_tests] () noexcept {
                    try {
                        iotune_tests.stop().get();
                    } catch (...) {
                        fmt::print("Error occurred during iotune context shutdown: {}", std::current_exception());
                        abort();
                    }
                });

                row_stats rates;
                auto accuracy_msg = [accuracy, &rates] {
                    auto stdev = rates.stdev_percents() * 100.0;
                    return (accuracy == 0 || stdev > accuracy) ? fmt::format(" (deviation {}%)", int(round(stdev))) : std::string("");
                };

                iotune_tests.create_data_file().get();

                fmt::print("Starting Evaluation. This may take a while...\n");
                fmt::print("Measuring sequential write bandwidth: ");
                std::cout.flush();
                io_rates write_bw;
                size_t sequential_buffer_size = 1 << 20;
                for (unsigned shard = 0; shard < smp::count; ++shard) {
                    write_bw += iotune_tests.write_sequential_data(shard, sequential_buffer_size, duration * 0.70 / smp::count).get();
                }
                write_bw.bytes_per_sec /= smp::count;
                rates = iotune_tests.get_serial_rates().get();
                fmt::print("{} MB/s{}\n", uint64_t(write_bw.bytes_per_sec / (1024 * 1024)), accuracy_msg());

                std::optional<uint64_t> write_sat;

                if (write_saturation) {
                    fmt::print("Measuring write saturation length: ");
                    std::cout.flush();
                    write_sat = iotune_tests.saturate_write(write_bw.bytes_per_sec * (1.0 - rates.stdev_percents()), sequential_buffer_size/2, duration * 0.70).get();
                    fmt::print("{}\n", *write_sat);
                }

                fmt::print("Measuring sequential read bandwidth: ");
                std::cout.flush();
                auto read_bw = iotune_tests.read_sequential_data(0, sequential_buffer_size, duration * 0.1).get();
                rates = iotune_tests.get_serial_rates().get();
                fmt::print("{} MB/s{}\n", uint64_t(read_bw.bytes_per_sec / (1024 * 1024)), accuracy_msg());

                std::optional<uint64_t> read_sat;

                if (read_saturation) {
                    fmt::print("Measuring read saturation length: ");
                    std::cout.flush();
                    read_sat = iotune_tests.saturate_read(read_bw.bytes_per_sec * (1.0 - rates.stdev_percents()), sequential_buffer_size/2, duration * 0.1).get();
                    fmt::print("{}\n", *read_sat);
                }

                fmt::print("Measuring random write IOPS: ");
                std::cout.flush();
                auto write_iops = iotune_tests.write_random_data(test_directory.minimum_io_size(), duration * 0.1).get();
                rates = iotune_tests.get_sharded_worst_rates().get();
                fmt::print("{} IOPS{}\n", uint64_t(write_iops.iops), accuracy_msg());

                fmt::print("Measuring random read IOPS: ");
                std::cout.flush();
                auto read_iops = iotune_tests.read_random_data(test_directory.minimum_io_size(), duration * 0.1).get();
                rates = iotune_tests.get_sharded_worst_rates().get();
                fmt::print("{} IOPS{}\n", uint64_t(read_iops.iops), accuracy_msg());

                struct disk_descriptor desc;
                desc.mountpoint = mountpoint;
                desc.read_iops = read_iops.iops;
                desc.read_bw = read_bw.bytes_per_sec;
                desc.read_sat_len = read_sat;
                desc.write_iops = write_iops.iops;
                desc.write_bw = write_bw.bytes_per_sec;
                desc.write_sat_len = write_sat;
                disk_descriptors.push_back(std::move(desc));
            }

            if (fs_check) {
                return 0;
            }

            auto file = "properties file";
            try {
                if (configuration.count("properties-file")) {
                    fmt::print("Writing result to {}\n", configuration["properties-file"].as<sstring>());
                    write_property_file(configuration["properties-file"].as<sstring>(), disk_descriptors);
                }

                file = "configuration file";
                if (configuration.count("options-file")) {
                    fmt::print("Writing result to {}\n", configuration["options-file"].as<sstring>());
                    write_configuration_file(configuration["options-file"].as<sstring>(), format, configuration["properties-file"].as<sstring>());
                }
            } catch (...) {
                iotune_logger.error("Exception when writing {}: {}.\nPlease add the above values manually to your seastar command line.", file, std::current_exception());
                return 1;
            }
            return 0;
        });
    });
}