#pragma once

#include <ATen/core/ivalue.h>
#include <torch/csrc/jit/script/module.h>
#include <pybind11/pybind11.h>

#include <torch/csrc/jit/pybind_utils.h>

#include <vector>
#include <memory>

namespace py = pybind11;

namespace torch {
namespace throughput_benchmark {

/**
 * The struct is used to provide results of a benchmark to the caller
 * In the future all additional statics should be added here.
 */
struct BenchmarkExecutionStats {
  float latency_avg_ms{-1};
  int64_t num_iters{-1};
};

/**
 * Use this struct in order to configure a throughput benchmark run.
 * This struct should include parameters related to threading, batching, number
 * of iterations, warm-up, etc. More configs can be added as needed.
 * General rule here is that only things that c++ must(!) to be aware of should
 * be here. If we can keep other parts in python, we should keep them there.
 * This is typical for things that are not perf critical and don't affect
 * execution statistics benchmark returns.
 */
struct BenchmarkConfig {
 public:
  // Calling threads are those threads that are calling into a module in
  // parallel.
  int num_calling_threads{1};
  // Worker threads are not supported yet. This is just an example that we plan
  // to support some sort of multi-threaded forward calls. We may change this
  // setting in the future to support different intra and inter op parallelizm
  // which is not available in PyTorch yet
  int num_worker_threads{1};
  // Warmup iters are used to make sure we run a module a few times before
  // actually measuring things. This way we avoid cold caches and any other
  // similar problems
  int num_warmup_iters{1};
  // Number of iterations the benchmark should run with. This number is separate
  // from the warmup iterations
  int64_t num_iters{100};
};

namespace detail {

/**
 * A helper class to abstract out different models we test throughput of
 */
template <class Input, class Output, class Model>
class BenchmarkHelper {
public:
  BenchmarkHelper(): initialized_{false} {}
  explicit BenchmarkHelper(Model model): model_(model), initialized_(true) {}

  // This method to be used in benchmark() method
  // Note that there is no result. This way we don't have to call this under GIL
  // even when running in the nn.Module mode. Otherwise destructor of the result
  // would race with Python
  void runOnce(Input&&) const;
  // This method is to be used when calling from Python dirrectly
  Output runOnce(py::args&&, py::kwargs&&) const;
  // Aggregate input in the format Model expects in order to avoid further
  // conversions at the benchmark time
  void addInput(py::args&&, py::kwargs&&);
  BenchmarkExecutionStats benchmark(const BenchmarkConfig& config) const;

  bool initialized() const { return initialized_; }

  // Destructor doesn't require the GIL because it is going to be executed on
  // the PyThon thread
  std::vector<Input> inputs_;
  Model model_;
  bool initialized_{false};
};

struct C10_HIDDEN ModuleInput {
  ModuleInput(ModuleInput&& other) = default;

  ModuleInput(const ModuleInput&) = delete;
  ModuleInput& operator=(ModuleInput& other) = delete;
  ModuleInput& operator=(ModuleInput&& other) = delete;

  ModuleInput(py::args&& args, py::kwargs&& kwargs)
      : args(std::move(args)), kwargs(std::move(kwargs)) {}

  py::args args;
  py::kwargs kwargs;
};
typedef py::object ModuleOutput;
typedef std::vector<at::IValue> ScriptModuleInput;
typedef at::IValue ScriptModuleOutput;

template<class Input>
Input cloneInput(const Input& input);

typedef BenchmarkHelper<
    ScriptModuleInput,
    at::IValue,
    jit::script::Module>
    ScriptModuleBenchmark;
typedef BenchmarkHelper<ModuleInput, py::object, py::object> ModuleBenchmark;

template <>
void ScriptModuleBenchmark::runOnce(
    ScriptModuleInput&& input) const;

template <>
ScriptModuleOutput ScriptModuleBenchmark::runOnce(
    py::args&& args,
    py::kwargs&& kwargs) const;

template <>
void ModuleBenchmark::runOnce(ModuleInput&& input) const;

template <>
ModuleOutput ModuleBenchmark::runOnce(py::args&& args, py::kwargs&& kwargs)
    const;

template <>
void ScriptModuleBenchmark::addInput(py::args&& args, py::kwargs&& kwargs);

template <>
void ModuleBenchmark::addInput(py::args&& args, py::kwargs&& kwargs);

} // namespace detail

/**
 * This class is a small c++ component responsible for executing a PyTorch
 * module under an inference server like load. It can emulate multiple calling
 * threads to a single module provided. In the future we plan to enhance this
 * component to support inter and intra-op parallelism as well as multiple
 * models running in a single process.
 *
 * For current available configurations refer to the BenchmkarConfig
 * documentation
 *
 * The class supports working with either nn.Module or ScriptModule.
 * Under the hood it just dispatches to corresponding specialization of
 * class BenchmarkHelper<Input, Output, Model>
 */
class C10_HIDDEN ThroughputBenchmark {
 public:
  explicit ThroughputBenchmark(jit::script::Module module);
  explicit ThroughputBenchmark(py::object module);

  // Add one more input example. This input example should be in the exact
  // format the module under test expects. It is responsibility of the module to
  // perform any such format checks, the benchmark doesn't perform any
  // validation of its own
  void addInput(py::args args, py::kwargs kwargs);

  // Equivalent to just running the model dirrectly on the given input
  py::object runOnce(py::args&& args, py::kwargs&& kwargs);

  // The main method of the class allows to perform a multi-threaded benchmark
  // It returns BenchmarkExecutionStats object with a lot of useful statistics
  // about runtime execution. We can enhance this class in the future to provide
  // more information to the user
  BenchmarkExecutionStats benchmark(const BenchmarkConfig& config) const;

 private:
  detail::ScriptModuleBenchmark script_module_;
  detail::ModuleBenchmark module_;
};
} // namespace throughput benchmark
} // namepsace torch

#include <torch/csrc/utils/throughput_benchmark-inl.h>