~zhangmeng/libgowrapper/reid.git

#pragma once
 
// TODO Also register c10 operators on mobile
#if !defined(CAFFE2_IS_XPLAT_BUILD)
#include <ATen/core/dispatch/Dispatcher.h>
#include <ATen/core/ivalue.h>
#include <c10/util/ArrayRef.h>
#include <c10/util/C++17.h>
#include <c10/util/Metaprogramming.h>
#include "caffe2/core/operator.h"
#include "caffe2/core/export_caffe2_op_to_c10.h"
 
namespace caffe2 {
 
/**
 * To make a c10 operator "C10Add" callable from caffe2 as "C2MyAddOpName", just
 * write
 *
 *  To export the CPU kernel
 *     C10_EXPORT_C10_OP_TO_CAFFE2_CPU(C10Add, C2MyAddOp)
 *
 *  To export the CUDA kernel
 *     C10_EXPORT_C10_OP_TO_CAFFE2_CUDA(C10Add, C2MyAddOp)
 *
 */
 
namespace detail {
template <class Context>
class C10OperatorWrapper final : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
 
  C10OperatorWrapper(
      const c10::OperatorHandle& op,
      const OperatorDef& operator_def,
      Workspace* ws)
      : Operator<Context>(operator_def, ws),
        op_(op),
        has_preallocated_outputs_(
            op_.schema().arguments().size() != 0 &&
            op_.schema().arguments().back().name() ==
                detail::PREALLOCATED_OUTPUT_ARGNAME) {
    AT_ASSERT(
        !has_preallocated_outputs_ ||
        op_.schema().arguments().back().type()->isSubtypeOf(
            OptionalType::create(ListType::ofTensors())));
 
    AT_ASSERT(operator_def.output_size() == op_.schema().returns().size());
    AT_ASSERT(
        operator_def.input_size() + (has_preallocated_outputs_ ? 1 : 0) <=
        op_.schema()
            .arguments()
            .size()); // '<=' because there might be caffe2 nontensor arguments
  }
 
  bool RunOnDevice() override {
    // due to caching the stack_, concurrent calling is not allowed.
    // TODO thread_local might fix this
    std::lock_guard<std::mutex> lock(mutex_);
 
    pushInputs_();
    callKernel_();
    popOutputs_();
 
    return true;
  }
 
 private:
  void pushInputs_() {
    AT_ASSERT(stack_.size() == 0);
    stack_.reserve(
        op_.schema().arguments().size() + (has_preallocated_outputs_ ? 1 : 0));
 
    size_t input_tensor_index = 0;
 
    for (const auto& argument : op_.schema().arguments()) {
      if (argument.name() == detail::PREALLOCATED_OUTPUT_ARGNAME) {
        // note: if detail::PREALLOCATED_OUTPUT_ARGNAME was at the end of the
        // argument list, then has_preallocated_outputs_ would be true.
        AT_ASSERTM(
            has_preallocated_outputs_,
            "Error in caffe2->c10 wrapper: Operator schema has a parameter named ",
            detail::PREALLOCATED_OUTPUT_ARGNAME,
            ", but it's not at the end of the argument list");
 
        AT_ASSERTM(
            argument.type()->isSubtypeOf(
                OptionalType::create(ListType::ofTensors())),
            "Error in caffe2->c10 wrapper: Operator schema has a parameter named ",
            detail::PREALLOCATED_OUTPUT_ARGNAME,
            ", but it's not of type TensorList?");
        stack_.emplace_back(preallocated_outputs_());
 
      } else if (argument.type()->isSubtypeOf(TensorType::get())) {
        AT_ASSERTM(
            input_tensor_index < InputSize(),
            "Error in caffe2->c10 wrapper: Too few tensor arguments given (",
            InputSize(),
            "), operator schema expected more.");
        stack_.emplace_back(at::Tensor(Input(input_tensor_index++)));
      } else if (argument.type()->isSubtypeOf(OptionalType::ofTensor())) {
        if (input_tensor_index < InputSize()) {
          stack_.emplace_back(at::Tensor(Input(input_tensor_index++)));
        } else {
          stack_.emplace_back(IValue());
        }
      } else if (argument.type()->isSubtypeOf(ListType::ofTensors())) {
        AT_ASSERTM(
            input_tensor_index == 0,
            "Error in caffe2->c10 wrapper: Schema can only have either one or more Tensor inputs or one TensorList input.");
        stack_.emplace_back(array_inputs_());
        input_tensor_index = InputSize();
 
      } else {
        stack_.emplace_back(get_nontensor_argument_(argument));
      }
    }
    AT_ASSERTM(
        input_tensor_index == InputSize(),
        "Error in caffe2->c10 wrapper: Number of caffe2 operator inputs (",
        InputSize(),
        ") doesn't match number of tensor arguments (",
        input_tensor_index,
        ") in the c10 operator schema.");
  }
 
  void callKernel_() {
    AT_ASSERT(stack_.size() == op_.schema().arguments().size());
    c10::Dispatcher::singleton().callBoxed(op_, &stack_);
  }
 
  void popOutputs_() {
    AT_ASSERT(stack_.size() == op_.schema().returns().size());
    for (size_t i = 0; i < op_.schema().returns().size(); ++i) {
      OperatorBase::SetOutputTensor(i, Tensor(std::move(stack_[i]).toTensor()));
    }
    stack_.clear();
  }
 
  c10::List<at::Tensor> array_inputs_() {
    c10::List<at::Tensor> result;
    result.reserve(InputSize());
    for (size_t i = 0; i < InputSize(); ++i) {
      result.emplace_back(Input(i));
    }
    return result;
  }
 
  c10::List<at::Tensor> preallocated_outputs_() {
    c10::List<at::Tensor> result;
    result.reserve(OutputSize());
    for (size_t i = 0; i < OutputSize(); ++i) {
      result.emplace_back(OperatorBase::OutputTensorOrUndefined(i));
    }
    return result;
  }
 
  IValue get_nontensor_argument_(const c10::Argument& argument) {
    if (argument.type()->isSubtypeOf(IntType::get())) {
      return get_nontensor_argument_<int>(
          argument.name(), argument.default_value());
    } else if (argument.type()->isSubtypeOf(FloatType::get())) {
      return get_nontensor_argument_<double>(
          argument.name(), argument.default_value());
    } else if (argument.type()->isSubtypeOf(BoolType::get())) {
      return get_nontensor_argument_<bool>(
          argument.name(), argument.default_value());
    } else {
      // TODO Support more types
      AT_ERROR(
          "Error in caffe2->c10 wrapper: Unsupported argument type ",
          argument.type()->str(),
          " in c10 operator schema");
    }
  }
 
  template <class T>
  IValue get_nontensor_argument_(
      const std::string& name,
      const c10::optional<IValue>& default_value) {
    if (default_value.has_value()) {
      return this->template GetSingleArgument<T>(name, default_value->to<T>());
    } else {
      TORCH_CHECK(
          this->template HasSingleArgumentOfType<T>(name),
          "Error in caffe2->c10 wrapper: Expected argument '",
          name,
          "' missing or wrong type.");
      return this->template GetSingleArgument<T>(name, 0);
    }
  }
 
  c10::OperatorHandle op_;
 
  // has_preallocated_outputs_ is true iff the operator schema has a last
  // argument that is a TensorList and has a name equal to with the name equal
  // to detail::PREALLOCATED_OUTPUT_ARGNAME. This argument is then used to pass
  // in preallocated output tensors to the caffe2 operator.
  bool has_preallocated_outputs_;
 
  // this is stored as a member here to avoid having to re-allocate a stack
  // for each call. Between kernel calls, stack_.size() == 0, but capacity
  // should not need to be grown anymore after the first call.
  std::vector<IValue> stack_;
  std::mutex mutex_;
};
 
template <class Context>
inline std::function<
    std::unique_ptr<OperatorBase>(const OperatorDef&, Workspace*)>
createC10OperatorWrapper(const c10::OperatorName& op_name) {
  return [op_name](const OperatorDef& op_def, Workspace* ws) {
    auto op_handle =
        c10::Dispatcher::singleton().findSchema(op_name);
    AT_ASSERTM(
        op_handle.has_value(),
        "Tried to register c10 operator ",
        op_name.name,
        ".",
        op_name.overload_name,
        " with caffe2, but didn't find the c10 operator.");
    return c10::guts::make_unique<C10OperatorWrapper<Context>>(
        *op_handle, op_def, ws);
  };
}
 
} // namespace detail
} // namespace caffe2
 
#define C10_EXPORT_C10_OP_TO_CAFFE2_CPU(                       \
    OperatorName, Name)                                        \
  REGISTER_CPU_OPERATOR_CREATOR(                               \
      Name,                                                    \
      ::caffe2::detail::createC10OperatorWrapper<CPUContext>(  \
          {OperatorName, ""}))
#define C10_EXPORT_C10_OP_TO_CAFFE2_CUDA(                      \
    OperatorName, Name)                                        \
  REGISTER_CUDA_OPERATOR_CREATOR(                              \
      Name,                                                    \
      ::caffe2::detail::createC10OperatorWrapper<CUDAContext>( \
          {OperatorName, ""}))
#define C10_EXPORT_C10_OP_TO_CAFFE2_HIP(                       \
    OperatorName, Name)                                        \
  REGISTER_HIP_OPERATOR_CREATOR(                               \
      Name,                                                    \
      ::caffe2::detail::createC10OperatorWrapper<HIPContext>(  \
          {OperatorName, ""}))
#else
#define C10_EXPORT_C10_OP_TO_CAFFE2_CPU(                       \
    OperatorName, Name)
#define C10_EXPORT_C10_OP_TO_CAFFE2_CUDA(                      \
    OperatorName, Name)
#define C10_EXPORT_C10_OP_TO_CAFFE2_HIP(                       \
    OperatorName, Name)
#endif