~zhangmeng/libgowrapper/reid.git

#pragma once
 
#if !defined(CAFFE2_IS_XPLAT_BUILD)
#include <ATen/core/function_schema.h>
#include <ATen/core/grad_mode.h>
#include <ATen/core/op_registration/op_registration.h>
#include <torch/csrc/jit/script/function_schema_parser.h>
#include <vector>
 
namespace caffe2 {
namespace detail {
 
constexpr const char* PREALLOCATED_OUTPUT_ARGNAME =
    "_caffe2_preallocated_outputs";
 
using _CallCaffe2OpFunc = c10::List<at::Tensor>(
    const c10::FunctionSchema& schema,
    std::vector<c10::IValue>&& inputs,
    c10::List<at::Tensor>&& outputs);
 
template <class Caffe2Operator>
inline c10::List<at::Tensor> _call_caffe2_op(
    const c10::FunctionSchema& schema,
    std::vector<c10::IValue>&& inputs,
    c10::List<at::Tensor>&& outputs) {
  Caffe2Operator op(schema, std::move(inputs), std::move(outputs));
  op.Run();
  return std::move(op).move_newstyle_outputs();
}
 
// This function is inline in the hope that compilers optimizing for speed will
// inline it into call_caffe2_op_from_c10, allowing call_op to be inlined and
// avoiding the function pointer indirection, while compilers optimizing for
// binary size will keep it a separate function instead of inlining it into
// a template and will reuse the binary code of this function between ops.
// We measured and confirmed that binary size off the instagram ios app is
// reduced when having _call_caffe2_op_from_c10 separate from the templated
// call_caffe2_op_from_c10.
inline void _call_caffe2_op_from_c10(
    c10::Stack* stack,
    const c10::FunctionSchema& schema,
    _CallCaffe2OpFunc* call_op) {
  // precondition: on the stack, there's one IValue for each argument of the
  // c10 schema. The last argument is an optional tensor list that
  // (if not ivalue::None) contains a preallocated output tensor for each
  // operator output.
 
  // As an invariant, we don't want any autograd gradients to be tracked in
  // Caffe2 operators.
  at::NoGradGuard guard;
 
  AT_ASSERT(
      schema.arguments().size() != 0 &&
      schema.arguments().back().type()->isSubtypeOf(
          OptionalType::create(ListType::ofTensors())));
  IValue preallocated_outputs = torch::jit::pop(*stack);
 
  const size_t num_outputs = schema.returns().size();
  const size_t num_inputs = schema.arguments().size() -
      1; // -1 because the last argument is the list of preallocated tensors
 
  c10::List<at::Tensor> outputs;
  if (preallocated_outputs.isNone()) {
    // either the schema doesn't support preallocated outputs or it does but
    // they haven't been passed in. Pass a list of uninitialized tensors to
    // the caffe2 operator as preallocated outputs.
    outputs.resize(num_outputs);
  } else {
    AT_ASSERT(preallocated_outputs.isTensorList());
    outputs = std::move(preallocated_outputs).toTensorList();
  }
 
  // TODO Avoid vector allocation. One idea would be to keep the std::vector
  // instances in the cache.
  std::vector<IValue> inputs = torch::jit::pop(*stack, num_inputs);
 
  outputs = (*call_op)(schema, std::move(inputs), std::move(outputs));
 
  for (size_t i = 0; i < outputs.size(); ++i) {
    torch::jit::push(*stack, outputs.extract(i));
  }
 
  // postcondition: All inputs are cleared from the stack, there's now one
  //                IValue for each output which holds the result. This
  //                might reuse one of the preallocated tensors but doesn't have to.
}
 
template <const c10::FunctionSchema& (*Schema)(), class Caffe2Operator>
void call_caffe2_op_from_c10(
    c10::OperatorKernel* functor,
    c10::Stack* stack) {
  _call_caffe2_op_from_c10(stack, Schema(), &_call_caffe2_op<Caffe2Operator>);
}
 
inline FunctionSchema make_function_schema_for_c10(const char* schema_str) {
#if defined(CAFFE2_IS_XPLAT_BUILD)
  throw std::logic_error("We don't support registering c10 ops on mobile yet because the function schema parser isn't present in the mobile build.");
#else
  c10::FunctionSchema parsed_schema = torch::jit::parseSchema(schema_str);
  std::vector<c10::Argument> arguments = parsed_schema.arguments();
  arguments.emplace_back(
      PREALLOCATED_OUTPUT_ARGNAME,
      c10::OptionalType::create(c10::ListType::ofTensors()),
      nullopt,
      IValue());
 
  return FunctionSchema(
    parsed_schema.name(),
    parsed_schema.overload_name(),
    std::move(arguments),
    parsed_schema.returns(),
    parsed_schema.is_vararg(),
    parsed_schema.is_varret()
  );
#endif
}
 
}
}
 
 
/**
 * To register a caffe2 operator caffe2::MyOperator with the c10 dispatcher,
 * call:
 *
 * In caffe2/operators/MyOperator.h:
 *
 * > C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(C10MyOperator) // C10MyOperator is the name
 *                                              // used by c10 for this operator
 *
 * In caffe2/operators/MyOperator.cc
 *
 * > C10_EXPORT_CAFFE2_OP_TO_C10_CPU (
 * >    C10MyOperator,
 * >    "_caffe2::C10MyOperator(Tensor input1, int argument2, float argument3) -> (Tensor output1, Tensor output2)"
 * >    caffe2::MyOperator<caffe2::CPUContext> // This is the caffe2 operator
 * >                                           // class template
 * > )
 *
 * In caffe2/operators/MyOperator.cu
 *
 * > C10_EXPORT_CAFFE2_OP_TO_C10_CUDA(C10MyOperator ,
 *   caffe2::MyOperator<caffe2::CUDAContext>)
 *
 * Notes:
 * - all macros must be defined in the top level namespace, not in namespace
 *   caffe2.
 * - all operators must call C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10 and
 *   C10_EXPORT_CAFFE2_OP_TO_C10_CPU .
 * - calling C10_EXPORT_CAFFE2_OP_TO_C10_CUDA is optional and can be omitted i f
 *   you don't want to expose the operator for CUDA operations.
 * - caffe2 arguments must come after caffe2 inputs, in other words, any tensor
 *   inputs must precede any non-tensor inputs.
 *
 * More complex use cases:
 * - If your operator has a variable number of input tensors, make the first (!)
 *   input an input of type TensorList. There must be no other tensor inputs.
 */
#define C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(OperatorName)          \
  namespace caffe2 {                                               \
  namespace _c10_ops {                                             \
  CAFFE2_API const FunctionSchema& schema_##OperatorName();        \
  }                                                                \
  }
 
#define C10_EXPORT_CAFFE2_OP_TO_C10_CPU(                                     \
    OperatorName, OperatorSchema, OperatorClass)                             \
  /* Register the op schema with the c10 dispatcher */                       \
  namespace caffe2 {                                                         \
  namespace _c10_ops {                                                       \
  C10_EXPORT const FunctionSchema& schema_##OperatorName() {                 \
    static const FunctionSchema schema =                                     \
        ::caffe2::detail::make_function_schema_for_c10(OperatorSchema);      \
    return schema;                                                           \
  }                                                                          \
  }                                                                          \
  }                                                                          \
  /* Register call_caffe2_op_from_c10 as a kernel with the c10 dispatcher */ \
  static auto registry_##OperatorName##_##__COUNTER__ =                      \
      ::c10::RegisterOperators().op(                                         \
          ::caffe2::_c10_ops::schema_##OperatorName(),                       \
          ::c10::RegisterOperators::options()                                \
              .kernel(                                                       \
                  ::c10::TensorTypeId::CPUTensorId,                          \
                  &::caffe2::detail::call_caffe2_op_from_c10<                \
                      ::caffe2::_c10_ops::schema_##OperatorName,             \
                      OperatorClass>));
 
#define C10_EXPORT_CAFFE2_OP_TO_C10_CUDA(OperatorName, OperatorClass)        \
  /* Register call_caffe2_op_from_c10 as a kernel with the c10 dispatcher */ \
  static auto registry_##OperatorName##_##__COUNTER__ =                      \
      ::c10::RegisterOperators().op(                                         \
          ::caffe2::_c10_ops::schema_##OperatorName(),                       \
          ::c10::RegisterOperators::options()                                \
              .kernel(                                                       \
                  ::c10::TensorTypeId::CUDATensorId,                         \
                  &::caffe2::detail::call_caffe2_op_from_c10<                \
                      ::caffe2::_c10_ops::schema_##OperatorName,             \
                      OperatorClass>));
 
// You should never manually call the C10_EXPORT_CAFFE2_OP_TO_C10_HIP macro .
// The C10_EXPORT_CAFFE2_OP_TO_C10_CUDA macro from above will be automatically
// rewritten to C10_EXPORT_CAFFE2_OP_TO_C10_HIP by hipify .
#define C10_EXPORT_CAFFE2_OP_TO_C10_HIP(OperatorName, OperatorClass)         \
  /* Register call_caffe2_op_from_c10 as a kernel with the c10 dispatcher */ \
  static auto registry_##OperatorName##_##__COUNTER__ =                      \
      ::c10::RegisterOperators().op(                                         \
          ::caffe2::_c10_ops::schema_##OperatorName(),                       \
          ::c10::RegisterOperators().options()                               \
              .kernel(                                                       \
                  ::c10::TensorTypeId::HIPTensorId,                          \
                  &::caffe2::detail::call_caffe2_op_from_c10<                \
                      ::caffe2::_c10_ops::schema_##OperatorName,             \
                      OperatorClass>));
 
#else
// Don't use c10 dispatcher on mobile because of binary size
#define C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(OperatorName)
#define C10_EXPORT_CAFFE2_OP_TO_C10_CPU(OperatorName, OperatorSchema, OperatorClass)
#define C10_EXPORT_CAFFE2_OP_TO_C10_CUDA(OperatorName, OperatorClass)
#define C10_EXPORT_CAFFE2_OP_TO_C10_HIP(OperatorName, OperatorClass)
#endif