#pragma once // TODO Also register c10 operators on mobile #if !defined(CAFFE2_IS_XPLAT_BUILD) #include #include #include #include #include #include "caffe2/core/operator.h" #include "caffe2/core/export_caffe2_op_to_c10.h" namespace caffe2 { /** * To make a c10 operator "C10Add" callable from caffe2 as "C2MyAddOpName", just * write * * To export the CPU kernel * C10_EXPORT_C10_OP_TO_CAFFE2_CPU(C10Add, C2MyAddOp) * * To export the CUDA kernel * C10_EXPORT_C10_OP_TO_CAFFE2_CUDA(C10Add, C2MyAddOp) * */ namespace detail { template class C10OperatorWrapper final : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; C10OperatorWrapper( const c10::OperatorHandle& op, const OperatorDef& operator_def, Workspace* ws) : Operator(operator_def, ws), op_(op), has_preallocated_outputs_( op_.schema().arguments().size() != 0 && op_.schema().arguments().back().name() == detail::PREALLOCATED_OUTPUT_ARGNAME) { AT_ASSERT( !has_preallocated_outputs_ || op_.schema().arguments().back().type()->isSubtypeOf( OptionalType::create(ListType::ofTensors()))); AT_ASSERT(operator_def.output_size() == op_.schema().returns().size()); AT_ASSERT( operator_def.input_size() + (has_preallocated_outputs_ ? 1 : 0) <= op_.schema() .arguments() .size()); // '<=' because there might be caffe2 nontensor arguments } bool RunOnDevice() override { // due to caching the stack_, concurrent calling is not allowed. // TODO thread_local might fix this std::lock_guard lock(mutex_); pushInputs_(); callKernel_(); popOutputs_(); return true; } private: void pushInputs_() { AT_ASSERT(stack_.size() == 0); stack_.reserve( op_.schema().arguments().size() + (has_preallocated_outputs_ ? 1 : 0)); size_t input_tensor_index = 0; for (const auto& argument : op_.schema().arguments()) { if (argument.name() == detail::PREALLOCATED_OUTPUT_ARGNAME) { // note: if detail::PREALLOCATED_OUTPUT_ARGNAME was at the end of the // argument list, then has_preallocated_outputs_ would be true. AT_ASSERTM( has_preallocated_outputs_, "Error in caffe2->c10 wrapper: Operator schema has a parameter named ", detail::PREALLOCATED_OUTPUT_ARGNAME, ", but it's not at the end of the argument list"); AT_ASSERTM( argument.type()->isSubtypeOf( OptionalType::create(ListType::ofTensors())), "Error in caffe2->c10 wrapper: Operator schema has a parameter named ", detail::PREALLOCATED_OUTPUT_ARGNAME, ", but it's not of type TensorList?"); stack_.emplace_back(preallocated_outputs_()); } else if (argument.type()->isSubtypeOf(TensorType::get())) { AT_ASSERTM( input_tensor_index < InputSize(), "Error in caffe2->c10 wrapper: Too few tensor arguments given (", InputSize(), "), operator schema expected more."); stack_.emplace_back(at::Tensor(Input(input_tensor_index++))); } else if (argument.type()->isSubtypeOf(OptionalType::ofTensor())) { if (input_tensor_index < InputSize()) { stack_.emplace_back(at::Tensor(Input(input_tensor_index++))); } else { stack_.emplace_back(IValue()); } } else if (argument.type()->isSubtypeOf(ListType::ofTensors())) { AT_ASSERTM( input_tensor_index == 0, "Error in caffe2->c10 wrapper: Schema can only have either one or more Tensor inputs or one TensorList input."); stack_.emplace_back(array_inputs_()); input_tensor_index = InputSize(); } else { stack_.emplace_back(get_nontensor_argument_(argument)); } } AT_ASSERTM( input_tensor_index == InputSize(), "Error in caffe2->c10 wrapper: Number of caffe2 operator inputs (", InputSize(), ") doesn't match number of tensor arguments (", input_tensor_index, ") in the c10 operator schema."); } void callKernel_() { AT_ASSERT(stack_.size() == op_.schema().arguments().size()); c10::Dispatcher::singleton().callBoxed(op_, &stack_); } void popOutputs_() { AT_ASSERT(stack_.size() == op_.schema().returns().size()); for (size_t i = 0; i < op_.schema().returns().size(); ++i) { OperatorBase::SetOutputTensor(i, Tensor(std::move(stack_[i]).toTensor())); } stack_.clear(); } c10::List array_inputs_() { c10::List result; result.reserve(InputSize()); for (size_t i = 0; i < InputSize(); ++i) { result.emplace_back(Input(i)); } return result; } c10::List preallocated_outputs_() { c10::List result; result.reserve(OutputSize()); for (size_t i = 0; i < OutputSize(); ++i) { result.emplace_back(OperatorBase::OutputTensorOrUndefined(i)); } return result; } IValue get_nontensor_argument_(const c10::Argument& argument) { if (argument.type()->isSubtypeOf(IntType::get())) { return get_nontensor_argument_( argument.name(), argument.default_value()); } else if (argument.type()->isSubtypeOf(FloatType::get())) { return get_nontensor_argument_( argument.name(), argument.default_value()); } else if (argument.type()->isSubtypeOf(BoolType::get())) { return get_nontensor_argument_( argument.name(), argument.default_value()); } else { // TODO Support more types AT_ERROR( "Error in caffe2->c10 wrapper: Unsupported argument type ", argument.type()->str(), " in c10 operator schema"); } } template IValue get_nontensor_argument_( const std::string& name, const c10::optional& default_value) { if (default_value.has_value()) { return this->template GetSingleArgument(name, default_value->to()); } else { TORCH_CHECK( this->template HasSingleArgumentOfType(name), "Error in caffe2->c10 wrapper: Expected argument '", name, "' missing or wrong type."); return this->template GetSingleArgument(name, 0); } } c10::OperatorHandle op_; // has_preallocated_outputs_ is true iff the operator schema has a last // argument that is a TensorList and has a name equal to with the name equal // to detail::PREALLOCATED_OUTPUT_ARGNAME. This argument is then used to pass // in preallocated output tensors to the caffe2 operator. bool has_preallocated_outputs_; // this is stored as a member here to avoid having to re-allocate a stack // for each call. Between kernel calls, stack_.size() == 0, but capacity // should not need to be grown anymore after the first call. std::vector stack_; std::mutex mutex_; }; template inline std::function< std::unique_ptr(const OperatorDef&, Workspace*)> createC10OperatorWrapper(const c10::OperatorName& op_name) { return [op_name](const OperatorDef& op_def, Workspace* ws) { auto op_handle = c10::Dispatcher::singleton().findSchema(op_name); AT_ASSERTM( op_handle.has_value(), "Tried to register c10 operator ", op_name.name, ".", op_name.overload_name, " with caffe2, but didn't find the c10 operator."); return c10::guts::make_unique>( *op_handle, op_def, ws); }; } } // namespace detail } // namespace caffe2 #define C10_EXPORT_C10_OP_TO_CAFFE2_CPU( \ OperatorName, Name) \ REGISTER_CPU_OPERATOR_CREATOR( \ Name, \ ::caffe2::detail::createC10OperatorWrapper( \ {OperatorName, ""})) #define C10_EXPORT_C10_OP_TO_CAFFE2_CUDA( \ OperatorName, Name) \ REGISTER_CUDA_OPERATOR_CREATOR( \ Name, \ ::caffe2::detail::createC10OperatorWrapper( \ {OperatorName, ""})) #define C10_EXPORT_C10_OP_TO_CAFFE2_HIP( \ OperatorName, Name) \ REGISTER_HIP_OPERATOR_CREATOR( \ Name, \ ::caffe2::detail::createC10OperatorWrapper( \ {OperatorName, ""})) #else #define C10_EXPORT_C10_OP_TO_CAFFE2_CPU( \ OperatorName, Name) #define C10_EXPORT_C10_OP_TO_CAFFE2_CUDA( \ OperatorName, Name) #define C10_EXPORT_C10_OP_TO_CAFFE2_HIP( \ OperatorName, Name) #endif