#pragma once #include #include "onnx/onnx_pb.h" #include "c10/util/SmallVector.h" #include "caffe2/core/context.h" #include "caffe2/core/logging.h" #include "caffe2/core/operator.h" #include "caffe2/onnx/onnxifi_graph_info.h" #include "caffe2/onnx/onnxifi_init.h" #include "caffe2/opt/shape_info.h" #include "caffe2/utils/proto_utils.h" #include "caffe2/utils/string_utils.h" namespace caffe2 { template class OnnxifiOp final : public Operator { struct TensorInfo { TensorInfo() {} TensorInfo(TensorInfo&&) = default; TensorInfo& operator=(TensorInfo&&) = default; std::vector dims; uint64_t onnxifi_type; }; struct OutputReshapeInfo { std::vector begins; std::vector ends; std::vector fast_path; bool skip{false}; }; public: USE_OPERATOR_CONTEXT_FUNCTIONS; explicit OnnxifiOp(const OperatorDef& operator_def, Workspace* ws) : Operator(operator_def, ws), use_onnx_(this->template GetSingleArgument("use_onnx", 0)), max_batch_size_( this->template GetSingleArgument("max_batch_size", 0)), max_seq_size_(this->template GetSingleArgument("max_seq_size", 0)), nominal_batch_idx_( this->template GetSingleArgument("nominal_batch_idx", 0)) { lib_ = onnx::initOnnxifiLibrary(); backend_graph_map_ptr_ = onnx::getOnnxBackendGraphMap(); CAFFE_ENFORCE(lib_, "Cannot initialize ONNXIFI library"); auto onnx_model_str = this->template GetSingleArgument("onnx_model", ""); CAFFE_ENFORCE(!onnx_model_str.empty(), "onnx_model cannot be empty"); if (!use_onnx_) { CAFFE_ENFORCE(ParseProtoFromLargeString(onnx_model_str, &netdef_)); } // Setup input/output descriptor templates input_names_ = this->template GetRepeatedArgument("input_names"); output_names_ = this->template GetRepeatedArgument("output_names"); CAFFE_ENFORCE_EQ(input_names_.size(), operator_def.input_size()); CAFFE_ENFORCE_EQ(output_names_.size(), operator_def.output_size()); for (const auto& input : input_names_) { input_desc_.push_back(onnxTensorDescriptorV1()); input_desc_.back().name = input.c_str(); } all_offsets_.reserve(ws->Blobs().size()); all_scales_.reserve(ws->Blobs().size()); input_shapes_.resize(input_names_.size()); output_shapes_.resize(output_names_.size()); output_reshape_info_.begins.reserve(output_names_.size()); output_reshape_info_.ends.reserve(output_names_.size()); output_reshape_info_.fast_path.reserve(output_names_.size()); int output_idx = 0; for (const auto& output : output_names_) { output_desc_.push_back(onnxTensorDescriptorV1()); output_desc_.back().name = output.c_str(); // For output, we try to get its output size hint int64_t num_dims = 0; const std::string key = c10::str("output_shape_hint_", output_idx); auto output_shape_hint = this->template GetRepeatedArgument(key); if (!output_shape_hint.empty()) { TensorInfo info; info.onnxifi_type = output_shape_hint.front(); for (size_t i = 1; i < output_shape_hint.size(); ++i) { info.dims.push_back(output_shape_hint[i]); } num_dims = info.dims.size(); output_shape_hints_.emplace(output_idx, std::move(info)); } // Initialize the tensors used to slice the output output_reshape_info_.begins.emplace_back(); ReinitializeTensor( &output_reshape_info_.begins.back(), {num_dims}, at::dtype().device(CPU)); output_reshape_info_.ends.emplace_back(); ReinitializeTensor( &output_reshape_info_.ends.back(), {num_dims}, at::dtype().device(CPU)); output_reshape_info_.fast_path.push_back(false); ++output_idx; } // Get output resizing hints adjust_output_batch_ = this->template GetSingleArgument("adjust_output_batch", 0); // Encode arguments starting with "custom_" to backend std::vector property_pointers; std::vector int_args; std::vector float_args; buildPropertyList(operator_def, &property_pointers, &int_args, &float_args); // Initialize the backend if it has not been already created. When we // initialized the backend, we will get the weights (initializers) from the // workspace and offload onto the backend. This should be done only once. // Subsequent call of this function with the same model id should find a // cached backend and therefore there is no need to repeat the above // process. buildBackendAndGraph(ws, property_pointers, onnx_model_str); } ~OnnxifiOp() { backend_graph_shared_ptr_.reset(); backend_graph_map_ptr_->remove(op_id_string_); #ifdef ONNXIFI_ENABLE_EXT traces_.reset(); #endif } bool RunOnDevice() override; void setEnableTracing(bool b) { enable_tracing_ = b; } #ifdef ONNXIFI_ENABLE_EXT std::shared_ptr traces() const { return traces_; } #endif private: uint64_t SetOutputShapeAndType(int output_idx, std::vector* dims) { uint64_t type = ONNXIFI_DATATYPE_FLOAT32; const auto it = output_shape_hints_.find(output_idx); if (it != output_shape_hints_.end()) { std::copy( it->second.dims.begin(), it->second.dims.end(), std::back_inserter(*dims)); type = it->second.onnxifi_type; } return type; } void buildPropertyList( const OperatorDef& /* unused */, std::vector* property_list, std::vector* /* unused */, std::vector* /* unused */) { property_list->push_back(ONNXIFI_BACKEND_PROPERTY_NONE); } void buildBackendAndGraph( Workspace* ws, const std::vector& property_pointers, const std::string& onnx_model_str) { op_id_string_ = this->template GetSingleArgument("model_id", "") + ":" + this->template GetSingleArgument("net_pos", ""); auto initializers = this->template GetRepeatedArgument("initializers"); // Build the Onnxifi engine auto backend_index = this->template GetSingleArgument("backend_id", 0); auto creator = [this, ws, property_pointers, backend_index, &onnx_model_str, &initializers]() { std::vector backend_ids; size_t num_backends{0}; CAFFE_ENFORCE_EQ( lib_->onnxGetBackendIDs(nullptr, &num_backends), ONNXIFI_STATUS_FALLBACK); CAFFE_ENFORCE_GT( num_backends, 0, "At least 1 onnxifi backend should be available"); CAFFE_ENFORCE_LT( backend_index, num_backends, "Backend idx out of bound: ", backend_index, ", #backends: ", num_backends); backend_ids.resize(num_backends); CAFFE_ENFORCE_EQ( lib_->onnxGetBackendIDs(backend_ids.data(), &num_backends), ONNXIFI_STATUS_SUCCESS); onnxBackendID backend_id = backend_ids[backend_index]; onnxBackend backend{nullptr}; CAFFE_ENFORCE_EQ( lib_->onnxInitBackend(backend_id, property_pointers.data(), &backend), ONNXIFI_STATUS_SUCCESS); // Release unused backend ids. for (size_t i = 0; i < num_backends; ++i) { if (i == backend_index) { continue; } lib_->onnxReleaseBackendID(backend_ids[i]); } // Get weights std::vector weight_names; std::vector> weight_shapes; auto weight_descs = buildInitializationList( ws, initializers, &weight_names, &weight_shapes, &all_scales_, &all_offsets_); // Extra weight shapes std::unordered_map weight_shape_info; for (size_t i = 0; i < weight_names.size(); ++i) { TensorShape shape; const auto& shape0 = weight_shapes[i]; for (const auto d : shape0) { shape.add_dims(d); } weight_shape_info[weight_names[i]] = ShapeInfo(ShapeInfo::DimType::CONSTANT, std::move(shape)); } onnxGraph graph{nullptr}; CAFFE_ENFORCE_EQ( lib_->onnxInitGraph( backend, nullptr, onnx_model_str.size(), (const void*)(onnx_model_str.c_str()), weight_descs.size(), weight_descs.data(), &graph), ONNXIFI_STATUS_SUCCESS); return std::make_shared( backend_id, backend, graph, lib_, std::move(weight_shape_info)); }; backend_graph_shared_ptr_ = backend_graph_map_ptr_->insert(op_id_string_, creator); backend_id_ = backend_graph_shared_ptr_->backend_id; backend_ = backend_graph_shared_ptr_->backend; graph_ = backend_graph_shared_ptr_->graph; input_shape_info_ = backend_graph_shared_ptr_->weight_shape_info; getExtFunctionPointers(); } /// Set up function pointer if onnxifi_ext is enabled void getExtFunctionPointers() { #ifdef ONNXIFI_ENABLE_EXT onnxExtensionFunctionPointer p; if (lib_->onnxGetExtensionFunctionAddress( backend_id_, "onnxSetIOAndRunGraphFunction", &p) != ONNXIFI_STATUS_SUCCESS) { onnxSetIOAndRunGraphPointer_ = nullptr; } else { onnxSetIOAndRunGraphPointer_ = reinterpret_cast(p); } if (lib_->onnxGetExtensionFunctionAddress( backend_id_, "onnxReleaseTraceEventsFunction", &p) != ONNXIFI_STATUS_SUCCESS) { onnxReleaseTraceEventsPointer_ = nullptr; } else { onnxReleaseTraceEventsPointer_ = reinterpret_cast(p); } #endif } void extractOutputBatchSizes(); // If needed, adjust output tensor shape based on the real input batch size. // If the output shape is conditioned on first dim (batch size), we have a // fast path to shrink the tensor shape by just manipulating the meta data. // Otherwise, we have to slice it in the middle of the dimension with copy // invoked. This is a slow path and we don't expect it to happen very often. // We can already omit this step by setting "adjust_output_batch_" to false void maybeAdjustOutputBatchSizes(); std::vector buildInitializationList( Workspace* ws, const std::vector& initializers, std::vector* weight_names, std::vector>* weight_shapes, std::vector>* all_scales, std::vector>* all_offsets) const; // pointer to loaded onnxifi library onnxifi_library* lib_{nullptr}; onnx::OnnxBackendGraphMap* backend_graph_map_ptr_; std::string op_id_string_; onnxBackendID backend_id_{nullptr}; onnxBackend backend_{nullptr}; onnxGraph graph_{nullptr}; onnx::SharedPtrBackendGraphInfo backend_graph_shared_ptr_; // input/output descriptors std::vector input_desc_; std::vector output_desc_; // Output reshape info OutputReshapeInfo output_reshape_info_; #ifdef ONNXIFI_ENABLE_EXT // onnxifi extension mode function pointer onnxStatus (*onnxSetIOAndRunGraphPointer_)( onnxGraph, uint32_t, const onnxTensorDescriptorV1*, uint32_t, const onnxTensorDescriptorV1*, onnxMemoryFenceV1*, onnxTraceEventList*); onnxStatus (*onnxReleaseTraceEventsPointer_)(onnxTraceEventList*); std::shared_ptr traces_{nullptr}; #endif // ONNX model or not bool use_onnx_{false}; // max batch size int max_batch_size_; // max sequence lookup size int max_seq_size_; // index of the input whose first dimension represents the batch size int nominal_batch_idx_{0}; // We bind the op input/output by position while ONNXIFI binds input/output by // names. In addition, op input/output names can be writtten by, for example, // memonger. We cache the original input/output name of ONNX object here and // bind them by position. std::vector input_names_; std::vector output_names_; // NetDef of the onnxifi subgraph for shape inference NetDef netdef_; std::vector> input_shapes_; std::vector> output_shapes_; // A cache vector to avoid repeated reallocation. The existence of this is not // ideal, which is purely due to the factor that we use int64_t for c2::tensor // dim but uint64_t for onnxDesciptor dim. Maybe we should just use int64_t c10::SmallVector tensor_dims_int64_; // This is for multi group quantization info std::vector> all_scales_; std::vector> all_offsets_; // output shape hints std::unordered_map output_shape_hints_; // input shape info. Used by shape inference when inputs are not at // max_batch_size std::unordered_map input_shape_info_; // Whether we need to resize outputs or not bool adjust_output_batch_{false}; // Whether we enable tracing in one run of inference bool enable_tracing_{false}; }; } // namespace caffe2