~zhangmeng/libgowrapper/reid.git

#pragma once
 
#include <unordered_map>
 
#include "onnx/onnx_pb.h"
 
#include "c10/util/SmallVector.h"
#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
#include "caffe2/onnx/onnxifi_graph_info.h"
#include "caffe2/onnx/onnxifi_init.h"
#include "caffe2/opt/shape_info.h"
#include "caffe2/utils/proto_utils.h"
#include "caffe2/utils/string_utils.h"
 
namespace caffe2 {
 
template <typename Context>
class OnnxifiOp final : public Operator<Context> {
  struct TensorInfo {
    TensorInfo() {}
    TensorInfo(TensorInfo&&) = default;
    TensorInfo& operator=(TensorInfo&&) = default;
    std::vector<uint64_t> dims;
    uint64_t onnxifi_type;
  };
 
  struct OutputReshapeInfo {
    std::vector<Tensor> begins;
    std::vector<Tensor> ends;
    std::vector<bool> fast_path;
    bool skip{false};
  };
 
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  explicit OnnxifiOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<Context>(operator_def, ws),
        use_onnx_(this->template GetSingleArgument<int>("use_onnx", 0)),
        max_batch_size_(
            this->template GetSingleArgument<int>("max_batch_size", 0)),
        max_seq_size_(this->template GetSingleArgument<int>("max_seq_size", 0)),
        nominal_batch_idx_(
            this->template GetSingleArgument<int>("nominal_batch_idx", 0)) {
    lib_ = onnx::initOnnxifiLibrary();
    backend_graph_map_ptr_ = onnx::getOnnxBackendGraphMap();
    CAFFE_ENFORCE(lib_, "Cannot initialize ONNXIFI library");
    auto onnx_model_str =
        this->template GetSingleArgument<std::string>("onnx_model", "");
    CAFFE_ENFORCE(!onnx_model_str.empty(), "onnx_model cannot be empty");
    if (!use_onnx_) {
      CAFFE_ENFORCE(ParseProtoFromLargeString(onnx_model_str, &netdef_));
    }
 
    // Setup input/output descriptor templates
    input_names_ =
        this->template GetRepeatedArgument<std::string>("input_names");
    output_names_ =
        this->template GetRepeatedArgument<std::string>("output_names");
    CAFFE_ENFORCE_EQ(input_names_.size(), operator_def.input_size());
    CAFFE_ENFORCE_EQ(output_names_.size(), operator_def.output_size());
    for (const auto& input : input_names_) {
      input_desc_.push_back(onnxTensorDescriptorV1());
      input_desc_.back().name = input.c_str();
    }
    all_offsets_.reserve(ws->Blobs().size());
    all_scales_.reserve(ws->Blobs().size());
    input_shapes_.resize(input_names_.size());
    output_shapes_.resize(output_names_.size());
    output_reshape_info_.begins.reserve(output_names_.size());
    output_reshape_info_.ends.reserve(output_names_.size());
    output_reshape_info_.fast_path.reserve(output_names_.size());
    int output_idx = 0;
    for (const auto& output : output_names_) {
      output_desc_.push_back(onnxTensorDescriptorV1());
      output_desc_.back().name = output.c_str();
 
      // For output, we try to get its output size hint
      int64_t num_dims = 0;
      const std::string key = c10::str("output_shape_hint_", output_idx);
      auto output_shape_hint = this->template GetRepeatedArgument<int>(key);
      if (!output_shape_hint.empty()) {
        TensorInfo info;
        info.onnxifi_type = output_shape_hint.front();
        for (size_t i = 1; i < output_shape_hint.size(); ++i) {
          info.dims.push_back(output_shape_hint[i]);
        }
        num_dims = info.dims.size();
        output_shape_hints_.emplace(output_idx, std::move(info));
      }
 
      // Initialize the tensors used to slice the output
      output_reshape_info_.begins.emplace_back();
      ReinitializeTensor(
          &output_reshape_info_.begins.back(),
          {num_dims},
          at::dtype<int32_t>().device(CPU));
      output_reshape_info_.ends.emplace_back();
      ReinitializeTensor(
          &output_reshape_info_.ends.back(),
          {num_dims},
          at::dtype<int32_t>().device(CPU));
      output_reshape_info_.fast_path.push_back(false);
      ++output_idx;
    }
 
    // Get output resizing hints
    adjust_output_batch_ =
        this->template GetSingleArgument<int>("adjust_output_batch", 0);
 
    // Encode arguments starting with "custom_" to backend
    std::vector<uint64_t> property_pointers;
    std::vector<int64_t> int_args;
    std::vector<float> float_args;
    buildPropertyList(operator_def, &property_pointers, &int_args, &float_args);
 
    // Initialize the backend if it has not been already created. When we
    // initialized the backend, we will get the weights (initializers) from the
    // workspace and offload onto the backend. This should be done only once.
    // Subsequent call of this function with the same model id should find a
    // cached backend and therefore there is no need to repeat the above
    // process.
    buildBackendAndGraph(ws, property_pointers, onnx_model_str);
 
  }
 
  ~OnnxifiOp() {
    backend_graph_shared_ptr_.reset();
    backend_graph_map_ptr_->remove(op_id_string_);
#ifdef ONNXIFI_ENABLE_EXT
    traces_.reset();
#endif
  }
 
  bool RunOnDevice() override;
 
  void setEnableTracing(bool b) {
    enable_tracing_ = b;
  }
 
#ifdef ONNXIFI_ENABLE_EXT
  std::shared_ptr<onnxTraceEventList> traces() const {
    return traces_;
  }
#endif
 private:
  uint64_t SetOutputShapeAndType(int output_idx, std::vector<size_t>* dims) {
    uint64_t type = ONNXIFI_DATATYPE_FLOAT32;
    const auto it = output_shape_hints_.find(output_idx);
    if (it != output_shape_hints_.end()) {
      std::copy(
          it->second.dims.begin(),
          it->second.dims.end(),
          std::back_inserter(*dims));
      type = it->second.onnxifi_type;
    }
    return type;
  }
 
  void buildPropertyList(
      const OperatorDef& /* unused */,
      std::vector<uint64_t>* property_list,
      std::vector<int64_t>* /* unused */,
      std::vector<float>* /* unused */) {
    property_list->push_back(ONNXIFI_BACKEND_PROPERTY_NONE);
  }
 
  void buildBackendAndGraph(
      Workspace* ws,
      const std::vector<uint64_t>& property_pointers,
      const std::string& onnx_model_str) {
    op_id_string_ =
        this->template GetSingleArgument<std::string>("model_id", "") + ":" +
        this->template GetSingleArgument<std::string>("net_pos", "");
 
    auto initializers =
        this->template GetRepeatedArgument<std::string>("initializers");
    // Build the Onnxifi engine
    auto backend_index = this->template GetSingleArgument<int>("backend_id", 0);
    auto creator = [this,
                    ws,
                    property_pointers,
                    backend_index,
                    &onnx_model_str,
                    &initializers]() {
      std::vector<onnxBackendID> backend_ids;
      size_t num_backends{0};
      CAFFE_ENFORCE_EQ(
          lib_->onnxGetBackendIDs(nullptr, &num_backends),
          ONNXIFI_STATUS_FALLBACK);
      CAFFE_ENFORCE_GT(
          num_backends, 0, "At least 1 onnxifi backend should be available");
      CAFFE_ENFORCE_LT(
          backend_index,
          num_backends,
          "Backend idx out of bound: ",
          backend_index,
          ", #backends: ",
          num_backends);
      backend_ids.resize(num_backends);
      CAFFE_ENFORCE_EQ(
          lib_->onnxGetBackendIDs(backend_ids.data(), &num_backends),
          ONNXIFI_STATUS_SUCCESS);
 
      onnxBackendID backend_id = backend_ids[backend_index];
      onnxBackend backend{nullptr};
 
      CAFFE_ENFORCE_EQ(
          lib_->onnxInitBackend(backend_id, property_pointers.data(), &backend),
          ONNXIFI_STATUS_SUCCESS);
 
      // Release unused backend ids.
      for (size_t i = 0; i < num_backends; ++i) {
        if (i == backend_index) {
          continue;
        }
        lib_->onnxReleaseBackendID(backend_ids[i]);
      }
 
      // Get weights
      std::vector<std::string> weight_names;
      std::vector<std::vector<uint64_t>> weight_shapes;
      auto weight_descs = buildInitializationList(
          ws,
          initializers,
          &weight_names,
          &weight_shapes,
          &all_scales_,
          &all_offsets_);
 
      // Extra weight shapes
      std::unordered_map<std::string, ShapeInfo> weight_shape_info;
      for (size_t i = 0; i < weight_names.size(); ++i) {
        TensorShape shape;
        const auto& shape0 = weight_shapes[i];
        for (const auto d : shape0) {
          shape.add_dims(d);
        }
        weight_shape_info[weight_names[i]] =
            ShapeInfo(ShapeInfo::DimType::CONSTANT, std::move(shape));
      }
 
      onnxGraph graph{nullptr};
      CAFFE_ENFORCE_EQ(
          lib_->onnxInitGraph(
              backend,
              nullptr,
              onnx_model_str.size(),
              (const void*)(onnx_model_str.c_str()),
              weight_descs.size(),
              weight_descs.data(),
              &graph),
          ONNXIFI_STATUS_SUCCESS);
 
      return std::make_shared<onnx::BackendGraphInfo>(
          backend_id, backend, graph, lib_, std::move(weight_shape_info));
    };
    backend_graph_shared_ptr_ =
        backend_graph_map_ptr_->insert(op_id_string_, creator);
 
    backend_id_ = backend_graph_shared_ptr_->backend_id;
    backend_ = backend_graph_shared_ptr_->backend;
    graph_ = backend_graph_shared_ptr_->graph;
    input_shape_info_ = backend_graph_shared_ptr_->weight_shape_info;
 
    getExtFunctionPointers();
  }
 
  /// Set up function pointer if onnxifi_ext is enabled
  void getExtFunctionPointers() {
#ifdef ONNXIFI_ENABLE_EXT
    onnxExtensionFunctionPointer p;
    if (lib_->onnxGetExtensionFunctionAddress(
            backend_id_, "onnxSetIOAndRunGraphFunction", &p) !=
        ONNXIFI_STATUS_SUCCESS) {
      onnxSetIOAndRunGraphPointer_ = nullptr;
    } else {
      onnxSetIOAndRunGraphPointer_ =
          reinterpret_cast<decltype(onnxSetIOAndRunGraphPointer_)>(p);
    }
    if (lib_->onnxGetExtensionFunctionAddress(
            backend_id_, "onnxReleaseTraceEventsFunction", &p) !=
        ONNXIFI_STATUS_SUCCESS) {
      onnxReleaseTraceEventsPointer_ = nullptr;
    } else {
      onnxReleaseTraceEventsPointer_ =
          reinterpret_cast<decltype(onnxReleaseTraceEventsPointer_)>(p);
    }
#endif
  }
 
  void extractOutputBatchSizes();
 
  // If needed, adjust output tensor shape based on the real input batch size.
  // If the output shape is conditioned on first dim (batch size), we have a
  // fast path to shrink the tensor shape by just manipulating the meta data.
  // Otherwise, we have to slice it in the middle of the dimension with copy
  // invoked. This is a slow path and we don't expect it to happen very often.
  // We can already omit this step by setting "adjust_output_batch_" to false
  void maybeAdjustOutputBatchSizes();
 
  std::vector<onnxTensorDescriptorV1> buildInitializationList(
      Workspace* ws,
      const std::vector<std::string>& initializers,
      std::vector<std::string>* weight_names,
      std::vector<std::vector<uint64_t>>* weight_shapes,
      std::vector<std::vector<float>>* all_scales,
      std::vector<std::vector<int32_t>>* all_offsets) const;
 
  // pointer to loaded onnxifi library
  onnxifi_library* lib_{nullptr};
  onnx::OnnxBackendGraphMap* backend_graph_map_ptr_;
  std::string op_id_string_;
 
  onnxBackendID backend_id_{nullptr};
  onnxBackend backend_{nullptr};
  onnxGraph graph_{nullptr};
  onnx::SharedPtrBackendGraphInfo backend_graph_shared_ptr_;
 
  // input/output descriptors
  std::vector<onnxTensorDescriptorV1> input_desc_;
  std::vector<onnxTensorDescriptorV1> output_desc_;
 
  // Output reshape info
  OutputReshapeInfo output_reshape_info_;
 
#ifdef ONNXIFI_ENABLE_EXT
  // onnxifi extension mode function pointer
  onnxStatus (*onnxSetIOAndRunGraphPointer_)(
      onnxGraph,
      uint32_t,
      const onnxTensorDescriptorV1*,
      uint32_t,
      const onnxTensorDescriptorV1*,
      onnxMemoryFenceV1*,
      onnxTraceEventList*);
 
  onnxStatus (*onnxReleaseTraceEventsPointer_)(onnxTraceEventList*);
 
  std::shared_ptr<onnxTraceEventList> traces_{nullptr};
#endif
 
  // ONNX model or not
  bool use_onnx_{false};
 
  // max batch size
  int max_batch_size_;
 
  // max sequence lookup size
  int max_seq_size_;
 
  // index of the input whose first dimension represents the batch size
  int nominal_batch_idx_{0};
 
  // We bind the op input/output by position while ONNXIFI binds input/output by
  // names. In addition, op input/output names can be writtten by, for example,
  // memonger. We cache the original input/output name of ONNX object here and
  // bind them by position.
  std::vector<std::string> input_names_;
  std::vector<std::string> output_names_;
 
  // NetDef of the onnxifi subgraph for shape inference
  NetDef netdef_;
 
  std::vector<c10::SmallVector<uint64_t, 4>> input_shapes_;
  std::vector<c10::SmallVector<uint64_t, 4>> output_shapes_;
 
  // A cache vector to avoid repeated reallocation. The existence of this is not
  // ideal, which is purely due to the factor that we use int64_t for c2::tensor
  // dim but uint64_t for onnxDesciptor dim. Maybe we should just use int64_t
  c10::SmallVector<int64_t, 4> tensor_dims_int64_;
 
  // This is for multi group quantization info
  std::vector<std::vector<float>> all_scales_;
  std::vector<std::vector<int32_t>> all_offsets_;
 
  // output shape hints
  std::unordered_map<int, TensorInfo> output_shape_hints_;
 
  // input shape info. Used by shape inference when inputs are not at
  // max_batch_size
  std::unordered_map<std::string, ShapeInfo> input_shape_info_;
 
  // Whether we need to resize outputs or not
  bool adjust_output_batch_{false};
 
  // Whether we enable tracing in one run of inference
  bool enable_tracing_{false};
};
 
} // namespace caffe2