~zhangmeng/libgowrapper/reid.git

#ifndef CAFFE2_CORE_NET_ASYNC_BASE_H_
#define CAFFE2_CORE_NET_ASYNC_BASE_H_
 
#include "c10/core/thread_pool.h"
#include "c10/util/Registry.h"
#include "caffe2/core/common.h"
#include "caffe2/core/net.h"
#include "caffe2/core/net_dag_utils.h"
#include "caffe2/core/prof_dag_counters.h"
#include "caffe2/core/stats.h"
#include "caffe2/core/timer.h"
#include "caffe2/core/workspace.h"
#include "caffe2/proto/caffe2_pb.h"
#include "caffe2/proto/prof_dag.pb.h"
#include "caffe2/utils/proto_utils.h"
#include <ATen/core/Tensor.h>
 
C10_DECLARE_int(caffe2_streams_per_gpu);
C10_DECLARE_int(caffe2_net_async_max_gpus);
C10_DECLARE_int(caffe2_net_async_max_numa_nodes);
C10_DECLARE_int(caffe2_net_async_thread_pool_size);
C10_DECLARE_bool(caffe2_net_async_check_stream_status);
C10_DECLARE_bool(caffe2_net_async_use_single_pool);
C10_DECLARE_bool(caffe2_net_async_use_per_net_pools);
C10_DECLARE_bool(caffe2_net_async_run_root_tasks_inline);
C10_DECLARE_bool(caffe2_net_async_profile_operators);
 
namespace caffe2 {
 
class AsyncNetExecutorHelper;
 
namespace tracing {
class Tracer;
}
 
struct ExecutionOptions {
  explicit ExecutionOptions(const std::shared_ptr<const NetDef>& net_def);
 
  // number of gpu streams per gpu per cpu thread
  int streams_per_gpu_ = 1;
  // ops synchronization options
  bool finish_chain_ = false;
  bool always_schedule_child_ = false;
  // try to pick gpu stream that is not busy
  bool check_stream_status_ = false;
  // use single thread pool for all devices
  bool use_single_pool_ = false;
  // use per net instances thread pools instead of global ones
  bool use_per_net_pools_ = false;
  // whether RunAsync is blocking
  bool is_blocking_ = false;
  // prof_dag counters reporting
  bool report_stats_ = false;
  // immediately run children tasks inline whenever possible
  bool use_dfs_scheduling_ = false;
  // run net's root tasks in RunAsync thread instead of in thread pool
  bool run_root_tasks_inline_ = false;
};
 
class CAFFE2_API AsyncNetBase : public NetBase {
 public:
  AsyncNetBase(const std::shared_ptr<const NetDef>& net_def, Workspace* ws);
  ~AsyncNetBase() override;
 
  bool SupportsAsync() override {
    return true;
  }
 
  vector<OperatorBase*> GetOperators() const override {
    return operators_;
  }
 
  bool RunAsync() override;
 
  const dag_utils::ExecutionChains& TEST_execution_chains() const {
    return execution_chains_;
  }
 
  ProfDAGProtos GetOperatorStats() const;
  ProfDAGProtos GetPerOperatorCost() const;
  ProfDAGReport GetProfReport() const;
 
 protected:
  bool canSchedule(
      int chain_id,
      const std::vector<EventStatus>* status = nullptr,
      bool* parent_failed = nullptr);
  bool canSchedule(int parent_id, int child_id);
 
  int tasksNum() const;
  Event& event(int task_id) const;
  EventStatus query(int task_id) const;
  const std::vector<int>& children(int task_id) const;
  const std::vector<int>& parents(int task_id) const;
  int updateParentCount(int child_id);
  int getParentCount(int child_id);
  bool testAndSetScheduled(int task_id);
  int numOps(int task_id) const;
 
  int firstTaskOpId(int task_id) const;
  int lastTaskOpId(int task_id) const;
  const OperatorBase* firstTaskOp(int task_id) const;
  const OperatorBase* lastTaskOp(int task_id) const;
  OperatorBase* firstTaskOp(int task_id);
  OperatorBase* lastTaskOp(int task_id);
 
  void asyncWait(
      int task_id,
      int stream_id,
      const std::vector<int>& wait_task_ids) const;
  bool run(int task_id, int stream_id) noexcept;
  int stream(int task_id);
  TaskThreadPoolBase* pool(const DeviceOption& device_option);
  TaskThreadPoolBase* pool();
 
  void finishTasks(const std::unordered_set<int>& task_ids);
  void finalizeEvents();
 
  bool isStreamFree(int task_id, int stream_id) const;
 
  virtual void reset();
 
  bool handleRunError() override;
 
  // Operator/task graph
  std::vector<OperatorBase*> operators_;
  std::vector<dag_utils::OperatorNode> operator_nodes_;
  std::vector<std::vector<int>> chains_;
  std::vector<dag_utils::OpGraphNode> chain_nodes_; // chains' parents/children
  dag_utils::ExecutionChains execution_chains_; // for testing
 
  // Pools and streams
  std::mutex pools_mutex_;
  // first int key - device id, second - pool size, one pool per (device, size)
  typedef std::unordered_map<
      int,
      std::unordered_map<int, std::shared_ptr<TaskThreadPoolBase>>>
      PoolsMap;
  PoolsMap cpu_pools_;
  PoolsMap gpu_pools_;
  static std::vector<int>& getStreamCounters();
  int num_workers_;
 
  // Exception/error handling
  void handleChainError(
      int task_id,
      OperatorBase* op,
      const char* err_msg,
      bool save_exception = false) noexcept;
  std::atomic<bool> success_;
 
  // Tracing
  std::shared_ptr<tracing::Tracer> tracer_;
 
  // execution mode flags
  ExecutionOptions options_;
 
  ProfDAGCounters counters_;
 
  C10_DISABLE_COPY_AND_ASSIGN(AsyncNetBase);
 
 private:
  TaskThreadPoolBase*
  poolGetter(PoolsMap& pools, int device_type, int device_id, int pool_size);
 
  std::unique_ptr<AsyncNetExecutorHelper> helper_;
 
  friend class AsyncNetExecutorHelper;
  friend class tracing::Tracer;
};
 
class AsyncNetExecutorHelper : public ExecutorHelper {
 public:
  explicit AsyncNetExecutorHelper(AsyncNetBase* net) : net_(net) {}
  TaskThreadPoolBase* GetPool(const DeviceOption& option) const override {
    return net_->pool(option);
  }
 
 private:
  AsyncNetBase* net_;
};
 
template <class TaskThreadPoolImpl, int device_type>
std::shared_ptr<TaskThreadPoolBase>
GetAsyncNetThreadPool(int device_id, int pool_size, bool create_new) {
  static std::unordered_map<
      int,
      std::unordered_map<int, std::weak_ptr<TaskThreadPoolBase>>>
      pools;
  static std::mutex pool_mutex;
 
  const auto& device_type_name = DeviceTypeName(device_type);
 
  if (pool_size <= 0) {
    if (FLAGS_caffe2_net_async_thread_pool_size > 0) {
      pool_size = FLAGS_caffe2_net_async_thread_pool_size;
      LOG(INFO) << "Using default " << device_type_name
                << " pool size: " << pool_size << "; device id: " << device_id;
    } else {
      auto num_cores = std::thread::hardware_concurrency();
      CAFFE_ENFORCE(num_cores > 0, "Failed to get number of CPU cores");
      LOG(INFO) << "Using estimated " << device_type_name
                << " pool size: " << num_cores << "; device id: " << device_id;
      pool_size = num_cores;
    }
  } else {
    LOG(INFO) << "Using specified " << device_type_name
              << " pool size: " << pool_size << "; device id: " << device_id;
  }
 
  if (create_new) {
    LOG(INFO) << "Created new " << device_type_name
              << " pool, size: " << pool_size << "; device id: " << device_id;
    return std::make_shared<TaskThreadPoolImpl>(pool_size, device_id);
  } else {
    std::lock_guard<std::mutex> lock(pool_mutex);
 
    auto shared_pool = pools[device_id][pool_size].lock();
    if (!shared_pool) {
      LOG(INFO) << "Created shared " << device_type_name
                << " pool, size: " << pool_size << "; device id: " << device_id;
      shared_pool = std::make_shared<TaskThreadPoolImpl>(pool_size, device_id);
      pools[device_id][pool_size] = shared_pool;
    }
    return shared_pool;
  }
}
 
} // namespace caffe2
 
#endif // CAFFE2_CORE_NET_ASYNC_BASE_H_