~zhangmeng/libgowrapper/reid.git

#pragma once
#ifdef USE_CUDA
#include <c10/core/Allocator.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <c10/cuda/CUDAException.h>
#include <c10/cuda/CUDAGuard.h>
#include <c10/cuda/CUDAStream.h>
#include <c10/util/Logging.h>
#include <cuda_runtime_api.h>
#include <cstddef>
 
namespace torch {
 
bool CudaIPCCollect();
 
struct CudaIPCReceivedData final {
  explicit CudaIPCReceivedData(std::shared_ptr<void> shared_ptr)
      : shared_ptr_(std::move(shared_ptr)) {}
  std::shared_ptr<void> shared_ptr_;
};
 
struct CudaIPCSentData final {
  std::string handle_;
  int64_t offset_;
  int64_t* counter_ptr_; // Reference counter shared memory block
  at::DataPtr original_ptr_; // Original mem allocation
  cudaEvent_t event_; // Sync cuEventDestroy
  bool event_sync_required_;
  at::Device device_;
 
  CudaIPCSentData(
      std::string handle,
      int64_t offset,
      int64_t* counter_ptr,
      at::Device device);
  ~CudaIPCSentData();
 
  int64_t counter_value();
  std::string handle() {
    return handle_;
  }
  int64_t offset() {
    return offset_;
  }
  void set_original_ptr(at::DataPtr data_ptr) {
    original_ptr_ = std::move(data_ptr);
  }
};
 
at::DataPtr GetNewRefCountedSentData(void* data, at::Device device);
 
namespace {
 
constexpr int64_t CUDA_IPC_REF_COUNTER_FILE_SIZE = 10000;
constexpr int64_t CUDA_IPC_WARN_AFTER_X_BLOCKS_IN_LIMBO = 1000;
// This was determined empirically that CUDA (v10.1 and below) have the limit
// on the number of recorded blocking interprocess events. It is around ~22,000.
// And to give us leeway, we picked 1000 as it gives us enough events to share
// tensors effectively.
constexpr int64_t CUDA_IPC_MAXIMUM_EVENTS_TO_USE = 1000;
 
// All to be deleted data blocks with non zero reference counter goes there
struct CudaIPCSentDataLimbo final {
  ~CudaIPCSentDataLimbo();
  bool collect();
  void add(std::unique_ptr<CudaIPCSentData> shared_block);
  uint64_t size() {
    return shared_blocks_.size();
  }
 
 private:
  // TODO: Can be changed to FIFO in order to avoid full traverse on every
  // collect()
  std::vector<std::unique_ptr<CudaIPCSentData>> shared_blocks_;
  std::mutex limbo_mutex_;
};
 
struct CudaIPCRefCountersFile final {
  CudaIPCRefCountersFile(
      std::string handle,
      uint64_t size,
      at::DataPtr data_ptr)
      : next_offset_(0),
        size_(size),
        used_slots_(0),
        handle_(handle),
        refcounted_shared_mem_(std::move(data_ptr)) {}
 
  int64_t* counter_ptr() {
    return static_cast<int64_t*>(refcounted_shared_mem_.get()) + next_offset_;
  }
 
  void set_counter(uint64_t value) {
    *counter_ptr() = value;
  }
 
  bool have_offsets() {
    return next_offset_ < size_;
  }
 
  bool offsets_in_use() {
    return used_slots_;
  }
 
  int64_t get_offset() {
    return next_offset_;
  }
 
  void rotate_offset() {
    next_offset_++;
    used_slots_++;
  }
 
  void return_offset(uint64_t offset /* unused */) {
    used_slots_--;
  }
 
  std::string handle() {
    return handle_;
  }
 
 private:
  uint64_t next_offset_;
  uint64_t size_;
  uint64_t used_slots_;
  std::string handle_;
  at::DataPtr refcounted_shared_mem_;
};
 
} // namespace
} // namespace torch
 
namespace c10 {
namespace {
class CudaIPCCollectCallback : public FreeMemoryCallback {
 public:
  ~CudaIPCCollectCallback() {};
  bool Execute() override {
    return torch::CudaIPCCollect();
  }
};
} // namespace
 
} // namespace c10
 
#endif