#ifndef CAFFE2_CORE_CONTEXT_GPU_H_
|
#define CAFFE2_CORE_CONTEXT_GPU_H_
|
|
#include <ctime>
|
#include <mutex>
|
|
#include "caffe2/core/common.h"
|
#include "caffe2/core/common_gpu.h"
|
#include "caffe2/core/context.h"
|
#include "caffe2/core/context_base.h"
|
#include "caffe2/core/logging.h"
|
#include "caffe2/core/numa.h"
|
#include "caffe2/core/tensor.h"
|
#include "caffe2/core/types.h"
|
#include "caffe2/proto/caffe2_pb.h"
|
|
// Since we are using the macro CAFFE2_USE_CUDNN, we will need to include this
|
// file after common.h is included.
|
#ifdef CAFFE2_USE_CUDNN
|
#include "caffe2/core/common_cudnn.h"
|
#endif // CAFFE2_USE_CUDNN
|
|
#include <c10/core/Device.h>
|
#include <c10/core/Stream.h>
|
#include <c10/cuda/CUDAStream.h>
|
#include <c10/cuda/CUDAGuard.h>
|
|
namespace caffe2 {
|
|
enum class CudaMemoryPoolType {
|
NONE = 0,
|
CUB = 1,
|
THC = 2,
|
};
|
|
/**
|
* Gets the current memory pool type used by Caffe2.
|
*
|
* The memory pool is set up during caffe2's global initialization time.
|
*/
|
CAFFE2_CUDA_API CudaMemoryPoolType GetCudaMemoryPoolType();
|
|
/**
|
* A struct to host thread-local cuda objects.
|
*
|
* In Caffe2, each thread has its own non-default cuda stream as well as
|
* related objects such as cublas and curand handles. This is achieved by
|
* having the ThreadLocalCUDAObjects wrapper that takes care of allocating
|
* and deallocating these objects at the thread scope. This class is solely
|
* used inside CUDAContext and should not be used externally.
|
*
|
* This class manages the mapping from logical stream ID (int stream_id
|
* passed around in Caffe2) and CUDAStream objects. We intend to eventually
|
* deprecate the logical stream ID interface, but not for now.
|
*/
|
class CAFFE2_CUDA_API ThreadLocalCUDAObjects {
|
friend class CUDAContext;
|
|
private:
|
ThreadLocalCUDAObjects() {
|
for (DeviceIndex i = 0; i < C10_COMPILE_TIME_MAX_GPUS; ++i) {
|
cuda_streams_[i] = vector<c10::cuda::CUDAStream>();
|
}
|
}
|
|
// Record current stream id for the current thread.
|
// This is the new API we're trying to migrate use cases to and get rid of
|
// explicit stream id passing. For now it's invoked in
|
// CUDAContext::SwitchToDevice
|
void SetCurrentStreamId(DeviceIndex gpu, StreamId stream_id) {
|
// TODO: use current device id from thread local instead of passing gpu in
|
c10::cuda::setCurrentCUDAStream(GetCUDAStream(gpu, stream_id));
|
}
|
|
// Retrieves the CUDAStream corresponding to a logical stream ID, ensuring
|
// that it exists in cuda_streams_ if it has not been allocated yet.
|
c10::cuda::CUDAStream GetCUDAStream(DeviceIndex gpu, StreamId stream_id) {
|
vector<c10::cuda::CUDAStream>& gpu_streams = cuda_streams_[gpu];
|
while (gpu_streams.size() <= static_cast<size_t>(stream_id)) {
|
// NB: This streams are not guaranteed to be unique; we'll
|
// wrap around once we run out of streams in the pool.
|
gpu_streams.emplace_back(c10::cuda::getStreamFromPool(/* high priority */ false, gpu));
|
}
|
return gpu_streams[stream_id];
|
}
|
|
// Uses the logical stream id from the thread local to pick the stream
|
// We're going to migrate all usages to this case API instead of passing the
|
// stream id directly
|
cudaStream_t GetStream(DeviceIndex gpu) {
|
return c10::cuda::getCurrentCUDAStream(gpu).stream();
|
}
|
|
cudaStream_t GetStream(DeviceIndex gpu, StreamId stream_id) {
|
return GetCUDAStream(gpu, stream_id).stream();
|
}
|
|
// Uses the logical stream id from the thread local to pick the stream
|
// We're going to migrate all usages to this case API instead of passing the
|
// stream id directly
|
cublasHandle_t GetHandle(DeviceIndex gpu) {
|
return GetHandle(c10::cuda::getCurrentCUDAStream(gpu));
|
}
|
|
cublasHandle_t GetHandle(c10::cuda::CUDAStream cuda_stream) {
|
CUDAGuard guard(cuda_stream.device_index());
|
// Default construct in the map if it doesn't exist, and return a mutable
|
// refernce to it.
|
auto& r = cublas_handles_[cuda_stream];
|
if (r == nullptr) {
|
CUBLAS_ENFORCE(cublasCreate(&r));
|
// The default is CUBLAS_POINTER_MODE_HOST. You can override
|
// it after obtaining the cublas handle, but do that with
|
// caution.
|
CUBLAS_ENFORCE(cublasSetPointerMode(r, CUBLAS_POINTER_MODE_HOST));
|
CUBLAS_ENFORCE(cublasSetStream(r, cuda_stream));
|
}
|
return r;
|
}
|
|
#ifdef CAFFE2_USE_CUDNN
|
// Uses the logical stream id from the thread local to pick the stream
|
// We're going to migrate all usages to this case API instead of passing the
|
// stream id directly
|
cudnnHandle_t GetCudnnHandle(DeviceIndex gpu) {
|
return GetCudnnHandle(c10::cuda::getCurrentCUDAStream(gpu));
|
}
|
|
cudnnHandle_t GetCudnnHandle(c10::cuda::CUDAStream cuda_stream) {
|
CUDAGuard guard(cuda_stream.device_index());
|
auto& r = cudnn_handles_[cuda_stream];
|
if (r == nullptr) {
|
CUDNN_ENFORCE(cudnnCreate(&r));
|
CUDNN_ENFORCE(cudnnSetStream(r, cuda_stream));
|
}
|
return r;
|
}
|
#endif // CAFFE2_USE_CUDNN
|
|
~ThreadLocalCUDAObjects() noexcept {
|
for (auto element : cublas_handles_) {
|
if (element.second) {
|
CUBLAS_CHECK(cublasDestroy(element.second));
|
}
|
}
|
#ifdef CAFFE2_USE_CUDNN
|
for (auto element : cudnn_handles_) {
|
if (element.second) {
|
CUDNN_CHECK(cudnnDestroy(element.second));
|
}
|
}
|
#endif // CAFFE2_USE_CUDNN
|
}
|
// WARNING: mapping from logical stream ID to c10::cuda::CUDAStream
|
// is NOT bijective; multiple logical stream IDs may map to the
|
// same underlying stream ID.
|
vector<c10::cuda::CUDAStream> cuda_streams_[C10_COMPILE_TIME_MAX_GPUS];
|
std::unordered_map<c10::cuda::CUDAStream, cublasHandle_t> cublas_handles_;
|
#ifdef CAFFE2_USE_CUDNN
|
std::unordered_map<c10::cuda::CUDAStream, cudnnHandle_t> cudnn_handles_;
|
#endif // CAFFE2_USE_CUDNN
|
};
|
|
class CAFFE2_CUDA_API CUDAContext final : public BaseContext {
|
public:
|
// The default cuda context constructor.
|
explicit CUDAContext(DeviceIndex gpu_id = -1);
|
explicit CUDAContext(const DeviceOption& option);
|
explicit CUDAContext(Device device)
|
: CUDAContext(DeviceToOption(device)) {}
|
|
~CUDAContext() override {
|
if (curand_generator_) {
|
CURAND_CHECK(curandDestroyGenerator(curand_generator_));
|
}
|
// CUDAContext is used in 2 cases now:
|
// - long-lived instance inside OperatorBase in which case what happens in
|
// destructor doesn't really matter
|
// - short-lived on-the-fly instances that are utilized as CUDAGuard - in
|
// this case there's only one stream id (passed to SwitchToDevice) and
|
// it's preferrable to synchronize in the destructor
|
FinishDeviceComputation();
|
}
|
|
inline void SwitchToDevice(StreamId stream_id) override {
|
getCudaObjects().SetCurrentStreamId(gpu_id_, stream_id);
|
CaffeCudaSetDevice(gpu_id_);
|
}
|
|
// void SwitchToDevice()
|
using BaseContext::SwitchToDevice;
|
|
inline void WaitEvent(const Event& ev) override {
|
ev.Wait(CUDA, this);
|
}
|
|
inline void Record(Event* ev, const char* err_msg = nullptr) const override {
|
CAFFE_ENFORCE(ev, "Event must not be null.");
|
ev->Record(CUDA, this, err_msg);
|
}
|
|
// Note on current use cases:
|
// FinishDeviceComputation must be called on the same cpu thread as
|
// SwitchToDevice()
|
void FinishDeviceComputation() override {
|
CUDA_ENFORCE(cudaStreamSynchronize(getCudaObjects().GetStream(gpu_id_)));
|
cudaError_t error = cudaGetLastError();
|
if (error != cudaSuccess) {
|
CAFFE_THROW("Encountered CUDA error: ", cudaGetErrorString(error));
|
}
|
}
|
|
inline int device_id() const {
|
return gpu_id_;
|
}
|
|
inline cudaStream_t cuda_stream() const {
|
return getCudaObjects().GetStream(gpu_id_);
|
}
|
|
static cudaStream_t cuda_stream(DeviceIndex gpu_id, StreamId stream_id) {
|
return getCudaObjects().GetStream(gpu_id, stream_id);
|
}
|
|
cublasHandle_t cublas_handle() {
|
return getCudaObjects().GetHandle(gpu_id_);
|
}
|
|
#ifdef CAFFE2_USE_CUDNN
|
cudnnHandle_t cudnn_handle() {
|
return getCudaObjects().GetCudnnHandle(gpu_id_);
|
}
|
#endif // CAFFE2_USE_CUDNN
|
|
curandGenerator_t& curand_generator() {
|
if (!curand_generator_) {
|
CUDAGuard guard(gpu_id_);
|
CURAND_ENFORCE(
|
curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
|
CURAND_ENFORCE(
|
curandSetPseudoRandomGeneratorSeed(curand_generator_, random_seed_));
|
CHECK_NOTNULL(curand_generator_);
|
}
|
CURAND_ENFORCE(curandSetStream(curand_generator_, cuda_stream()));
|
return curand_generator_;
|
}
|
|
inline static at::DataPtr New(size_t nbytes) {
|
return GetAllocator(CUDA)->allocate(nbytes);
|
}
|
|
// Get a mutex to lock out cudaMalloc / cudaFree calls when
|
// NCCL kernels are being launched. Should remove threat of
|
// deadlocks
|
static std::mutex& mutex();
|
|
// Functions to query memory stats. Only available if flag
|
// --caffe2_gpu_memory_tracking is enabled.
|
static std::vector<long> TotalMemoryByGpu();
|
static std::vector<long> MaxMemoryByGpu();
|
|
template <class SrcContext, class DstContext>
|
inline void CopyBytes(size_t nbytes, const void* src, void* dst) {
|
CUDA_ENFORCE(cudaMemcpyAsync(
|
dst,
|
src,
|
nbytes,
|
cudaMemcpyDefault,
|
getCudaObjects().GetStream(gpu_id_)));
|
}
|
|
void CopyBytesSameDevice(size_t nbytes, const void* src, void* dst) override {
|
CopyBytes<CUDAContext, CUDAContext>(nbytes, src, dst);
|
}
|
|
void CopyBytesToCPU(size_t nbytes, const void* src, void* dst) override {
|
CopyBytes<CUDAContext, CPUContext>(nbytes, src, dst);
|
}
|
|
void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst) override {
|
CopyBytes<CPUContext, CUDAContext>(nbytes, src, dst);
|
}
|
|
template <typename T, class SrcContext, class DstContext>
|
inline void Copy(int n, const T* src, T* dst) {
|
CopyBytes<SrcContext, DstContext>(n * sizeof(T),
|
static_cast<const void*>(src),
|
static_cast<void*>(dst));
|
}
|
|
template <class SrcContext, class DstContext>
|
inline void
|
CopyItems(const TypeMeta& meta, size_t n, const void* src, void* dst) {
|
CAFFE_ENFORCE(!meta.copy(), "CUDAContext requires fundamental types.");
|
CopyBytes<SrcContext, DstContext>(n * meta.itemsize(), src, dst);
|
}
|
|
static void CopyBytesAsync(
|
size_t nbytes,
|
const void* src,
|
Device src_device,
|
void* dst,
|
Device dst_device);
|
static void CopyBytesSync(
|
size_t nbytes,
|
const void* src,
|
Device src_device,
|
void* dst,
|
Device dst_device);
|
|
// By default CUDA operators have async device parts
|
static bool HasAsyncPartDefault() {
|
return true;
|
}
|
|
static bool SupportsAsyncScheduling() {
|
return true;
|
}
|
|
static bool IsStreamFree(const DeviceOption& option, StreamId stream_id) {
|
auto stream = CUDAContext::cuda_stream(option.device_id(), stream_id);
|
return cudaStreamQuery(stream) == cudaSuccess;
|
}
|
|
at::Device device() const override {
|
return at::Device(CUDA, gpu_id_);
|
}
|
|
DeviceType device_type() const override {
|
return CUDA;
|
}
|
|
static constexpr DeviceType GetDeviceType() {
|
return CUDA;
|
}
|
|
protected:
|
int gpu_id_;
|
int random_seed_;
|
curandGenerator_t curand_generator_{nullptr};
|
static ThreadLocalCUDAObjects& getCudaObjects();
|
};
|
|
using TensorCUDA = Tensor;
|
|
} // namespace caffe2
|
|
#endif // CAFFE2_CORE_CONTEXT_GPU_H_
|