#ifndef CAFFE2_CORE_CONTEXT_GPU_H_ #define CAFFE2_CORE_CONTEXT_GPU_H_ #include #include #include "caffe2/core/common.h" #include "caffe2/core/common_gpu.h" #include "caffe2/core/context.h" #include "caffe2/core/context_base.h" #include "caffe2/core/logging.h" #include "caffe2/core/numa.h" #include "caffe2/core/tensor.h" #include "caffe2/core/types.h" #include "caffe2/proto/caffe2_pb.h" // Since we are using the macro CAFFE2_USE_CUDNN, we will need to include this // file after common.h is included. #ifdef CAFFE2_USE_CUDNN #include "caffe2/core/common_cudnn.h" #endif // CAFFE2_USE_CUDNN #include #include #include #include namespace caffe2 { enum class CudaMemoryPoolType { NONE = 0, CUB = 1, THC = 2, }; /** * Gets the current memory pool type used by Caffe2. * * The memory pool is set up during caffe2's global initialization time. */ CAFFE2_CUDA_API CudaMemoryPoolType GetCudaMemoryPoolType(); /** * A struct to host thread-local cuda objects. * * In Caffe2, each thread has its own non-default cuda stream as well as * related objects such as cublas and curand handles. This is achieved by * having the ThreadLocalCUDAObjects wrapper that takes care of allocating * and deallocating these objects at the thread scope. This class is solely * used inside CUDAContext and should not be used externally. * * This class manages the mapping from logical stream ID (int stream_id * passed around in Caffe2) and CUDAStream objects. We intend to eventually * deprecate the logical stream ID interface, but not for now. */ class CAFFE2_CUDA_API ThreadLocalCUDAObjects { friend class CUDAContext; private: ThreadLocalCUDAObjects() { for (DeviceIndex i = 0; i < C10_COMPILE_TIME_MAX_GPUS; ++i) { cuda_streams_[i] = vector(); } } // Record current stream id for the current thread. // This is the new API we're trying to migrate use cases to and get rid of // explicit stream id passing. For now it's invoked in // CUDAContext::SwitchToDevice void SetCurrentStreamId(DeviceIndex gpu, StreamId stream_id) { // TODO: use current device id from thread local instead of passing gpu in c10::cuda::setCurrentCUDAStream(GetCUDAStream(gpu, stream_id)); } // Retrieves the CUDAStream corresponding to a logical stream ID, ensuring // that it exists in cuda_streams_ if it has not been allocated yet. c10::cuda::CUDAStream GetCUDAStream(DeviceIndex gpu, StreamId stream_id) { vector& gpu_streams = cuda_streams_[gpu]; while (gpu_streams.size() <= static_cast(stream_id)) { // NB: This streams are not guaranteed to be unique; we'll // wrap around once we run out of streams in the pool. gpu_streams.emplace_back(c10::cuda::getStreamFromPool(/* high priority */ false, gpu)); } return gpu_streams[stream_id]; } // Uses the logical stream id from the thread local to pick the stream // We're going to migrate all usages to this case API instead of passing the // stream id directly cudaStream_t GetStream(DeviceIndex gpu) { return c10::cuda::getCurrentCUDAStream(gpu).stream(); } cudaStream_t GetStream(DeviceIndex gpu, StreamId stream_id) { return GetCUDAStream(gpu, stream_id).stream(); } // Uses the logical stream id from the thread local to pick the stream // We're going to migrate all usages to this case API instead of passing the // stream id directly cublasHandle_t GetHandle(DeviceIndex gpu) { return GetHandle(c10::cuda::getCurrentCUDAStream(gpu)); } cublasHandle_t GetHandle(c10::cuda::CUDAStream cuda_stream) { CUDAGuard guard(cuda_stream.device_index()); // Default construct in the map if it doesn't exist, and return a mutable // refernce to it. auto& r = cublas_handles_[cuda_stream]; if (r == nullptr) { CUBLAS_ENFORCE(cublasCreate(&r)); // The default is CUBLAS_POINTER_MODE_HOST. You can override // it after obtaining the cublas handle, but do that with // caution. CUBLAS_ENFORCE(cublasSetPointerMode(r, CUBLAS_POINTER_MODE_HOST)); CUBLAS_ENFORCE(cublasSetStream(r, cuda_stream)); } return r; } #ifdef CAFFE2_USE_CUDNN // Uses the logical stream id from the thread local to pick the stream // We're going to migrate all usages to this case API instead of passing the // stream id directly cudnnHandle_t GetCudnnHandle(DeviceIndex gpu) { return GetCudnnHandle(c10::cuda::getCurrentCUDAStream(gpu)); } cudnnHandle_t GetCudnnHandle(c10::cuda::CUDAStream cuda_stream) { CUDAGuard guard(cuda_stream.device_index()); auto& r = cudnn_handles_[cuda_stream]; if (r == nullptr) { CUDNN_ENFORCE(cudnnCreate(&r)); CUDNN_ENFORCE(cudnnSetStream(r, cuda_stream)); } return r; } #endif // CAFFE2_USE_CUDNN ~ThreadLocalCUDAObjects() noexcept { for (auto element : cublas_handles_) { if (element.second) { CUBLAS_CHECK(cublasDestroy(element.second)); } } #ifdef CAFFE2_USE_CUDNN for (auto element : cudnn_handles_) { if (element.second) { CUDNN_CHECK(cudnnDestroy(element.second)); } } #endif // CAFFE2_USE_CUDNN } // WARNING: mapping from logical stream ID to c10::cuda::CUDAStream // is NOT bijective; multiple logical stream IDs may map to the // same underlying stream ID. vector cuda_streams_[C10_COMPILE_TIME_MAX_GPUS]; std::unordered_map cublas_handles_; #ifdef CAFFE2_USE_CUDNN std::unordered_map cudnn_handles_; #endif // CAFFE2_USE_CUDNN }; class CAFFE2_CUDA_API CUDAContext final : public BaseContext { public: // The default cuda context constructor. explicit CUDAContext(DeviceIndex gpu_id = -1); explicit CUDAContext(const DeviceOption& option); explicit CUDAContext(Device device) : CUDAContext(DeviceToOption(device)) {} ~CUDAContext() override { if (curand_generator_) { CURAND_CHECK(curandDestroyGenerator(curand_generator_)); } // CUDAContext is used in 2 cases now: // - long-lived instance inside OperatorBase in which case what happens in // destructor doesn't really matter // - short-lived on-the-fly instances that are utilized as CUDAGuard - in // this case there's only one stream id (passed to SwitchToDevice) and // it's preferrable to synchronize in the destructor FinishDeviceComputation(); } inline void SwitchToDevice(StreamId stream_id) override { getCudaObjects().SetCurrentStreamId(gpu_id_, stream_id); CaffeCudaSetDevice(gpu_id_); } // void SwitchToDevice() using BaseContext::SwitchToDevice; inline void WaitEvent(const Event& ev) override { ev.Wait(CUDA, this); } inline void Record(Event* ev, const char* err_msg = nullptr) const override { CAFFE_ENFORCE(ev, "Event must not be null."); ev->Record(CUDA, this, err_msg); } // Note on current use cases: // FinishDeviceComputation must be called on the same cpu thread as // SwitchToDevice() void FinishDeviceComputation() override { CUDA_ENFORCE(cudaStreamSynchronize(getCudaObjects().GetStream(gpu_id_))); cudaError_t error = cudaGetLastError(); if (error != cudaSuccess) { CAFFE_THROW("Encountered CUDA error: ", cudaGetErrorString(error)); } } inline int device_id() const { return gpu_id_; } inline cudaStream_t cuda_stream() const { return getCudaObjects().GetStream(gpu_id_); } static cudaStream_t cuda_stream(DeviceIndex gpu_id, StreamId stream_id) { return getCudaObjects().GetStream(gpu_id, stream_id); } cublasHandle_t cublas_handle() { return getCudaObjects().GetHandle(gpu_id_); } #ifdef CAFFE2_USE_CUDNN cudnnHandle_t cudnn_handle() { return getCudaObjects().GetCudnnHandle(gpu_id_); } #endif // CAFFE2_USE_CUDNN curandGenerator_t& curand_generator() { if (!curand_generator_) { CUDAGuard guard(gpu_id_); CURAND_ENFORCE( curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_ENFORCE( curandSetPseudoRandomGeneratorSeed(curand_generator_, random_seed_)); CHECK_NOTNULL(curand_generator_); } CURAND_ENFORCE(curandSetStream(curand_generator_, cuda_stream())); return curand_generator_; } inline static at::DataPtr New(size_t nbytes) { return GetAllocator(CUDA)->allocate(nbytes); } // Get a mutex to lock out cudaMalloc / cudaFree calls when // NCCL kernels are being launched. Should remove threat of // deadlocks static std::mutex& mutex(); // Functions to query memory stats. Only available if flag // --caffe2_gpu_memory_tracking is enabled. static std::vector TotalMemoryByGpu(); static std::vector MaxMemoryByGpu(); template inline void CopyBytes(size_t nbytes, const void* src, void* dst) { CUDA_ENFORCE(cudaMemcpyAsync( dst, src, nbytes, cudaMemcpyDefault, getCudaObjects().GetStream(gpu_id_))); } void CopyBytesSameDevice(size_t nbytes, const void* src, void* dst) override { CopyBytes(nbytes, src, dst); } void CopyBytesToCPU(size_t nbytes, const void* src, void* dst) override { CopyBytes(nbytes, src, dst); } void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst) override { CopyBytes(nbytes, src, dst); } template inline void Copy(int n, const T* src, T* dst) { CopyBytes(n * sizeof(T), static_cast(src), static_cast(dst)); } template inline void CopyItems(const TypeMeta& meta, size_t n, const void* src, void* dst) { CAFFE_ENFORCE(!meta.copy(), "CUDAContext requires fundamental types."); CopyBytes(n * meta.itemsize(), src, dst); } static void CopyBytesAsync( size_t nbytes, const void* src, Device src_device, void* dst, Device dst_device); static void CopyBytesSync( size_t nbytes, const void* src, Device src_device, void* dst, Device dst_device); // By default CUDA operators have async device parts static bool HasAsyncPartDefault() { return true; } static bool SupportsAsyncScheduling() { return true; } static bool IsStreamFree(const DeviceOption& option, StreamId stream_id) { auto stream = CUDAContext::cuda_stream(option.device_id(), stream_id); return cudaStreamQuery(stream) == cudaSuccess; } at::Device device() const override { return at::Device(CUDA, gpu_id_); } DeviceType device_type() const override { return CUDA; } static constexpr DeviceType GetDeviceType() { return CUDA; } protected: int gpu_id_; int random_seed_; curandGenerator_t curand_generator_{nullptr}; static ThreadLocalCUDAObjects& getCudaObjects(); }; using TensorCUDA = Tensor; } // namespace caffe2 #endif // CAFFE2_CORE_CONTEXT_GPU_H_