#pragma once #include #include #include #include #include #include #include #include #include #include #include #ifndef _WIN32 #include #endif #include typedef struct CUevent_st* CUDAEventStub; namespace torch { namespace autograd { struct Node; namespace profiler { struct TORCH_API CUDAStubs { virtual void record(int* device, CUDAEventStub* event, int64_t* cpu_ns) { fail(); } virtual float elapsed(CUDAEventStub event, CUDAEventStub event2) { fail(); return 0.f; } virtual void nvtxMarkA(const char* name) { fail(); } virtual void nvtxRangePushA(const char* name) { fail(); } virtual void nvtxRangePop() { fail(); } virtual bool enabled() { return false; } virtual void onEachDevice(std::function op) { fail(); } virtual void synchronize() { fail(); } virtual ~CUDAStubs(); private: void fail() { AT_ERROR("CUDA used in profiler but not enabled."); } }; TORCH_API void registerCUDAMethods(CUDAStubs* stubs); constexpr inline size_t ceilToMultiple(size_t a, size_t b) { return ((a + b - 1) / b) * b; } #if defined(__MACH__) && !defined(CLOCK_REALTIME) #include // clock_gettime is not implemented on older versions of OS X (< 10.12). // If implemented, CLOCK_REALTIME will have already been defined. #endif inline int64_t getTime() { #ifdef _WIN32 using namespace std::chrono; using clock = std::conditional::type; return duration_cast(clock::now().time_since_epoch()).count(); #elif defined(__MACH__) && !defined(CLOCK_REALTIME) struct timeval now; gettimeofday(&now, NULL); return static_cast(now.tv_sec) * 1000000000 + static_cast(now.tv_usec) * 1000; #else // clock_gettime is *much* faster than std::chrono implementation on Linux struct timespec t{}; clock_gettime(CLOCK_MONOTONIC, &t); return static_cast(t.tv_sec) * 1000000000 + static_cast(t.tv_nsec); #endif } // Old GCC versions generate warnings incorrectly // see https://stackoverflow.com/questions/2463113/g-c0x-enum-class-compiler-warnings #ifndef _MSC_VER # pragma GCC diagnostic push # pragma GCC diagnostic ignored "-Wattributes" #endif enum class TORCH_API ProfilerState { Disabled, CPU, // CPU-only profiling CUDA, // CPU + CUDA events NVTX, // only emit NVTX markers }; struct TORCH_API ProfilerConfig { ProfilerConfig(ProfilerState state, bool report_input_shapes) : state(state), report_input_shapes(report_input_shapes) {} ~ProfilerConfig(); ProfilerState state; bool report_input_shapes; }; enum class TORCH_API EventKind : uint16_t { Mark, PushRange, PopRange }; #ifndef _MSC_VER # pragma GCC diagnostic pop #endif struct TORCH_API Event final { Event( EventKind kind, StringView name, uint16_t thread_id, bool record_cuda, std::vector>&& shapes = {}) : name_(std::move(name)), kind_(kind), thread_id_(thread_id), shapes_(shapes) { record(record_cuda); } void record(bool record_cuda); std::string kind() const { switch(kind_) { case EventKind::Mark: return "mark"; case EventKind::PushRange: return "push"; case EventKind::PopRange: return "pop"; } throw std::runtime_error("unknown EventKind"); } const char* name() const { return name_.str(); } uint16_t thread_id() const { return thread_id_; } std::vector> shapes() const { return shapes_; } double cpu_elapsed_us(const Event & e) { return (e.cpu_ns_ - cpu_ns_)/(1000.0); } double cuda_elapsed_us(const Event & e); bool has_cuda() const { return event != nullptr; } int device() const { return device_; } private: // signed to allow for negative intervals, initialized for safety. int64_t cpu_ns_ = 0; StringView name_; EventKind kind_; uint16_t thread_id_; std::vector> shapes_; int device_ = -1; struct CUevent_st* event = nullptr; }; // a linked-list of fixed sized vectors, to avoid // a std::vector resize from taking a large amount of time inside // a profiling event struct RangeEventList { constexpr static size_t MB = 1024 * 1024; constexpr static size_t event_block_size = 16 * MB; constexpr static size_t num_block_elements = event_block_size / ceilToMultiple(sizeof(Event), alignof(Event)); static_assert(sizeof(Event[num_block_elements]) <= event_block_size, "num_block_elements is calculated incorrectly"); using block_type = std::vector; void allocBlock() { blocks.emplace_front(); auto & new_block = blocks.front(); new_block.reserve(num_block_elements); // Materialize all pages in the new block to release jitter when recording events. const char * const end_ptr = reinterpret_cast(new_block.data() + num_block_elements); for (volatile const char * ptr = reinterpret_cast(new_block.data()); ptr < end_ptr; ptr += 4 * 1024) { (*ptr); } } template void record(Args&&... args) { if (blocks.empty() || blocks.front().size() == num_block_elements) { allocBlock(); } blocks.front().emplace_back(std::forward(args)...); } std::vector consolidate() { std::vector result; for (auto & block : blocks) { result.insert(result.begin(), std::make_move_iterator(block.begin()), std::make_move_iterator(block.end())); } blocks.clear(); return result; } std::forward_list blocks; }; TORCH_API RangeEventList& getEventList(); TORCH_API void mark(std::string name, bool include_cuda = true); TORCH_API void pushRange(std::string name); TORCH_API void popRange(); using thread_event_lists = std::vector>; // NOTE: changing profiler modes is **NOT THREAD SAFE**. You should ensure that // there no autograd functions are being executed when these function are used. TORCH_API void enableProfiler(ProfilerConfig); TORCH_API thread_event_lists disableProfiler(); // Usage: // { // RecordProfile guard("filename.trace"); // // code you want to profile // } // Then open filename.trace in chrome://tracing struct TORCH_API RecordProfile { RecordProfile(std::ostream& out); RecordProfile(const std::string& filename); ~RecordProfile(); private: void init(); std::unique_ptr file_; std::ostream& out_; void processEvents(const std::vector& events); }; } // namespace profiler }} // namespace torch::autograd