#ifndef CAFFE2_CORE_TENSOR_H_ #define CAFFE2_CORE_TENSOR_H_ #include "caffe2/core/storage.h" #include "caffe2/core/tensor_impl.h" #include #include #if !defined(CAFFE2_IS_XPLAT_BUILD) #include "ATen/core/Tensor.h" #endif #include #include namespace caffe2 { using at::UndefinedTensorImpl; /** * @brief Tensor class holds a shared pointer to the implementation TensorImpl, * redirects API calls to TensorImpl; * Copying of Tensor results in sharing the same underlying implementation * object * * NB: See TensorImpl for documentation on these methods. */ class CAFFE2_API Tensor final { private: enum Unsafe { IDoWantAliasing }; Tensor(const Tensor& other, Unsafe _) : impl_(other.getIntrusivePtr()) {} protected: using TensorImplPtr = c10::intrusive_ptr; TensorImplPtr impl_; void enforce_invariants(); public: Tensor() : impl_() {} // caffe2::Tensor is explicitly marked as moveable-only because before // the refactoring the class used to be a value type and a lot of user code // is written this way. With PyTorch unification, caffe2::Tensor actually // has semantics of a shared_ptr now (via intrusive_ptr). However, to prevent // accidental mistakes when changing legacy code we keep caffe2::Tensor // to have movable semantics. // // If you need to get a pointer to the same Tensor instance (not to be // confused with shared storage), `UnsafeSharedInstance` can be used. It has // the same behavior as `at::Tensor a = b`. Tensor(const Tensor&) = delete; Tensor& operator=(const Tensor&) = delete; Tensor(Tensor&&) = default; Tensor& operator=(Tensor&&) = default; operator bool() const { return impl_.defined(); } TensorImpl* unsafeGetTensorImpl() const { return impl_.get(); } Tensor UnsafeSharedInstance() const { return Tensor(*this, IDoWantAliasing); } /** * @brief Creates a tensor of the given device type. * * Note that the actual data allocation is not going to be carried out until * you resize the tensor and then call mutable_data(). */ explicit Tensor(at::Device device) : impl_(c10::make_intrusive( Storage::create_legacy(device, TypeMeta()), c10::computeTensorTypeId(at::device(device).layout(at::kStrided)) )) { } /** * @brief Creates a tensor of the given dimension. * * Note that the actual data allocation is not going to be carried out until * the first time mutable_data() is called. */ explicit Tensor(at::IntArrayRef dims, DeviceType type) : Tensor(type) { // TODO: here, we create a Storage // and immediately discard it in Resize() since // reset_tensor will be true and FreeMemory will be called, // we might want to avoid creating Storage twice? Resize(dims); } // we want to preserve index information explicit Tensor(at::IntArrayRef dims, at::Device device): Tensor(device) { Resize(dims); } // TODO: remove? explicit Tensor(const vector& dims, DeviceType type) : Tensor(type) { Resize(dims); } /** * @brief: Create a Tensor of at::DeviceType `type` and initialize it with * src Tensor */ Tensor(const Tensor& src, DeviceType type) : Tensor(type) { CopyFrom(src); } /** * @brief Mutual conversion with at::Tensor * * The tensor will share the same instance (data, strides, sizes, etc) but * a different subset of APIs would be available */ #if !defined(CAFFE2_IS_XPLAT_BUILD) explicit Tensor(at::Tensor tensor) : impl_(std::move(tensor.impl_)) { enforce_invariants(); } explicit operator at::Tensor() const& { return at::Tensor::wrap_tensor_impl(impl_); } explicit operator at::Tensor() && { return at::Tensor::wrap_tensor_impl(std::move(impl_)); } #endif bool is_same(const Tensor& other) const noexcept { return impl_ == other.impl_; } Tensor Clone() const { Tensor x(GetDevice()); x.CopyFrom(*this); return x; } /** * Clone self as a Tensor that share the same Storage, * that is, both Tensors are views on the same Storage. * If we change the sizes or strides of one Tensor, it * does not affect the other Tensor that it shares Storage * with. * A similar yet different usage is `Tensor x = y;`, this * will make x and y pointing to the same Tensor and resizing * one of them will resize the other as well. * * TODO: Deduplicate this with THTensor_(newWithTensor) * (exposed in ATen as at::alias but not otherwise available) */ Tensor Alias() const { Tensor x(sizes(), GetDevice()); if (!dtype_initialized()) { C10_LOG_EVERY_MS(WARNING, 1000) << "Cloning a tensor that don't have a data type (did you call mutable_data on the tensor?)"; } AT_ASSERTM( storage_initialized(), "Cloning a tensor that has no content and has size > 0"); // set_storage already sets data_type_ of TensorImpl x.impl_->set_storage(storage()); x.impl_->set_storage_offset(impl_->storage_offset()); x.impl_->set_sizes_and_strides(sizes(), strides()); return x; } DeviceType GetDeviceType() const { return impl_->device_type(); } at::Device GetDevice() const { return impl_.get()->device(); } /** * @brief Copies the data from a source tensor, with a context provided to * carry out the underlying memcpy operation. This method respects * caffe2_keep_on_shrink. * * After CopyFrom, this function guarantees that the destination tensor will * have the same initialization state and dtype as src. This function * preserves the DeviceType of the source tensor (so, e.g., if you allocate * a tensor on CPU and then CopyFrom a CUDA tensor, that will to a * CUDA-to-CPU transfer). * * 'async' parameter triggers async copy for CUDA tensors */ void CopyFrom(const Tensor& src, bool async = false) { // TODO: only check `!impl_->requires_grad()` after Variable and Tensor are merged AT_ASSERT(!impl_->is_variable() || !(impl_->requires_grad() && at::GradMode::is_enabled())); AT_ASSERTM( src.impl_->is_contiguous(), "Right now only copy of contiguous source Tensor is supported."); AT_ASSERTM( src.impl_->storage_initialized(), "Cannot copy from an uninitialized Tensor"); if (src.impl_.get() == impl_.get()) { return; } // Test if we need to allocate a new storage // Uninitialized storages are guaranteed to be uniquely owned, // so we don't need to swap in dst case. // If the dtype changed, we need to reallocate storage. if (impl_->dtype() != src.impl_->dtype()) { // NB: copy preserves device_type // This storage will get initialized by the mutable_data call below. impl_->set_storage(at::Storage::create_legacy(impl_->device_type(), src.impl_->dtype())); } impl_->Resize(src.impl_->sizes()); if (impl_->numel() > 0) { if (impl_->dtype().copy()) { AT_ASSERTM( impl_->device_type() == ::at::DeviceType::CPU, "In CopyFrom source and dest tensors must both be CPU for " "non-POD copy, but dest tensor was ", impl_->device_type()); AT_ASSERTM( src.impl_->device_type() == ::at::DeviceType::CPU, "In CopyFrom source and dest tensors must both be CPU for " "non-POD copy, but src tensor was ", src.impl_->device_type()); impl_->dtype().copy()(src.impl_->data(), impl_->raw_mutable_data(impl_->dtype()), impl_->numel()); } else { // The following copy uses the current (thread local) stream for copying // and also takes the GPU id from the device() field passed in. // // TODO: Potentially more enforcements are necessary to avoid accidental // switch to sync copy if the currently set device is wrong. // // Specifically, we might need to switch to a different context device // here explicitly to avoid relying on user synchronizing things // properly. // // note: raw_mutable_data initializes device here void* new_data = impl_->raw_mutable_data(impl_->dtype()); at::CopyBytes( impl_->numel() * impl_->itemsize(), src.impl_->data(), src.impl_->device(), new_data, impl_->device(), async); } } } /** * @brief Extend the outer-most dimension of this tensor * to dimension of `num`. */ void ExtendTo(int64_t num, float growthPct) const { CAFFE_ENFORCE_GE_WITH_CALLER(impl_->dim(), 1); CAFFE_ENFORCE_GE_WITH_CALLER(growthPct, 0); Extend(num - impl_->size(0), growthPct); } void Extend(int64_t num, float growthPct) const { impl_.get()->Extend(num, growthPct); } /** * @brief Shrinks the outer-most dimension to given size, keeping the data. * * This method guarantees that no re-allocations are carried out, which means * that the extra capacity after the end of the shrunk tensor is maintained. * Notably, this function does NOT respect caffe2_keep_on_shrink. */ void ShrinkTo(int64_t outer_dim) const { CAFFE_ENFORCE_WITH_CALLER( impl_->is_contiguous(), "Right now ShrinkTo is only supported on contiguous Tensor."); CAFFE_ENFORCE_WITH_CALLER(impl_->dim() >= 1, "Tensor must be at least 1D"); CAFFE_ENFORCE_WITH_CALLER( outer_dim <= impl_->size(0), "New outer dimension must be smaller than current."); CAFFE_ENFORCE( impl_->storage().unique(), "Can't call ShrinkTo on shared storage, please call Resize instead."); impl_.get()->set_size(0, outer_dim); } template void ReserveSpace(const T& outer_dim) const { impl_.get()->ReserveSpace(outer_dim); } template void Resize(Ts... dim_source) const { impl_.get()->Resize(dim_source...); } /** * Resize the tensor like the source tensor. Note that this is just a * sugar wrapper that essentially calls Resize(src_tensor.dims()). * This method respects caffe2_keep_on_shrink. */ inline void ResizeLike(const Tensor& src_tensor) const { CAFFE_ENFORCE_WITH_CALLER( src_tensor.is_contiguous(), "Right now ResizeLike is only supported for contiguous Tensor."); if (impl_ != src_tensor.impl_) { impl_.get()->Resize(src_tensor.sizes()); } } inline void Reshape(const vector& dims) const { impl_.get()->Reshape(dims); } inline void Reshape(const vector& dims) const { impl_.get()->Reshape(ToVectorint64_t(dims)); } inline void FreeMemory() const { impl_.get()->FreeMemory(); } /** * A utility function to print the debug string for the tensor. Note that this * is very slow since it involves quite some string operations, so do not use * it in your performance-critical code. */ string DebugString() const { std::stringstream ss; ss << "A Tensor of item size " << impl_->storage().itemsize() << " and type " << impl_->dtype().name() << " and dimension ("; for (int d : impl_->sizes()) { ss << d << ","; } ss << ")."; return ss.str(); } // To be deprecated void ShareData(const Tensor& src) const { impl_.get()->ShareData(*src.impl_.get()); } /** * @brief Shares the data with an externally managed pointer. * * This is similar to ShareData() but the source is a pointer with an advanced * deleter option. In default, no deletion takes place, and one needs to make * sure that the external memory is deallocated only after the tensor finishes * using it. If a Deleter object is passed in, when this tensor is reallocated * or freed, the deleter function is going to be called. */ template void ShareExternalPointer( T* src, size_t capacity = 0, MemoryDeleter d = nullptr) const { ShareExternalPointer((void*)src, caffe2::TypeMeta::Make(), capacity, d); } template void ShareExternalPointer(at::DataPtr&& data_ptr, size_t capacity = 0) const { ShareExternalPointer(std::move(data_ptr), caffe2::TypeMeta::Make(), capacity); } void ShareExternalPointer( void* src, const TypeMeta& data_type, size_t capacity = 0, MemoryDeleter d = nullptr) const { CAFFE_ENFORCE_WITH_CALLER( impl_->is_contiguous(), "Right now ShareExternalPointer is only supported for contiguous Tensor."); CAFFE_ENFORCE_WITH_CALLER( data_type.id() != caffe2::TypeIdentifier::uninitialized(), "To share with a raw external pointer you need to pass in an " "initialized data_type(TypeMeta)."); impl_.get()->ShareExternalPointer( at::DataPtr(src, src, d, impl_->device_type()), data_type, capacity); } void ShareExternalPointer( at::DataPtr&& data_ptr, const TypeMeta& data_type, size_t capacity) { impl_.get()->ShareExternalPointer(std::move(data_ptr), data_type, capacity); } const c10::intrusive_ptr& getIntrusivePtr() const { return impl_; } bool defined() const { return impl_; } /** * Returns a raw void* pointer of the underlying storage. mutable_data() * or raw_mutable_data() must have been called prior to this function call. */ inline void* raw_data() const { return impl_->data(); } template inline T* data() const { return impl_.get()->data(); } inline void* raw_mutable_data(const TypeMeta& meta) const { return impl_.get()->raw_mutable_data(meta); } /** * Returns a mutable raw pointer of the underlying storage. This can only be * used when you know for sure that the underlying storage of the tensor is * already created via an earlier raw_mutable_data(meta) call or a * mutable_data() call. * * If the existing data does not match the desired type, it will be deleted * and a new storage will be created. */ inline void* raw_mutable_data() const { const auto& data_type = impl_->dtype(); CAFFE_ENFORCE_WITH_CALLER( data_type.id() != caffe2::TypeIdentifier::uninitialized(), "Calling raw_mutable_data() without meta, but the current meta is " "of unknown type."); return raw_mutable_data(data_type); } template inline T* mutable_data() const { return impl_.get()->mutable_data(); } /** * Returns the number of dimensions of the data. */ inline int dim() const { return impl_->dim(); } /** * (To be deprecated) Returns the number of dimensions of the data. */ inline int ndim() const { return impl_->dim(); } /** * (To be deprecated) Returns the size (i.e. the number of items) of the * tensor. */ inline int64_t size() const { return impl_->numel(); } /** * Returns the number of items of the tensor. */ inline int64_t numel() const { return impl_->numel(); } /** * Return the number of bytes each item takes in the tensor. */ inline size_t itemsize() const { return impl_->storage().itemsize(); } /** * Returns the total number of bytes of the storage. * * This is equivalent to calling size() * itemsize(). */ inline size_t nbytes() const { return impl_->numel() * itemsize(); } inline at::IntArrayRef sizes() const { return impl_.get()->sizes(); } inline int64_t size_from_dim(int k) const { return size_from_dim_(k, impl_->sizes()); } inline int64_t size_to_dim(int k) const { return size_to_dim_(k, impl_->sizes()); } inline int64_t size_between_dim(int k, int l) const { return size_between_dim_(k, l, impl_->sizes()); } /** * Returns the 'canonical' version of a (usually) user-specified axis, * allowing for negative indexing (e.g., -1 for the last axis). * * @param axis_index the axis index. * If 0 <= index < dim(), return index. * If -ndim <= index <= -1, return (dim() - (-index)), * e.g., the last axis index (dim() - 1) if index == -1, * the second to last if index == -2, etc. * Dies on out of range index. */ inline int canonical_axis_index(int axis_index) const { return canonical_axis_index_(axis_index, impl_->dim()); } inline int64_t stride(int64_t dim) const { return impl_.get()->stride(dim); } inline at::IntArrayRef strides() const { return impl_.get()->strides(); } inline bool is_contiguous(at::MemoryFormat memory_format=at::MemoryFormat::Contiguous) const { return impl_.get()->is_contiguous(memory_format); } /** * Checks if the tensor content is of the given data type. */ template inline bool IsType() const { return impl_->storage().IsType(); } /** * Returns the TypeMeta object associated with the current data type. */ inline const TypeMeta& dtype() const { return impl_->dtype(); } /** * (To be deprecated) Returns the TypeMeta object associated with the current * data type. */ inline const TypeMeta& meta() const { return impl_->dtype(); } /** * Returns the i-th dimension of the tensor in int. * * This function returns an int value instead of int64_t, which depending on * the typedef could be int64. If you want int64 dim values, make sure you * call dim() instead. */ inline int dim32(const int i) const { #ifndef NDEBUG CAFFE_ENFORCE_LT_WITH_CALLER(i, static_cast(impl_->dim()), "Exceeding ndim limit"); CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index"); #endif auto s = impl_->size(i); CAFFE_ENFORCE_LT_WITH_CALLER(s, std::numeric_limits::max()); return static_cast(s); } inline int64_t size(const int i) const { return impl_->size(i); } // To be deprecated inline int64_t dim(const int i) const { return impl_->size(i); } const Storage& storage() { return impl_->storage(); } const Storage& storage() const { return impl_->storage(); } bool storage_initialized() const { return impl_->storage_initialized(); } bool dtype_initialized() const { return impl_->dtype_initialized(); } }; /** * Reinitialize a Tensor to given dims and options if necessary, note that * this will not do anything if the * Tensor already has correct size and data type */ CAFFE2_API void ReinitializeTensor(Tensor* t, at::IntArrayRef dims, at::TensorOptions options); CAFFE2_API void ReinitializeAndCopyFrom( Tensor* t, at::TensorOptions options, const Tensor& src, bool async = false); CAFFE_DECLARE_PREALLOCATED_KNOWN_TYPE(12, Tensor) using TensorCPU = Tensor; constexpr int k_limit_default_ = 1000; // TODO: the following logic can be merged into regular Tensor class methods // after MKLMemory starts to implement Tensor interface // Type call registry typedef TypeMeta (*TypeCall)(const void*); TypeCall GetTypeCallFunction(TypeIdentifier id); void RegisterTypeCallFunction(TypeIdentifier id, TypeCall c); // Shape call registry typedef vector (*TensorInfoCall)( const void*, size_t* capacity, DeviceOption* device); TensorInfoCall GetTensorInfoFunction(TypeIdentifier id); void RegisterTensorInfoFunction(TypeIdentifier id, TensorInfoCall c); // resize helper function void TensorVectorResize( std::vector& tensors, int size, DeviceType type); // Tensor factory function CAFFE2_API Tensor empty(at::IntArrayRef dims, at::TensorOptions options); /** * @brief Creates a CPU tensor, and fills its contents with the given values. * Values are copied in */ // TODO: can be unified with at::from_blob when Tensor is merged and string // types are supported template Tensor TensorCPUFromValues(at::IntArrayRef dims, at::ArrayRef values) { Tensor r = empty(dims, at::device(CPU).dtype()); CAFFE_ENFORCE_EQ(values.size(), r.numel()); CPUContext context; context.CopyItemsFromCPU( r.dtype(), values.size(), values.data(), r.mutable_data()); return r; } vector GetTensorInfo(const void* c, size_t* capacity, DeviceOption* device); class CAFFE2_API TensorPrinter { public: explicit TensorPrinter( const std::string& tensor_name = "", const std::string& file_name = "", int limit = k_limit_default_); ~TensorPrinter(); template void Print(const Tensor& tensor); void PrintMeta(const Tensor& tensor); string MetaStr(const Tensor& tensor); private: bool to_file_; int limit_; std::unique_ptr log_file_; std::string tensor_name_; }; template void TensorPrinter::Print(const Tensor& tensor) { std::stringstream values_stream; // One most likely doesn't want to print int64-number of items for visual // inspection, so we cast down to int here. int total_count = static_cast(std::min(tensor.numel(), int64_t(limit_))); const T* tensor_data = tensor.template data(); for (int i = 0; i < total_count - 1; ++i) { values_stream << tensor_data[i] << ","; } if (total_count) { // We do not add a comma after the last item. values_stream << tensor_data[total_count - 1]; } if (to_file_) { (*log_file_) << MetaStr(tensor) << values_stream.str() << std::endl; } else { // Log to console. LOG(INFO) << MetaStr(tensor) << values_stream.str(); } } } // namespace caffe2 #endif // CAFFE2_CORE_TENSOR_H_