#ifndef CAFFE2_OPERATORS_UTILITY_OPS_H_ #define CAFFE2_OPERATORS_UTILITY_OPS_H_ #include #include #include #include "caffe2/core/common_omp.h" #include "caffe2/core/context.h" #include "caffe2/core/logging.h" #include "caffe2/core/operator.h" #include "caffe2/core/types.h" #include "caffe2/operators/gather_op.h" #include "caffe2/utils/conversions.h" #include "caffe2/utils/math.h" namespace caffe2 { template class NanCheckOp final : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; template explicit NanCheckOp(Args&&... args) : Operator(std::forward(args)...) {} bool RunOnDevice() override; private: TensorPrinter tensorPrinter_; Tensor scratch_; }; struct GetNanCheckGradient : public GradientMakerBase { using GradientMakerBase::GradientMakerBase; std::vector GetGradientDefs() override { return {CreateOperatorDef( "NanCheck", "", std::vector{GO(0)}, std::vector{GI(0)})}; } }; template class IsNanOp final : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; IsNanOp(const OperatorDef& operator_def, Workspace* ws) : Operator(operator_def, ws) {} bool RunOnDevice() override { return DispatchHelper>::call(this, Input(0)); } template bool DoRunWithType() { auto& X = Input(0); auto* Y = Output(0, X.sizes(), at::dtype()); const auto* X_data = X.template data(); uint8_t* Y_data = Y->template mutable_data(); for (size_t i = 0; i < X.numel(); i++) { Y_data[i] = (uint8_t)(std::isnan(X_data[i])); } return true; } }; template class WallClockTimeOp final : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; template explicit WallClockTimeOp(Args&&... args) : Operator(std::forward(args)...) {} bool RunOnDevice() override { int64_t nanoseconds = static_cast( std::chrono::duration_cast( std::chrono::high_resolution_clock::now().time_since_epoch()) .count()); TensorCPU* output = Output(0); output->Resize(); *output->template mutable_data() = nanoseconds; return true; } }; const char kPrintFileExtension[] = ".log"; template class PrintOp final : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; USE_DISPATCH_HELPER; explicit PrintOp(const OperatorDef& operator_def, Workspace* ws) : Operator(operator_def, ws), tensor_printer_( operator_def.input(0), this->template GetSingleArgument("to_file", 0) ? ws->RootFolder() + "/" + operator_def.input(0) + kPrintFileExtension : "", this->template GetSingleArgument("limit", 0)), every_n_(this->template GetSingleArgument("every_n", 1)) { CAFFE_ENFORCE_GE(every_n_, 1); } bool RunOnDevice() override { if (++occurrences_mod_n_ > every_n_) { occurrences_mod_n_ -= every_n_; } if (occurrences_mod_n_ != 1) { return true; } if (!this->InputIsTensorType(0, Context::GetDeviceType()) && !this->InputIsTensorType(0, CPU)) { LOG(INFO) << "Blob of type: " << OperatorBase::Inputs().at(0)->meta().name(); return true; } // special-case empty tensors since they may have no meta() if (Input(0).numel() == 0) { tensor_printer_.PrintMeta(Input(0)); return true; } using Types = TensorTypes< float, double, int, long, bool, char, unsigned char, std::string>; if (this->InputIsTensorType(0, CPU)) { return DispatchHelper::call( this, this->template Input(0, CPU)); } else { return DispatchHelper::call(this, Input(0)); } } private: template bool DoRunWithType() { // A simple strategy to copy tensor if needed, and have the tensor pointer // pointing to the right instantiation. Note that tensor_copy_if_needed // will handle memory deallocation itself so no smart pointer is needed. const TensorCPU* tensor; Tensor tensor_copy_if_needed(CPU); if (this->InputIsTensorType(0, CPU)) { tensor = &this->template Input(0, CPU); } else { // sync copy tensor_copy_if_needed.CopyFrom(Input(0)); tensor = &tensor_copy_if_needed; } tensor_printer_.Print(*tensor); return true; } private: TensorPrinter tensor_printer_; int every_n_; int occurrences_mod_n_{0}; }; /** * @brief Alias op makes the output and the input share the same underlying * storage. * * WARNING: in general, in caffe2's operator interface different tensors should * have different underlying storage, which is the assumption made by * components such as the dependency engine and memory optimization. Thus, in * normal situations you should not use the AliasOp, especially in a normal * forward-backward pass. * * The Alias op is provided so one can achieve true asynchrony, such as * Hogwild, in a graph. But make sure you understand all the implications * similar to multi-thread computation before you use it explicitly. */ template class AliasOp final : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; USE_SIMPLE_CTOR_DTOR(AliasOp); bool RunOnDevice() override { auto& input = Input(0); CAFFE_ENFORCE_GE(input.numel(), 0, "Tensor is not initialized"); OutputTensorAlias(0, input); return true; } }; /** * @brief Pass inputs to outputs. * Input: * DATA - dense tensor. * Output: * DATA - same tensor as input. */ template class EnsureDenseOp final : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; USE_SIMPLE_CTOR_DTOR(EnsureDenseOp) bool RunOnDevice() override { const auto& input = Input(0); auto* output = Output(0); CAFFE_ENFORCE_GT(input.dim(), 0, "Input has to be at least a vector."); // it is allowed to have the output inplace overwrite the input but also // allow the output to be copied from the input if (&input != output) { output->ResizeLike(input); output->CopyFrom(input, true /*async*/); } return true; } }; template class FlattenToVecOp : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; USE_SIMPLE_CTOR_DTOR(FlattenToVecOp); bool RunOnDevice() override { auto& input = Input(0); auto* output = Output(0); CAFFE_ENFORCE_GE( input.dim(), 1, "The rank of the tensor must be >= 1."); output->Resize(input.numel()); context_.CopyItemsSameDevice( input.dtype(), input.numel(), input.raw_data(), output->raw_mutable_data(input.dtype())); return true; } }; // Output gets the data of input(0), but reshapes it like input(1). template class ResizeLikeOp : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; USE_SIMPLE_CTOR_DTOR(ResizeLikeOp); bool RunOnDevice() override { auto& input0 = Input(0); auto& input1 = Input(1); auto* output = Output(0); CAFFE_ENFORCE_EQ(input0.numel(), input1.numel()); output->ResizeLike(Input(1)); context_.CopyItemsSameDevice( input0.dtype(), input0.numel(), input0.raw_data(), output->raw_mutable_data(input0.dtype())); return true; } }; template class SumOp : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; USE_SIMPLE_CTOR_DTOR(SumOp); template bool DoRunWithType() { auto& input0 = Input(0); if (InputSize() == 1) { // TODO: better TensorOptions argument passing(e.g. default argument) OutputTensorCopyFrom( 0, // I'll change the order of argument in another diff, so that we don't // need to write this at::dtype(input0.dtype()), input0, true /*async*/); return true; } auto* output = Output(0, input0.sizes(), at::dtype()); T* output_data = output->template mutable_data(); // Dimension checking for (int i = 1; i < InputSize(); ++i) { if (output->sizes() != Input(i).sizes()) { CAFFE_THROW( "Check failed: output->sizes() == Input(i).sizes().", "Description: Input #", i, ", input dimension:", Input(i).sizes(), " should match output dimension: ", output->sizes()); } } // Add the first two - works if in-place or not. math::Add( output->numel(), input0.template data(), Input(1).template data(), output_data, &context_); // Add remaining. for (int i = 2; i < InputSize(); ++i) { math::Add( output->numel(), output_data, Input(i).template data(), output_data, &context_); } return true; } bool RunOnDevice() override { if (Input(0).template IsType()) { return DoRunWithType(); } else if (Input(0).template IsType()) { return DoRunWithType(); } else { CAFFE_THROW( "Sum operator only supports 32-bit float and ints, but", " input was of type ", Input(0).dtype().name()); } } }; inline OpSchema::Cost CostInferenceForSum( const OperatorDef& def, const std::vector& in) { struct OpSchema::Cost cost = PointwiseCostInference<1>(def, in); cost.flops *= (in.size() - 1); cost.params_bytes = 0; return cost; } // WeightedSumOp computes the weighted sum of several tensors. The input should // be in the form X_0, weight_0, X_1, weight_1, ... where X_i all have the same // shape, and weight_i are size 1 tensors that specifies the weight of each // vector. Note that if one wants to do in-place computation, it could only be // done with X_0 also as the output, but not other X_i. template class WeightedSumOp : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; USE_SIMPLE_CTOR_DTOR(WeightedSumOp); bool RunOnDevice() override; template bool DoRunWithType() { // the code is written this way because of 10.1 + gcc 7.3.1 compiler bug // as discussed at https://devtalk.nvidia.com/default/topic/1048037/linux/cuda-10-1-nvidia-you-re-now-quot-fixing-quot-gcc-bugs-that-gcc-doesn-t-even-have/ const int input_size = (*this).InputSize(); CAFFE_ENFORCE_EQ(input_size % 2, 0); const auto& X0 = Input(0); const auto& weight0 = Input(1); CAFFE_ENFORCE_GT(X0.numel(), 0); CAFFE_ENFORCE_EQ(weight0.numel(), 1); const int size = X0.numel(); // Note: removed Aliasing check, since Output already has // caching capability auto* Y = Output(0, X0.sizes(), at::dtype()); T* Y_data = Y->template mutable_data(); if (input_size == 2) { math::Scale( size, weight0.template data(), X0.template data(), Y_data, &context_); return true; } const auto& X1 = Input(2); CAFFE_ENFORCE( !IsInputOutputAlias(2, 0), "Input #2 is the same as output. If you want to do in-place updates, " "put the output as input #0."); const auto& weight1 = Input(3); CAFFE_ENFORCE_EQ(X1.numel(), size); CAFFE_ENFORCE_EQ(weight1.numel(), 1); if (!IsInputOutputAlias(0, 0)) { context_.template CopySameDevice(size, X0.template data(), Y_data); } math::Axpby( size, weight1.template data(), X1.template data(), weight0.template data(), Y_data, &context_); for (int i = 4; i < input_size; i += 2) { const auto& Xi = Input(i); // Do a check: if the input is the same as output, we have a problem - // in-place update should always only happen with the zeroth input. const std::string err_msg = "Input #" + to_string(i) + " is the same as output. If you want to do in-place updates, " "put the output as input #0."; CAFFE_ENFORCE(!IsInputOutputAlias(i, 0), err_msg); const auto& weighti = Input(i + 1); CAFFE_ENFORCE_EQ(Xi.numel(), size); CAFFE_ENFORCE_EQ(weighti.numel(), 1); math::Axpy( size, weighti.template data(), Xi.template data(), Y_data, &context_); } return true; } }; template class WeightedSumGradientOp : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; template explicit WeightedSumGradientOp(Args&&... args) : Operator(std::forward(args)...), grad_on_w_(this->template GetSingleArgument("grad_on_w", false)) { } template bool DoRunWithType() { CAFFE_ENFORCE_EQ(InputSize() % 2, 1); auto output_size = grad_on_w_ ? InputSize() - 1 : InputSize() / 2; CAFFE_ENFORCE_EQ(OutputSize(), output_size); auto& dY = Input(0); const auto* dY_data = dY.template data(); int size = dY.numel(); // The input size should be the input size of the forward op plus 1 for (int i = 0; i < InputSize() / 2; i++) { auto& cur_w = Input(2 * i + 2); CAFFE_ENFORCE_EQ(cur_w.numel(), 1); auto* cur_dX = Output(i, dY.sizes(), at::dtype()); math::Scale( size, cur_w.template data(), dY_data, cur_dX->template mutable_data(), &context_); if (grad_on_w_) { auto& cur_X = Input(2 * i + 1); CAFFE_ENFORCE_EQ(cur_X.numel(), size); auto* cur_dw = Output(i + output_size / 2); cur_dw->Resize(1); math::Dot( size, dY_data, cur_X.template data(), cur_dw->template mutable_data(), &context_); } } return true; } bool RunOnDevice() override; private: bool grad_on_w_; }; /** * @brief Update slices of the tensor in-place with weighted sum. * * ScatterWeightedSumOp is similar to WeightedSum and computes the weighted sum * of several tensors. The first tensor has to be in-place and only slices of it * on the first dimension as indexed by INDICES will be updated. * * Input: * X_0 - tensor to be updated * weight_0 - scalar weight for X_0, applied only to slices affected, * INDICES - 1-D list of indices on the first dimension of X_0 that need to be * updated * X_1 - update slices, has to have shape of len(INDICES) + shape(X_0)[1:] * weight_1 - scalar weight for X_1 update * X_2, weight_2, ... * * Output: * X_0 - has to be exactly the same tensor as the input 0 * * Note: The op pretty much ignores the exact shapes of the input arguments and * cares only about sizes. It's done for performance consideration to avoid * unnecessary reshapes. Only first dimension of X_0 is important, let's call it * N. If M is the total size of X_0 and K is the size of INDICES then X_i is * assumed to be of shape K x (M / N) regardless of the real shape. * * Note: Each update in INDICES is applied independently which means that if * duplicated elements are present in INDICES the corresponding slice of X_0 * will be scaled multiple times. Manual collapsing of INDICES is required * beforehand if necessary. * * Note: Updates are applied sequentially by inputs which might have undesired * consequences if the input tensor is accessed concurrently by different op * (e.g. when doing Hogwild). Other threads might see intermediate results even * on individual slice level, e.g. X_0 scaled by weight_0 but without any * updates applied. * * For now really works only on CPU because of INDICES access */ template class ScatterWeightedSumOp : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; USE_SIMPLE_CTOR_DTOR(ScatterWeightedSumOp); USE_DISPATCH_HELPER; bool RunOnDevice() override { return DispatchHelper>::call(this, Input(2)); } private: template bool DoRunWithType() { int64_t block_size = Input(0).size_from_dim(1); return DispatchHelper, Index>::call(this, block_size); } template bool DoRunWithValue() { CAFFE_ENFORCE_EQ(InputSize() % 2, 1); auto& X0 = Input(0); auto& weight0 = Input(1); auto& indices = Input(2); auto* output = Output(0); CAFFE_ENFORCE_EQ(&X0, output, "In place operation is required"); CAFFE_ENFORCE_GT(X0.numel(), 0); CAFFE_ENFORCE_GT(X0.dim(), 0, "X0 has to be at least the vector"); CAFFE_ENFORCE_EQ(weight0.numel(), 1); int64_t M = X0.numel(); int64_t N = X0.size(0); int64_t K = indices.numel(); int64_t block_size = M / N; T* data = output->template mutable_data(); const Index* idxs = indices.template data(); T w0 = *weight0.template data(); // It's most likely a constant so exact comparison is fine if (w0 != 1.0) { for (int i = 0; i < K; ++i) { Index idx = idxs[i]; CAFFE_ENFORCE( 0 <= idx && idx < N, "Index out of bounds: ", idx, ", range 0 to ", N); math::ScaleFixedSize( block_size, w0, data + block_size * idx, data + block_size * idx, &context_); } } for (int inp = 3; inp < InputSize(); inp += 2) { auto& X = Input(inp); auto& weight = Input(inp + 1); CAFFE_ENFORCE_EQ(X.numel(), block_size * K); CAFFE_ENFORCE_EQ(weight.numel(), 1); const T* x_data = X.template data(); T w = *weight.template data(); for (int i = 0; i < K; ++i) { Index idx = idxs[i]; // double-checking the indices, but it's fine as it's DCHECK only DCHECK(0 <= idx && idx < N) << "Index out of bounds: " << idx << ", range 0 to " << N; math::AxpyFixedSize( block_size, w, x_data + block_size * i, data + block_size * idx, &context_); } } return true; } Tensor x_data_host_; Tensor weights_host_; Tensor x_data_device_; Tensor weights_device_; }; /** * @brief Update slices of the tensor in-place by overriding. * * Input: * DATA - tensor to be updated * INDICES - 1-D list of indices on the first dimension of X_0 that need to be * updated * SLICES - update slices, has to have shape of len(INDICES) + shape(X_0)[1:] * * Output: * DATA - has to be exactly the same tensor as the input 0 * * Note: The op pretty much ignores the exact shapes of the input arguments and * cares only about sizes. It's done for performance consideration to avoid * unnecessary reshapes. Only first dimension of X_0 is important, let's call it * N. If M is the total size of X_0 and K is the size of INDICES then X_i is * assumed to be of shape K x (M / N) regardless of the real shape. * * Note: Each update in INDICES is applied independently which means that if * duplicated elements are present in INDICES arbitrary one will win. * * For now really works only on CPU because of INDICES access */ template class ScatterAssignOp : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; virtual ~ScatterAssignOp() {} template explicit ScatterAssignOp(Args&&... args) : Operator(std::forward(args)...), runners_({{{TensorProto_DataType_INT32, TensorProto_DataType_FLOAT}, &ScatterAssignOp::DoRun}, {{TensorProto_DataType_INT32, TensorProto_DataType_FLOAT16}, &ScatterAssignOp::DoRun}, {{TensorProto_DataType_INT32, TensorProto_DataType_UINT8}, &ScatterAssignOp::DoRun}, {{TensorProto_DataType_INT32, TensorProto_DataType_INT32}, &ScatterAssignOp::DoRun}, {{TensorProto_DataType_INT32, TensorProto_DataType_INT64}, &ScatterAssignOp::DoRun}, {{TensorProto_DataType_INT64, TensorProto_DataType_FLOAT}, &ScatterAssignOp::DoRun}, {{TensorProto_DataType_INT64, TensorProto_DataType_FLOAT16}, &ScatterAssignOp::DoRun}, {{TensorProto_DataType_INT64, TensorProto_DataType_UINT8}, &ScatterAssignOp::DoRun}, {{TensorProto_DataType_INT64, TensorProto_DataType_INT32}, &ScatterAssignOp::DoRun}, {{TensorProto_DataType_INT64, TensorProto_DataType_INT64}, &ScatterAssignOp::DoRun}}) {} bool RunOnDevice() override { const auto& data = Input(DATA); const auto& slices = Input(SLICES); auto& indices = Input(INDICES); const auto dataType = TypeMetaToDataType(data.dtype()); const auto slicesType = TypeMetaToDataType(slices.dtype()); const auto indicesType = TypeMetaToDataType(indices.dtype()); auto* output = Output(0); auto runner = GetRunner(dataType, slicesType, indicesType); (this->*runner)(); return true; } private: typedef void (ScatterAssignOp::*RunnerType)(); typedef std:: map, RunnerType> RunnerMap; RunnerMap runners_; RunnerType GetRunner( const TensorProto_DataType dataType, const TensorProto_DataType slicesType, const TensorProto_DataType indicesType) { CAFFE_ENFORCE_EQ(dataType, slicesType, "Data and slice types must match"); auto it = runners_.find({indicesType, dataType}); CAFFE_ENFORCE( it != runners_.end(), "Could not find the runner corresponding to indicesType, dataType = ", indicesType, " ", dataType); return it->second; } template void DoRun() { auto& input = Input(DATA); auto& indices = Input(INDICES); auto& slices = Input(SLICES); auto* output = Output(0); CAFFE_ENFORCE_EQ(&input, output, "In place operation is required"); CAFFE_ENFORCE_GT(input.dim(), 0, "X0 has to be at least the vector"); int64_t M = input.numel(); int64_t N = input.size(0); int64_t K = indices.numel(); int64_t block_size = M / N; CAFFE_ENFORCE_EQ(slices.numel(), block_size * K); // TODO(dzhulgakov): it can be made to work with arbitrary data type by // using raw_mutable_data T* data = output->template mutable_data(); const Index* idxs = indices.template data(); const T* slicesData = slices.template data(); DoScatterAssign(data, idxs, slicesData, N, K, block_size); } template void DoScatterAssign( T* data, const Index* idxs, const T* slicesData, int64_t N, int64_t K, int64_t block_size) { for (int i = 0; i < K; ++i) { Index idx = idxs[i]; // double-checking the indices, but it's fine as it's DCHECK only DCHECK(0 <= idx && idx < N) << "Index out of bounds: " << idx << ", range 0 to " << N; context_.template CopySameDevice( block_size, slicesData + block_size * i, data + block_size * idx); } } INPUT_TAGS(DATA, INDICES, SLICES); }; template class ScatterOp : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; template explicit ScatterOp(Args&&... args) : Operator(std::forward(args)...), OP_SINGLE_ARG(int, "axis", axis_, 1) { } virtual ~ScatterOp() noexcept override {} bool RunOnDevice() override { TORCH_CHECK(Context::GetDeviceType() == kCPU, "ScatterOp currently only supports CPU.") return DispatchHelper>::call( this, this->template Input(INDICES, CPU)); } template bool DoRunWithType() { const Tensor& data = Input(DATA); const Tensor& indices = Input(INDICES); const Tensor& updates = Input(UPDATES); const TypeMeta dataType = data.dtype(); size_t item_bytesize = dataType.itemsize(); // ONNX allows negative axis to index from the back, valid range: [-r, r]. axis_ = data.canonical_axis_index(axis_); CAFFE_ENFORCE_GE(data.dim(), axis_ + 1, "DATA should be at least [axis+1]-D"); CAFFE_ENFORCE_GE(axis_, 0, "Axis should be non-negative"); CAFFE_ENFORCE_LT(axis_, data.dim(), "Axis out of range"); Tensor* output = Output(0, data.sizes().vec(), at::dtype(dataType)); output->CopyFrom(data); char* out = static_cast(output->raw_mutable_data(dataType)); // Succeed if size of output is zero, which can happen for empty batch which // would have data dimension size of 0. // This *must* be done AFTER output->raw_mutable_data() above as that has // important allocation side effect that we must see. if (output->numel() == 0) { return true; } const IndexType* idxs = indices.template data(); const char* src_base = static_cast(updates.raw_data()); const int64_t outer_dims_product = indices.size_to_dim(axis_); const int64_t dst_indexing_axis_dim = data.size(axis_); const int64_t idxs_block_size = indices.size_from_dim(axis_ + 1); const int64_t src_block_size = updates.size_from_dim(axis_ + 1); const int64_t dst_block_size = data.size_from_dim(axis_ + 1); const int64_t idxs_batch_size = indices.size_from_dim(axis_); const int64_t src_batch_size = updates.size_from_dim(axis_); const int64_t dst_batch_size = data.size_from_dim(axis_); const int64_t N = indices.size(axis_); check_indexarray_range(idxs, N, dst_indexing_axis_dim); // For a 3-D tensor, dst is updated as: // dst[i][idxs[i][j][k]][k] = src[i][j][k] # if dim == 1 // where i, j, k are iterating over their corresponding axis I, J, K. // For a given i, j, k tuple. // idxs offset can be computed as i * J_src * K + j * K + k. // src offset can be computed as i * J_src * K + j * K + k. // dst offset can be computed as i * J_dst * K + idxs[idxs_offset] * K + K // Note that idxs and src should have the same rank and shape. // dst should have the same rank as idxs and src, but the dimension of dim axis can be different. // That is why in the above equation, there is the difference of J_src and J_dst. for (int64_t outer_batch = 0; outer_batch < outer_dims_product; ++outer_batch) { for (int64_t i = 0; i < N; ++i) { for (int64_t inner_batch = 0; inner_batch < idxs_block_size; ++inner_batch) { auto idxs_elem_idx = outer_batch * idxs_batch_size + i * idxs_block_size + inner_batch; auto src_elem_idx = outer_batch * src_batch_size + i * src_block_size + inner_batch; auto dst_elem_idx = outer_batch * dst_batch_size + idxs[idxs_elem_idx] * dst_block_size + inner_batch; auto src = src_base + src_elem_idx * item_bytesize; auto dst = out + dst_elem_idx * item_bytesize; context_.CopyItemsSameDevice(dataType, 1, src, dst); } } } return true; } INPUT_TAGS(DATA, INDICES, UPDATES); // Check that indices fall within dimension array size with CAFFE_ENFORCE. template static void check_indexarray_range( const IndexType* indices, int64_t n, IndexType indexing_axis_dim) { for (auto i = 0; i < n; ++i) { auto idx = indices[i]; CAFFE_ENFORCE( 0 <= idx && idx < indexing_axis_dim, "INDICES element is out of DATA bounds, id=", idx, " axis_dim=", indexing_axis_dim); } } protected: int axis_; }; template class LengthsToSegmentIdsOp : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; USE_SIMPLE_CTOR_DTOR(LengthsToSegmentIdsOp); bool RunOnDevice() override { auto& input = Input(0); auto* output = Output(0); auto* input_data = input.template data(); CAFFE_ENFORCE(input.sizes().size() == 1, "Input must be a vector."); auto total_length = std::accumulate(input_data, input_data + input.numel(), 0); output->Resize(total_length); auto* output_data = output->template mutable_data(); for (int i = 0; i < input.numel(); ++i) { auto len = input_data[i]; std::fill(output_data, output_data + len, i); output_data += len; } return true; } }; template class LengthsToRangesOp : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; USE_SIMPLE_CTOR_DTOR(LengthsToRangesOp); bool RunOnDevice() override { auto& input = Input(0); auto* output = Output(0); auto* input_data = input.template data(); CAFFE_ENFORCE(input.sizes().size() == 1, "Input must be a vector."); auto size = input.numel(); output->Resize(size, 2); auto* output_data = output->template mutable_data(); int32_t offset = 0; for (int i = 0; i < size; ++i) { auto len = input_data[i]; output_data[i * 2] = offset; output_data[i * 2 + 1] = len; offset += len; } return true; } }; template class SegmentIdsToLengthsOp : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; USE_SIMPLE_CTOR_DTOR(SegmentIdsToLengthsOp); bool RunOnDevice() override { return DispatchHelper>::call(this, Input(0)); } template bool DoRunWithType() { auto& input = Input(0); if (input.dim() == 2) { CAFFE_ENFORCE( input.dim32(0) == 1 || input.dim32(1) == 1, "Input must be a vector."); } else { CAFFE_ENFORCE_EQ(input.dim(), 1, "Input must be a vector."); } auto* input_data = input.template data(); auto input_size = input.numel(); auto* output = Output(0); // segment id starts from 0 auto num_segments = input_size ? input_data[input_size - 1] + 1 : 0; if (InputSize() > 1) { CAFFE_ENFORCE_GE(Input(1).dim(), 1); CAFFE_ENFORCE_LE( num_segments, Input(1).size(0), "The number of segments inferred should *NOT* be larger " "than the size of Input(1)'s first dimension"); num_segments = Input(1).size(0); } CAFFE_ENFORCE(0 <= num_segments, "Indices must be in 0..K-1 range"); output->Resize(num_segments); auto* output_data = output->template mutable_data(); if (num_segments == 0) { return true; } std::fill(output_data, output_data + num_segments, 0); Index prev = 0; // Assume that segment_id >= 0. for (int64_t i = 0; i < input_size; i++) { CAFFE_ENFORCE( prev <= input_data[i], "Segment ids must be sorted: ", prev, " vs ", input_data[i]); prev = input_data[i]; output_data[input_data[i]] += 1; } return true; } }; template class SegmentIdsToRangesOp : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; USE_SIMPLE_CTOR_DTOR(SegmentIdsToRangesOp); bool RunOnDevice() override { return DispatchHelper>::call(this, Input(0)); } template bool DoRunWithType() { auto& input = Input(0); CAFFE_ENFORCE(input.sizes().size() == 1, "Input must be a vector."); auto* input_data = input.template data(); auto input_size = input.numel(); auto* output = Output(0); // segment id starts from 0 auto num_segments = input_size ? input_data[input_size - 1] + 1 : 0; if (InputSize() > 1) { CAFFE_ENFORCE_GE(Input(1).dim(), 1); CAFFE_ENFORCE_LE( num_segments, Input(1).size(0), "The number of segments inferred should *NOT* be larger " "than the size of Input(1)'s first dimension"); num_segments = Input(1).size(0); } CAFFE_ENFORCE(0 <= num_segments, "Indices must be in 0..K-1 range"); output->Resize(num_segments, 2); auto* output_data = output->template mutable_data(); if (num_segments == 0) { return true; } std::fill(output_data, output_data + num_segments * 2, 0); Index prev = input_data[0]; for (int64_t i = 0; i < input_size; i++) { CAFFE_ENFORCE( prev <= input_data[i], "Segment ids must be sorted: ", prev, " vs ", input_data[i]); while (prev != input_data[i]) { ++prev; output_data[prev * 2] = i; } output_data[input_data[i] * 2 + 1] += 1; } return true; } }; template class LengthsToWeightsOp : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; template explicit LengthsToWeightsOp(Args&&... args) : Operator(std::forward(args)...), power_(this->template GetSingleArgument("power", 0.5)) {} bool RunOnDevice() override { return DispatchHelper>::call(this, Input(0)); } template bool DoRunWithType() { auto& input = Input(0); CAFFE_ENFORCE(input.sizes().size() == 1, "Input must be a vector."); auto* input_data = input.template data(); auto input_size = input.numel(); auto* output = Output(0); int64_t output_size = 0; for (auto i = 0; i < input_size; i++) { CAFFE_ENFORCE_GE(input_data[i], 0, "unexpected negative length value"); output_size += input_data[i]; } std::function getWeight; if (power_ == 0.5) { getWeight = [](const int64_t& length, const float& /*power*/) { return 1.0 / std::sqrt(length); }; } else if (power_ == 1) { getWeight = [](const int64_t& length, const float& /*power*/) { return 1.0 / length; }; } else { getWeight = [](const int64_t& length, const float& power) { return 1.0 / std::pow(length, power); }; } output->Resize(output_size); auto* output_data = output->template mutable_data(); int64_t cnt = 0; for (auto i = 0; i < input_size; i++) { auto len = input_data[i]; if (len == 0) { continue; } CAFFE_ENFORCE_LE(cnt + len, output_size, "unexpected lengths value"); float weight_value = getWeight(len, power_); std::fill(output_data + cnt, output_data + cnt + len, weight_value); cnt += len; } return true; } private: float power_; }; template class HasElementsOp : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; USE_SIMPLE_CTOR_DTOR(HasElementsOp); bool RunOnDevice() override { auto& input = Input(0); auto* output = Output(0); output->Resize(std::vector{}); *output->template mutable_data() = input.numel() > 0; return true; } }; // Return the size of a tensor template class SizeOp : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; USE_SIMPLE_CTOR_DTOR(SizeOp); bool RunOnDevice() override { auto& input = Input(0); auto* output = Output(0, vector(), at::dtype()); auto* output_data = output->template mutable_data(); auto size = input.numel(); math::Set( 1, static_cast(size), output_data, &context_); return true; } }; // returns a shape to be passed to Reshape template class LengthsToShapeOp : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; USE_SIMPLE_CTOR_DTOR(LengthsToShapeOp); bool RunOnDevice() override { auto& input = Input(0); CAFFE_ENFORCE(input.sizes().size() == 1, "Input must be a vector."); auto* output = Output(0); auto* input_data = input.template data(); auto size = input.numel(); auto first = input_data[0]; for (int i = 1; i < size; i++) { CAFFE_ENFORCE( input_data[i] == first, "All elements of input must be same "); } output->Resize(2); auto* output_data = output->template mutable_data(); output_data[0] = size; output_data[1] = first; return true; } }; template class GatherRangesOp : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; USE_SIMPLE_CTOR_DTOR(GatherRangesOp); bool RunOnDevice() override { return DispatchHelper>::call( this, this->template Input(RANGES, CPU)); } template bool DoRunWithType() { auto& data = Input(DATA); auto& ranges = Input(RANGES); auto* outputData = Output(0); auto* outputLengths = Output(1); auto batchSize = ranges.size(0); CAFFE_ENFORCE(data.dim() == 1, "Data has to be 1-D"); CAFFE_ENFORCE(ranges.dim() == 3, "Ranges must be 3-D"); CAFFE_ENFORCE(ranges.size(1) > 0, "There has to be at least one range"); CAFFE_ENFORCE_EQ( ranges.size(2), 2, "Ranges last dimention should be of size 2"); auto* rawData = static_cast(data.raw_data()); auto* rangesData = ranges.template data(); outputLengths->Resize(batchSize); auto* outputLengthsPtr = outputLengths->template mutable_data(); size_t start = 0; size_t blockSize = ranges.size_from_dim(1); for (size_t i = 0; i < batchSize; ++i) { auto end = start + blockSize; outputLengthsPtr[i] = accumulate(rangesData, start, end); start = end; } size_t outputSize = accumulate(rangesData, 0, ranges.numel()); outputData->Resize(outputSize); auto outputRawData = static_cast(outputData->raw_mutable_data(data.dtype())); VLOG(1) << "Copying data"; size_t outputOffsetBytes = 0; auto itemsize = data.dtype().itemsize(); for (int i = 0; i < ranges.numel(); i += 2) { auto rangeStart = rangesData[i]; auto rangeLength = rangesData[i + 1]; if (!rangeLength) { continue; } auto rangeSizeBytes = rangeLength * itemsize; CAFFE_ENFORCE(outputOffsetBytes < outputSize * itemsize); CAFFE_ENFORCE(rangeStart + rangeLength <= data.numel()); context_.CopyItemsSameDevice( data.dtype(), rangeLength, rawData + rangeStart * itemsize, outputRawData + outputOffsetBytes); outputOffsetBytes += rangeSizeBytes; } CAFFE_ENFORCE(outputOffsetBytes == outputSize * itemsize); return true; } INPUT_TAGS(DATA, RANGES, LENGTHS); private: template size_t accumulate(Index* ranges, size_t start, size_t end) { size_t result = 0; for (size_t i = start + 1; i < end; i += 2) { result += ranges[i]; } return result; } }; template class LengthsGatherOp : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; USE_SIMPLE_CTOR_DTOR(LengthsGatherOp); bool RunOnDevice() override { return DispatchHelper>::call( this, this->template Input(INDICES, CPU)); } template bool DoRunWithType() { auto& items = Input(ITEMS); auto& lengths = Input(LENGTHS); auto& indices = Input(INDICES); auto* output = Output(0); CAFFE_ENFORCE_GE(items.dim(), 1, "ITEMS should be at least 1-D"); CAFFE_ENFORCE_EQ(lengths.dim(), 1, "LENGTHS should be 1-D"); CAFFE_ENFORCE_EQ(indices.dim(), 1, "INDICES should be 1-D"); const auto* lengths_data = lengths.template data(); const auto* indices_data = indices.template data(); int64_t total_length = 0; for (size_t i = 0; i < indices.numel(); ++i) { auto idx = indices_data[i]; CAFFE_ENFORCE_LT(idx, lengths.numel()); total_length += lengths_data[idx]; } auto shape = items.sizes().vec(); shape[0] = total_length; output->Resize(shape); offsets_.clear(); int64_t running_offset = 0; offsets_.reserve(lengths.numel()); for (size_t i = 0; i < lengths.numel(); ++i) { offsets_.push_back(running_offset); running_offset += lengths_data[i]; } CAFFE_ENFORCE_EQ( items.size(0), running_offset, "LENGTHS must match the first dimension of ITEMS"); auto src_base = static_cast(items.raw_data()); auto block_size = items.size_from_dim(1); auto block_bytesize = block_size * items.itemsize(); auto out = static_cast(output->raw_mutable_data(items.dtype())); for (size_t i = 0; i < indices.numel(); ++i) { auto idx = indices_data[i]; auto length = lengths_data[idx]; context_.CopyItemsSameDevice( items.dtype(), length * block_size, src_base + offsets_[idx] * block_bytesize, out); out += length * block_bytesize; } return true; } std::vector offsets_; INPUT_TAGS(ITEMS, LENGTHS, INDICES); }; template class AccumulateHistogramOp : public Operator { public: template explicit AccumulateHistogramOp(Args&&... args) : Operator(std::forward(args)...), lower_bound_( this->template GetSingleArgument("lower_bound", 0.0)), upper_bound_( this->template GetSingleArgument("upper_bound", 1.0)), num_buckets_(this->template GetSingleArgument("num_buckets", 1)) { CAFFE_ENFORCE_GT(num_buckets_, 0); // 2 more for histograms < lower_bound, >= upper_bound respectively num_output_buckets_ = num_buckets_ + 2; accumulate_hist_ = std::vector(num_output_buckets_, 0); } USE_OPERATOR_CONTEXT_FUNCTIONS; bool RunOnDevice() override { auto& X = Input(X_IN); auto* X_data = X.template data(); int N = X.numel(); auto* cur_hist = Output(CUR_HIST); auto* acc_hist = Output(ACC_HIST); cur_hist->Resize(num_output_buckets_); acc_hist->Resize(num_output_buckets_); auto* cur_hist_data = cur_hist->template mutable_data(); auto* acc_hist_data = acc_hist->template mutable_data(); auto segment = (upper_bound_ - lower_bound_) / num_buckets_; math::Set( num_output_buckets_, 0, cur_hist_data, &context_); for (int i = 0; i < N; i++) { int bucket_index = -1; if (X_data[i] < lower_bound_) { bucket_index = 0; } else if (X_data[i] >= upper_bound_) { bucket_index = num_buckets_ + 1; } else { bucket_index = (int)((X_data[i] - lower_bound_) / segment) + 1; } cur_hist_data[bucket_index] += 1; accumulate_hist_[bucket_index] += 1; } for (int i = 0; i < num_output_buckets_; i++) { acc_hist_data[i] = accumulate_hist_[i]; } return true; } private: float lower_bound_; float upper_bound_; int num_buckets_; int num_output_buckets_; std::vector accumulate_hist_; INPUT_TAGS(X_IN); OUTPUT_TAGS(CUR_HIST, ACC_HIST); }; template class RangeOp : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; USE_SIMPLE_CTOR_DTOR(RangeOp) bool RunOnDevice() override { return DispatchHelper>::call( this, Input(0)); } template T readScalarInput(const int index) { if (std::is_same::value) { return Input(index).template data()[0]; } else { local_.CopyFrom(Input(index)); return local_.template data()[0]; } } template bool DoRunWithType() { T stop = 0; T start = 0; T step = 1; for (int i = 0; i < InputSize(); ++i) { CAFFE_ENFORCE_EQ(Input(i).numel(), 1, "All inputs must be scalar/1D tensor."); } switch (InputSize()) { case 1: stop = readScalarInput(0); break; case 2: start = readScalarInput(0); stop = readScalarInput(1); break; case 3: step = readScalarInput(2); start = readScalarInput(0); stop = readScalarInput(1); break; } CAFFE_ENFORCE_NE(step, 0, "Step size cannot be 0."); int length; auto diff = stop - start; if (std::is_integral::value) { // Avoid casting to and from floats in case it introduces rounding and // avoid mod because the compiler doesn't strip unused code until later. length = diff / step; if (length * step < diff) { length += 1; } } else { length = static_cast(ceil(diff / step)); } // Match numpy's behavior here. if (length <= 0) { Output(0, {0}, at::dtype()); return true; } else { auto* output = Output(0, {length}, at::dtype()); return DoRunOnDevice(start, step, output); } } template bool DoRunOnDevice(const T& start, const T& step, Tensor* output); private: // local CPU tensor for copying constants. Tensor local_{CPU}; }; class ThrowExceptionOp : public Operator { public: template explicit ThrowExceptionOp(Args&&... args) : Operator(std::forward(args)...), message_(GetSingleArgument( "message", "Exception from ThrowExceptionOp")) {} bool RunOnDevice() override { CAFFE_THROW(message_); } private: const std::string message_; }; class ThrowChildThreadExceptionOp : public Operator { public: template explicit ThrowChildThreadExceptionOp(Args&&... args) : Operator(std::forward(args)...), message_(GetSingleArgument( "message", "Exception from ThrowChildThreadExceptionOp")) {} bool RunOnDevice() override { std::thread t([this]() { CAFFE_THROW(this->message_); }); t.join(); return true; } private: const std::string message_; }; class LogFatalOp : public Operator { public: template explicit LogFatalOp(Args&&... args) : Operator(std::forward(args)...), message_(GetSingleArgument( "message", "Logging from LogFatalOp")) {} bool RunOnDevice() override { LOG(FATAL) << message_; return true; } private: const std::string message_; }; class FailOp : public Operator { public: template explicit FailOp(Args&&... args) : Operator(std::forward(args)...) {} bool RunOnDevice() override { return false; } }; } // namespace caffe2 #endif // CAFFE2_OPERATORS_UTILITY_OPS_H_