|
#ifndef CAFFE2_OPERATORS_RECUDER_FUNCTORS_H_
|
#define CAFFE2_OPERATORS_RECUDER_FUNCTORS_H_
|
|
#include <array>
|
|
#include "caffe2/core/context.h"
|
#include "caffe2/core/tensor.h"
|
#include "caffe2/utils/eigen_utils.h"
|
#include "caffe2/utils/math.h"
|
#include "caffe2/utils/proto_utils.h"
|
|
namespace caffe2 {
|
|
////////////////////////////////////////////////////////////////////////////////
|
// Range reducers: can leverage that input segment is continuous and provide
|
// special implementation
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Put forward and backward in the same template?
|
template <typename T, class Context>
|
class SumRangeReducer;
|
template <typename T, class Context>
|
class SumRangeReducerGradient;
|
|
template <typename T>
|
class SumRangeReducer<T, CPUContext> {
|
public:
|
void operator()(
|
const int64_t block_size,
|
const int64_t blocks,
|
const T* in,
|
T* out,
|
CPUContext* /*context*/) {
|
// do we need to go through wrapper in math.h?
|
EigenVectorMap<T> out_vec(out, block_size);
|
out_vec = ConstEigenMatrixMap<T>(in, block_size, blocks).rowwise().sum();
|
}
|
};
|
|
template <typename T, class Context>
|
class SumRangeReducerGradient {
|
public:
|
void operator()(
|
const int64_t block_size,
|
const int64_t blocks,
|
const T* segment_grad,
|
T* data_grad,
|
const T* /*data_in*/, // unused
|
const T* /*data_out*/, // unused
|
Context* context) {
|
// do we have some op that does it smartly with minimum number of memcpy?
|
for (int64_t i = 0; i < blocks; ++i) {
|
context->template CopySameDevice<T>(
|
block_size, segment_grad, data_grad + block_size * i);
|
}
|
}
|
};
|
|
struct SumRangeReducerDef {
|
template <typename T, class Context>
|
using Reducer = SumRangeReducer<T, Context>;
|
template <typename T, class Context>
|
using ReducerGradient = SumRangeReducerGradient<T, Context>;
|
static constexpr const char* name = "Sum";
|
static constexpr const char* doc =
|
"Summation is done element-wise across slices of the input tensor and "
|
"doesn't change the shape of the individual blocks.";
|
};
|
|
// Put forward and backward in the same template?
|
template <typename T, class Context>
|
class LogSumExpRangeReducer;
|
template <typename T, class Context>
|
class LogSumExpRangeReducerGradient;
|
|
template <typename T>
|
class LogSumExpRangeReducer<T, CPUContext> {
|
public:
|
void operator()(
|
const int64_t block_size,
|
const int64_t blocks,
|
const T* in,
|
T* out,
|
CPUContext* /*context*/) {
|
for (int j = 0; j < block_size; ++j) {
|
T max_value = std::numeric_limits<T>::lowest();
|
for (int i = 0; i < blocks; ++i) {
|
max_value = std::max(max_value, in[i * block_size + j]);
|
}
|
T scaled_exp_sum = 0;
|
for (int i = 0; i < blocks; ++i) {
|
scaled_exp_sum += std::exp(in[i * block_size + j] - max_value);
|
}
|
*(out++) = std::log(scaled_exp_sum) + max_value;
|
}
|
}
|
T r{1};
|
};
|
|
template <typename T, class Context>
|
class LogSumExpRangeReducerGradient {
|
public:
|
void operator()(
|
const int64_t block_size,
|
const int64_t blocks,
|
const T* segment_grad, // GO
|
T* data_grad, // GI
|
const T* data_in, // I
|
const T* data_out, // O
|
Context* /*context*/) {
|
for (int j = 0; j < block_size; ++j) {
|
const T out_grad = *(segment_grad++);
|
const T offset = *(data_out++);
|
for (int i = 0; i < blocks; ++i) {
|
auto idx = i * block_size + j;
|
data_grad[idx] = out_grad * std::exp(data_in[idx] - offset);
|
}
|
}
|
}
|
};
|
|
struct LogSumExpRangeReducerDef {
|
template <typename T, class Context>
|
using Reducer = LogSumExpRangeReducer<T, Context>;
|
template <typename T, class Context>
|
using ReducerGradient = LogSumExpRangeReducerGradient<T, Context>;
|
static constexpr const char* name = "LogSumExp";
|
static constexpr const char* doc =
|
"LogSumExp computes the element-wise log of the sum of exponentials of "
|
"input slices. Operation doesn't change the shape of individual blocks.";
|
};
|
|
template <typename T, class Context>
|
class LogMeanExpRangeReducer;
|
template <typename T, class Context>
|
class LogMeanExpRangeReducerGradient;
|
|
template <typename T>
|
class LogMeanExpRangeReducer<T, CPUContext> {
|
public:
|
void operator()(
|
const int64_t block_size,
|
const int64_t blocks,
|
const T* in,
|
T* out,
|
CPUContext* /*context*/) {
|
for (int j = 0; j < block_size; ++j) {
|
T max_value = std::numeric_limits<T>::lowest();
|
for (int i = 0; i < blocks; ++i) {
|
max_value = std::max(max_value, in[i * block_size + j]);
|
}
|
T scaled_exp_sum = 0;
|
for (int i = 0; i < blocks; ++i) {
|
scaled_exp_sum += std::exp(in[i * block_size + j] - max_value);
|
}
|
scaled_exp_sum /= blocks;
|
*(out++) = std::log(scaled_exp_sum) + max_value;
|
}
|
}
|
};
|
|
template <typename T, class Context>
|
class LogMeanExpRangeReducerGradient {
|
public:
|
void operator()(
|
const int64_t block_size,
|
const int64_t blocks,
|
const T* segment_grad, // GO
|
T* data_grad, // GI
|
const T* data_in, // I
|
const T* data_out, // O
|
Context* /*context*/) {
|
for (int j = 0; j < block_size; ++j) {
|
const T out_grad = *(segment_grad++);
|
const T offset = *(data_out++);
|
for (int i = 0; i < blocks; ++i) {
|
auto idx = i * block_size + j;
|
data_grad[idx] = out_grad * std::exp(data_in[idx] - offset) / blocks;
|
}
|
}
|
}
|
};
|
|
struct LogMeanExpRangeReducerDef {
|
template <typename T, class Context>
|
using Reducer = LogMeanExpRangeReducer<T, Context>;
|
template <typename T, class Context>
|
using ReducerGradient = LogMeanExpRangeReducerGradient<T, Context>;
|
static constexpr const char* name = "LogMeanExp";
|
static constexpr const char* doc =
|
"LogMeanExp computes the element-wise log of the mean of exponentials of "
|
"input slices. Operation doesn't change the shape of individual blocks.";
|
};
|
|
template <typename T, class Context>
|
class MeanRangeReducer;
|
template <typename T, class Context>
|
class MeanRangeReducerGradient;
|
|
template <typename T>
|
class MeanRangeReducer<T, CPUContext> {
|
public:
|
void operator()(
|
const int64_t block_size,
|
const int64_t blocks,
|
const T* in,
|
T* out,
|
CPUContext* /*context*/) {
|
for (int j = 0; j < block_size; ++j) {
|
T avg_value = 0;
|
for (int i = 0; i < blocks; ++i) {
|
avg_value += in[i * block_size + j] / blocks;
|
}
|
*(out++) = avg_value;
|
}
|
}
|
};
|
|
template <typename T, class Context>
|
class MeanRangeReducerGradient {
|
public:
|
void operator()(
|
const int64_t block_size,
|
const int64_t blocks,
|
const T* segment_grad, // GO
|
T* data_grad, // GI
|
const T* /*data_in*/, // I
|
const T* /*data_out*/, // O
|
Context* /*context*/) {
|
const auto in_grad = 1.0 / blocks;
|
for (int j = 0; j < block_size; ++j) {
|
const T out_grad = *(segment_grad++);
|
for (int i = 0; i < blocks; ++i) {
|
auto idx = i * block_size + j;
|
data_grad[idx] = out_grad * in_grad;
|
}
|
}
|
}
|
};
|
|
struct MeanRangeReducerDef {
|
template <typename T, class Context>
|
using Reducer = MeanRangeReducer<T, Context>;
|
template <typename T, class Context>
|
using ReducerGradient = MeanRangeReducerGradient<T, Context>;
|
static constexpr const char* name = "Mean";
|
static constexpr const char* doc =
|
"Mean computation is done element-wise, so that each element of the "
|
"output slice corresponds to the average value of the respective "
|
"elements in the input slices. Operation doesn't change the shape of "
|
"individual blocks.";
|
};
|
|
template <typename T, class Context>
|
class MaxRangeReducer;
|
template <typename T, class Context>
|
class MaxRangeReducerGradient;
|
|
template <typename T>
|
class MaxRangeReducer<T, CPUContext> {
|
public:
|
void operator()(
|
const int64_t block_size,
|
const int64_t blocks,
|
const T* in,
|
T* out,
|
CPUContext* /*context*/) {
|
for (int j = 0; j < block_size; ++j) {
|
T max_value = std::numeric_limits<T>::lowest();
|
for (int i = 0; i < blocks; ++i) {
|
max_value = std::max(max_value, in[i * block_size + j]);
|
}
|
*(out++) = max_value;
|
}
|
}
|
};
|
|
template <typename T, class Context>
|
class MaxRangeReducerGradient {
|
public:
|
void operator()(
|
const int64_t block_size,
|
const int64_t blocks,
|
const T* segment_grad, // GO
|
T* data_grad, // GI
|
const T* data_in, // I
|
const T* data_out, // O
|
Context* /*context*/) {
|
std::memset(
|
static_cast<void*>(data_grad), 0, blocks * block_size * sizeof(T));
|
for (int j = 0; j < block_size; ++j) {
|
const T out_grad = *(segment_grad++);
|
const T out = data_out[j];
|
for (int i = 0; i < blocks; ++i) {
|
auto idx = i * block_size + j;
|
if (out == data_in[idx]) {
|
data_grad[idx] = out_grad;
|
}
|
}
|
}
|
}
|
};
|
|
struct MaxRangeReducerDef {
|
template <typename T, class Context>
|
using Reducer = MaxRangeReducer<T, Context>;
|
template <typename T, class Context>
|
using ReducerGradient = MaxRangeReducerGradient<T, Context>;
|
static constexpr const char* name = "Max";
|
static constexpr const char* doc =
|
"Max computation is done element-wise, so that each element of the "
|
"output slice corresponds to the max value of the respective "
|
"elements in the input slices. Operation doesn't change the shape of "
|
"individual blocks. This implementation imitates torch nn.Max operator. "
|
"If the maximum value occurs more than once, the operator will return "
|
"the first occurence of value. When computing the gradient using the "
|
"backward propagation, the gradient input corresponding to the first "
|
"occurence of the maximum value will be used.";
|
};
|
|
////////////////////////////////////////////////////////////////////////////////
|
// Incremental reducers: consume elements one by one
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Base implementation, everything can be overwritten
|
class BaseReducer {
|
public:
|
static constexpr int kInputCount = 1;
|
|
struct Meta {
|
int64_t block_size;
|
vector<int64_t> block_shape;
|
bool first_dim;
|
|
explicit Meta(bool first = true) : first_dim(first) {}
|
|
void computeMeta(at::IntArrayRef dims, size_t skip_dims) {
|
first_dim ? block_shape.assign(dims.begin() + skip_dims, dims.end())
|
: block_shape.assign(dims.begin(), dims.end() - skip_dims);
|
block_size = first_dim ? size_from_dim_(skip_dims, dims)
|
: size_from_dim_(dims.size() - skip_dims, dims);
|
}
|
|
void observeInput(int input, const Tensor& value, int skip_dims) {
|
DCHECK_EQ(0, input);
|
auto dims = value.sizes();
|
computeMeta(dims, skip_dims);
|
}
|
|
void appendOutputShape(vector<int64_t>* output_shape) {
|
output_shape->insert(
|
output_shape->end(), block_shape.begin(), block_shape.end());
|
}
|
|
vector<int64_t> getOutputShape(const TensorShape& in, int skip_dims) {
|
vector<int64_t> dims(in.dims().begin(), in.dims().end());
|
computeMeta(dims, skip_dims);
|
return block_shape;
|
}
|
};
|
|
template <int FixedSize>
|
void finish(const Meta& /*meta*/, CPUContext* /*context*/) {}
|
};
|
|
class BaseReducerGradient {
|
public:
|
// which of the original inputs are required for gradient computation
|
static constexpr std::array<int, 0> originalInputs() {
|
return std::array<int, 0>();
|
}
|
|
static constexpr bool computeLength() {
|
return false;
|
}
|
|
static int numAuxInputsWithGrads(const OperatorDef& /*def*/) {
|
return 0;
|
}
|
|
static bool requiresDataInput(const OperatorDef& /*def*/) {
|
return false;
|
}
|
|
// True if the backward op requires the output of the forward op.
|
static bool requiresForwardOutput() {
|
return false;
|
}
|
|
struct Meta {
|
int64_t block_size;
|
vector<int64_t> block_shape;
|
bool first_dim;
|
|
Meta(const Tensor& out_grad, int skip_dims, bool first_dim = true)
|
: first_dim(first_dim) {
|
auto dims = out_grad.sizes();
|
first_dim ? block_shape.assign(dims.begin() + skip_dims, dims.end())
|
: block_shape.assign(dims.begin(), dims.end() - skip_dims);
|
block_size = first_dim
|
? out_grad.size_from_dim(skip_dims)
|
: out_grad.size_from_dim(out_grad.dim() - skip_dims);
|
}
|
|
void observeOriginalInput(
|
int /*original_input*/,
|
const Tensor& /*value*/,
|
Tensor* /*input_grad*/, // optional grad to populate
|
int /*skip_dims*/) {}
|
|
void appendGradShape(vector<int64_t>* output_shape) {
|
output_shape->insert(
|
output_shape->end(), block_shape.begin(), block_shape.end());
|
}
|
};
|
};
|
|
// Put forward and backward in the same template?
|
template <typename T, class Context>
|
class SumReducer;
|
template <typename T, class Context>
|
class SumReducerGradient;
|
|
template <typename T>
|
class SumReducer<T, CPUContext> : public BaseReducer {
|
public:
|
using FixedDispatch = FixedValues<1>;
|
|
SumReducer(const Meta& meta, T* out, CPUContext* /*context*/)
|
: current_size_(0), out_(out) {
|
// add a wrapper in Context for it
|
if (meta.first_dim) {
|
memset(out, 0, sizeof(T) * meta.block_size);
|
}
|
}
|
template <int FixedSize>
|
void process(
|
const Meta& meta,
|
const T* in,
|
int64_t /*offset*/,
|
CPUContext* context) {
|
if (meta.first_dim) {
|
math::AxpyFixedSize<T, CPUContext, FixedSize>(
|
meta.block_size, 1, in, out_, context);
|
} else {
|
math::Sum<T, CPUContext>(
|
meta.block_size, in, out_ + current_size_++, context);
|
}
|
}
|
|
private:
|
int current_size_;
|
T* out_;
|
};
|
|
template <typename T, class Context>
|
class SumReducerGradient : public BaseReducerGradient {
|
public:
|
using FixedDispatch = FixedValues<1>;
|
|
SumReducerGradient(
|
const Meta& /*meta*/,
|
const T* s_grad,
|
CPUContext* /*context*/)
|
: s_grad_(s_grad) {}
|
|
template <int FixedSize>
|
void fillGrad(
|
const Meta& meta,
|
T* data_grad,
|
int64_t offset,
|
Context* context,
|
const int length) {
|
if (FixedSize == 1) { // static if
|
*data_grad = *s_grad_;
|
} else if (meta.first_dim) {
|
context->template CopySameDevice<T>(meta.block_size, s_grad_, data_grad);
|
} else {
|
math::Set<T, Context>(length, s_grad_[offset], data_grad, context);
|
}
|
}
|
|
private:
|
const T* s_grad_;
|
};
|
|
struct SumReducerDef {
|
template <typename T, class Context>
|
using Reducer = SumReducer<T, Context>;
|
template <typename T, class Context>
|
using ReducerGradient = SumReducerGradient<T, Context>;
|
static constexpr const char* name = "Sum";
|
static constexpr const char* doc =
|
"Summation is done element-wise across slices of the input tensor and "
|
"doesn't change the shape of the individual blocks.";
|
static void PopulateSchema(OpSchema& /*schema*/) {}
|
};
|
|
// Put forward and backward in the same template?
|
template <typename T, class Context>
|
class WeightedSumReducer;
|
template <typename T, class Context>
|
class WeightedSumReducerGradient;
|
|
template <typename T>
|
class WeightedSumReducer<T, CPUContext> : public BaseReducer {
|
public:
|
static constexpr int kInputCount = 2;
|
|
using FixedDispatch = FixedValues<1>;
|
|
struct Meta : BaseReducer::Meta {
|
const T* scalars;
|
|
bool first_dim;
|
|
explicit Meta(bool first = true) : first_dim(first) {}
|
|
void observeInput(int input, const Tensor& value, int skip_dims) {
|
if (input == 1) {
|
CAFFE_ENFORCE_EQ(
|
skip_dims, value.dim(), "SCALARS mustn't have extra dimensions");
|
scalars = value.data<T>();
|
return;
|
}
|
BaseReducer::Meta::observeInput(input, value, skip_dims);
|
}
|
};
|
|
WeightedSumReducer(const Meta& meta, T* out, CPUContext* /*context*/)
|
: out_(out) {
|
// do we have a wrapper for it?
|
memset(out, 0, sizeof(T) * meta.block_size);
|
}
|
template <int FixedSize>
|
void
|
process(const Meta& meta, const T* in, int64_t offset, CPUContext* context) {
|
CAFFE_ENFORCE(
|
meta.first_dim,
|
"WeightedSumReducer implemented only for "
|
"front dimensions reduction");
|
math::AxpyFixedSize<T, CPUContext, FixedSize>(
|
meta.block_size, meta.scalars[offset], in, out_, context);
|
}
|
|
private:
|
T* out_;
|
};
|
|
template <typename T, class Context>
|
class WeightedSumReducerGradient : public BaseReducerGradient {
|
public:
|
// which of the original inputs are required for gradient computation
|
static constexpr std::array<int, 1> originalInputs() {
|
return {{1}};
|
}
|
|
static int numAuxInputsWithGrads(const OperatorDef& def) {
|
return GetFlagArgument(def, "grad_on_weights");
|
}
|
|
static bool requiresDataInput(const OperatorDef& def) {
|
return numAuxInputsWithGrads(def) > 0;
|
}
|
|
using FixedDispatch = FixedValues<1>;
|
|
struct Meta : public BaseReducerGradient::Meta {
|
const T* scalars;
|
T* scalars_grad;
|
|
using BaseReducerGradient::Meta::Meta;
|
|
void observeOriginalInput(
|
int original_input,
|
const Tensor& value,
|
Tensor* input_grad, // optional grad to populate
|
int /*skip_dims*/) {
|
CAFFE_ENFORCE_EQ(1, original_input);
|
scalars = value.data<T>();
|
if (input_grad) {
|
input_grad->ResizeLike(value);
|
scalars_grad = input_grad->template mutable_data<T>();
|
}
|
}
|
};
|
|
WeightedSumReducerGradient(
|
const Meta& /*meta*/,
|
const T* s_grad,
|
CPUContext* /*context*/)
|
: s_grad_(s_grad) {}
|
|
template <int FixedSize>
|
void fillGrad(
|
const Meta& meta,
|
T* data_grad,
|
int64_t offset,
|
Context* context,
|
const int /*length*/) {
|
math::ScaleFixedSize<T, CPUContext, FixedSize>(
|
meta.block_size, meta.scalars[offset], s_grad_, data_grad, context);
|
}
|
|
// Special version which is called with the main input too, used only if
|
// additional input grad is requested
|
template <int FixedSize>
|
void fillGradWithMainInput(
|
const Meta& meta,
|
const T* data,
|
T* data_grad,
|
int64_t offset,
|
Context* context,
|
const int /*length*/) {
|
math::ScaleFixedSize<T, CPUContext, FixedSize>(
|
meta.block_size, meta.scalars[offset], s_grad_, data_grad, context);
|
math::Dot(
|
meta.block_size, s_grad_, data, meta.scalars_grad + offset, context);
|
}
|
|
private:
|
const T* s_grad_;
|
};
|
|
struct WeightedSumReducerDef {
|
template <typename T, class Context>
|
using Reducer = WeightedSumReducer<T, Context>;
|
template <typename T, class Context>
|
using ReducerGradient = WeightedSumReducerGradient<T, Context>;
|
static constexpr const char* name = "WeightedSum";
|
static constexpr const char* doc =
|
"Input slices are first scaled by SCALARS and then summed element-wise. "
|
"It doesn't change the shape of the individual blocks.";
|
static void PopulateSchema(OpSchema& schema) {
|
schema.Input(0, "DATA", "Input tensor for the summation");
|
schema.Input(
|
1,
|
"SCALARS",
|
"Scalar multipliers for the input slices. Must be a vector with the "
|
"length matching the number of slices");
|
schema.Arg(
|
"grad_on_weights",
|
"Produce also gradient for `weights`. For now it's only supported in "
|
"`Lengths`-based operators");
|
}
|
};
|
|
template <typename T, class Context>
|
class MeanReducer;
|
template <typename T, class Context>
|
class MeanReducerGradient;
|
|
template <typename T>
|
class MeanReducer<T, CPUContext> : public BaseReducer {
|
public:
|
using FixedDispatch = FixedValues<1>;
|
|
MeanReducer(const Meta& meta, T* out, CPUContext* /*context*/)
|
: out_(out), current_size_(0) {
|
if (meta.first_dim) {
|
memset(out, 0, sizeof(T) * meta.block_size);
|
}
|
}
|
|
template <int FixedSize>
|
void process(
|
const Meta& meta,
|
const T* in,
|
int64_t /*offset*/,
|
CPUContext* context) {
|
if (meta.first_dim) {
|
math::AxpyFixedSize<T, CPUContext, FixedSize>(
|
meta.block_size, 1, in, out_, context);
|
} else {
|
math::Sum<T, CPUContext>(
|
meta.block_size, in, out_ + current_size_, context);
|
}
|
current_size_++;
|
}
|
|
template <int FixedSize>
|
void finish(const Meta& meta, CPUContext* context) {
|
if (meta.first_dim) {
|
if (current_size_ > 0) {
|
math::ScaleFixedSize<T, CPUContext, FixedSize>(
|
meta.block_size, 1.0 / current_size_, out_, out_, context);
|
}
|
} else {
|
math::ScaleFixedSize<T, CPUContext, FixedSize>(
|
current_size_, 1.0 / meta.block_size, out_, out_, context);
|
}
|
}
|
|
private:
|
T* out_;
|
int current_size_;
|
};
|
|
template <typename T, class Context>
|
class MeanReducerGradient : public BaseReducerGradient {
|
public:
|
static constexpr bool computeLength() {
|
return true;
|
}
|
|
using FixedDispatch = FixedValues<1>;
|
|
MeanReducerGradient(
|
const Meta& /*meta*/,
|
const T* s_grad,
|
CPUContext* /*context*/)
|
: s_grad_(s_grad) {}
|
|
template <int FixedSize>
|
void fillGrad(
|
const Meta& meta,
|
T* data_grad,
|
int64_t offset,
|
Context* context,
|
const int length) {
|
CAFFE_ENFORCE_GT(length, 0, "Segment length must be > 0");
|
if (meta.first_dim) {
|
math::ScaleFixedSize<T, CPUContext, FixedSize>(
|
meta.block_size, 1.0 / length, s_grad_, data_grad, context);
|
} else {
|
math::Set<T, CPUContext>(
|
length, s_grad_[offset] * 1.0f / length, data_grad, context);
|
}
|
}
|
|
private:
|
const T* s_grad_;
|
};
|
|
struct MeanReducerDef {
|
template <typename T, class Context>
|
using Reducer = MeanReducer<T, Context>;
|
template <typename T, class Context>
|
using ReducerGradient = MeanReducerGradient<T, Context>;
|
static constexpr const char* name = "Mean";
|
static constexpr const char* doc =
|
"Mean computes the element-wise mean of the input slices. "
|
"Operation doesn't change the shape of the individual blocks.";
|
static void PopulateSchema(OpSchema& /*schema*/) {}
|
};
|
|
template <typename T, class Context>
|
class MaxReducer;
|
template <typename T, class Context>
|
class MaxReducerGradient;
|
|
template <typename T>
|
class MaxReducer<T, CPUContext> : public BaseReducer {
|
public:
|
using FixedDispatch = FixedValues<1>;
|
|
MaxReducer(const Meta& meta, T* out, CPUContext* /*context*/)
|
: out_(out), current_size_(0) {
|
// add a wrapper in Context for it
|
memset(out, 0, sizeof(T) * meta.block_size);
|
}
|
|
template <int FixedSize>
|
void process(
|
const Meta& meta,
|
const T* in,
|
int64_t /*offset*/,
|
CPUContext* context) {
|
CAFFE_ENFORCE(
|
meta.first_dim,
|
"MaxReducer implemented only for front dimensions reduction");
|
if (current_size_ > 0) {
|
EigenVectorMap<T> output_vec(out_, meta.block_size);
|
output_vec =
|
output_vec.cwiseMax(ConstEigenVectorMap<T>(in, meta.block_size));
|
} else {
|
memcpy(out_, in, sizeof(T) * meta.block_size);
|
}
|
++current_size_;
|
}
|
|
private:
|
T* out_;
|
int current_size_;
|
};
|
|
template <typename T, class Context>
|
class MaxReducerGradient : public BaseReducerGradient {
|
public:
|
static bool requiresDataInput(const OperatorDef& /*def*/) {
|
return true;
|
}
|
|
static bool requiresForwardOutput() {
|
return true;
|
}
|
|
using FixedDispatch = FixedValues<1>;
|
|
MaxReducerGradient(
|
const Meta& /*meta*/,
|
const T* s_grad,
|
CPUContext* /*context*/)
|
: s_grad_(s_grad) {}
|
|
template <int FixedSize>
|
void fillGradWithMainInputAndForwardOutput(
|
const Meta& meta,
|
const T* data,
|
T* data_grad,
|
const T* forward_output,
|
int64_t /*offset*/,
|
Context* /*context*/,
|
const int /*length*/) {
|
for (int64_t i = 0; i < meta.block_size; ++i) {
|
data_grad[i] = data[i] == forward_output[i] ? s_grad_[i] : 0;
|
}
|
}
|
|
private:
|
const T* s_grad_;
|
};
|
|
struct MaxReducerDef {
|
template <typename T, class Context>
|
using Reducer = MaxReducer<T, Context>;
|
template <typename T, class Context>
|
using ReducerGradient = MaxReducerGradient<T, Context>;
|
static constexpr const char* name = "Max";
|
static constexpr const char* doc =
|
"Max computes the element-wise max of the input slices. "
|
"Operation doesn't change the shape of the individual blocks.";
|
static void PopulateSchema(OpSchema& /*schema*/) {}
|
};
|
|
} // namespace caffe2
|
|
#endif // CAFFE2_OPERATORS_RECUDER_FUNCTORS_H_
|