// conv_op_impl.h is the templated implementation of the conv_op.h file.
|
#ifndef CAFFE2_OPERATORS_CONV_OP_IMPL_H_
|
#define CAFFE2_OPERATORS_CONV_OP_IMPL_H_
|
|
#include "caffe2/operators/conv_op.h"
|
|
#include <array>
|
#include <vector>
|
|
#include "caffe2/core/context.h"
|
#include "caffe2/core/flags.h"
|
#include "caffe2/core/logging.h"
|
#include "caffe2/core/operator.h"
|
#include "caffe2/operators/conv_pool_op_base.h"
|
#include "caffe2/utils/eigen_utils.h"
|
#include "caffe2/utils/math.h"
|
|
namespace caffe2 {
|
|
template <typename T, class Context>
|
bool ConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
|
const auto& X = Input(INPUT);
|
const auto& filter = Input(FILTER);
|
auto* Y = Output(0);
|
const int N = X.dim32(0);
|
const int C = X.dim32(1);
|
const int G = group_;
|
CAFFE_ENFORCE_EQ(X.dim(), filter.dim());
|
const int M = filter.dim32(0);
|
CAFFE_ENFORCE_EQ(
|
C,
|
filter.dim32(1) * G,
|
"Convolution op: input channels does not match: # of input channels ",
|
C,
|
" is not equal to kernel channels * group: ",
|
filter.dim32(1),
|
"*",
|
G);
|
CAFFE_ENFORCE_EQ(
|
M % G, 0, "The number of output channels is not divisible by group.");
|
|
int kernel_size = 1;
|
for (std::size_t i = 0; i < kernel_.size(); ++i) {
|
CAFFE_ENFORCE_EQ(filter.dim32(i + 2), kernel_[i]);
|
kernel_size *= kernel_[i];
|
}
|
ConvPoolOpBase<Context>::SetOutputSize(X, Y, M);
|
|
if (N == 0) {
|
Y->template mutable_data<T>();
|
return true;
|
}
|
|
const vector<int> X_dims = GetDims(X);
|
const vector<int> Y_dims = GetDims(*Y);
|
const int X_HxW = X.numel() / (N * C);
|
const int Y_HxW = Y->numel() / (N * M);
|
const vector<int> img_shape(X.sizes().cbegin() + 1, X.sizes().cend());
|
vector<int> buffer_shape(Y_dims.size() + 1);
|
buffer_shape[0] = C * kernel_size;
|
std::copy(Y_dims.cbegin(), Y_dims.cend(), buffer_shape.begin() + 1);
|
|
const int buffer_size = C * kernel_size * Y_HxW;
|
|
// The dimension of each kernel
|
const int kernel_dim = C / G * kernel_size;
|
const int X_stride = C * X_HxW;
|
const int Y_stride = M * Y_HxW;
|
const int filter_stride = filter.numel() / G;
|
|
// The col buffer is stored in CHW order as well - kernel_dim, and the height
|
// and width.
|
const T* X_data = X.template data<T>();
|
const T* filter_data = filter.template data<T>();
|
const T* bias_data = nullptr;
|
if (InputSize() == 3) {
|
const auto& bias = Input(BIAS);
|
CAFFE_ENFORCE_EQ(bias.dim(), 1);
|
CAFFE_ENFORCE_EQ(bias.dim32(0), M);
|
bias_data = bias.template data<T>();
|
ConvPoolOpBase<Context>::template SetBiasMultiplier<T>(
|
Y_HxW, &bias_multiplier_);
|
}
|
T* Y_data = Y->template mutable_data<T>();
|
|
// Shortcut for 1x1 conv.
|
if (kernel_size == 1 && !HasPad() && !HasStride()) {
|
return Run1x1ConvOnDeviceWithOrderNCHW(
|
N, C, X_HxW, M, X_data, filter_data, bias_data, Y_data);
|
}
|
|
const auto func = [&](Tensor* col_buffer) {
|
col_buffer->Resize(buffer_shape);
|
T* col_buffer_data = col_buffer->template mutable_data<T>();
|
// Im2Col, followed by gemm.
|
for (int image_id = 0; image_id < N; ++image_id) {
|
if (kernel_.size() == 2) {
|
math::Im2Col<T, Context, StorageOrder::NCHW>(
|
C,
|
X_dims[0],
|
X_dims[1],
|
kernel_h(),
|
kernel_w(),
|
dilation_h(),
|
dilation_w(),
|
pad_t(),
|
pad_l(),
|
pad_b(),
|
pad_r(),
|
stride_h(),
|
stride_w(),
|
X_data,
|
col_buffer_data,
|
&context_);
|
} else {
|
math::Im2ColNd<T, Context, StorageOrder::NCHW>(
|
kernel_.size(),
|
C * X_HxW,
|
buffer_size,
|
img_shape.data(),
|
buffer_shape.data(),
|
kernel_.data(),
|
stride_.data(),
|
dilation_.data(),
|
pads_.data(),
|
X_data,
|
col_buffer_data,
|
&context_);
|
}
|
// Weight term
|
if (G == 1) {
|
math::Gemm<T, Context>(
|
CblasNoTrans,
|
CblasNoTrans,
|
M,
|
Y_HxW,
|
kernel_dim,
|
1.0f,
|
filter_data,
|
col_buffer_data,
|
0.0f,
|
Y_data,
|
&context_);
|
} else {
|
math::GemmStridedBatched<T, Context>(
|
CblasNoTrans,
|
CblasNoTrans,
|
G,
|
M / G,
|
Y_HxW,
|
kernel_dim,
|
1.0f,
|
filter_data,
|
filter_stride,
|
col_buffer_data,
|
buffer_size / G,
|
0.0f,
|
Y_data,
|
Y_stride / G,
|
&context_);
|
}
|
if (bias_data != nullptr) {
|
// Bias term can be carried out outside the group definition
|
// to be efficient.
|
math::Gemm<T, Context>(
|
CblasNoTrans,
|
CblasNoTrans,
|
M,
|
Y_HxW,
|
1,
|
1.0f,
|
bias_data,
|
bias_multiplier_.template data<T>(),
|
1.0f,
|
Y_data,
|
&context_);
|
}
|
X_data += X_stride;
|
Y_data += Y_stride;
|
}
|
};
|
if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
|
runWithSharedBuffer<Context>(ws_, func);
|
} else {
|
func(&col_buffer_);
|
}
|
return true;
|
}
|
|
// The implementations.
|
template <typename T, class Context>
|
bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() {
|
CAFFE_ENFORCE_LE(
|
kernel_.size(),
|
3,
|
"Only 1-3d convolution is supported for NHWC storage type");
|
const Tensor& X = Input(INPUT);
|
const auto& filter = Input(FILTER);
|
Tensor* Y = Output(0);
|
const int N = X.dim32(0), C = X.dim32(X.dim() - 1);
|
const int G = group_;
|
CAFFE_ENFORCE_EQ(X.dim(), filter.dim());
|
const int M = filter.dim32(0);
|
CAFFE_ENFORCE_EQ(
|
C,
|
filter.dim32(filter.dim() - 1) * G,
|
"Convolution op: input channels does not match: # of input channels ",
|
C,
|
" is not equal to kernel channels * group: ",
|
filter.dim32(filter.dim() - 1),
|
"*",
|
G);
|
CAFFE_ENFORCE_EQ(
|
M % G, 0, "The number of output channels is not divisible by group.");
|
|
int kernel_size = 1;
|
for (std::size_t i = 0; i < kernel_.size(); ++i) {
|
CAFFE_ENFORCE_EQ(filter.dim32(i + 1), kernel_[i]);
|
kernel_size *= kernel_[i];
|
}
|
ConvPoolOpBase<Context>::SetOutputSize(X, Y, M);
|
|
if (N == 0) {
|
Y->template mutable_data<T>();
|
return true;
|
}
|
|
const vector<int> Y_dims = GetDims(*Y);
|
const int X_HxW = X.numel() / (N * C);
|
const int Y_HxW = Y->numel() / (N * M);
|
const vector<int> img_shape(X.sizes().cbegin() + 1, X.sizes().cend());
|
vector<int> buffer_shape(Y_dims.size() + 1);
|
std::copy(Y_dims.cbegin(), Y_dims.cend(), buffer_shape.begin());
|
buffer_shape.back() = C * kernel_size;
|
|
const int buffer_size = C * kernel_size * Y_HxW;
|
|
// The dimension of each kernel
|
const int kernel_dim = C / G * kernel_size;
|
// The offset corresponding to a single input image, and a single output
|
// image.
|
const int input_offset = X_HxW * C;
|
const int output_offset = Y->numel() / Y->dim32(0);
|
|
// The output image size is the spatial size of the output.
|
// The col buffer is stored in HWC order as well - the height and width, and
|
// kernel_dim.
|
const T* X_data = X.template data<T>();
|
const T* filter_data = filter.template data<T>();
|
const T* bias_data = nullptr;
|
if (InputSize() == 3) {
|
const auto& bias = Input(BIAS);
|
CAFFE_ENFORCE_EQ(bias.dim(), 1);
|
CAFFE_ENFORCE_EQ(bias.dim32(0), M);
|
bias_data = bias.template data<T>();
|
}
|
T* Y_data = Y->template mutable_data<T>();
|
|
// Specialized path for 1 by 1 convolution with stride 1, pad 0 - we
|
// can skip im2col.
|
if (kernel_dim == (C / group_) && !HasPad() && !HasStride()) {
|
if (bias_data != nullptr) {
|
// For this specialized path, we need a bigger bias_multiplier_ because
|
// we're doing just 1 big GEMM.
|
ConvPoolOpBase<Context>::template SetBiasMultiplier<T>(
|
N * X_HxW, &bias_multiplier_);
|
}
|
return Run1x1ConvOnDeviceWithOrderNHWC(
|
N, C, X_HxW, M, X_data, filter_data, bias_data, Y_data);
|
}
|
|
if (bias_data != nullptr) {
|
ConvPoolOpBase<Context>::template SetBiasMultiplier<T>(
|
Y_HxW, &bias_multiplier_);
|
}
|
auto f = [&](Tensor* col_buffer) {
|
col_buffer->Resize(buffer_shape);
|
T* col_buffer_data = col_buffer->template mutable_data<T>();
|
// Im2Col, followed by gemm.
|
for (int image_id = 0; image_id < N; ++image_id) {
|
if (kernel_.size() <= 2) {
|
math::Im2Col<T, Context, StorageOrder::NHWC>(
|
C,
|
X.dim32(1),
|
kernel_.size() == 2 ? X.dim32(2) : 1,
|
kernel_h(),
|
kernel_.size() == 2 ? kernel_w() : 1,
|
dilation_h(),
|
kernel_.size() == 2 ? dilation_w() : 1,
|
pad_t(),
|
kernel_.size() == 2 ? pad_l() : 0,
|
kernel_.size() == 2 ? pad_b() : pad_l(),
|
kernel_.size() == 2 ? pad_r() : 0,
|
stride_h(),
|
kernel_.size() == 2 ? stride_w() : 1,
|
X_data,
|
col_buffer_data,
|
&context_,
|
group_);
|
} else {
|
math::Im2ColNd<T, Context, StorageOrder::NHWC>(
|
kernel_.size(),
|
C * X_HxW,
|
buffer_size,
|
img_shape.data(),
|
buffer_shape.data(),
|
kernel_.data(),
|
stride_.data(),
|
dilation_.data(),
|
pads_.data(),
|
X_data,
|
col_buffer_data,
|
&context_,
|
group_);
|
}
|
// Weight term
|
for (int group_id = 0; group_id < group_; ++group_id) {
|
// col_buffer_data in G (H W) (R S C/G) layout
|
// filter_data in G K/G (R S C/G) layout
|
math::GemmEx<T, Context>(
|
CblasNoTrans,
|
CblasTrans,
|
Y_HxW,
|
M / group_,
|
kernel_dim,
|
1,
|
col_buffer_data + group_id * kernel_dim,
|
group_ * kernel_dim,
|
filter_data + group_id * (M / group_) * kernel_dim,
|
kernel_dim,
|
0,
|
Y_data + group_id * (M / group_),
|
M,
|
&context_);
|
}
|
if (bias_data != nullptr) {
|
// Bias term
|
math::Gemm<T, Context>(
|
CblasNoTrans,
|
CblasNoTrans,
|
Y_HxW,
|
M,
|
1,
|
1,
|
bias_multiplier_.template data<T>(),
|
bias_data,
|
1,
|
Y_data,
|
&context_);
|
}
|
X_data += input_offset;
|
Y_data += output_offset;
|
}
|
};
|
if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
|
runWithSharedBuffer<Context>(ws_, f);
|
} else {
|
f(&col_buffer_);
|
}
|
return true;
|
}
|
|
template <typename T, class Context>
|
bool ConvOp<T, Context>::Run1x1ConvOnDeviceWithOrderNCHW(
|
const int N,
|
const int C,
|
const int HxW,
|
const int M,
|
const T* X,
|
const T* filter,
|
const T* bias,
|
T* Y) {
|
const int G = group_;
|
if (G == 1) {
|
math::GemmStridedBatched<T, Context>(
|
CblasNoTrans,
|
CblasNoTrans,
|
N,
|
M,
|
HxW,
|
C,
|
1.0f,
|
filter,
|
0,
|
X,
|
C * HxW,
|
0.0f,
|
Y,
|
M * HxW,
|
&context_);
|
} else {
|
const int batch_size = N * G;
|
const int D_X = C / G;
|
const int D_Y = M / G;
|
const int X_stride = D_X * HxW;
|
const int W_stride = D_Y * D_X;
|
const int Y_stride = D_Y * HxW;
|
std::vector<const T*> X_ptr(N * G);
|
std::vector<const T*> W_ptr(N * G);
|
std::vector<T*> Y_ptr(N * G);
|
for (int i = 0; i < N; ++i) {
|
for (int j = 0; j < G; ++j) {
|
const int index = i * G + j;
|
X_ptr[index] = X + index * X_stride;
|
W_ptr[index] = filter + j * W_stride;
|
Y_ptr[index] = Y + index * Y_stride;
|
}
|
}
|
math::GemmBatched<T, Context>(
|
CblasNoTrans,
|
CblasNoTrans,
|
batch_size,
|
D_Y,
|
HxW,
|
D_X,
|
1.0f,
|
W_ptr.data(),
|
X_ptr.data(),
|
0.0f,
|
Y_ptr.data(),
|
&context_);
|
}
|
if (bias != nullptr) {
|
const T* bias_multiplier_data = bias_multiplier_.template data<T>();
|
math::GemmStridedBatched<T, Context>(
|
CblasNoTrans,
|
CblasNoTrans,
|
N,
|
M,
|
HxW,
|
1,
|
1.0f,
|
bias,
|
0,
|
bias_multiplier_data,
|
0,
|
1.0f,
|
Y,
|
M * HxW,
|
&context_);
|
}
|
return true;
|
}
|
|
template <typename T, class Context>
|
bool ConvOp<T, Context>::Run1x1ConvOnDeviceWithOrderNHWC(
|
const int N,
|
const int C,
|
const int HxW,
|
const int M,
|
const T* X,
|
const T* filter,
|
const T* bias,
|
T* Y) {
|
const int G = group_;
|
const int kernel_dim = C / G;
|
for (int group_id = 0; group_id < group_; ++group_id) {
|
math::GemmEx<T, Context>(
|
CblasNoTrans,
|
CblasTrans,
|
N * HxW,
|
M / group_,
|
kernel_dim,
|
1.0f,
|
X + group_id * kernel_dim,
|
C,
|
filter + group_id * (M / group_) * kernel_dim,
|
kernel_dim,
|
0.0f,
|
Y + group_id * (M / group_),
|
M,
|
&context_);
|
}
|
if (bias != nullptr) {
|
const T* bias_multiplier_data = bias_multiplier_.template data<T>();
|
math::Gemm<T, Context>(
|
CblasNoTrans,
|
CblasNoTrans,
|
N * HxW,
|
M,
|
1,
|
1.0f,
|
bias_multiplier_data,
|
bias,
|
1.0f,
|
Y,
|
&context_);
|
}
|
return true;
|
}
|
|
template <typename T, class Context>
|
bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
|
auto& X = Input(INPUT);
|
auto& filter = Input(FILTER);
|
auto& dY = Input(OUTPUT_GRAD);
|
|
const int N = X.dim32(0), C = X.dim32(1);
|
|
const vector<int> input_dims = this->GetDims(X);
|
const int input_image_size = this->GetDimsSize(X);
|
|
const vector<int> output_dims = this->GetDims(dY);
|
// The output image size is the spatial size of the output.
|
const int output_image_size = this->GetDimsSize(dY);
|
|
ConvPoolOpBase<Context>::ComputePads(input_dims);
|
CAFFE_ENFORCE_EQ(X.dim(), filter.dim());
|
const int M = filter.dim32(0);
|
CAFFE_ENFORCE_EQ(C, filter.dim32(1) * group_);
|
|
int kernel_dims_size = 1;
|
for (int i = 0; i < kernel_.size(); ++i) {
|
CAFFE_ENFORCE_EQ(filter.dim32(i + 2), kernel_[i]);
|
kernel_dims_size *= kernel_[i];
|
}
|
|
CAFFE_ENFORCE_EQ(M % group_, 0);
|
auto* dfilter = Output(FILTER_GRAD, filter.sizes(), at::dtype<T>());
|
// The dimension of each kernel
|
const int kernel_dim = C / group_ * kernel_dims_size;
|
// The col buffer is stored in CHW order as well - kernel_dim, and the height
|
// and width.
|
vector<int> img_shape;
|
img_shape.assign(X.sizes().begin() + 1, X.sizes().end());
|
vector<int> col_buffer_shape;
|
col_buffer_shape.push_back(C / group_ * kernel_dims_size);
|
col_buffer_shape.insert(
|
col_buffer_shape.end(), output_dims.begin(), output_dims.end());
|
vector<int64_t> col_buffer_shape_64;
|
std::copy(
|
col_buffer_shape.cbegin(),
|
col_buffer_shape.cend(),
|
std::back_inserter(col_buffer_shape_64));
|
ReinitializeTensor(
|
&col_buffer_,
|
col_buffer_shape_64,
|
at::dtype<T>().device(Context::GetDeviceType()));
|
|
if (kernel_.size() != 2) {
|
// TODO: SetDeviceTensor accept vector<int64_t>
|
SetDeviceTensor(img_shape, &img_shape_device_);
|
SetDeviceTensor(col_buffer_shape, &col_buffer_shape_device_);
|
}
|
|
const int col_buffer_size =
|
(C / group_) * kernel_dims_size * output_image_size;
|
const T* Xdata = X.template data<T>();
|
const T* filter_data = filter.template data<T>();
|
const T* dYdata = dY.template data<T>();
|
T* col_buffer_data = col_buffer_.template mutable_data<T>();
|
T* dfilter_data = dfilter->template mutable_data<T>();
|
|
// Pre-setting the gradients to zero.
|
math::Set<T, Context>(dfilter->numel(), 0, dfilter_data, &context_);
|
|
T* dbias_data = nullptr;
|
if (!no_bias_) {
|
auto* dbias = Output(BIAS_OR_INPUT_GRAD, {M}, at::dtype<T>());
|
// Removed the check for whether bias_multiplier_ has correct size or not
|
ReinitializeTensor(
|
&bias_multiplier_,
|
vector<int64_t>(1, output_image_size),
|
at::dtype<T>().device(Context::GetDeviceType()));
|
math::Set<T, Context>(
|
output_image_size,
|
static_cast<T>(1),
|
bias_multiplier_.template mutable_data<T>(),
|
&context_);
|
dbias_data = dbias->template mutable_data<T>();
|
math::Set<T, Context>(dbias->numel(), 0, dbias_data, &context_);
|
}
|
|
if (N == 0) {
|
if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
|
auto* dX = Output(
|
no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD,
|
X.sizes(),
|
at::dtype<T>());
|
dX->template mutable_data<T>();
|
}
|
return true;
|
}
|
|
// The offset corresponding to a single input image, and a single output
|
// image.
|
const int input_offset = C / group_ * input_image_size;
|
const int output_offset = dY.numel() / dY.dim32(0) / group_;
|
const int filter_offset = filter.numel() / group_;
|
for (int image_id = 0; image_id < N; ++image_id) {
|
for (int group_id = 0; group_id < group_; ++group_id) {
|
// When we compute the gradient with respect to the filters, we need to do
|
// im2col to allow gemm-type computation.
|
if (kernel_.size() == 2) {
|
math::Im2Col<T, Context, StorageOrder::NCHW>(
|
C / group_,
|
input_dims[0],
|
input_dims[1],
|
kernel_h(),
|
kernel_w(),
|
dilation_h(),
|
dilation_w(),
|
pad_t(),
|
pad_l(),
|
pad_b(),
|
pad_r(),
|
stride_h(),
|
stride_w(),
|
Xdata + group_id * input_offset,
|
col_buffer_data,
|
&context_);
|
} else {
|
math::Im2ColNd<T, Context, StorageOrder::NCHW>(
|
kernel_.size(),
|
input_offset,
|
col_buffer_size,
|
img_shape.data(),
|
col_buffer_shape.data(),
|
kernel_.data(),
|
stride_.data(),
|
dilation_.data(),
|
pads_.data(),
|
Xdata + group_id * input_offset,
|
col_buffer_data,
|
&context_);
|
}
|
// Gradient with respect to filter.
|
math::Gemm<T, Context>(
|
CblasNoTrans,
|
CblasTrans,
|
M / group_,
|
kernel_dim,
|
output_image_size,
|
1,
|
dYdata + group_id * output_offset,
|
col_buffer_data,
|
1,
|
dfilter_data + group_id * filter_offset,
|
&context_);
|
}
|
if (!no_bias_) {
|
// Gradient with respect to bias can be computed independent from group.
|
math::Gemv<T, Context>(
|
CblasNoTrans,
|
M,
|
output_image_size,
|
1,
|
dYdata,
|
bias_multiplier_.template data<T>(),
|
1,
|
dbias_data,
|
&context_);
|
}
|
Xdata += input_offset * group_;
|
dYdata += output_offset * group_;
|
}
|
if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
|
// Compute the gradient w.r.t. the input.
|
|
auto* dX = Output(
|
no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD, X.sizes(), at::dtype<T>());
|
T* dXdata = dX->template mutable_data<T>();
|
dYdata = dY.template data<T>();
|
for (int image_id = 0; image_id < N; ++image_id) {
|
for (int group_id = 0; group_id < group_; ++group_id) {
|
// Compute gradient into col_buffer.
|
math::Gemm<T, Context>(
|
CblasTrans,
|
CblasNoTrans,
|
kernel_dim,
|
output_image_size,
|
M / group_,
|
1,
|
filter_data + group_id * filter_offset,
|
dYdata,
|
0,
|
col_buffer_data,
|
&context_);
|
if (kernel_.size() == 2) {
|
math::Col2Im<T, Context, StorageOrder::NCHW>(
|
C / group_,
|
input_dims[0],
|
input_dims[1],
|
kernel_h(),
|
kernel_w(),
|
dilation_h(),
|
dilation_w(),
|
pad_t(),
|
pad_l(),
|
pad_b(),
|
pad_r(),
|
stride_h(),
|
stride_w(),
|
col_buffer_data,
|
dXdata,
|
&context_);
|
} else {
|
math::Col2ImNd<T, Context, StorageOrder::NCHW>(
|
kernel_.size(),
|
input_offset,
|
col_buffer_size,
|
img_shape.data(),
|
col_buffer_shape.data(),
|
kernel_.data(),
|
stride_.data(),
|
dilation_.data(),
|
pads_.data(),
|
col_buffer_data,
|
dXdata,
|
&context_);
|
}
|
dXdata += input_offset;
|
dYdata += output_offset;
|
}
|
}
|
}
|
return true;
|
}
|
|
template <typename T, class Context>
|
bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
|
auto& X = Input(INPUT);
|
auto& filter = Input(FILTER);
|
auto& dY = Input(OUTPUT_GRAD);
|
|
const int N = X.dim32(0), C = X.dim32(X.dim() - 1);
|
|
const vector<int> input_dims = this->GetDims(X);
|
const int input_image_size = this->GetDimsSize(X);
|
|
const vector<int> output_dims = this->GetDims(dY);
|
// The output image size is the spatial size of the output.
|
const int output_image_size = this->GetDimsSize(dY);
|
|
ConvPoolOpBase<Context>::ComputePads(input_dims);
|
CAFFE_ENFORCE_EQ(X.dim(), filter.dim());
|
const int M = filter.dim32(0);
|
CAFFE_ENFORCE_EQ(C, filter.dim32(filter.dim() - 1) * group_);
|
|
int kernel_dims_size = 1;
|
for (size_t i = 0; i < kernel_.size(); ++i) {
|
CAFFE_ENFORCE_EQ(filter.dim32(i + 1), kernel_[i]);
|
kernel_dims_size *= kernel_[i];
|
}
|
|
CAFFE_ENFORCE_EQ(M % group_, 0);
|
auto* dfilter = Output(FILTER_GRAD, filter.sizes(), at::dtype<T>());
|
// The dimension of each kernel
|
const int kernel_dim = C / group_ * kernel_dims_size;
|
|
// The col buffer is stored in HWC order as well - the height and width, and
|
// kernel_dim.
|
vector<int> img_shape(X.sizes().cbegin() + 1, X.sizes().cend());
|
vector<int> col_buffer_shape(output_dims.size() + 1);
|
std::copy(output_dims.cbegin(), output_dims.cend(), col_buffer_shape.begin());
|
col_buffer_shape.back() = C * kernel_dims_size;
|
vector<int64_t> col_buffer_shape_64;
|
std::copy(
|
col_buffer_shape.cbegin(),
|
col_buffer_shape.cend(),
|
std::back_inserter(col_buffer_shape_64));
|
ReinitializeTensor(
|
&col_buffer_,
|
col_buffer_shape_64,
|
at::dtype<T>().device(Context::GetDeviceType()));
|
|
if (kernel_.size() != 2) {
|
SetDeviceTensor(img_shape, &img_shape_device_);
|
SetDeviceTensor(col_buffer_shape, &col_buffer_shape_device_);
|
}
|
|
const int col_buffer_size = C * kernel_dims_size * output_image_size;
|
const T* Xdata = X.template data<T>();
|
const T* const filter_data = filter.template data<T>();
|
const T* const dYdata = dY.template data<T>();
|
T* col_buffer_data = col_buffer_.template mutable_data<T>();
|
T* dfilter_data = dfilter->template mutable_data<T>();
|
|
// Pre-setting the gradients to zero.
|
math::Set<T, Context>(dfilter->numel(), 0, dfilter_data, &context_);
|
|
T* dbias_data = nullptr;
|
if (!no_bias_) {
|
auto* dbias = Output(BIAS_OR_INPUT_GRAD, {M}, at::dtype<T>());
|
dbias_data = dbias->template mutable_data<T>();
|
math::Set<T, Context>(dbias->numel(), 0, dbias_data, &context_);
|
// Removed the check for whether bias_multiplier_ has correct size or not
|
ReinitializeTensor(
|
&bias_multiplier_,
|
vector<int64_t>(1, output_image_size),
|
at::dtype<T>().device(Context::GetDeviceType()));
|
math::Set<T, Context>(
|
output_image_size,
|
static_cast<T>(1),
|
bias_multiplier_.template mutable_data<T>(),
|
&context_);
|
}
|
|
if (N == 0) {
|
if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
|
auto* dX = Output(
|
no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD,
|
X.sizes(),
|
at::dtype<T>());
|
dX->template mutable_data<T>();
|
}
|
return true;
|
}
|
|
// The offset corresponding to a single input image, and a single output
|
// image.
|
const size_t input_offset = C * input_image_size;
|
const size_t output_offset = dY.numel() / dY.dim32(0);
|
for (int image_id = 0; image_id < N; ++image_id) {
|
// When we compute the gradient with respect to the filters, we need to do
|
// im2col to allow gemm-type computation.
|
if (kernel_.size() <= 2) {
|
math::Im2Col<T, Context, StorageOrder::NHWC>(
|
C,
|
X.size(1),
|
kernel_.size() == 2 ? X.dim32(2) : 1,
|
kernel_h(),
|
kernel_.size() == 2 ? kernel_w() : 1,
|
dilation_h(),
|
kernel_.size() == 2 ? dilation_w() : 1,
|
pad_t(),
|
kernel_.size() == 2 ? pad_l() : 0,
|
kernel_.size() == 2 ? pad_b() : pad_l(),
|
kernel_.size() == 2 ? pad_r() : 0,
|
stride_h(),
|
kernel_.size() == 2 ? stride_w() : 1,
|
Xdata,
|
col_buffer_data,
|
&context_,
|
group_);
|
} else {
|
math::Im2ColNd<T, Context, StorageOrder::NHWC>(
|
kernel_.size(),
|
C * input_image_size,
|
col_buffer_size,
|
img_shape.data(),
|
col_buffer_shape.data(),
|
kernel_.data(),
|
stride_.data(),
|
dilation_.data(),
|
pads_.data(),
|
Xdata,
|
col_buffer_data,
|
&context_,
|
group_);
|
}
|
// Gradient with respect to filter.
|
for (int group_id = 0; group_id < group_; ++group_id) {
|
math::GemmEx<T, Context>(
|
CblasTrans,
|
CblasNoTrans,
|
M / group_,
|
kernel_dim,
|
output_image_size,
|
1,
|
dYdata + output_offset * image_id + group_id * (M / group_),
|
M,
|
col_buffer_data + group_id * kernel_dim,
|
group_ * kernel_dim,
|
1,
|
dfilter_data + group_id * (M / group_) * kernel_dim,
|
kernel_dim,
|
&context_);
|
}
|
if (!no_bias_) {
|
// Gradient with respect to bias
|
math::Gemv<T, Context>(
|
CblasTrans,
|
output_image_size,
|
M,
|
1,
|
dYdata + output_offset * image_id,
|
bias_multiplier_.template data<T>(),
|
1,
|
dbias_data,
|
&context_);
|
}
|
Xdata += input_offset;
|
} // for each image
|
|
if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
|
// Compute the gradient w.r.t. the input.
|
|
auto* dX = Output(
|
no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD, X.sizes(), at::dtype<T>());
|
T* dXdata = dX->template mutable_data<T>();
|
for (int image_id = 0; image_id < N; ++image_id) {
|
// Compute gradient into col_buffer.
|
for (int group_id = 0; group_id < group_; ++group_id) {
|
math::GemmEx<T, Context>(
|
CblasNoTrans,
|
CblasNoTrans,
|
output_image_size,
|
kernel_dim,
|
M / group_,
|
1,
|
dYdata + output_offset * image_id + group_id * (M / group_),
|
M,
|
filter_data + group_id * (M / group_) * kernel_dim,
|
kernel_dim,
|
0,
|
col_buffer_data + group_id * kernel_dim,
|
group_ * kernel_dim,
|
&context_);
|
}
|
if (kernel_.size() <= 2) {
|
math::Col2Im<T, Context, StorageOrder::NHWC>(
|
C,
|
X.size(1),
|
kernel_.size() == 2 ? X.dim32(2) : 1,
|
kernel_h(),
|
kernel_.size() == 2 ? kernel_w() : 1,
|
dilation_h(),
|
kernel_.size() == 2 ? dilation_w() : 1,
|
pad_t(),
|
kernel_.size() == 2 ? pad_l() : 0,
|
kernel_.size() == 2 ? pad_b() : pad_l(),
|
kernel_.size() == 2 ? pad_r() : 0,
|
stride_h(),
|
kernel_.size() == 2 ? stride_w() : 1,
|
col_buffer_data,
|
dXdata,
|
&context_,
|
group_);
|
} else {
|
math::Col2ImNd<T, Context, StorageOrder::NHWC>(
|
kernel_.size(),
|
C * input_image_size,
|
col_buffer_size,
|
img_shape.data(),
|
col_buffer_shape.data(),
|
kernel_.data(),
|
stride_.data(),
|
dilation_.data(),
|
pads_.data(),
|
col_buffer_data,
|
dXdata,
|
&context_,
|
group_);
|
}
|
dXdata += input_offset;
|
} // for each image
|
}
|
return true;
|
}
|
} // namespace caffe2
|
|
#endif // CAFFE2_OPERATORS_CONV_OP_IMPL_H_
|