// YellowFin: An automatic tuner for momentum SGD
|
// (https://arxiv.org/abs/1706.03471)
|
// The YellowFinOp tunes learning rate and momentum and performs momentum SGD
|
// steps. The learning rate and momentum are separate for any matrix of
|
// parameters.
|
|
#pragma once
|
|
#include <cmath>
|
#include <cstring>
|
#include "caffe2/core/operator.h"
|
#include "caffe2/utils/math.h"
|
|
namespace caffe2 {
|
|
template <typename T, class Context>
|
class YellowFinOp final : public Operator<Context> {
|
public:
|
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
YellowFinOp(const OperatorDef& operator_def, Workspace* ws)
|
: Operator<Context>(operator_def, ws),
|
curv_win_width_(
|
this->template GetSingleArgument<int>("curv_win_width", 20)),
|
nesterov_(this->template GetSingleArgument<int>("nesterov", false)),
|
zero_debias_(
|
this->template GetSingleArgument<bool>("zero_debias", true)),
|
epsilon_(this->template GetSingleArgument<T>("epsilon", 1e-6f)),
|
beta_(this->template GetSingleArgument<T>("beta", 0.999f)) {}
|
|
protected:
|
// GetLrMu and MomentumSgdUpdate have different implementations for GPU and
|
// CPU. All other methods are generic.
|
void GetLrMu();
|
void MomentumSgdUpdate();
|
|
void AfterApply() {
|
// g
|
MovingAverage(D_, grad_, g_avg_, g_avg_out_, g_deb_);
|
// g2
|
math::Mul(D_, grad_, grad_, aux_vector_, &context_);
|
MovingAverage(D_, aux_vector_, g2_avg_, g2_avg_out_, g2_deb_);
|
// g_norm2
|
math::Dot(D_, grad_, grad_, g_norm2_, &context_);
|
math::Maximum(1, epsilon_, g_norm2_, g_norm2_, &context_);
|
MovingAverage(1, g_norm2_, g_norm2_avg_, g_norm2_avg_out_, g_norm2_deb_);
|
// g_norm
|
math::Sqrt(1, g_norm2_, g_norm_, &context_);
|
MovingAverage(1, g_norm_, g_norm_avg_, g_norm_avg_out_, g_norm_deb_);
|
math::Maximum(1, epsilon_, g_norm_deb_, g_norm_deb_, &context_);
|
// Curvature range: g_norm2_min, g_norm2_max
|
math::CopyVector(curv_win_width_, curv_win_, curv_win_out_, &context_);
|
T* curv_win_cell = curv_win_out_ + (iter_ - 1) % curv_win_width_;
|
math::Log(1, g_norm2_, curv_win_cell, &context_);
|
int valid_end = std::min(curv_win_width_, iter_);
|
math::ReduceMin(
|
valid_end, curv_win_out_, g_norm2_min_, &scratch_tensor_, &context_);
|
math::ReduceMax(
|
valid_end, curv_win_out_, g_norm2_max_, &scratch_tensor_, &context_);
|
MovingAverage(
|
1,
|
g_norm2_min_,
|
g_norm2_min_avg_,
|
g_norm2_min_avg_out_,
|
g_norm2_min_deb_);
|
MovingAverage(
|
1,
|
g_norm2_max_,
|
g_norm2_max_avg_,
|
g_norm2_max_avg_out_,
|
g_norm2_max_deb_);
|
math::Exp(1, g_norm2_min_deb_, g_norm2_min_deb_, &context_);
|
math::Exp(1, g_norm2_max_deb_, g_norm2_max_deb_, &context_);
|
math::Maximum(1, epsilon_, g_norm2_min_deb_, g_norm2_min_deb_, &context_);
|
math::Maximum(1, epsilon_, g_norm2_max_deb_, g_norm2_max_deb_, &context_);
|
// Gradient variance
|
math::Dot(D_, g_deb_, g_deb_, aux_scalar_, &context_);
|
|
math::Sub(1, g_norm2_deb_, aux_scalar_, variance_, &context_);
|
math::Maximum(1, epsilon_, variance_, variance_, &context_);
|
// Distance to opt
|
math::Div(1, g_norm_avg_out_, g_norm2_avg_out_, distance_, &context_);
|
MovingAverage(
|
1, distance_, distance_avg_, distance_avg_out_, distance_deb_);
|
if (iter_ > 1) {
|
GetLrMu();
|
}
|
}
|
|
void MovingAverage(
|
const int N,
|
const T* elt,
|
const T* avg,
|
T* new_avg,
|
T* debias_avg) {
|
const T one = 1;
|
math::Scale(N, beta_, avg, new_avg, &context_);
|
math::Axpy(N, one - beta_, elt, new_avg, &context_);
|
math::Scale(N, debias_factor_, new_avg, debias_avg, &context_);
|
}
|
|
T ZeroDebiasFactor() {
|
if (zero_debias_) {
|
const T one = 1;
|
return one / (one - std::pow(beta_, iter_));
|
} else {
|
return 1;
|
}
|
}
|
|
public:
|
bool RunOnDevice() override {
|
// Iter live on the CPU
|
|
#define CAFFE2_YF_READ_INPUT(INPUT_NAME, VAR_NAME) \
|
const auto& VAR_NAME##_tensor = Input(INPUT_NAME); \
|
VAR_NAME##_ = VAR_NAME##_tensor.template data<T>();
|
|
CAFFE2_YF_READ_INPUT(PARAM, param)
|
CAFFE2_YF_READ_INPUT(MOMENT, moment)
|
CAFFE2_YF_READ_INPUT(LR_AVG, lr_avg)
|
CAFFE2_YF_READ_INPUT(MU_AVG, mu_avg)
|
CAFFE2_YF_READ_INPUT(CURV_WIN, curv_win)
|
CAFFE2_YF_READ_INPUT(G_AVG, g_avg)
|
CAFFE2_YF_READ_INPUT(G2_AVG, g2_avg)
|
CAFFE2_YF_READ_INPUT(SCALARS_MEMORY, scalars_memory)
|
CAFFE2_YF_READ_INPUT(GRAD, grad)
|
#undef CAFFE2_YF_READ_OUTPUT
|
|
CAFFE_ENFORCE(OperatorBase::InputIsTensorType(ITER, CPU));
|
CAFFE_ENFORCE_EQ(lr_avg_tensor.numel(), 1);
|
CAFFE_ENFORCE_EQ(mu_avg_tensor.numel(), 1);
|
CAFFE_ENFORCE_EQ(param_tensor.dim(), moment_tensor.dim());
|
CAFFE_ENFORCE_EQ(param_tensor.dim(), g_avg_tensor.dim());
|
CAFFE_ENFORCE_EQ(param_tensor.dim(), g2_avg_tensor.dim());
|
CAFFE_ENFORCE_EQ(param_tensor.dim(), grad_tensor.dim());
|
for (int i = 0; i < param_tensor.dim(); ++i) {
|
CAFFE_ENFORCE_EQ(param_tensor.dim32(i), moment_tensor.dim32(i));
|
CAFFE_ENFORCE_EQ(param_tensor.dim32(i), g_avg_tensor.dim32(i));
|
CAFFE_ENFORCE_EQ(param_tensor.dim32(i), g2_avg_tensor.dim32(i));
|
CAFFE_ENFORCE_EQ(param_tensor.dim32(i), grad_tensor.dim32(i));
|
}
|
|
iter_ = OperatorBase::Input<Tensor>(ITER, CPU).template data<int64_t>()[0];
|
|
D_ = param_tensor.numel();
|
|
// Input data - persistent memory for internal scalars
|
// Note: Memory for these scalars is being allocated during initialization
|
// of the network. If you want to add / remove a scalar, make a
|
// suitable change of memory size in the initialization.
|
const T* memory_it = scalars_memory_ - 1;
|
g_norm_avg_ = ++memory_it;
|
g_norm2_avg_ = ++memory_it;
|
g_norm2_min_avg_ = ++memory_it;
|
g_norm2_max_avg_ = ++memory_it;
|
distance_avg_ = ++memory_it;
|
|
// Output data
|
|
#define CAFFE2_YF_READ_OUTPUT(OUTPUT_NAME, VAR_NAME) \
|
auto VAR_NAME##_out_tensor = \
|
Output(OUTPUT_##OUTPUT_NAME, VAR_NAME##_tensor.sizes(), at::dtype<T>()); \
|
VAR_NAME##_out_ = VAR_NAME##_out_tensor->template mutable_data<T>();
|
|
CAFFE2_YF_READ_OUTPUT(PARAM, param)
|
CAFFE2_YF_READ_OUTPUT(MOMENT, moment)
|
CAFFE2_YF_READ_OUTPUT(LR_AVG, lr_avg)
|
CAFFE2_YF_READ_OUTPUT(MU_AVG, mu_avg)
|
CAFFE2_YF_READ_OUTPUT(CURV_WIN, curv_win)
|
CAFFE2_YF_READ_OUTPUT(G_AVG, g_avg)
|
CAFFE2_YF_READ_OUTPUT(G2_AVG, g2_avg)
|
CAFFE2_YF_READ_OUTPUT(SCALARS_MEMORY, scalars_memory)
|
#undef CAFFE2_YF_READ_OUTPUT
|
|
T* out_memory_it = scalars_memory_out_ - 1;
|
g_norm_avg_out_ = ++out_memory_it;
|
g_norm2_avg_out_ = ++out_memory_it;
|
g_norm2_min_avg_out_ = ++out_memory_it;
|
g_norm2_max_avg_out_ = ++out_memory_it;
|
distance_avg_out_ = ++out_memory_it;
|
|
#define CAFFE2_YF_INIT_VECTOR(NAME) \
|
ReinitializeTensor(&NAME##_tensor_, {D_}, at::dtype<T>().device(Context::GetDeviceType())); \
|
NAME##_ = NAME##_tensor_.template mutable_data<T>();
|
|
CAFFE2_YF_INIT_VECTOR(aux_vector)
|
CAFFE2_YF_INIT_VECTOR(g_deb)
|
CAFFE2_YF_INIT_VECTOR(g2_deb)
|
CAFFE2_YF_INIT_VECTOR(g_deb2)
|
#undef CAFFE2_YF_INIT_VECTOR
|
|
#define CAFFE2_YF_INIT_SCALAR(NAME) \
|
ReinitializeTensor(&NAME##_tensor_, {1}, at::dtype<T>().device(Context::GetDeviceType())); \
|
NAME##_ = NAME##_tensor_.template mutable_data<T>();
|
|
CAFFE2_YF_INIT_SCALAR(aux_scalar)
|
CAFFE2_YF_INIT_SCALAR(distance)
|
CAFFE2_YF_INIT_SCALAR(distance_deb)
|
CAFFE2_YF_INIT_SCALAR(g_norm)
|
CAFFE2_YF_INIT_SCALAR(g_norm_deb)
|
CAFFE2_YF_INIT_SCALAR(g_norm2)
|
CAFFE2_YF_INIT_SCALAR(g_norm2_max)
|
CAFFE2_YF_INIT_SCALAR(g_norm2_max_deb)
|
CAFFE2_YF_INIT_SCALAR(g_norm2_min)
|
CAFFE2_YF_INIT_SCALAR(g_norm2_min_deb)
|
CAFFE2_YF_INIT_SCALAR(g_norm2_deb)
|
CAFFE2_YF_INIT_SCALAR(lr)
|
CAFFE2_YF_INIT_SCALAR(lr_deb)
|
CAFFE2_YF_INIT_SCALAR(mu_deb)
|
CAFFE2_YF_INIT_SCALAR(mu)
|
CAFFE2_YF_INIT_SCALAR(variance)
|
#undef CAFFE2_YF_INIT_SCALAR
|
|
debias_factor_ = ZeroDebiasFactor();
|
MomentumSgdUpdate();
|
AfterApply();
|
return true;
|
}
|
|
protected:
|
int curv_win_width_;
|
bool nesterov_;
|
bool zero_debias_;
|
|
T epsilon_;
|
T beta_;
|
T debias_factor_;
|
|
int D_;
|
|
// Temporary memory on device, listed all variables used in calculations
|
#define CAFFE2_YF_DEFINE_TENSOR(NAME) \
|
Tensor NAME##_tensor_; \
|
T* NAME##_;
|
|
CAFFE2_YF_DEFINE_TENSOR(aux_vector)
|
CAFFE2_YF_DEFINE_TENSOR(g_deb)
|
CAFFE2_YF_DEFINE_TENSOR(g2_deb)
|
CAFFE2_YF_DEFINE_TENSOR(g_deb2)
|
|
CAFFE2_YF_DEFINE_TENSOR(aux_scalar)
|
CAFFE2_YF_DEFINE_TENSOR(distance)
|
CAFFE2_YF_DEFINE_TENSOR(distance_deb)
|
CAFFE2_YF_DEFINE_TENSOR(g_norm)
|
CAFFE2_YF_DEFINE_TENSOR(g_norm_deb)
|
CAFFE2_YF_DEFINE_TENSOR(g_norm2)
|
CAFFE2_YF_DEFINE_TENSOR(g_norm2_deb)
|
CAFFE2_YF_DEFINE_TENSOR(g_norm2_max)
|
CAFFE2_YF_DEFINE_TENSOR(g_norm2_max_deb)
|
CAFFE2_YF_DEFINE_TENSOR(g_norm2_min)
|
CAFFE2_YF_DEFINE_TENSOR(g_norm2_min_deb)
|
CAFFE2_YF_DEFINE_TENSOR(lr)
|
CAFFE2_YF_DEFINE_TENSOR(lr_deb)
|
CAFFE2_YF_DEFINE_TENSOR(mu)
|
CAFFE2_YF_DEFINE_TENSOR(mu_deb)
|
CAFFE2_YF_DEFINE_TENSOR(variance)
|
|
Tensor scratch_tensor_{Context::GetDeviceType()};
|
|
#undef CAFFE2_YF_DEFINE_TENSOR
|
|
// Input tensors' data
|
const T* param_;
|
const T* moment_;
|
const T* lr_avg_;
|
const T* mu_avg_;
|
const T* curv_win_;
|
const T* g_avg_;
|
const T* g2_avg_;
|
const T* scalars_memory_;
|
const T* grad_;
|
int iter_;
|
|
// Scalar data from scalars_memory_ input tensor
|
const T* g_norm_avg_;
|
const T* g_norm2_avg_;
|
const T* g_norm2_min_avg_;
|
const T* g_norm2_max_avg_;
|
const T* distance_avg_;
|
|
// Output tensors' data
|
|
T* param_out_;
|
T* moment_out_;
|
T* lr_avg_out_;
|
T* mu_avg_out_;
|
T* curv_win_out_;
|
T* g_avg_out_;
|
T* g2_avg_out_;
|
T* scalars_memory_out_;
|
|
// Scalar data from scalars_memory_ output tensor
|
T* g_norm_avg_out_;
|
T* g_norm2_avg_out_;
|
T* g_norm2_min_avg_out_;
|
T* g_norm2_max_avg_out_;
|
T* distance_avg_out_;
|
|
INPUT_TAGS(
|
PARAM,
|
MOMENT,
|
LR_AVG,
|
MU_AVG,
|
CURV_WIN,
|
G_AVG,
|
G2_AVG,
|
SCALARS_MEMORY,
|
GRAD,
|
ITER);
|
OUTPUT_TAGS(
|
OUTPUT_PARAM,
|
OUTPUT_MOMENT,
|
OUTPUT_LR_AVG,
|
OUTPUT_MU_AVG,
|
OUTPUT_CURV_WIN,
|
OUTPUT_G_AVG,
|
OUTPUT_G2_AVG,
|
OUTPUT_SCALARS_MEMORY);
|
};
|
|
} // namespace caffe2
|