// YellowFin: An automatic tuner for momentum SGD // (https://arxiv.org/abs/1706.03471) // The YellowFinOp tunes learning rate and momentum and performs momentum SGD // steps. The learning rate and momentum are separate for any matrix of // parameters. #pragma once #include #include #include "caffe2/core/operator.h" #include "caffe2/utils/math.h" namespace caffe2 { template class YellowFinOp final : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; YellowFinOp(const OperatorDef& operator_def, Workspace* ws) : Operator(operator_def, ws), curv_win_width_( this->template GetSingleArgument("curv_win_width", 20)), nesterov_(this->template GetSingleArgument("nesterov", false)), zero_debias_( this->template GetSingleArgument("zero_debias", true)), epsilon_(this->template GetSingleArgument("epsilon", 1e-6f)), beta_(this->template GetSingleArgument("beta", 0.999f)) {} protected: // GetLrMu and MomentumSgdUpdate have different implementations for GPU and // CPU. All other methods are generic. void GetLrMu(); void MomentumSgdUpdate(); void AfterApply() { // g MovingAverage(D_, grad_, g_avg_, g_avg_out_, g_deb_); // g2 math::Mul(D_, grad_, grad_, aux_vector_, &context_); MovingAverage(D_, aux_vector_, g2_avg_, g2_avg_out_, g2_deb_); // g_norm2 math::Dot(D_, grad_, grad_, g_norm2_, &context_); math::Maximum(1, epsilon_, g_norm2_, g_norm2_, &context_); MovingAverage(1, g_norm2_, g_norm2_avg_, g_norm2_avg_out_, g_norm2_deb_); // g_norm math::Sqrt(1, g_norm2_, g_norm_, &context_); MovingAverage(1, g_norm_, g_norm_avg_, g_norm_avg_out_, g_norm_deb_); math::Maximum(1, epsilon_, g_norm_deb_, g_norm_deb_, &context_); // Curvature range: g_norm2_min, g_norm2_max math::CopyVector(curv_win_width_, curv_win_, curv_win_out_, &context_); T* curv_win_cell = curv_win_out_ + (iter_ - 1) % curv_win_width_; math::Log(1, g_norm2_, curv_win_cell, &context_); int valid_end = std::min(curv_win_width_, iter_); math::ReduceMin( valid_end, curv_win_out_, g_norm2_min_, &scratch_tensor_, &context_); math::ReduceMax( valid_end, curv_win_out_, g_norm2_max_, &scratch_tensor_, &context_); MovingAverage( 1, g_norm2_min_, g_norm2_min_avg_, g_norm2_min_avg_out_, g_norm2_min_deb_); MovingAverage( 1, g_norm2_max_, g_norm2_max_avg_, g_norm2_max_avg_out_, g_norm2_max_deb_); math::Exp(1, g_norm2_min_deb_, g_norm2_min_deb_, &context_); math::Exp(1, g_norm2_max_deb_, g_norm2_max_deb_, &context_); math::Maximum(1, epsilon_, g_norm2_min_deb_, g_norm2_min_deb_, &context_); math::Maximum(1, epsilon_, g_norm2_max_deb_, g_norm2_max_deb_, &context_); // Gradient variance math::Dot(D_, g_deb_, g_deb_, aux_scalar_, &context_); math::Sub(1, g_norm2_deb_, aux_scalar_, variance_, &context_); math::Maximum(1, epsilon_, variance_, variance_, &context_); // Distance to opt math::Div(1, g_norm_avg_out_, g_norm2_avg_out_, distance_, &context_); MovingAverage( 1, distance_, distance_avg_, distance_avg_out_, distance_deb_); if (iter_ > 1) { GetLrMu(); } } void MovingAverage( const int N, const T* elt, const T* avg, T* new_avg, T* debias_avg) { const T one = 1; math::Scale(N, beta_, avg, new_avg, &context_); math::Axpy(N, one - beta_, elt, new_avg, &context_); math::Scale(N, debias_factor_, new_avg, debias_avg, &context_); } T ZeroDebiasFactor() { if (zero_debias_) { const T one = 1; return one / (one - std::pow(beta_, iter_)); } else { return 1; } } public: bool RunOnDevice() override { // Iter live on the CPU #define CAFFE2_YF_READ_INPUT(INPUT_NAME, VAR_NAME) \ const auto& VAR_NAME##_tensor = Input(INPUT_NAME); \ VAR_NAME##_ = VAR_NAME##_tensor.template data(); CAFFE2_YF_READ_INPUT(PARAM, param) CAFFE2_YF_READ_INPUT(MOMENT, moment) CAFFE2_YF_READ_INPUT(LR_AVG, lr_avg) CAFFE2_YF_READ_INPUT(MU_AVG, mu_avg) CAFFE2_YF_READ_INPUT(CURV_WIN, curv_win) CAFFE2_YF_READ_INPUT(G_AVG, g_avg) CAFFE2_YF_READ_INPUT(G2_AVG, g2_avg) CAFFE2_YF_READ_INPUT(SCALARS_MEMORY, scalars_memory) CAFFE2_YF_READ_INPUT(GRAD, grad) #undef CAFFE2_YF_READ_OUTPUT CAFFE_ENFORCE(OperatorBase::InputIsTensorType(ITER, CPU)); CAFFE_ENFORCE_EQ(lr_avg_tensor.numel(), 1); CAFFE_ENFORCE_EQ(mu_avg_tensor.numel(), 1); CAFFE_ENFORCE_EQ(param_tensor.dim(), moment_tensor.dim()); CAFFE_ENFORCE_EQ(param_tensor.dim(), g_avg_tensor.dim()); CAFFE_ENFORCE_EQ(param_tensor.dim(), g2_avg_tensor.dim()); CAFFE_ENFORCE_EQ(param_tensor.dim(), grad_tensor.dim()); for (int i = 0; i < param_tensor.dim(); ++i) { CAFFE_ENFORCE_EQ(param_tensor.dim32(i), moment_tensor.dim32(i)); CAFFE_ENFORCE_EQ(param_tensor.dim32(i), g_avg_tensor.dim32(i)); CAFFE_ENFORCE_EQ(param_tensor.dim32(i), g2_avg_tensor.dim32(i)); CAFFE_ENFORCE_EQ(param_tensor.dim32(i), grad_tensor.dim32(i)); } iter_ = OperatorBase::Input(ITER, CPU).template data()[0]; D_ = param_tensor.numel(); // Input data - persistent memory for internal scalars // Note: Memory for these scalars is being allocated during initialization // of the network. If you want to add / remove a scalar, make a // suitable change of memory size in the initialization. const T* memory_it = scalars_memory_ - 1; g_norm_avg_ = ++memory_it; g_norm2_avg_ = ++memory_it; g_norm2_min_avg_ = ++memory_it; g_norm2_max_avg_ = ++memory_it; distance_avg_ = ++memory_it; // Output data #define CAFFE2_YF_READ_OUTPUT(OUTPUT_NAME, VAR_NAME) \ auto VAR_NAME##_out_tensor = \ Output(OUTPUT_##OUTPUT_NAME, VAR_NAME##_tensor.sizes(), at::dtype()); \ VAR_NAME##_out_ = VAR_NAME##_out_tensor->template mutable_data(); CAFFE2_YF_READ_OUTPUT(PARAM, param) CAFFE2_YF_READ_OUTPUT(MOMENT, moment) CAFFE2_YF_READ_OUTPUT(LR_AVG, lr_avg) CAFFE2_YF_READ_OUTPUT(MU_AVG, mu_avg) CAFFE2_YF_READ_OUTPUT(CURV_WIN, curv_win) CAFFE2_YF_READ_OUTPUT(G_AVG, g_avg) CAFFE2_YF_READ_OUTPUT(G2_AVG, g2_avg) CAFFE2_YF_READ_OUTPUT(SCALARS_MEMORY, scalars_memory) #undef CAFFE2_YF_READ_OUTPUT T* out_memory_it = scalars_memory_out_ - 1; g_norm_avg_out_ = ++out_memory_it; g_norm2_avg_out_ = ++out_memory_it; g_norm2_min_avg_out_ = ++out_memory_it; g_norm2_max_avg_out_ = ++out_memory_it; distance_avg_out_ = ++out_memory_it; #define CAFFE2_YF_INIT_VECTOR(NAME) \ ReinitializeTensor(&NAME##_tensor_, {D_}, at::dtype().device(Context::GetDeviceType())); \ NAME##_ = NAME##_tensor_.template mutable_data(); CAFFE2_YF_INIT_VECTOR(aux_vector) CAFFE2_YF_INIT_VECTOR(g_deb) CAFFE2_YF_INIT_VECTOR(g2_deb) CAFFE2_YF_INIT_VECTOR(g_deb2) #undef CAFFE2_YF_INIT_VECTOR #define CAFFE2_YF_INIT_SCALAR(NAME) \ ReinitializeTensor(&NAME##_tensor_, {1}, at::dtype().device(Context::GetDeviceType())); \ NAME##_ = NAME##_tensor_.template mutable_data(); CAFFE2_YF_INIT_SCALAR(aux_scalar) CAFFE2_YF_INIT_SCALAR(distance) CAFFE2_YF_INIT_SCALAR(distance_deb) CAFFE2_YF_INIT_SCALAR(g_norm) CAFFE2_YF_INIT_SCALAR(g_norm_deb) CAFFE2_YF_INIT_SCALAR(g_norm2) CAFFE2_YF_INIT_SCALAR(g_norm2_max) CAFFE2_YF_INIT_SCALAR(g_norm2_max_deb) CAFFE2_YF_INIT_SCALAR(g_norm2_min) CAFFE2_YF_INIT_SCALAR(g_norm2_min_deb) CAFFE2_YF_INIT_SCALAR(g_norm2_deb) CAFFE2_YF_INIT_SCALAR(lr) CAFFE2_YF_INIT_SCALAR(lr_deb) CAFFE2_YF_INIT_SCALAR(mu_deb) CAFFE2_YF_INIT_SCALAR(mu) CAFFE2_YF_INIT_SCALAR(variance) #undef CAFFE2_YF_INIT_SCALAR debias_factor_ = ZeroDebiasFactor(); MomentumSgdUpdate(); AfterApply(); return true; } protected: int curv_win_width_; bool nesterov_; bool zero_debias_; T epsilon_; T beta_; T debias_factor_; int D_; // Temporary memory on device, listed all variables used in calculations #define CAFFE2_YF_DEFINE_TENSOR(NAME) \ Tensor NAME##_tensor_; \ T* NAME##_; CAFFE2_YF_DEFINE_TENSOR(aux_vector) CAFFE2_YF_DEFINE_TENSOR(g_deb) CAFFE2_YF_DEFINE_TENSOR(g2_deb) CAFFE2_YF_DEFINE_TENSOR(g_deb2) CAFFE2_YF_DEFINE_TENSOR(aux_scalar) CAFFE2_YF_DEFINE_TENSOR(distance) CAFFE2_YF_DEFINE_TENSOR(distance_deb) CAFFE2_YF_DEFINE_TENSOR(g_norm) CAFFE2_YF_DEFINE_TENSOR(g_norm_deb) CAFFE2_YF_DEFINE_TENSOR(g_norm2) CAFFE2_YF_DEFINE_TENSOR(g_norm2_deb) CAFFE2_YF_DEFINE_TENSOR(g_norm2_max) CAFFE2_YF_DEFINE_TENSOR(g_norm2_max_deb) CAFFE2_YF_DEFINE_TENSOR(g_norm2_min) CAFFE2_YF_DEFINE_TENSOR(g_norm2_min_deb) CAFFE2_YF_DEFINE_TENSOR(lr) CAFFE2_YF_DEFINE_TENSOR(lr_deb) CAFFE2_YF_DEFINE_TENSOR(mu) CAFFE2_YF_DEFINE_TENSOR(mu_deb) CAFFE2_YF_DEFINE_TENSOR(variance) Tensor scratch_tensor_{Context::GetDeviceType()}; #undef CAFFE2_YF_DEFINE_TENSOR // Input tensors' data const T* param_; const T* moment_; const T* lr_avg_; const T* mu_avg_; const T* curv_win_; const T* g_avg_; const T* g2_avg_; const T* scalars_memory_; const T* grad_; int iter_; // Scalar data from scalars_memory_ input tensor const T* g_norm_avg_; const T* g_norm2_avg_; const T* g_norm2_min_avg_; const T* g_norm2_max_avg_; const T* distance_avg_; // Output tensors' data T* param_out_; T* moment_out_; T* lr_avg_out_; T* mu_avg_out_; T* curv_win_out_; T* g_avg_out_; T* g2_avg_out_; T* scalars_memory_out_; // Scalar data from scalars_memory_ output tensor T* g_norm_avg_out_; T* g_norm2_avg_out_; T* g_norm2_min_avg_out_; T* g_norm2_max_avg_out_; T* distance_avg_out_; INPUT_TAGS( PARAM, MOMENT, LR_AVG, MU_AVG, CURV_WIN, G_AVG, G2_AVG, SCALARS_MEMORY, GRAD, ITER); OUTPUT_TAGS( OUTPUT_PARAM, OUTPUT_MOMENT, OUTPUT_LR_AVG, OUTPUT_MU_AVG, OUTPUT_CURV_WIN, OUTPUT_G_AVG, OUTPUT_G2_AVG, OUTPUT_SCALARS_MEMORY); }; } // namespace caffe2