#ifndef CAFFE2_SGD_LEARNING_RATE_FUNCTORS_H_
|
#define CAFFE2_SGD_LEARNING_RATE_FUNCTORS_H_
|
|
#include <list>
|
#include <map>
|
|
#include "caffe2/core/context.h"
|
#include "caffe2/core/operator.h"
|
|
namespace caffe2 {
|
|
// LearningRateFunctor is a functor that when fed with an iter number, produces
|
// the learning rate for the corresponding iteration.
|
template <typename T>
|
class LearningRateFunctor {
|
public:
|
virtual ~LearningRateFunctor() {}
|
virtual T operator()(const int64_t iter) const = 0;
|
};
|
|
// Fixed: not changing the learning rate at all.
|
template <typename T>
|
class FixedLearningRate : public LearningRateFunctor<T> {
|
public:
|
T operator()(const int64_t /*iter*/) const override {
|
return 1.;
|
}
|
};
|
|
// Alter: alternatate learning rate with active_period and inactive_period.
|
// update for for a duration of active_period and then stop for a duration of
|
// inactive_period if active_first, and vice versa
|
template <typename T>
|
class AlternateLearningRate : public LearningRateFunctor<T> {
|
public:
|
AlternateLearningRate(
|
const int64_t active_period,
|
const int64_t inactive_period,
|
const bool active_first)
|
: active_period_(active_period),
|
inactive_period_(inactive_period),
|
active_first_(active_first) {}
|
T operator()(const int64_t iter) const override {
|
if (iter % (active_period_ + inactive_period_) <
|
(active_first_ ? active_period_ : inactive_period_)) {
|
return active_first_ ? 1. : 0.;
|
} else {
|
return active_first_ ? 0. : 1.;
|
};
|
};
|
|
int64_t active_period_;
|
int64_t inactive_period_;
|
bool active_first_;
|
};
|
|
// Step: return gamma ^ (floor(iter / step))
|
template <typename T>
|
class StepLearningRate : public LearningRateFunctor<T> {
|
public:
|
StepLearningRate(const int stepsize, const T gamma)
|
: stepsize_(stepsize), gamma_(gamma) {}
|
T operator()(const int64_t iter) const override {
|
return std::pow(gamma_, static_cast<T>(iter / stepsize_));
|
}
|
|
int stepsize_;
|
T gamma_;
|
};
|
|
// Exp: return gamma ^ iter
|
template <typename T>
|
class ExpLearningRate : public LearningRateFunctor<T> {
|
public:
|
explicit ExpLearningRate(const T gamma) : gamma_(gamma) {}
|
T operator()(const int64_t iter) const override {
|
return std::pow(gamma_, static_cast<T>(iter));
|
}
|
|
T gamma_;
|
};
|
|
// Gate: return multiplier_1 if before num_iter, else multiplier_2
|
template <typename T>
|
class GateLearningRate : public LearningRateFunctor<T> {
|
public:
|
GateLearningRate(
|
const T multiplier_1,
|
const T multiplier_2,
|
const int64_t num_iter)
|
: multiplier_1_(multiplier_1),
|
multiplier_2_(multiplier_2),
|
num_iter_(num_iter) {}
|
T operator()(const int64_t iter) const override {
|
if (iter >= int64_t(num_iter_)) {
|
return T(multiplier_2_);
|
}
|
return T(multiplier_1_);
|
}
|
T multiplier_1_;
|
T multiplier_2_;
|
uint64_t num_iter_;
|
};
|
|
// Inv: return (1 + gamma * iter) ^ (-power)
|
template <typename T>
|
class InvLearningRate : public LearningRateFunctor<T> {
|
public:
|
InvLearningRate(const T gamma, const T power)
|
: gamma_(gamma), power_(power) {}
|
T operator()(const int64_t iter) const override {
|
return std::pow(T(1) + gamma_ * iter, -power_);
|
}
|
T gamma_;
|
T power_;
|
};
|
|
// Poly: return (1 - iter/max_iter) ^ (power)
|
template <typename T>
|
class PolyLearningRate : public LearningRateFunctor<T> {
|
public:
|
PolyLearningRate(const T power, const int64_t max_iter)
|
: power_(power), max_iter_(max_iter) {}
|
T operator()(const int64_t iter) const override {
|
return std::pow(1 - T(iter) / T(max_iter_), power_);
|
}
|
T power_;
|
uint64_t max_iter_;
|
};
|
|
// LinearWarmup: return max(iter/num_iter, 1)
|
template <typename T>
|
class LinearWarmupLearningRate : public LearningRateFunctor<T> {
|
public:
|
LinearWarmupLearningRate(const T start_multiplier, const int64_t num_iter)
|
: start_multiplier_(start_multiplier), num_iter_(num_iter) {}
|
T operator()(const int64_t iter) const override {
|
if (iter >= int64_t(num_iter_)) {
|
return 1.;
|
}
|
return start_multiplier_ +
|
(1. - start_multiplier_) * T(iter) / T(num_iter_);
|
}
|
T start_multiplier_;
|
uint64_t num_iter_;
|
};
|
|
// ConstantWarmup: return scale when iter < num_iter, and 1 otherwise
|
template <typename T>
|
class ConstantWarmupLearningRate : public LearningRateFunctor<T> {
|
public:
|
ConstantWarmupLearningRate(const T multiplier, const int64_t num_iter)
|
: multiplier_(multiplier), num_iter_(num_iter) {}
|
T operator()(const int64_t iter) const override {
|
if (iter >= int64_t(num_iter_)) {
|
return 1.;
|
}
|
return T(multiplier_);
|
}
|
T multiplier_;
|
uint64_t num_iter_;
|
};
|
|
// ConstantWarmup: return scale when iter < num_iter, and 1 otherwise
|
template <typename T>
|
class PieceWarmupLearningRate : public LearningRateFunctor<T> {
|
public:
|
PieceWarmupLearningRate(
|
const T m1,
|
const int64_t n1,
|
const T m2,
|
const int64_t n2,
|
const T m3)
|
: m1_(m1), m2_(m2), m3_(m3), n1_(n1), n2_(n2){};
|
|
T operator()(const int64_t iter) const override {
|
if (iter < int64_t(n1_)) {
|
return m1_;
|
} else if (iter < int64_t(n2_)) {
|
return m2_;
|
}
|
return m3_;
|
}
|
|
T m1_, m2_, m3_;
|
uint64_t n1_, n2_;
|
};
|
|
// hill: the learning rate changes according to following 3 stages
|
// 1) linear warmup (increasing) at first num_iter steps from start_multiplier
|
// 2) inverse shrink (decreasing) afterwards (gamma, power)
|
// 3) lower bounded by end_multiplier
|
template <typename T>
|
class HillLearningRate : public LearningRateFunctor<T> {
|
public:
|
HillLearningRate(
|
const int64_t num_iter,
|
const T start_multiplier,
|
const T gamma,
|
const T power,
|
const T end_multiplier)
|
: linear_warmup_lr_(start_multiplier, num_iter),
|
inv_lr_(gamma, power),
|
num_iter_(num_iter),
|
end_multiplier_(end_multiplier) {}
|
T operator()(const int64_t iter) const override {
|
if (iter < num_iter_) {
|
return linear_warmup_lr_(iter);
|
} else {
|
return std::max(end_multiplier_, inv_lr_(iter - num_iter_));
|
}
|
}
|
LinearWarmupLearningRate<T> linear_warmup_lr_;
|
InvLearningRate<T> inv_lr_;
|
int64_t num_iter_;
|
T end_multiplier_;
|
};
|
|
template <typename T>
|
class CompositeLearningRateItem {
|
public:
|
CompositeLearningRateItem(
|
int64_t num_iter,
|
float lr_scale,
|
LearningRateFunctor<T>* policy)
|
: num_iter_(num_iter), lr_scale_(lr_scale), policy_(policy) {}
|
int64_t num_iter_;
|
float lr_scale_;
|
LearningRateFunctor<T>* policy_;
|
};
|
|
// composite: the learning policy changes according to current iteration #
|
template <typename T>
|
class CompositeLearningRate : public LearningRateFunctor<T> {
|
public:
|
CompositeLearningRate(
|
const std::list<CompositeLearningRateItem<T>>& sub_policies) {
|
DCHECK_GT(sub_policies.size(), 0);
|
int64_t num_iter_start = 1;
|
for (auto it = sub_policies.begin(); it != sub_policies.end(); ++it) {
|
DCHECK_GT(it->num_iter_, 0);
|
sub_policies_[num_iter_start].reset(it->policy_);
|
sub_policy_lr_scales_[num_iter_start] = it->lr_scale_;
|
num_iter_start += it->num_iter_;
|
}
|
}
|
T operator()(const int64_t iter) const override {
|
auto sub_policy = sub_policies_.upper_bound(iter);
|
DCHECK(sub_policy != sub_policies_.begin());
|
--sub_policy;
|
auto sub_policy_lr_scale = sub_policy_lr_scales_.upper_bound(iter);
|
DCHECK(sub_policy_lr_scale != sub_policy_lr_scales_.begin());
|
--sub_policy_lr_scale;
|
return ((*sub_policy->second)(iter)) * (sub_policy_lr_scale->second);
|
}
|
|
private:
|
std::map<int64_t, std::unique_ptr<LearningRateFunctor<T>>> sub_policies_;
|
std::map<int64_t, float> sub_policy_lr_scales_;
|
};
|
|
// Cyclical: return a learning rate with period 2 * stepsize and
|
// lower bound base_lr, upper bound max_lr.
|
// See https://arxiv.org/pdf/1506.01186.pdf
|
template <typename T>
|
class CyclicalLearningRate : public LearningRateFunctor<T> {
|
public:
|
CyclicalLearningRate(
|
const T base_lr,
|
const T max_lr,
|
const int stepsize,
|
const T decay)
|
: base_lr_(base_lr),
|
max_lr_(max_lr),
|
stepsize_(stepsize),
|
decay_(decay) {}
|
T operator()(const int64_t iter) const override {
|
int64_t cycle = static_cast<int>((iter / (2 * stepsize_)) + 1);
|
T x = abs(static_cast<T>(iter) / stepsize_ - 2 * cycle + 1);
|
return 1 +
|
(T(abs(max_lr_)) / T(abs(base_lr_)) - 1) * std::max(T(0.0), (1 - x)) *
|
std::pow(decay_, static_cast<int>(iter / (2 * stepsize_)));
|
}
|
T base_lr_;
|
T max_lr_;
|
int stepsize_;
|
T decay_;
|
};
|
|
// constantThenLinearWarmup: first use a constant multiplier
|
// and then ramp up to the global lr
|
template <typename T>
|
class ConstantThenLinearWarmupLearningRate : public LearningRateFunctor<T> {
|
public:
|
ConstantThenLinearWarmupLearningRate(
|
const T start_warmup_multiplier,
|
const int64_t constant_warmup_num_iter,
|
const int64_t linear_warmup_num_iter)
|
: constant_warmup_num_iter_(constant_warmup_num_iter),
|
linear_warmup_num_iter_(linear_warmup_num_iter),
|
constant_warmup_lr_(start_warmup_multiplier, constant_warmup_num_iter),
|
linear_warmup_lr_(start_warmup_multiplier, linear_warmup_num_iter) {}
|
|
T operator()(const int64_t iter) const override {
|
if (iter < constant_warmup_num_iter_) {
|
return constant_warmup_lr_(iter);
|
} else if (iter < constant_warmup_num_iter_ + linear_warmup_num_iter_) {
|
return linear_warmup_lr_(iter - constant_warmup_num_iter_);
|
} else {
|
return 1.0;
|
}
|
}
|
int64_t constant_warmup_num_iter_;
|
int64_t linear_warmup_num_iter_;
|
ConstantWarmupLearningRate<T> constant_warmup_lr_;
|
LinearWarmupLearningRate<T> linear_warmup_lr_;
|
};
|
|
// CompositeCyclicalLearningRate: first use a constant multiplier
|
// and then ramp up to the global lr, and then use a cyclical learning rate
|
template <typename T>
|
class CompositeCyclicalLearningRate : public LearningRateFunctor<T> {
|
public:
|
CompositeCyclicalLearningRate(
|
const T base_lr,
|
const T start_warmup_multiplier,
|
const int64_t constant_warmup_num_iter,
|
const int64_t linear_warmup_num_iter,
|
const T cyclical_max_lr,
|
const int cyclical_step_size,
|
const T cyclical_decay)
|
: constant_warmup_num_iter_(constant_warmup_num_iter),
|
linear_warmup_num_iter_(linear_warmup_num_iter),
|
constant_then_linear_warmup_lr_(
|
start_warmup_multiplier,
|
constant_warmup_num_iter,
|
linear_warmup_num_iter),
|
cyclical_lr_(
|
base_lr,
|
cyclical_max_lr,
|
cyclical_step_size,
|
cyclical_decay) {}
|
|
T operator()(const int64_t iter) const override {
|
if (iter < constant_warmup_num_iter_ + linear_warmup_num_iter_) {
|
return constant_then_linear_warmup_lr_(iter);
|
}
|
return cyclical_lr_(
|
iter - constant_warmup_num_iter_ - linear_warmup_num_iter_);
|
}
|
|
int64_t constant_warmup_num_iter_;
|
int64_t linear_warmup_num_iter_;
|
ConstantThenLinearWarmupLearningRate<T> constant_then_linear_warmup_lr_;
|
CyclicalLearningRate<T> cyclical_lr_;
|
};
|
|
} // namespace caffe2
|
|
#endif // CAFFE2_SGD_LEARNING_RATE_FUNCTORS_H_
|