1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
| #ifndef CAFFE2_OPERATORS_LARS_OP_H_
| #define CAFFE2_OPERATORS_LARS_OP_H_
|
| #include "caffe2/core/context.h"
| #include "caffe2/core/logging.h"
| #include "caffe2/core/operator.h"
| #include "caffe2/utils/math.h"
|
| namespace caffe2 {
|
| template <typename T, class Context>
| class LarsOp final : public Operator<Context> {
| public:
| USE_OPERATOR_CONTEXT_FUNCTIONS;
| LarsOp(const OperatorDef& operator_def, Workspace* ws)
| : Operator<Context>(operator_def, ws),
| offset_(this->template GetSingleArgument<float>("offset", 0.5)),
| lr_min_(this->template GetSingleArgument<float>("lr_min", 0.02)) {}
|
| bool RunOnDevice() override {
| auto& X = Input(0);
| auto& dX = Input(1);
| CAFFE_ENFORCE(
| dX.numel() == X.numel(), "Gradient size doesn't match parameter size.");
| CAFFE_ENFORCE_GE(offset_, 0);
| CAFFE_ENFORCE_GE(lr_min_, 0);
|
| auto& wd = Input(2);
| auto& trust = Input(3);
| auto& lr_max = Input(4);
|
| auto* lr_rescaled = Output(0, vector<int64_t>{1}, at::dtype<T>());
|
| ReinitializeTensor(&X_norm_tensor_, {1}, at::dtype<T>().device(Context::GetDeviceType()));
| T* X_norm_ = X_norm_tensor_.template mutable_data<T>();
|
| ReinitializeTensor(&dX_norm_tensor_, {1}, at::dtype<T>().device(Context::GetDeviceType()));
| T* dX_norm_ = dX_norm_tensor_.template mutable_data<T>();
|
| ComputeNorms(
| dX.numel(),
| X.template data<T>(),
| dX.template data<T>(),
| X_norm_,
| dX_norm_);
|
| ComputeLearningRate(
| wd.template data<T>(),
| trust.template data<T>(),
| lr_max.template data<T>(),
| offset_,
| lr_min_,
| X_norm_,
| dX_norm_,
| lr_rescaled->template mutable_data<T>());
|
| return true;
| }
|
| private:
| // Compute the l2 norm of X_data and dX_data
| void ComputeNorms(
| int64_t N,
| const T* X_data,
| const T* dX_data,
| T* X_norm,
| T* dX_norm) {
| math::SumSqr(N, X_data, X_norm, &context_);
| math::Sqrt(1, X_norm, X_norm, &context_);
| math::SumSqr(N, dX_data, dX_norm, &context_);
| math::Sqrt(1, dX_norm, dX_norm, &context_);
| }
| // Compute the learning rate and apply clipping
| void ComputeLearningRate(
| const T* wd,
| const T* trust,
| const T* lr_max,
| T offset,
| T lr_min,
| T* X_norm,
| T* dX_norm,
| T* lr_rescaled);
|
| T offset_;
| T lr_min_;
|
| Tensor X_norm_tensor_;
| Tensor dX_norm_tensor_;
| };
|
| } // namespace caffe2
|
| #endif // CAFFE2_OPERATORS_LARS_OP_H_
|
|