#ifndef CAFFE2_OPERATORS_LARS_OP_H_
#define CAFFE2_OPERATORS_LARS_OP_H_

#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
#include "caffe2/utils/math.h"

namespace caffe2 {

template <typename T, class Context>
class LarsOp final : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  LarsOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<Context>(operator_def, ws),
        offset_(this->template GetSingleArgument<float>("offset", 0.5)),
        lr_min_(this->template GetSingleArgument<float>("lr_min", 0.02)) {}

  bool RunOnDevice() override {
    auto& X = Input(0);
    auto& dX = Input(1);
    CAFFE_ENFORCE(
        dX.numel() == X.numel(), "Gradient size doesn't match parameter size.");
    CAFFE_ENFORCE_GE(offset_, 0);
    CAFFE_ENFORCE_GE(lr_min_, 0);

    auto& wd = Input(2);
    auto& trust = Input(3);
    auto& lr_max = Input(4);

    auto* lr_rescaled = Output(0, vector<int64_t>{1}, at::dtype<T>());

    ReinitializeTensor(&X_norm_tensor_, {1}, at::dtype<T>().device(Context::GetDeviceType()));
    T* X_norm_ = X_norm_tensor_.template mutable_data<T>();

    ReinitializeTensor(&dX_norm_tensor_, {1}, at::dtype<T>().device(Context::GetDeviceType()));
    T* dX_norm_ = dX_norm_tensor_.template mutable_data<T>();

    ComputeNorms(
        dX.numel(),
        X.template data<T>(),
        dX.template data<T>(),
        X_norm_,
        dX_norm_);

    ComputeLearningRate(
        wd.template data<T>(),
        trust.template data<T>(),
        lr_max.template data<T>(),
        offset_,
        lr_min_,
        X_norm_,
        dX_norm_,
        lr_rescaled->template mutable_data<T>());

    return true;
  }

 private:
  // Compute the l2 norm of X_data and dX_data
  void ComputeNorms(
      int64_t N,
      const T* X_data,
      const T* dX_data,
      T* X_norm,
      T* dX_norm) {
    math::SumSqr(N, X_data, X_norm, &context_);
    math::Sqrt(1, X_norm, X_norm, &context_);
    math::SumSqr(N, dX_data, dX_norm, &context_);
    math::Sqrt(1, dX_norm, dX_norm, &context_);
  }
  // Compute the learning rate and apply clipping
  void ComputeLearningRate(
      const T* wd,
      const T* trust,
      const T* lr_max,
      T offset,
      T lr_min,
      T* X_norm,
      T* dX_norm,
      T* lr_rescaled);

  T offset_;
  T lr_min_;

  Tensor X_norm_tensor_;
  Tensor dX_norm_tensor_;
};

} // namespace caffe2

#endif // CAFFE2_OPERATORS_LARS_OP_H_