#include "representation_layer.h"
#include "utils.h"
#include "dark_cuda.h"
#include "blas.h"
#include <stdio.h>
#include <assert.h>

layer make_implicit_layer(int batch, int index, float mean_init, float std_init, int filters, int atoms)
{
    fprintf(stderr,"implicit Layer: %d x %d \t mean=%.2f, std=%.2f \n", filters, atoms, mean_init, std_init);
    layer l = { (LAYER_TYPE)0 };
    l.type = IMPLICIT;
    l.batch = batch;
    l.w = 1;
    l.h = 1;
    l.c = 1;

    l.out_w = 1;
    l.out_h = atoms;
    l.out_c = filters;

    l.outputs = l.out_w*l.out_h*l.out_c;
    l.inputs = 1;
    l.index = index;

    l.nweights = l.out_w * l.out_h * l.out_c;

    l.weight_updates = (float*)xcalloc(l.nweights, sizeof(float));
    l.weights = (float*)xcalloc(l.nweights, sizeof(float));
    int i;
    for (i = 0; i < l.nweights; ++i) l.weights[i] = mean_init + rand_uniform(-std_init, std_init);


    l.delta = (float*)xcalloc(l.outputs * batch, sizeof(float));
    l.output = (float*)xcalloc(l.outputs * batch, sizeof(float));

    l.forward = forward_implicit_layer;
    l.backward = backward_implicit_layer;
    l.update = update_implicit_layer;
#ifdef GPU
    l.forward_gpu = forward_implicit_layer_gpu;
    l.backward_gpu = backward_implicit_layer_gpu;
    l.update_gpu = update_implicit_layer_gpu;

    l.delta_gpu =  cuda_make_array(l.delta, l.outputs*batch);
    l.output_gpu = cuda_make_array(l.output, l.outputs*batch);

    l.weight_updates_gpu = cuda_make_array(l.weight_updates, l.nweights);
    l.weights_gpu = cuda_make_array(l.weights, l.nweights);
#endif
    return l;
}

void resize_implicit_layer(layer *l, int w, int h)
{
}

void forward_implicit_layer(const layer l, network_state state)
{
    int i;
    #pragma omp parallel for
    for (i = 0; i < l.nweights * l.batch; ++i) {
        l.output[i] = l.weights[i % l.nweights];
    }
}

void backward_implicit_layer(const layer l, network_state state)
{
    int i;
    for (i = 0; i < l.nweights * l.batch; ++i) {
        l.weight_updates[i % l.nweights] += l.delta[i];
    }
}

void update_implicit_layer(layer l, int batch, float learning_rate_init, float momentum, float decay)
{
    float learning_rate = learning_rate_init*l.learning_rate_scale;
    //float momentum = a.momentum;
    //float decay = a.decay;
    //int batch = a.batch;

    axpy_cpu(l.nweights, -decay*batch, l.weights, 1, l.weight_updates, 1);
    axpy_cpu(l.nweights, learning_rate / batch, l.weight_updates, 1, l.weights, 1);
    scal_cpu(l.nweights, momentum, l.weight_updates, 1);

}


#ifdef GPU
void forward_implicit_layer_gpu(const layer l, network_state state)
{
    forward_implicit_gpu(l.batch, l.nweights, l.weights_gpu, l.output_gpu);
}

void backward_implicit_layer_gpu(const layer l, network_state state)
{
    backward_implicit_gpu(l.batch, l.nweights, l.weight_updates_gpu, l.delta_gpu);
}

void update_implicit_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay, float loss_scale)
{
    // Loss scale for Mixed-Precision on Tensor-Cores
    float learning_rate = learning_rate_init*l.learning_rate_scale / loss_scale;
    //float momentum = a.momentum;
    //float decay = a.decay;
    //int batch = a.batch;

    reset_nan_and_inf(l.weight_updates_gpu, l.nweights);
    fix_nan_and_inf(l.weights_gpu, l.nweights);

    if (l.adam) {
        //adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.nweights, batch, a.t);
        adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.nweights, batch, l.t);
    }
    else {
        //axpy_ongpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
        //axpy_ongpu(l.nweights, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
        //scal_ongpu(l.nweights, momentum, l.weight_updates_gpu, 1);

        axpy_ongpu(l.nweights, -decay*batch*loss_scale, l.weights_gpu, 1, l.weight_updates_gpu, 1);
        axpy_ongpu(l.nweights, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);

        scal_ongpu(l.nweights, momentum, l.weight_updates_gpu, 1);
    }

    if (l.clip) {
        constrain_ongpu(l.nweights, l.clip, l.weights_gpu, 1);
    }
}

void pull_implicit_layer(layer l)
{
    cuda_pull_array_async(l.weights_gpu, l.weights, l.nweights);
    cuda_pull_array_async(l.weight_updates_gpu, l.weight_updates, l.nweights);

    if (l.adam) {
        cuda_pull_array_async(l.m_gpu, l.m, l.nweights);
        cuda_pull_array_async(l.v_gpu, l.v, l.nweights);
    }
    CHECK_CUDA(cudaPeekAtLastError());
    cudaStreamSynchronize(get_cuda_stream());
}

void push_implicit_layer(layer l)
{
    cuda_push_array(l.weights_gpu, l.weights, l.nweights);

    if (l.train) {
        cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.nweights);
    }
    if (l.adam) {
        cuda_push_array(l.m_gpu, l.m, l.nweights);
        cuda_push_array(l.v_gpu, l.v, l.nweights);
    }
    CHECK_CUDA(cudaPeekAtLastError());
}
#endif