#include "representation_layer.h" #include "utils.h" #include "dark_cuda.h" #include "blas.h" #include #include layer make_implicit_layer(int batch, int index, float mean_init, float std_init, int filters, int atoms) { fprintf(stderr,"implicit Layer: %d x %d \t mean=%.2f, std=%.2f \n", filters, atoms, mean_init, std_init); layer l = { (LAYER_TYPE)0 }; l.type = IMPLICIT; l.batch = batch; l.w = 1; l.h = 1; l.c = 1; l.out_w = 1; l.out_h = atoms; l.out_c = filters; l.outputs = l.out_w*l.out_h*l.out_c; l.inputs = 1; l.index = index; l.nweights = l.out_w * l.out_h * l.out_c; l.weight_updates = (float*)xcalloc(l.nweights, sizeof(float)); l.weights = (float*)xcalloc(l.nweights, sizeof(float)); int i; for (i = 0; i < l.nweights; ++i) l.weights[i] = mean_init + rand_uniform(-std_init, std_init); l.delta = (float*)xcalloc(l.outputs * batch, sizeof(float)); l.output = (float*)xcalloc(l.outputs * batch, sizeof(float)); l.forward = forward_implicit_layer; l.backward = backward_implicit_layer; l.update = update_implicit_layer; #ifdef GPU l.forward_gpu = forward_implicit_layer_gpu; l.backward_gpu = backward_implicit_layer_gpu; l.update_gpu = update_implicit_layer_gpu; l.delta_gpu = cuda_make_array(l.delta, l.outputs*batch); l.output_gpu = cuda_make_array(l.output, l.outputs*batch); l.weight_updates_gpu = cuda_make_array(l.weight_updates, l.nweights); l.weights_gpu = cuda_make_array(l.weights, l.nweights); #endif return l; } void resize_implicit_layer(layer *l, int w, int h) { } void forward_implicit_layer(const layer l, network_state state) { int i; #pragma omp parallel for for (i = 0; i < l.nweights * l.batch; ++i) { l.output[i] = l.weights[i % l.nweights]; } } void backward_implicit_layer(const layer l, network_state state) { int i; for (i = 0; i < l.nweights * l.batch; ++i) { l.weight_updates[i % l.nweights] += l.delta[i]; } } void update_implicit_layer(layer l, int batch, float learning_rate_init, float momentum, float decay) { float learning_rate = learning_rate_init*l.learning_rate_scale; //float momentum = a.momentum; //float decay = a.decay; //int batch = a.batch; axpy_cpu(l.nweights, -decay*batch, l.weights, 1, l.weight_updates, 1); axpy_cpu(l.nweights, learning_rate / batch, l.weight_updates, 1, l.weights, 1); scal_cpu(l.nweights, momentum, l.weight_updates, 1); } #ifdef GPU void forward_implicit_layer_gpu(const layer l, network_state state) { forward_implicit_gpu(l.batch, l.nweights, l.weights_gpu, l.output_gpu); } void backward_implicit_layer_gpu(const layer l, network_state state) { backward_implicit_gpu(l.batch, l.nweights, l.weight_updates_gpu, l.delta_gpu); } void update_implicit_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay, float loss_scale) { // Loss scale for Mixed-Precision on Tensor-Cores float learning_rate = learning_rate_init*l.learning_rate_scale / loss_scale; //float momentum = a.momentum; //float decay = a.decay; //int batch = a.batch; reset_nan_and_inf(l.weight_updates_gpu, l.nweights); fix_nan_and_inf(l.weights_gpu, l.nweights); if (l.adam) { //adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.nweights, batch, a.t); adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.nweights, batch, l.t); } else { //axpy_ongpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1); //axpy_ongpu(l.nweights, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1); //scal_ongpu(l.nweights, momentum, l.weight_updates_gpu, 1); axpy_ongpu(l.nweights, -decay*batch*loss_scale, l.weights_gpu, 1, l.weight_updates_gpu, 1); axpy_ongpu(l.nweights, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1); scal_ongpu(l.nweights, momentum, l.weight_updates_gpu, 1); } if (l.clip) { constrain_ongpu(l.nweights, l.clip, l.weights_gpu, 1); } } void pull_implicit_layer(layer l) { cuda_pull_array_async(l.weights_gpu, l.weights, l.nweights); cuda_pull_array_async(l.weight_updates_gpu, l.weight_updates, l.nweights); if (l.adam) { cuda_pull_array_async(l.m_gpu, l.m, l.nweights); cuda_pull_array_async(l.v_gpu, l.v, l.nweights); } CHECK_CUDA(cudaPeekAtLastError()); cudaStreamSynchronize(get_cuda_stream()); } void push_implicit_layer(layer l) { cuda_push_array(l.weights_gpu, l.weights, l.nweights); if (l.train) { cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.nweights); } if (l.adam) { cuda_push_array(l.m_gpu, l.m, l.nweights); cuda_push_array(l.v_gpu, l.v, l.nweights); } CHECK_CUDA(cudaPeekAtLastError()); } #endif