From 168af40fe9a3cc81c6ee16b3e81f154780c36bdb Mon Sep 17 00:00:00 2001 From: Scheaven <xuepengqiang> Date: 星期四, 03 六月 2021 15:03:27 +0800 Subject: [PATCH] up new v4 --- lib/detecter_tools/darknet/convolutional_kernels.cu | 2798 ++++++++++++++++++++++++++++++---------------------------- 1 files changed, 1,436 insertions(+), 1,362 deletions(-) diff --git a/lib/detecter_tools/darknet/convolutional_kernels.cu b/lib/detecter_tools/darknet/convolutional_kernels.cu index 20ae7b5..ddd140c 100644 --- a/lib/detecter_tools/darknet/convolutional_kernels.cu +++ b/lib/detecter_tools/darknet/convolutional_kernels.cu @@ -1,1362 +1,1436 @@ -#include <cuda_runtime.h> -#include <curand.h> -#include <cublas_v2.h> - -#include "convolutional_layer.h" -#include "batchnorm_layer.h" -#include "gemm.h" -#include "blas.h" -#include "im2col.h" -#include "col2im.h" -#include "utils.h" -#include "dark_cuda.h" -#include "box.h" - - -__global__ void binarize_kernel(float *x, int n, float *binary) -{ - int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; - if (i >= n) return; - binary[i] = (x[i] >= 0) ? 1 : -1; -} - -void binarize_gpu(float *x, int n, float *binary) -{ - binarize_kernel<<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>>(x, n, binary); - CHECK_CUDA(cudaPeekAtLastError()); -} - -__global__ void binarize_input_kernel(float *input, int n, int size, float *binary) -{ - int s = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; - if (s >= size) return; - int i = 0; - float mean = 0; - for(i = 0; i < n; ++i){ - mean += fabs(input[i*size + s]); - } - mean = mean / n; - for(i = 0; i < n; ++i){ - binary[i*size + s] = (input[i*size + s] > 0) ? mean : -mean; - } -} - -void binarize_input_gpu(float *input, int n, int size, float *binary) -{ - binarize_input_kernel<<<cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >>>(input, n, size, binary); - CHECK_CUDA(cudaPeekAtLastError()); -} - -__global__ void binarize_weights_kernel(float *weights, int n, int size, float *binary) -{ - int f = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; - if (f >= n) return; - int i = 0; - float mean = 0; - for (i = 0; i < size; ++i) { - mean += fabs(weights[f*size + i]); - } - mean = mean / size; - for (i = 0; i < size; ++i) { - binary[f*size + i] = (weights[f*size + i] > 0) ? mean : -mean; - //binary[f*size + i] = weights[f*size + i]; - } -} - -void binarize_weights_gpu(float *weights, int n, int size, float *binary) -{ - binarize_weights_kernel << <cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >> >(weights, n, size, binary); - CHECK_CUDA(cudaPeekAtLastError()); -} - - -__global__ void set_zero_kernel(float *src, int size) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < size) src[i] = 0; -} - -__inline__ __device__ -float warpAllReduceSum(float val) { - for (int mask = WARP_SIZE / 2; mask > 0; mask /= 2) -#if CUDART_VERSION >= 9000 - val += __shfl_xor_sync(0xffffffff, val, mask); -#else - val += __shfl_xor(val, mask); -#endif - return val; -} - -// only if (size % 32 == 0) -__global__ void reduce_kernel(float *weights, int n, int size, float *mean_arr_gpu) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - int f = i / size; - if (f >= n) return; - float warp_mean = warpAllReduceSum(fabs(weights[i])); - if(i % 32 == 0) - atomicAdd(&mean_arr_gpu[f], warp_mean / size); -} - -__global__ void binarize_weights_mean_kernel(float *weights, int n, int size, float *binary, float *mean_arr_gpu) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - int f = i / size; - if (f >= n) return; - float mean = mean_arr_gpu[f]; - binary[i] = (weights[i] > 0) ? mean : -mean; -} - -void fast_binarize_weights_gpu(float *weights, int n, int size, float *binary, float *mean_arr_gpu) -{ - if (size % 32 == 0) { - size_t gridsize = n * size; - const int num_blocks = get_number_of_blocks(gridsize, BLOCK);// gridsize / BLOCK + 1; - - set_zero_kernel << <(n/BLOCK + 1), BLOCK, 0, get_cuda_stream() >> > (mean_arr_gpu, n); - reduce_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (weights, n, size, mean_arr_gpu); - binarize_weights_mean_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (weights, n, size, binary, mean_arr_gpu); - CHECK_CUDA(cudaPeekAtLastError()); - } - else { - binarize_weights_gpu(weights, n, size, binary); - } -} - - -__global__ void cuda_f32_to_f16(float* input_f32, size_t size, half *output_f16) -{ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) output_f16[idx] = __float2half(input_f32[idx]); - //if (idx < size) output_f16[idx] = __float2half_rn(input_f32[idx]); // can't be compiled on Linux without casting - // __float2half_ru, __float2half_rd, __float2half_rz, __float2half_rn - //if (idx < size) *((unsigned short *)output_f16 + idx) = __float2half(input_f32[idx]); -} - -void cuda_convert_f32_to_f16(float* input_f32, size_t size, float *output_f16) { - cuda_f32_to_f16 <<< get_number_of_blocks(size, BLOCK), BLOCK, 0, get_cuda_stream() >>> (input_f32, size, (half *)output_f16); - CHECK_CUDA(cudaPeekAtLastError()); -} - -__global__ void cuda_f16_to_f32(half* input_f16, size_t size, float *output_f32) -{ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) output_f32[idx] = __half2float(input_f16[idx]); - //if (idx < size) output_f32[idx] = __half2float(*((unsigned short *)input_f16 + idx)); -} - -void cuda_convert_f16_to_f32(float* input_f16, size_t size, float *output_f32) { - cuda_f16_to_f32 <<< get_number_of_blocks(size, BLOCK), BLOCK, 0, get_cuda_stream() >>> ((half *)input_f16, size, output_f32); - CHECK_CUDA(cudaPeekAtLastError()); -} - -half *cuda_make_f16_from_f32_array(float *src, size_t n) -{ - half *dst16; - size_t size = sizeof(half)*n; - CHECK_CUDA(cudaMalloc((void **)&dst16, size)); - if (src) { - assert(n > 0); - cuda_convert_f32_to_f16(src, n, (float *)dst16); - } - if (!dst16) error("Cuda malloc failed\n"); - return dst16; -} - -void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) -{ - //fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1); - if(l.binary){ - binarize_weights_gpu(l.weights_gpu, l.n, (l.c / l.groups)*l.size*l.size, l.binary_weights_gpu); - swap_binary(&l); - } - - if(l.xnor){ - if (!l.align_bit_weights_gpu || state.train) { - //binarize_weights_gpu(l.weights_gpu, l.n, (l.c / l.groups)*l.size*l.size, l.binary_weights_gpu); - - fast_binarize_weights_gpu(l.weights_gpu, l.n, (l.c / l.groups)*l.size*l.size, l.binary_weights_gpu, l.mean_arr_gpu); - } - - if (l.align_bit_weights_gpu && !state.train && l.c >= 32 && l.stride_x == l.stride_y) - { - //return; - //cudaError_t status = cudaSuccess; - //int input_size = l.c*l.h*l.w*l.batch; - - int m = l.n / l.groups; - int k = l.size*l.size*l.c / l.groups; - int n = l.out_w*l.out_h; - //float * a = l.weights_gpu; - - // int i, j; - // for(i = 0; i < l.batch; ++i){ - // for (j = 0; j < l.groups; ++j) { - - int ldb_align = l.lda_align; - size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8; - //size_t t_intput_size = new_ldb * n; - //size_t t_bit_input_size = t_intput_size / 8;// +1; - - if (l.c % 32 == 0) - { - //printf("\n\n l.index = %d, l.w = %d, l.c = %d, l.n = %d, l.stride = %d, l.pad = %d - new XNOR \n", l.index, l.w, l.c, l.n, l.stride, l.pad); - //printf("l.align_workspace_size = %d, (l.c * l.w * l.h) = %d \n", l.align_workspace_size, (l.c * l.w * l.h)); - - //float *intput_cpu = (float *)calloc(l.inputs, sizeof(float)); - // state.input - //cudaMemcpy(intput_cpu, state.input, l.inputs * sizeof(float), cudaMemcpyDefault); - - int ldb_align = l.lda_align; - size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8; - //size_t t_intput_size = new_ldb * l.bit_align;// n; - //size_t t_bit_input_size = t_intput_size / 8;// +1; - - const int new_c = l.c / 32; - - //float *re_packed_input = (float *)calloc(l.c * l.w * l.h, sizeof(float)); - //uint32_t *bin_re_packed_input = (uint32_t *)calloc(new_c * l.w * l.h + 1, sizeof(uint32_t)); - - // float32x4 by channel (as in cuDNN) - //repack_input(intput_cpu, re_packed_input, l.w, l.h, l.c); - - - // 32 x floats -> 1 x uint32_t - //float_to_bit(re_packed_input, (uint8_t *)bin_re_packed_input, l.c * l.w * l.h); - - //cudaDeviceSynchronize(); - //start_timer(); - - repack_input_gpu_bin(state.input, (uint32_t *)l.align_workspace_gpu, l.w, l.h, l.c); - - //repack_input_gpu(state.input, state.workspace, l.w, l.h, l.c); - - // 32 x floats -> 1 x uint32_t - //float_to_bit_gpu(state.workspace, (unsigned char *)l.align_workspace_gpu, l.c * l.w * l.h);// l.align_workspace_size); - - //cudaDeviceSynchronize(); - //stop_timer_and_show_name("repack_input_gpu + float_to_bit_gpu"); - - //free(re_packed_input); - - // slow - convolution the packed inputs and weights: float x 32 by channel (as in cuDNN) - //convolution_repacked((uint32_t *)bin_re_packed_input, (uint32_t *)l.align_bit_weights, l.output, - // l.w, l.h, l.c, l.n, l.size, l.pad, l.new_lda, l.mean_arr); - - // // then exit from if() - - //float *b = state.workspace; - //float *b = (float *)calloc(100 * 1024 * 1024, sizeof(float)); - //float *c = l.output; - //memset(c, 0, l.outputs * sizeof(float)); - - - //im2col_cpu_custom((float *)bin_re_packed_input, new_c, l.h, l.w, l.size, l.stride, l.pad, b); - - //cudaMemcpy(l.align_workspace_gpu, bin_re_packed_input, (new_c * l.w * l.h + 1) * sizeof(uint32_t), cudaMemcpyDefault); - - //start_timer(); - im2col_ongpu(l.align_workspace_gpu, new_c, l.h, l.w, l.size, l.stride, l.pad, state.workspace); - //cudaDeviceSynchronize(); - //stop_timer_and_show_name("im2col_ongpu"); - - //free(bin_re_packed_input); - - int new_k = l.size*l.size*l.c / 32; - - // good for (l.c == 64) - //gemm_nn_bin_32bit_packed(m, n, new_k, 1, - // l.align_bit_weights, l.new_lda/32, - // b, n, - // c, n, l.mean_arr); - - // // then exit from if() - - - //size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8; - //size_t t_intput_size = new_ldb * l.bit_align;// n; - //size_t t_bit_input_size = t_intput_size / 8;// +1; - - //char *t_bit_input = (char *)calloc(t_bit_input_size, sizeof(char)); - //transpose_uint32((uint32_t *)b, (uint32_t *)t_bit_input, new_k, n, n, new_ldb); - //cudaMemcpy(l.transposed_align_workspace_gpu, t_bit_input, t_bit_input_size * sizeof(char), cudaMemcpyDefault); - - //cudaMemcpy(state.workspace, b, t_bit_input_size * sizeof(char), cudaMemcpyDefault); - //printf("\n n = %d, n % 32 = %d, new_ldb = %d, new_ldb % 32 = %d \n", n, n % 32, new_ldb, new_ldb % 32); - - //start_timer(); - transpose_uint32_gpu((uint32_t *)state.workspace, (uint32_t *)l.transposed_align_workspace_gpu, new_k, n, n, new_ldb); - //cudaDeviceSynchronize(); - //stop_timer_and_show_name("transpose_uint32_gpu"); - - //cudaDeviceSynchronize(); - //stop_timer_and_show_name("repack_input_gpu_bin + im2col_ongpu + transpose_uint32_gpu_2"); - - //start_timer(); - gemm_nn_custom_bin_mean_transposed_gpu(m, n, k, - (unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu, - new_ldb, l.output_gpu, n, l.mean_arr_gpu, l.biases_gpu, l.activation == LEAKY, - l.bin_conv_shortcut_in_gpu, l.bin_conv_shortcut_out_gpu); - //cudaDeviceSynchronize(); - //stop_timer_and_show_name("gemm_nn_custom_bin_mean_transposed_gpu"); - - - // the main GEMM function - //gemm_nn_custom_bin_mean_transposed(m, n, k, 1, (uint8_t *)l.align_bit_weights, new_ldb, (uint8_t *)t_bit_input, new_ldb, c, n, l.mean_arr); - - //add_bias(l.output, l.biases, l.batch, l.n, l.out_h*l.out_w); - - //cudaMemcpy(l.output_gpu, l.output, l.outputs * sizeof(float), cudaMemcpyDefault); - - - // // alternative GEMM - //gemm_nn_bin_transposed_32bit_packed(m, n, new_k, 1, - // l.align_bit_weights, l.new_lda/32, - // t_bit_input, new_ldb / 32, - // c, n, l.mean_arr); - - //free(t_bit_input); - - //free(b); - } - else - { - //printf("\n\n l.index = %d, l.w = %d, l.c = %d, l.n = %d, l.stride = %d, l.pad = %d - old XNOR \n", l.index, l.w, l.c, l.n, l.stride, l.pad); - //cudaDeviceSynchronize(); - - int i = 0; - /* - // if (l.stride == 1 && l.c >= 256 && l.size > 1) - if (l.stride == 1 && l.c >= 1024 && l.size > 1 && 0)// && l.w >= 13) // disabled - { - // stride=1 only - //start_timer(); - im2col_align_bin_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, state.workspace, l.bit_align); - //cudaDeviceSynchronize(); - //stop_timer_and_show_name("im2col_align_bin_ongpu"); - } - else*/ - { - //start_timer(); - im2col_align_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, l.align_workspace_gpu, l.bit_align); - //cudaDeviceSynchronize(); - //stop_timer_and_show_name("im2col_align_ongpu"); - //getchar(); - - // should be optimized - //start_timer(); - float_to_bit_gpu(l.align_workspace_gpu, (unsigned char *)state.workspace, l.align_workspace_size); - //cudaDeviceSynchronize(); - //stop_timer_and_show_name("float_to_bit_gpu"); - } - //start_timer(); - transpose_bin_gpu((unsigned char *)state.workspace, (unsigned char *)l.transposed_align_workspace_gpu, k, n, l.bit_align, new_ldb, 8); - //cudaDeviceSynchronize(); - //stop_timer_and_show_name("transpose_bin_gpu"); - - //cudaDeviceSynchronize(); - //stop_timer_and_show_name("im2col_align_ongpu + float_to_bit_gpu + transpose_bin_gpu"); - - // should be optimized - //if(0) {//if (k > 1000) { // sequentially input-shared - BAD - // gemm_nn_custom_bin_mean_transposed_sequentially_gpu(m, n, k, - // (unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu, new_ldb, l.output_gpu, n, l.mean_arr_gpu); - //} - //else { // coalescing & weights-shared-memory - GOOD - //start_timer(); - gemm_nn_custom_bin_mean_transposed_gpu(m, n, k, - (unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu, - new_ldb, l.output_gpu, n, l.mean_arr_gpu, l.biases_gpu, l.activation == LEAKY, - l.bin_conv_shortcut_in_gpu, l.bin_conv_shortcut_out_gpu); - //cudaDeviceSynchronize(); - //stop_timer_and_show_name("gemm_nn_custom_bin_mean_transposed_gpu"); - //} - //cudaDeviceSynchronize(); - //check_error(status); - //getchar(); - } - - - /* - { - float_to_bit_gpu(state.input, (unsigned char *)l.align_workspace_gpu, input_size); - convolve_bin_gpu(l.align_workspace_gpu, (float *)l.align_bit_weights_gpu, l.output_gpu, l.w, l.h, l.c, l.n, l.size, l.pad, l.new_lda, l.mean_arr_gpu); - - //convolve_gpu(state.input, l.weights_gpu, l.output_gpu, l.w, l.h, l.c, l.n, l.size, l.pad); - - //cudaDeviceSynchronize(); - //check_error(status); - - add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h); - } - */ - - //add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h); - if (l.activation == SWISH) activate_array_swish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.output_gpu); - else if (l.activation == MISH) activate_array_mish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.output_gpu); - else if (l.activation == NORM_CHAN) activate_array_normalize_channels_ongpu(l.output_gpu, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output_gpu); - else if (l.activation == NORM_CHAN_SOFTMAX) activate_array_normalize_channels_softmax_ongpu(l.output_gpu, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output_gpu, 0); - else if (l.activation == NORM_CHAN_SOFTMAX_MAXVAL) activate_array_normalize_channels_softmax_ongpu(l.output_gpu, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output_gpu, 1); - else if (l.activation != LINEAR && l.activation != LEAKY) activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation); - //if(l.activation != LINEAR && l.activation != LEAKY) activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation); - //if (l.binary || l.xnor) swap_binary(&l); - //cudaDeviceSynchronize(); - return; - } - } - - if (l.xnor) { - swap_binary(&l); - binarize_gpu(state.input, l.c*l.h*l.w*l.batch, l.binary_input_gpu); - state.input = l.binary_input_gpu; - } - - //fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1); - -#ifdef CUDNN - //float one = 1; // alpha[0], beta[0] is float for HALF and FLOAT - float alpha = 1, beta = 0; - -//#ifdef CUDNN_HALF - //if (state.use_mixed_precision) { - int iteration_num = get_current_iteration(state.net); // (*state.net.seen) / (state.net.batch*state.net.subdivisions); - if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || (iteration_num > 3 * state.net.burn_in) && state.net.loss_scale != 1) && - (l.c / l.groups) % 8 == 0 && l.n % 8 == 0 && l.groups <= 1 && l.size > 1) - { - //printf("\n CUDNN_HALF!!! state.index = %d \n", state.index); - - // Note: For improved performance it is advised to use beta[0] = 0.0. - // For Tensor Core: cudnnSetConvolutionMathType() where cudnnMathType_t mathType = CUDNN_TENSOR_OP_MATH; - // 1. or CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM and use CUDNN_DATA_HALF - // 2. or CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED - // More: http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#tensor_ops - - const size_t input16_size = l.batch*l.c*l.w*l.h; - const size_t output16_size = l.batch*l.out_c*l.out_h*l.out_w; - - if (*state.net.max_input16_size < input16_size) { - //printf("\n input16_size: cur = %zu \t max = %zu \n", input16_size, *state.net.max_input16_size); - *state.net.max_input16_size = input16_size; - if (*state.net.input16_gpu) cuda_free(*state.net.input16_gpu); - assert(*state.net.max_input16_size > 0); - *state.net.input16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_input16_size); - } - float *input16 = *state.net.input16_gpu; - - if (*state.net.max_output16_size < output16_size) { - *state.net.max_output16_size = output16_size; - if (*state.net.output16_gpu) cuda_free(*state.net.output16_gpu); - assert(*state.net.max_output16_size > 0); - *state.net.output16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_output16_size); - } - float *output16 = *state.net.output16_gpu; - - assert(input16_size > 0); - cuda_convert_f32_to_f16(state.input, input16_size, input16); - - //fill_ongpu(output16_size / 2, 0, (float *)output16, 1); - CHECK_CUDNN(cudnnConvolutionForward(cudnn_handle(), - &alpha, - l.srcTensorDesc16, - input16, - l.weightDesc16, - l.weights_gpu16, - l.convDesc, - l.fw_algo16, - state.workspace, - l.workspace_size, - &beta, - l.dstTensorDesc16, - output16)); - - - if (l.batch_normalize) - { - if (state.train && !state.net.adversarial) // Training - { - simple_copy_ongpu(l.outputs*l.batch / 2, output16, l.x_gpu); - //copy_ongpu(l.outputs*l.batch / 2, output16, 1, l.x_gpu, 1); - //cudaMemcpyAsync(l.x_gpu, output16, l.outputs*l.batch*sizeof(half), cudaMemcpyDefault, get_cuda_stream()); - float one = 1.0f; - float zero = 0.0f; - // Batch-normalization can still take FP16 inputs and outputs, saving half the bandwidth - // compared to FP32, it's just that the statistics and value adjustment should be done in FP32. - CHECK_CUDNN(cudnnBatchNormalizationForwardTraining(cudnn_handle(), - CUDNN_BATCHNORM_SPATIAL, - &one, - &zero, - l.normDstTensorDescF16, - l.x_gpu, // input - l.normDstTensorDescF16, - output16, // output - l.normTensorDesc, - l.scales_gpu, // input - l.biases_gpu, // input - .01, - l.rolling_mean_gpu, // input/output (should be FP32) - l.rolling_variance_gpu, // input/output (should be FP32) - .00001, - l.mean_gpu, // output (should be FP32) - optional cache to speedup cudnnBatchNormalizationBackward() - l.variance_gpu)); // output (should be FP32) - optional cache to speedup cudnnBatchNormalizationBackward() - - cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu); - //forward_batchnorm_layer_gpu(l, state); - } - else // Detection - { - cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu); - normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w); - scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w); - add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h); - } - } - else // BIAS only - { - cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu); - add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h); - } - } - else { - - //#else - /* - int input_nan_inf = is_nan_or_inf(state.input, l.inputs * l.batch); - printf("\n is_nan_or_inf(state.input) = %d \n", input_nan_inf); - if (input_nan_inf) getchar(); - - int weights_nan_inf = is_nan_or_inf(l.weights_gpu, l.nweights); - printf("\n is_nan_or_inf(l.weights_gpu) = %d \n", weights_nan_inf); - if (weights_nan_inf) getchar(); - */ - - CHECK_CUDNN(cudnnConvolutionForward(cudnn_handle(), - &alpha, //&one, - l.srcTensorDesc, - state.input, - l.weightDesc, - l.weights_gpu, - l.convDesc, - l.fw_algo, - state.workspace, - l.workspace_size, - &beta, //&one, - l.dstTensorDesc, - l.output_gpu)); - - //cudaDeviceSynchronize(); - if (l.batch_normalize) { - forward_batchnorm_layer_gpu(l, state); - } - else { - add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h); - } - //#endif // CUDNN_HALF - } - - -#else - fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1); - - int i, j; - int m = l.n / l.groups; - int k = l.size*l.size*l.c / l.groups; - int n = l.out_w*l.out_h; - for(i = 0; i < l.batch; ++i){ - for (j = 0; j < l.groups; ++j) { - //float *im = state.input + i*l.c*l.h*l.w; - float *im = state.input + (i*l.groups + j)*l.c / l.groups*l.h*l.w; - float *a = l.weights_gpu + j*l.nweights / l.groups; - float *b = state.workspace; - float *c = l.output_gpu + (i*l.groups + j)*n*m; - if (l.size == 1) { - b = im; - } - else { - //im2col_ongpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, state.workspace); - - im2col_gpu_ext(im, // input - l.c / l.groups, // input channels - l.h, l.w, // input size (h, w) - l.size, l.size, // kernel size (h, w) - l.pad * l.dilation, l.pad * l.dilation, // padding (h, w) - l.stride_y, l.stride_x, // stride (h, w) - l.dilation, l.dilation, // dilation (h, w) - state.workspace); // output - - } - //gemm_ongpu(0, 0, m, n, k, 1., a, k, b, n, 1., c + i*m*n, n); - gemm_ongpu(0, 0, m, n, k, 1, a, k, b, n, 1, c, n); - } - } - - if (l.batch_normalize) { - forward_batchnorm_layer_gpu(l, state); - } - else { - add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h); - } -#endif - -//#ifndef CUDNN_HALF -//#endif // no CUDNN_HALF - - if (l.activation == SWISH) activate_array_swish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.output_gpu); - else if (l.activation == MISH) activate_array_mish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.output_gpu); - else if (l.activation == NORM_CHAN) activate_array_normalize_channels_ongpu(l.output_gpu, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output_gpu); - else if (l.activation == NORM_CHAN_SOFTMAX) activate_array_normalize_channels_softmax_ongpu(l.output_gpu, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output_gpu, 0); - else if (l.activation == NORM_CHAN_SOFTMAX_MAXVAL) activate_array_normalize_channels_softmax_ongpu(l.output_gpu, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output_gpu, 1); - else if (l.activation != LINEAR) activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation); - //if(l.dot > 0) dot_error_gpu(l); - if(l.binary || l.xnor) swap_binary(&l); - //cudaDeviceSynchronize(); // for correct profiling of performance - - if (state.net.try_fix_nan) { - fix_nan_and_inf(l.output_gpu, l.outputs*l.batch); - } - - if(l.assisted_excitation && state.train) assisted_excitation_forward_gpu(l, state); - - if (l.antialiasing) { - network_state s = { 0 }; - s.train = state.train; - s.workspace = state.workspace; - s.net = state.net; - if (!state.train) s.index = state.index; // don't use TC for training (especially without cuda_convert_f32_to_f16() ) - s.input = l.output_gpu; - forward_convolutional_layer_gpu(*(l.input_layer), s); - simple_copy_ongpu(l.outputs*l.batch, l.output_gpu, l.input_antialiasing_gpu); - simple_copy_ongpu(l.input_layer->outputs*l.input_layer->batch, l.input_layer->output_gpu, l.output_gpu); - } -} - -void backward_convolutional_layer_gpu(convolutional_layer l, network_state state) -{ - if (l.antialiasing) { - network_state s = { 0 }; - s.train = state.train; - s.workspace = state.workspace; - s.net = state.net; - s.delta = l.delta_gpu; // s.delta will be returned to l.delta_gpu - s.input = l.input_antialiasing_gpu; - //if (!state.train) s.index = state.index; // don't use TC for training (especially without cuda_convert_f32_to_f16() ) - simple_copy_ongpu(l.input_layer->outputs*l.input_layer->batch, l.delta_gpu, l.input_layer->delta_gpu); - backward_convolutional_layer_gpu(*(l.input_layer), s); - - simple_copy_ongpu(l.outputs*l.batch, l.input_antialiasing_gpu, l.output_gpu); - } - - if(state.net.try_fix_nan) constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1); - - if (l.activation == SWISH) gradient_array_swish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.delta_gpu); - else if (l.activation == MISH) gradient_array_mish_ongpu(l.outputs*l.batch, l.activation_input_gpu, l.delta_gpu); - else if (l.activation == NORM_CHAN_SOFTMAX || l.activation == NORM_CHAN_SOFTMAX_MAXVAL) gradient_array_normalize_channels_softmax_ongpu(l.output_gpu, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.delta_gpu); - else if (l.activation == NORM_CHAN) gradient_array_normalize_channels_ongpu(l.output_gpu, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.delta_gpu); - else gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu); - - if (!l.batch_normalize) - backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h); - -//#ifndef CUDNN_HALF - //if(l.batch_normalize){ - // backward_batchnorm_layer_gpu(l, state); - //} else { - // //backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h); - //} -//#endif // no CUDNN_HALF - float *original_input = state.input; - - if(l.xnor) state.input = l.binary_input_gpu; -#ifdef CUDNN - float one = 1.f; - float alpha = 1, beta = 0; - -//#ifdef CUDNN_HALF - int iteration_num = get_current_iteration(state.net); //(*state.net.seen) / (state.net.batch*state.net.subdivisions); - if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || (iteration_num > 3 * state.net.burn_in) && state.net.loss_scale != 1) && - (l.c / l.groups) % 8 == 0 && l.n % 8 == 0 && l.groups <= 1 && l.size > 1) - { - const size_t input16_size = l.batch*l.c*l.w*l.h; - const size_t delta16_size = l.batch*l.n*l.out_w*l.out_h; - - if (*state.net.max_input16_size < input16_size) { - *state.net.max_input16_size = input16_size; - if (*state.net.input16_gpu) cuda_free(*state.net.input16_gpu); - assert(*state.net.max_input16_size > 0); - *state.net.input16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_input16_size); - } - float *input16 = *state.net.input16_gpu; - - if (*state.net.max_output16_size < delta16_size) { - *state.net.max_output16_size = delta16_size; - if (*state.net.output16_gpu) cuda_free(*state.net.output16_gpu); - assert(*state.net.max_output16_size > 0); - *state.net.output16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_output16_size); - } - float *delta16 = *state.net.output16_gpu; - - assert(input16_size > 0); - assert(delta16_size > 0); - cuda_convert_f32_to_f16(state.input, input16_size, input16); - cuda_convert_f32_to_f16(l.delta_gpu, delta16_size, delta16); - - if (l.batch_normalize) { - //if (!state.train) { - // l.mean_gpu = l.rolling_mean_gpu; - // l.variance_gpu = l.rolling_variance_gpu; - //} - float one = 1.0f; - float zero = 0.0f; - CHECK_CUDNN(cudnnBatchNormalizationBackward(cudnn_handle(), - CUDNN_BATCHNORM_SPATIAL, - &one, - &zero, - &one, - &one, - l.normDstTensorDescF16, - l.x_gpu, // input (input in BN-forward-inference) - l.normDstTensorDescF16, - delta16, // input - l.normDstTensorDescF16, - l.output_gpu, //l.x_norm_gpu, // output (new delta) - l.normTensorDesc, - l.scales_gpu, // input (should be FP32) - l.scale_updates_gpu, // output (should be FP32) - l.bias_updates_gpu, // output (should be FP32) - .00001, - l.mean_gpu, // input (should be FP32) - l.variance_gpu)); // input (should be FP32) - - simple_copy_ongpu(l.outputs*l.batch / 2, l.output_gpu, delta16); - //copy_ongpu(l.outputs*l.batch / 2, l.x_norm_gpu, 1, delta16, 1); - //cudaMemcpyAsync(delta16, l.x_norm_gpu, l.outputs*l.batch * sizeof(half), cudaMemcpyDefault, get_cuda_stream()); - } - else - { - //backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h); - } - - // convert input: state.input (x), l.delta_gpu (y) from fp32 to fp16 - // get output: l.weight_updates_gpu (dw) and convert it to fp32 (ONLY if it is fp16) - - // calculate conv weight updates - // Already: l.weight_updates_gpu = (l.weight_updates_gpu - l.weight*decay*batch*subdivision)*momentum - // so we should copy f32 to f16, or compute: f16=(w_up - w*d*b*s)*m - assert((l.nweights) > 0); - cuda_convert_f32_to_f16(l.weight_updates_gpu, l.nweights, l.weight_updates_gpu16); - - if (!state.net.adversarial && !l.train_only_bn) { - CHECK_CUDNN(cudnnConvolutionBackwardFilter(cudnn_handle(), - &one, - l.srcTensorDesc16, - input16, //state.input, - l.ddstTensorDesc16, - delta16, //l.delta_gpu, - l.convDesc, - l.bf_algo16, - state.workspace, - l.workspace_size, - &one, - l.dweightDesc16, - l.weight_updates_gpu16)); // l.weight_updates_gpu); - - cuda_convert_f16_to_f32(l.weight_updates_gpu16, l.nweights, l.weight_updates_gpu); - } - - if (state.delta) { - if (l.binary || l.xnor) swap_binary(&l); - - // http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBackwardData - // calculate delta for the next layer - // convert input: l.weights_gpu (w), l.delta_gpu (dy) from fp32 to fp16 - // get output: state.delta (dx) and convert it to fp32 (ONLY if it is fp16) - CHECK_CUDNN(cudnnConvolutionBackwardData(cudnn_handle(), - &alpha, - l.weightDesc16, - l.weights_gpu16, //l.weights_gpu, - l.ddstTensorDesc16, - delta16, //l.delta_gpu, - l.convDesc, - l.bd_algo16, - state.workspace, - l.workspace_size, - &beta, - l.dsrcTensorDesc16, - input16)); // state.delta); - - cuda_convert_f16_to_f32(input16, input16_size, state.delta); - - if (l.binary || l.xnor) swap_binary(&l); - if (l.xnor) gradient_array_ongpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, state.delta); - } - } - else { - //#else // CUDNN_HALF - - if(l.batch_normalize){ - backward_batchnorm_layer_gpu(l, state); - } - - if (!state.net.adversarial && !l.train_only_bn) { - // calculate conv weight updates - // if used: beta=1 then loss decreases faster - CHECK_CUDNN(cudnnConvolutionBackwardFilter(cudnn_handle(), - &one, - l.srcTensorDesc, - state.input, - l.ddstTensorDesc, - l.delta_gpu, - l.convDesc, - l.bf_algo, - state.workspace, - l.workspace_size, - &one, - l.dweightDesc, - l.weight_updates_gpu)); - } - - if (state.delta) { - if (l.binary || l.xnor) swap_binary(&l); - // http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBackwardData - // calculate delta for the next layer - CHECK_CUDNN(cudnnConvolutionBackwardData(cudnn_handle(), - &one, - l.weightDesc, - l.weights_gpu, - l.ddstTensorDesc, - l.delta_gpu, - l.convDesc, - l.bd_algo, - state.workspace, - l.workspace_size, - &one, - l.dsrcTensorDesc, - state.delta)); - - if (l.binary || l.xnor) swap_binary(&l); - if (l.xnor) gradient_array_ongpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, state.delta); - } - } - -//#endif // CUDNN_HALF - -#else // CUDNN - if (l.batch_normalize) { - backward_batchnorm_layer_gpu(l, state); - } - - int m = l.n / l.groups; - int n = l.size*l.size*l.c / l.groups; - int k = l.out_w*l.out_h; - - int i, j; - for(i = 0; i < l.batch; ++i){ - for (j = 0; j < l.groups; ++j) { - float * a = l.delta_gpu + (i*l.groups + j)*m*k; - float * b = state.workspace; - float * c = l.weight_updates_gpu + j*l.nweights / l.groups; - - float *im = state.input + (i*l.groups + j)*l.c / l.groups*l.h*l.w; - - if (!state.net.adversarial && !l.train_only_bn) { - //im2col_ongpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, state.workspace); - im2col_gpu_ext(im, // input - l.c / l.groups, // input channels - l.h, l.w, // input size (h, w) - l.size, l.size, // kernel size (h, w) - l.pad * l.dilation, l.pad * l.dilation, // padding (h, w) - l.stride_y, l.stride_x, // stride (h, w) - l.dilation, l.dilation, // dilation (h, w) - state.workspace); // output - //gemm_ongpu(0, 1, m, n, k, 1, a + i*m*k, k, b, k, 1, c, n); - gemm_ongpu(0, 1, m, n, k, 1, a, k, b, k, 1, c, n); - } - - if (state.delta) { - if (l.binary || l.xnor) swap_binary(&l); - float * a = l.weights_gpu + j*l.nweights / l.groups; - float * b = l.delta_gpu + (i*l.groups + j)*m*k; - float * c = state.workspace; - - //gemm_ongpu(1, 0, n, k, m, 1, a, n, b + i*k*m, k, 0, c, k); - gemm_ongpu(1, 0, n, k, m, 1, a, n, b, k, 0, c, k); - - - float *delta = state.delta + (i*l.groups + j)*l.c / l.groups*l.h*l.w; - - //col2im_ongpu(state.workspace, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, delta); - col2im_gpu_ext( - state.workspace, // input - l.c / l.groups, // input channels - l.h, l.w, // input size (h, w) - l.size, l.size, // kernel size (h, w) - l.pad * l.dilation, l.pad * l.dilation, // padding size (h, w) - l.stride_y, l.stride_x, // stride size (h, w) - l.dilation, l.dilation, // dilation size (h, w) - delta); // output (delta) - - if (l.binary || l.xnor) { - swap_binary(&l); - } - if (l.xnor) gradient_array_ongpu(original_input + i*l.c*l.h*l.w, l.c*l.h*l.w, HARDTAN, state.delta + i*l.c*l.h*l.w); - } - } - } -#endif - if (state.net.try_fix_nan) { - if (state.delta) { - reset_nan_and_inf(state.delta, l.inputs * l.batch); - } - int size = l.nweights; - reset_nan_and_inf(l.weight_updates_gpu, size); - fix_nan_and_inf(l.weights_gpu, size); - } -} - -__global__ void calc_avg_activation_kernel(float *src, float *dst, int size, int channels, int batches) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - int xy = i % size; - int b = i / size; - - if (i < size*batches) { - dst[i] = 0; - for (int c = 0; c < channels; ++c) { - dst[i] += src[xy + size*(c + channels*b)]; - } - dst[i] = dst[i] / channels; - } -} - -void calc_avg_activation_gpu(float *src, float *dst, int size, int channels, int batches) -{ - const int num_blocks = get_number_of_blocks(size*batches, BLOCK); - - calc_avg_activation_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (src, dst, size, channels, batches); -} - - -__global__ void assisted_activation_kernel(float alpha, float *output, float *gt_gpu, float *a_avg_gpu, int size, int channels, int batches) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - int xy = i % size; - int b = i / size; - - if (b < batches) { - for (int c = 0; c < channels; ++c) { - output[xy + size*(c + channels*b)] += alpha * gt_gpu[i] * a_avg_gpu[i]; - //output[xy + size*(c + channels*b)] += gt_gpu[i] * a_avg_gpu[i]; - //output[xy + size*(c + channels*b)] += gt_gpu[i] * output[xy + size*(c + channels*b)]; - //output[xy + size*(c + channels*b)] = a_avg_gpu[i]; - } - } -} - -void assisted_activation_gpu(float alpha, float *output, float *gt_gpu, float *a_avg_gpu, int size, int channels, int batches) -{ - const int num_blocks = get_number_of_blocks(size*batches, BLOCK); - - assisted_activation_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (alpha, output, gt_gpu, a_avg_gpu, size, channels, batches); -} - - -__global__ void assisted_activation2_kernel(float alpha, float *output, float *gt_gpu, float *a_avg_gpu, int size, int channels, int batches) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - int xy = i % size; - int b = i / size; - float beta = 1 - alpha; - - if (b < batches) { - for (int c = 0; c < channels; ++c) { - if(gt_gpu[i] == 0) - output[xy + size*(c + channels*b)] *= beta; - - } - } -} - -void assisted_activation2_gpu(float alpha, float *output, float *gt_gpu, float *a_avg_gpu, int size, int channels, int batches) -{ - const int num_blocks = get_number_of_blocks(size*batches, BLOCK); - - assisted_activation2_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (alpha, output, gt_gpu, a_avg_gpu, size, channels, batches); -} - -void assisted_excitation_forward_gpu(convolutional_layer l, network_state state) -{ - const int iteration_num = get_current_iteration(state.net); //(*state.net.seen) / (state.net.batch*state.net.subdivisions); - - // epoch - //const float epoch = (float)(*state.net.seen) / state.net.train_images_num; - - // calculate alpha - //const float alpha = (1 + cos(3.141592 * iteration_num)) / (2 * state.net.max_batches); - //const float alpha = (1 + cos(3.141592 * epoch)) / (2 * state.net.max_batches); - float alpha = (1 + cos(3.141592 * iteration_num / state.net.max_batches)) / 2; - //float alpha = (1 + cos(3.141592 * iteration_num / state.net.max_batches)); - - if (l.assisted_excitation == 1) { - if (iteration_num > state.net.max_batches / 2) return; - } - else { - if (iteration_num < state.net.burn_in) return; - else - if (iteration_num > l.assisted_excitation) return; - else - alpha = (1 + cos(3.141592 * iteration_num / (state.net.burn_in + l.assisted_excitation))) / 2; // from 1 to 0 - } - - //printf("\n epoch = %f, alpha = %f, seen = %d, max_batches = %d, train_images_num = %d \n", - // epoch, alpha, (*state.net.seen), state.net.max_batches, state.net.train_images_num); - - //const int size = l.outputs * l.batch; - - float *a_avg = (float *)calloc(l.out_w * l.out_h * l.batch, sizeof(float)); - float *gt = (float *)calloc(l.out_w * l.out_h * l.batch, sizeof(float)); - - int b; - int w, h; - - l.max_boxes = state.net.num_boxes; - l.truths = l.max_boxes*(4 + 1); - - int num_truth = l.batch*l.truths; - float *truth_cpu = (float *)calloc(num_truth, sizeof(float)); - cuda_pull_array(state.truth, truth_cpu, num_truth); - //cudaStreamSynchronize(get_cuda_stream()); - //CHECK_CUDA(cudaPeekAtLastError()); - - for (b = 0; b < l.batch; ++b) - { - // calculate G - int t; - for (t = 0; t < state.net.num_boxes; ++t) { - box truth = float_to_box_stride(truth_cpu + t*(4 + 1) + b*l.truths, 1); - if (!truth.x) break; // continue; - float beta = 0; - //float beta = 1 - alpha; // from 0 to 1 - float dw = (1 - truth.w) * beta; - float dh = (1 - truth.h) * beta; - //printf(" alpha = %f, beta = %f, truth.w = %f, dw = %f, tw+dw = %f, l.out_w = %d \n", alpha, beta, truth.w, dw, truth.w+dw, l.out_w); - - int left = floor((truth.x - (dw + truth.w) / 2) * l.out_w); - int right = ceil((truth.x + (dw + truth.w) / 2) * l.out_w); - int top = floor((truth.y - (dh + truth.h) / 2) * l.out_h); - int bottom = ceil((truth.y + (dh + truth.h) / 2) * l.out_h); - if (left < 0) left = 0; - if (top < 0) top = 0; - if (right > l.out_w) right = l.out_w; - if (bottom > l.out_h) bottom = l.out_h; - - for (w = left; w <= right; w++) { - for (h = top; h < bottom; h++) { - gt[w + l.out_w * h + l.out_w*l.out_h*b] = 1; - } - } - } - } - - cuda_push_array(l.gt_gpu, gt, l.out_w * l.out_h * l.batch); - //cudaStreamSynchronize(get_cuda_stream()); - //CHECK_CUDA(cudaPeekAtLastError()); - - // calc avg_output on GPU - for whole batch - calc_avg_activation_gpu(l.output_gpu, l.a_avg_gpu, l.out_w * l.out_h, l.out_c, l.batch); - //cudaStreamSynchronize(get_cuda_stream()); - //CHECK_CUDA(cudaPeekAtLastError()); - - // calc new output - //assisted_activation2_gpu(1, l.output_gpu, l.gt_gpu, l.a_avg_gpu, l.out_w * l.out_h, l.out_c, l.batch); // AE3: gt increases (beta = 1 - alpha = 0) - //assisted_activation2_gpu(alpha, l.output_gpu, l.gt_gpu, l.a_avg_gpu, l.out_w * l.out_h, l.out_c, l.batch); - assisted_activation_gpu(alpha, l.output_gpu, l.gt_gpu, l.a_avg_gpu, l.out_w * l.out_h, l.out_c, l.batch); - //cudaStreamSynchronize(get_cuda_stream()); - //CHECK_CUDA(cudaPeekAtLastError()); - - - - /* - for (b = 0; b < l.batch; ++b) - { - // calculate average A - for (w = 0; w < l.out_w; w++) { - for (h = 0; h < l.out_h; h++) { - for (c = 0; c < l.out_c; c++) { - a_avg[w + l.out_w*(h + l.out_h*b)] += l.output[w + l.out_w*(h + l.out_h*(c + l.out_c*b))]; - } - a_avg[w + l.out_w*(h + l.out_h*b)] /= l.out_c; // a_avg / d - } - } - } - - // change activation - for (b = 0; b < l.batch; ++b) - { - for (w = 0; w < l.out_w; w++) { - for (h = 0; h < l.out_h; h++) { - for (c = 0; c < l.out_c; c++) - { - // a = a + alpha(t) + e(c,i,j) = a + alpha(t) + g(i,j) * avg_a(i,j) / channels - l.output[w + l.out_w*(h + l.out_h*(c + l.out_c*b))] += - alpha * - g[w + l.out_w*(h + l.out_h*b)] * - a_avg[w + l.out_w*(h + l.out_h*b)]; - - //l.output[w + l.out_w*(h + l.out_h*(c + l.out_c*b))] = - // alpha * g[w + l.out_w*(h + l.out_h*b)] * a_avg[w + l.out_w*(h + l.out_h*b)]; - } - } - } - } - */ - - if (0) // visualize ground truth - { -#ifdef OPENCV - cuda_pull_array(l.output_gpu, l.output, l.outputs * l.batch); - cudaStreamSynchronize(get_cuda_stream()); - CHECK_CUDA(cudaPeekAtLastError()); - - for (b = 0; b < l.batch; ++b) - { - printf(" Assisted Excitation alpha = %f \n", alpha); - image img = float_to_image(l.out_w, l.out_h, 1, >[l.out_w*l.out_h*b]); - char buff[100]; - sprintf(buff, "a_excitation_gt_%d", b); - show_image_cv(img, buff); - - //image img2 = float_to_image(l.out_w, l.out_h, 1, &l.output[l.out_w*l.out_h*l.out_c*b]); - image img2 = float_to_image_scaled(l.out_w, l.out_h, 1, &l.output[l.out_w*l.out_h*l.out_c*b]); - char buff2[100]; - sprintf(buff2, "a_excitation_output_%d", b); - show_image_cv(img2, buff2); - - /* - int c = l.out_c; - if (c > 4) c = 4; - image img3 = float_to_image(l.out_w, l.out_h, c, &l.output[l.out_w*l.out_h*l.out_c*b]); - image dc = collapse_image_layers(img3, 1); - char buff3[100]; - sprintf(buff3, "a_excitation_act_collapsed_%d", b); - show_image_cv(dc, buff3); - */ - - wait_key_cv(5); - } - wait_until_press_key_cv(); -#endif // OPENCV - } - - free(truth_cpu); - free(gt); - free(a_avg); -} - -void pull_convolutional_layer(convolutional_layer l) -{ - cuda_pull_array_async(l.weights_gpu, l.weights, l.nweights); - cuda_pull_array_async(l.biases_gpu, l.biases, l.n); - cuda_pull_array_async(l.weight_updates_gpu, l.weight_updates, l.nweights); - cuda_pull_array_async(l.bias_updates_gpu, l.bias_updates, l.n); - if (l.batch_normalize){ - cuda_pull_array_async(l.scales_gpu, l.scales, l.n); - cuda_pull_array_async(l.rolling_mean_gpu, l.rolling_mean, l.n); - cuda_pull_array_async(l.rolling_variance_gpu, l.rolling_variance, l.n); - } - if (l.adam){ - cuda_pull_array_async(l.m_gpu, l.m, l.nweights); - cuda_pull_array_async(l.v_gpu, l.v, l.nweights); - } - CHECK_CUDA(cudaPeekAtLastError()); - cudaStreamSynchronize(get_cuda_stream()); -} - -void push_convolutional_layer(convolutional_layer l) -{ - cuda_push_array(l.weights_gpu, l.weights, l.nweights); -#ifdef CUDNN_HALF - assert(l.nweights > 0); - cuda_convert_f32_to_f16(l.weights_gpu, l.nweights, l.weights_gpu16); -#endif - cuda_push_array(l.biases_gpu, l.biases, l.n); - if (l.train) { - cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.nweights); - cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n); - } - if (l.batch_normalize){ - cuda_push_array(l.scales_gpu, l.scales, l.n); - cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.n); - cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.n); - } - if (l.adam){ - cuda_push_array(l.m_gpu, l.m, l.nweights); - cuda_push_array(l.v_gpu, l.v, l.nweights); - } - CHECK_CUDA(cudaPeekAtLastError()); -} - -void update_convolutional_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay, float loss_scale) -{ - - /* - for (int angle = 0; angle < 360; angle++) { - printf(" angle = %d \n", angle); - smooth_rotate_weights_kernel(l.weights_gpu, l.weight_deform_gpu, l.nweights, l.n, l.size, angle, 0); - - cuda_pull_array(l.weight_deform_gpu, l.weights, l.nweights); - visualize_convolutional_layer(l, "weights", NULL); - wait_key_cv(10); - } - */ - - if (l.deform) { - - //for (l.angle = 0; l.angle < 360; l.angle += 1) - //{ - //stretch_weights_gpu(l.weight_updates_gpu, l.weight_deform_gpu, l.nweights, l.n, l.size, l.angle/180, 1); - //else simple_copy_ongpu(l.nweights, l.weight_updates_gpu, l.weight_deform_gpu); - - if (l.rotate) rotate_weights_gpu(l.weight_updates_gpu, l.weight_deform_gpu, l.nweights, l.n, l.size, 1); - else if (l.sway) sway_and_flip_weights_gpu(l.weight_updates_gpu, l.weight_deform_gpu, l.nweights, l.n, l.size, l.angle, 1); - else if (l.stretch) stretch_weights_gpu(l.weight_updates_gpu, l.weight_deform_gpu, l.nweights, l.n, l.size, 0, 1); - else if (l.stretch_sway) stretch_sway_flip_weights_gpu(l.weight_updates_gpu, l.weight_deform_gpu, l.nweights, l.n, l.size, l.angle, 1); - - //simple_copy_ongpu(l.nweights, l.weight_updates_gpu, l.weight_deform_gpu); - - reduce_and_expand_array_gpu(l.weight_deform_gpu, l.weight_updates_gpu, l.nweights, 4); - - //printf(" angle = %f \n", l.angle); - //cuda_pull_array(l.weight_deform_gpu, l.weights, l.nweights); - //visualize_convolutional_layer(l, "weights", NULL); - //wait_key_cv(10); - //} - - } - - - float learning_rate = learning_rate_init*l.learning_rate_scale; - //float momentum = a.momentum; - //float decay = a.decay; - //int batch = a.batch; - - // Loss scale for Mixed-Precision on Tensor-Cores - if (loss_scale != 1.0) { - if (l.weight_updates_gpu && l.nweights > 0) scal_ongpu(l.nweights, 1.0 / loss_scale, l.weight_updates_gpu, 1); - if (l.bias_updates_gpu && l.n > 0) scal_ongpu(l.n, 1.0 / loss_scale, l.bias_updates_gpu, 1); - if (l.scale_updates_gpu && l.n > 0) scal_ongpu(l.n, 1.0 / loss_scale, l.scale_updates_gpu, 1); - } - - reset_nan_and_inf(l.weight_updates_gpu, l.nweights); - fix_nan_and_inf(l.weights_gpu, l.nweights); - - // Gradient Centralization - if (l.grad_centr && l.batch_normalize) { - // weights[filters][channels][height][width] - // for(filters) w[f] = w[f] - mean(w[c][h][w]) - gradient_centralization_gpu(l.size, l.size, l.c / l.groups, l.n, l.weight_updates_gpu); - } - - - if (l.adam) { - //adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.nweights, batch, a.t); - adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.nweights, batch, l.t); - - adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.n, batch, l.t); - if (l.scales_gpu) { - adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.n, batch, l.t); - } - } - else { - //axpy_ongpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1); - //axpy_ongpu(l.nweights, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1); - //scal_ongpu(l.nweights, momentum, l.weight_updates_gpu, 1); - axpy_ongpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1); - axpy_ongpu(l.nweights, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1); - scal_ongpu(l.nweights, momentum, l.weight_updates_gpu, 1); - - axpy_ongpu(l.n, learning_rate / batch, l.bias_updates_gpu, 1, l.biases_gpu, 1); - scal_ongpu(l.n, momentum, l.bias_updates_gpu, 1); - - if (l.scales_gpu) { - axpy_ongpu(l.n, learning_rate / batch, l.scale_updates_gpu, 1, l.scales_gpu, 1); - scal_ongpu(l.n, momentum, l.scale_updates_gpu, 1); - } - } - - if (l.deform) { - //for (l.angle = 0; l.angle < 360; l.angle += 4) - //{ - expand_array_gpu(l.weights_gpu, l.weight_deform_gpu, l.nweights, 4); - - //simple_copy_ongpu(l.nweights, l.weight_deform_gpu, l.weights_gpu); - - if (l.rotate) rotate_weights_gpu(l.weight_deform_gpu, l.weights_gpu, l.nweights, l.n, l.size, 0); - else if (l.sway) sway_and_flip_weights_gpu(l.weight_deform_gpu, l.weights_gpu, l.nweights, l.n, l.size, l.angle, 0); - else if (l.stretch) stretch_weights_gpu(l.weight_deform_gpu, l.weights_gpu, l.nweights, l.n, l.size, 0, 0); - else if (l.stretch_sway) stretch_sway_flip_weights_gpu(l.weight_deform_gpu, l.weights_gpu, l.nweights, l.n, l.size, l.angle, 0); - - //printf(" angle = %f, reverse = %d \n", l.angle, 0); - //cuda_pull_array(l.weights_gpu, l.weights, l.nweights); - //visualize_convolutional_layer(l, "weights", NULL); - //wait_key_cv(10); - //} - } - - if (l.clip) { - constrain_ongpu(l.nweights, l.clip, l.weights_gpu, 1); - } -} - - - -/* -void update_convolutional_layer_gpu(convolutional_layer layer, int batch, float learning_rate, float momentum, float decay) -{ - int size = layer.size*layer.size*layer.c*layer.n; - axpy_ongpu(layer.n, learning_rate/batch, layer.bias_updates_gpu, 1, layer.biases_gpu, 1); - scal_ongpu(layer.n, momentum, layer.bias_updates_gpu, 1); - - if(layer.scales_gpu){ - axpy_ongpu(layer.n, learning_rate/batch, layer.scale_updates_gpu, 1, layer.scales_gpu, 1); - scal_ongpu(layer.n, momentum, layer.scale_updates_gpu, 1); - } - - if(layer.adam){ - scal_ongpu(size, layer.B1, layer.m_gpu, 1); - scal_ongpu(size, layer.B2, layer.v_gpu, 1); - - axpy_ongpu(size, -decay*batch, layer.weights_gpu, 1, layer.weight_updates_gpu, 1); - - axpy_ongpu(size, -(1-layer.B1), layer.weight_updates_gpu, 1, layer.m_gpu, 1); - mul_ongpu(size, layer.weight_updates_gpu, 1, layer.weight_updates_gpu, 1); - axpy_ongpu(size, (1-layer.B2), layer.weight_updates_gpu, 1, layer.v_gpu, 1); - - adam_gpu(size, layer.weights_gpu, layer.m_gpu, layer.v_gpu, layer.B1, layer.B2, learning_rate/batch, layer.eps, layer.t+1); - fill_ongpu(size, 0, layer.weight_updates_gpu, 1); - }else{ - axpy_ongpu(size, -decay*batch, layer.weights_gpu, 1, layer.weight_updates_gpu, 1); // wu = wu - w*decay*batch - axpy_ongpu(size, learning_rate/batch, layer.weight_updates_gpu, 1, layer.weights_gpu, 1); // w = w + wu*lr/batch - scal_ongpu(size, momentum, layer.weight_updates_gpu, 1); // wu = wu*momentum // wu = (wu - w*decay*batch)*momentum - // w = w + (wu - w*decay*batch)*lr/batch = w + wu*lr/batch - w*decay*lr = w*(1-decay*lr) + wu*lr/batch - //wu_prev = (wu_old - w_old*decay*batch)*momentum - - - //weights_update = weights_update_new + (weights_update_old - weights_old*decay*batch)*momentum - weights_new*decay*batch = - // = weights_update_new + weights_update_old*momentum - weights_old*decay*batch*momentum - weights_new*decay*batch - // = weights_update_new + weights_update_old*momentum - (weights_old*momentum + weights_new)*decay*batch - - //------------- RESULT -------------- - // weights_update = weights_update_new + weights_update_old*momentum - (weights_old*momentum + weights_new)*decay*batch - //----------------------------------- - - // weights_newest = weights_new + (weights_update_new + weights_update_old*momentum - (weights_old*momentum + weights_new)*decay*batch)*lr/batch - // = weights_new + weights_update_new*lr/batch + weights_update_old*momentum*lr/batch - weights_old*momentum*decay*batch*lr/batch - weights_new*decay*batch*lr/batch - // = weights_new + weights_update_new*lr/batch + weights_update_old*momentum*lr/batch - weights_old*momentum*decay*lr - weights_new*decay*lr - // = weights_new*(1 - decay*lr) - weights_old*momentum*decay*lr + (weights_update_new + weights_update_old*momentum)*lr/batch - - //------------- RESULT -------------- - // weights_newest = weights_new*(1 - decay*lr) - weights_old*momentum*(decay*lr) + (weights_update_new + weights_update_old*momentum)*lr/batch = - // = weights_new - (weights_new + weights_old*momentum)*decay*lr + (weights_update_new + weights_update_old*momentum)*lr / batch - //----------------------------------- - } -} -*/ +#include <cuda_runtime.h> +#include <curand.h> +#include <cublas_v2.h> + +#include "convolutional_layer.h" +#include "batchnorm_layer.h" +#include "gemm.h" +#include "blas.h" +#include "im2col.h" +#include "col2im.h" +#include "utils.h" +#include "dark_cuda.h" +#include "box.h" + + +__global__ void binarize_kernel(float *x, int n, float *binary) +{ + int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; + if (i >= n) return; + binary[i] = (x[i] >= 0) ? 1 : -1; +} + +void binarize_gpu(float *x, int n, float *binary) +{ + binarize_kernel<<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>>(x, n, binary); + CHECK_CUDA(cudaPeekAtLastError()); +} + +__global__ void binarize_input_kernel(float *input, int n, int size, float *binary) +{ + int s = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; + if (s >= size) return; + int i = 0; + float mean = 0; + for(i = 0; i < n; ++i){ + mean += fabs(input[i*size + s]); + } + mean = mean / n; + for(i = 0; i < n; ++i){ + binary[i*size + s] = (input[i*size + s] > 0) ? mean : -mean; + } +} + +void binarize_input_gpu(float *input, int n, int size, float *binary) +{ + binarize_input_kernel<<<cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >>>(input, n, size, binary); + CHECK_CUDA(cudaPeekAtLastError()); +} + +__global__ void binarize_weights_kernel(float *weights, int n, int size, float *binary) +{ + int f = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; + if (f >= n) return; + int i = 0; + float mean = 0; + for (i = 0; i < size; ++i) { + mean += fabs(weights[f*size + i]); + } + mean = mean / size; + for (i = 0; i < size; ++i) { + binary[f*size + i] = (weights[f*size + i] > 0) ? mean : -mean; + //binary[f*size + i] = weights[f*size + i]; + } +} + +void binarize_weights_gpu(float *weights, int n, int size, float *binary) +{ + binarize_weights_kernel << <cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >> >(weights, n, size, binary); + CHECK_CUDA(cudaPeekAtLastError()); +} + + +__global__ void set_zero_kernel(float *src, int size) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < size) src[i] = 0; +} + +__inline__ __device__ +float warpAllReduceSum(float val) { + for (int mask = WARP_SIZE / 2; mask > 0; mask /= 2) +#if CUDART_VERSION >= 9000 + val += __shfl_xor_sync(0xffffffff, val, mask); +#else + val += __shfl_xor(val, mask); +#endif + return val; +} + +// only if (size % 32 == 0) +__global__ void reduce_kernel(float *weights, int n, int size, float *mean_arr_gpu) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + int f = i / size; + if (f >= n) return; + float warp_mean = warpAllReduceSum(fabs(weights[i])); + if(i % 32 == 0) + atomicAdd(&mean_arr_gpu[f], warp_mean / size); +} + +__global__ void binarize_weights_mean_kernel(float *weights, int n, int size, float *binary, float *mean_arr_gpu) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + int f = i / size; + if (f >= n) return; + float mean = mean_arr_gpu[f]; + binary[i] = (weights[i] > 0) ? mean : -mean; +} + +void fast_binarize_weights_gpu(float *weights, int n, int size, float *binary, float *mean_arr_gpu) +{ + if (size % 32 == 0) { + size_t gridsize = n * size; + const int num_blocks = get_number_of_blocks(gridsize, BLOCK);// gridsize / BLOCK + 1; + + set_zero_kernel << <(n/BLOCK + 1), BLOCK, 0, get_cuda_stream() >> > (mean_arr_gpu, n); + reduce_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (weights, n, size, mean_arr_gpu); + binarize_weights_mean_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (weights, n, size, binary, mean_arr_gpu); + CHECK_CUDA(cudaPeekAtLastError()); + } + else { + binarize_weights_gpu(weights, n, size, binary); + } +} + + +__global__ void cuda_f32_to_f16(float* input_f32, size_t size, half *output_f16) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) output_f16[idx] = __float2half(input_f32[idx]); + //if (idx < size) output_f16[idx] = __float2half_rn(input_f32[idx]); // can't be compiled on Linux without casting + // __float2half_ru, __float2half_rd, __float2half_rz, __float2half_rn + //if (idx < size) *((unsigned short *)output_f16 + idx) = __float2half(input_f32[idx]); +} + +void cuda_convert_f32_to_f16(float* input_f32, size_t size, float *output_f16) { + cuda_f32_to_f16 <<< get_number_of_blocks(size, BLOCK), BLOCK, 0, get_cuda_stream() >>> (input_f32, size, (half *)output_f16); + CHECK_CUDA(cudaPeekAtLastError()); +} + +__global__ void cuda_f16_to_f32(half* input_f16, size_t size, float *output_f32) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) output_f32[idx] = __half2float(input_f16[idx]); + //if (idx < size) output_f32[idx] = __half2float(*((unsigned short *)input_f16 + idx)); +} + +void cuda_convert_f16_to_f32(float* input_f16, size_t size, float *output_f32) { + cuda_f16_to_f32 <<< get_number_of_blocks(size, BLOCK), BLOCK, 0, get_cuda_stream() >>> ((half *)input_f16, size, output_f32); + CHECK_CUDA(cudaPeekAtLastError()); +} + +half *cuda_make_f16_from_f32_array(float *src, size_t n) +{ + half *dst16; + size_t size = sizeof(half)*n; + CHECK_CUDA(cudaMalloc((void **)&dst16, size)); + if (src) { + assert(n > 0); + cuda_convert_f32_to_f16(src, n, (float *)dst16); + } + if (!dst16) error("Cuda malloc failed\n"); + return dst16; +} + +void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) +{ + if (l.stream >= 0) { + switch_stream(l.stream); + } + + if (l.wait_stream_id >= 0) { + wait_stream(l.wait_stream_id); + } + + //fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1); + if(l.binary){ + binarize_weights_gpu(l.weights_gpu, l.n, (l.c / l.groups)*l.size*l.size, l.binary_weights_gpu); + swap_binary(&l); + } + + if(l.xnor){ + if (!l.align_bit_weights_gpu || state.train) { + //binarize_weights_gpu(l.weights_gpu, l.n, (l.c / l.groups)*l.size*l.size, l.binary_weights_gpu); + + fast_binarize_weights_gpu(l.weights_gpu, l.n, (l.c / l.groups)*l.size*l.size, l.binary_weights_gpu, l.mean_arr_gpu); + } + + if (l.align_bit_weights_gpu && !state.train && l.c >= 32 && l.stride_x == l.stride_y) + { + //return; + //cudaError_t status = cudaSuccess; + //int input_size = l.c*l.h*l.w*l.batch; + + int m = l.n / l.groups; + int k = l.size*l.size*l.c / l.groups; + int n = l.out_w*l.out_h; + //float * a = l.weights_gpu; + + // int i, j; + // for(i = 0; i < l.batch; ++i){ + // for (j = 0; j < l.groups; ++j) { + + int ldb_align = l.lda_align; + size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8; + //size_t t_intput_size = new_ldb * n; + //size_t t_bit_input_size = t_intput_size / 8;// +1; + + if (l.c % 32 == 0) + { + //printf("\n\n l.index = %d, l.w = %d, l.c = %d, l.n = %d, l.stride = %d, l.pad = %d - new XNOR \n", l.index, l.w, l.c, l.n, l.stride, l.pad); + //printf("l.align_workspace_size = %d, (l.c * l.w * l.h) = %d \n", l.align_workspace_size, (l.c * l.w * l.h)); + + //float *intput_cpu = (float *)calloc(l.inputs, sizeof(float)); + // state.input + //cudaMemcpy(intput_cpu, state.input, l.inputs * sizeof(float), cudaMemcpyDefault); + + int ldb_align = l.lda_align; + size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8; + //size_t t_intput_size = new_ldb * l.bit_align;// n; + //size_t t_bit_input_size = t_intput_size / 8;// +1; + + const int new_c = l.c / 32; + + //float *re_packed_input = (float *)calloc(l.c * l.w * l.h, sizeof(float)); + //uint32_t *bin_re_packed_input = (uint32_t *)calloc(new_c * l.w * l.h + 1, sizeof(uint32_t)); + + // float32x4 by channel (as in cuDNN) + //repack_input(intput_cpu, re_packed_input, l.w, l.h, l.c); + + + // 32 x floats -> 1 x uint32_t + //float_to_bit(re_packed_input, (uint8_t *)bin_re_packed_input, l.c * l.w * l.h); + + //cudaDeviceSynchronize(); + //start_timer(); + + repack_input_gpu_bin(state.input, (uint32_t *)l.align_workspace_gpu, l.w, l.h, l.c); + + //repack_input_gpu(state.input, state.workspace, l.w, l.h, l.c); + + // 32 x floats -> 1 x uint32_t + //float_to_bit_gpu(state.workspace, (unsigned char *)l.align_workspace_gpu, l.c * l.w * l.h);// l.align_workspace_size); + + //cudaDeviceSynchronize(); + //stop_timer_and_show_name("repack_input_gpu + float_to_bit_gpu"); + + //free(re_packed_input); + + // slow - convolution the packed inputs and weights: float x 32 by channel (as in cuDNN) + //convolution_repacked((uint32_t *)bin_re_packed_input, (uint32_t *)l.align_bit_weights, l.output, + // l.w, l.h, l.c, l.n, l.size, l.pad, l.new_lda, l.mean_arr); + + // // then exit from if() + + //float *b = state.workspace; + //float *b = (float *)calloc(100 * 1024 * 1024, sizeof(float)); + //float *c = l.output; + //memset(c, 0, l.outputs * sizeof(float)); + + + //im2col_cpu_custom((float *)bin_re_packed_input, new_c, l.h, l.w, l.size, l.stride, l.pad, b); + + //cudaMemcpy(l.align_workspace_gpu, bin_re_packed_input, (new_c * l.w * l.h + 1) * sizeof(uint32_t), cudaMemcpyDefault); + + //start_timer(); + im2col_ongpu(l.align_workspace_gpu, new_c, l.h, l.w, l.size, l.stride, l.pad, state.workspace); + //cudaDeviceSynchronize(); + //stop_timer_and_show_name("im2col_ongpu"); + + //free(bin_re_packed_input); + + int new_k = l.size*l.size*l.c / 32; + + // good for (l.c == 64) + //gemm_nn_bin_32bit_packed(m, n, new_k, 1, + // l.align_bit_weights, l.new_lda/32, + // b, n, + // c, n, l.mean_arr); + + // // then exit from if() + + + //size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8; + //size_t t_intput_size = new_ldb * l.bit_align;// n; + //size_t t_bit_input_size = t_intput_size / 8;// +1; + + //char *t_bit_input = (char *)calloc(t_bit_input_size, sizeof(char)); + //transpose_uint32((uint32_t *)b, (uint32_t *)t_bit_input, new_k, n, n, new_ldb); + //cudaMemcpy(l.transposed_align_workspace_gpu, t_bit_input, t_bit_input_size * sizeof(char), cudaMemcpyDefault); + + //cudaMemcpy(state.workspace, b, t_bit_input_size * sizeof(char), cudaMemcpyDefault); + //printf("\n n = %d, n % 32 = %d, new_ldb = %d, new_ldb % 32 = %d \n", n, n % 32, new_ldb, new_ldb % 32); + + //start_timer(); + transpose_uint32_gpu((uint32_t *)state.workspace, (uint32_t *)l.transposed_align_workspace_gpu, new_k, n, n, new_ldb); + //cudaDeviceSynchronize(); + //stop_timer_and_show_name("transpose_uint32_gpu"); + + //cudaDeviceSynchronize(); + //stop_timer_and_show_name("repack_input_gpu_bin + im2col_ongpu + transpose_uint32_gpu_2"); + + //start_timer(); + gemm_nn_custom_bin_mean_transposed_gpu(m, n, k, + (unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu, + new_ldb, l.output_gpu, n, l.mean_arr_gpu, l.biases_gpu, l.activation == LEAKY, + l.bin_conv_shortcut_in_gpu, l.bin_conv_shortcut_out_gpu); + //cudaDeviceSynchronize(); + //stop_timer_and_show_name("gemm_nn_custom_bin_mean_transposed_gpu"); + + + // the main GEMM function + //gemm_nn_custom_bin_mean_transposed(m, n, k, 1, (uint8_t *)l.align_bit_weights, new_ldb, (uint8_t *)t_bit_input, new_ldb, c, n, l.mean_arr); + + //add_bias(l.output, l.biases, l.batch, l.n, l.out_h*l.out_w); + + //cudaMemcpy(l.output_gpu, l.output, l.outputs * sizeof(float), cudaMemcpyDefault); + + + // // alternative GEMM + //gemm_nn_bin_transposed_32bit_packed(m, n, new_k, 1, + // l.align_bit_weights, l.new_lda/32, + // t_bit_input, new_ldb / 32, + // c, n, l.mean_arr); + + //free(t_bit_input); + + //free(b); + } + else + { + //printf("\n\n l.index = %d, l.w = %d, l.c = %d, l.n = %d, l.stride = %d, l.pad = %d - old XNOR \n", l.index, l.w, l.c, l.n, l.stride, l.pad); + //cudaDeviceSynchronize(); + + int i = 0; + /* + // if (l.stride == 1 && l.c >= 256 && l.size > 1) + if (l.stride == 1 && l.c >= 1024 && l.size > 1 && 0)// && l.w >= 13) // disabled + { + // stride=1 only + //start_timer(); + im2col_align_bin_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, state.workspace, l.bit_align); + //cudaDeviceSynchronize(); + //stop_timer_and_show_name("im2col_align_bin_ongpu"); + } + else*/ + { + //start_timer(); + im2col_align_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, l.align_workspace_gpu, l.bit_align); + //cudaDeviceSynchronize(); + //stop_timer_and_show_name("im2col_align_ongpu"); + //getchar(); + + // should be optimized + //start_timer(); + float_to_bit_gpu(l.align_workspace_gpu, (unsigned char *)state.workspace, l.align_workspace_size); + //cudaDeviceSynchronize(); + //stop_timer_and_show_name("float_to_bit_gpu"); + } + //start_timer(); + transpose_bin_gpu((unsigned char *)state.workspace, (unsigned char *)l.transposed_align_workspace_gpu, k, n, l.bit_align, new_ldb, 8); + //cudaDeviceSynchronize(); + //stop_timer_and_show_name("transpose_bin_gpu"); + + //cudaDeviceSynchronize(); + //stop_timer_and_show_name("im2col_align_ongpu + float_to_bit_gpu + transpose_bin_gpu"); + + // should be optimized + //if(0) {//if (k > 1000) { // sequentially input-shared - BAD + // gemm_nn_custom_bin_mean_transposed_sequentially_gpu(m, n, k, + // (unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu, new_ldb, l.output_gpu, n, l.mean_arr_gpu); + //} + //else { // coalescing & weights-shared-memory - GOOD + //start_timer(); + gemm_nn_custom_bin_mean_transposed_gpu(m, n, k, + (unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu, + new_ldb, l.output_gpu, n, l.mean_arr_gpu, l.biases_gpu, l.activation == LEAKY, + l.bin_conv_shortcut_in_gpu, l.bin_conv_shortcut_out_gpu); + //cudaDeviceSynchronize(); + //stop_timer_and_show_name("gemm_nn_custom_bin_mean_transposed_gpu"); + //} + //cudaDeviceSynchronize(); + //check_error(status); + //getchar(); + } + + + /* + { + float_to_bit_gpu(state.input, (unsigned char *)l.align_workspace_gpu, input_size); + convolve_bin_gpu(l.align_workspace_gpu, (float *)l.align_bit_weights_gpu, l.output_gpu, l.w, l.h, l.c, l.n, l.size, l.pad, l.new_lda, l.mean_arr_gpu); + + //convolve_gpu(state.input, l.weights_gpu, l.output_gpu, l.w, l.h, l.c, l.n, l.size, l.pad); + + //cudaDeviceSynchronize(); + //check_error(status); + + add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h); + } + */ + + //add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h); + if (l.activation == SWISH) activate_array_swish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.output_gpu); + else if (l.activation == MISH) activate_array_mish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.output_gpu); + else if (l.activation == HARD_MISH) activate_array_hard_mish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.output_gpu); + else if (l.activation == NORM_CHAN) activate_array_normalize_channels_ongpu(l.output_gpu, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output_gpu); + else if (l.activation == NORM_CHAN_SOFTMAX) activate_array_normalize_channels_softmax_ongpu(l.output_gpu, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output_gpu, 0); + else if (l.activation == NORM_CHAN_SOFTMAX_MAXVAL) activate_array_normalize_channels_softmax_ongpu(l.output_gpu, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output_gpu, 1); + else if (l.activation != LINEAR && l.activation != LEAKY) activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation); + //if(l.activation != LINEAR && l.activation != LEAKY) activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation); + //if (l.binary || l.xnor) swap_binary(&l); + //cudaDeviceSynchronize(); + return; + } + } + + if (l.xnor) { + swap_binary(&l); + binarize_gpu(state.input, l.c*l.h*l.w*l.batch, l.binary_input_gpu); + state.input = l.binary_input_gpu; + } + + //fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1); + +#ifdef CUDNN + //float one = 1; // alpha[0], beta[0] is float for HALF and FLOAT + float alpha = 1, beta = 0; + +//#ifdef CUDNN_HALF + //if (state.use_mixed_precision) { + int iteration_num = get_current_iteration(state.net); // (*state.net.seen) / (state.net.batch*state.net.subdivisions); + if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || (iteration_num > 3 * state.net.burn_in) && state.net.loss_scale != 1) && + (l.c / l.groups) % 8 == 0 && l.n % 8 == 0 && l.groups <= 1 && l.size > 1) + { + //printf("\n CUDNN_HALF!!! state.index = %d \n", state.index); + + // Note: For improved performance it is advised to use beta[0] = 0.0. + // For Tensor Core: cudnnSetConvolutionMathType() where cudnnMathType_t mathType = CUDNN_TENSOR_OP_MATH; + // 1. or CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM and use CUDNN_DATA_HALF + // 2. or CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED + // More: http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#tensor_ops + + const size_t input16_size = l.batch*l.c*l.w*l.h; + const size_t output16_size = l.batch*l.out_c*l.out_h*l.out_w; + + if (*state.net.max_input16_size < input16_size) { + //printf("\n input16_size: cur = %zu \t max = %zu \n", input16_size, *state.net.max_input16_size); + *state.net.max_input16_size = input16_size; + if (*state.net.input16_gpu) cuda_free(*state.net.input16_gpu); + assert(*state.net.max_input16_size > 0); + *state.net.input16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_input16_size); + } + float *input16 = *state.net.input16_gpu; + + if (*state.net.max_output16_size < output16_size) { + *state.net.max_output16_size = output16_size; + if (*state.net.output16_gpu) cuda_free(*state.net.output16_gpu); + assert(*state.net.max_output16_size > 0); + *state.net.output16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_output16_size); + } + float *output16 = *state.net.output16_gpu; + + assert(input16_size > 0); + cuda_convert_f32_to_f16(state.input, input16_size, input16); + + //fill_ongpu(output16_size / 2, 0, (float *)output16, 1); + CHECK_CUDNN(cudnnConvolutionForward(cudnn_handle(), + &alpha, + l.srcTensorDesc16, + input16, + l.weightDesc16, + l.weights_gpu16, + l.convDesc, + l.fw_algo16, + state.workspace, + l.workspace_size, + &beta, + l.dstTensorDesc16, + output16)); + + + if (l.batch_normalize) + { + if (state.train && !state.net.adversarial) // Training + { + simple_copy_ongpu(l.outputs*l.batch / 2, output16, l.x_gpu); + //copy_ongpu(l.outputs*l.batch / 2, output16, 1, l.x_gpu, 1); + //cudaMemcpyAsync(l.x_gpu, output16, l.outputs*l.batch*sizeof(half), cudaMemcpyDefault, get_cuda_stream()); + float one = 1.0f; + float zero = 0.0f; + // Batch-normalization can still take FP16 inputs and outputs, saving half the bandwidth + // compared to FP32, it's just that the statistics and value adjustment should be done in FP32. + CHECK_CUDNN(cudnnBatchNormalizationForwardTraining(cudnn_handle(), + CUDNN_BATCHNORM_SPATIAL, + &one, + &zero, + l.normDstTensorDescF16, + l.x_gpu, // input + l.normDstTensorDescF16, + output16, // output + l.normTensorDesc, + l.scales_gpu, // input + l.biases_gpu, // input + .01, + l.rolling_mean_gpu, // input/output (should be FP32) + l.rolling_variance_gpu, // input/output (should be FP32) + .00001, + l.mean_gpu, // output (should be FP32) - optional cache to speedup cudnnBatchNormalizationBackward() + l.variance_gpu)); // output (should be FP32) - optional cache to speedup cudnnBatchNormalizationBackward() + + cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu); + //forward_batchnorm_layer_gpu(l, state); + } + else // Detection + { + cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu); + normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w); + scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w); + add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h); + } + } + else // BIAS only + { + cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu); + add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h); + } + } + else { + + //#else + /* + int input_nan_inf = is_nan_or_inf(state.input, l.inputs * l.batch); + printf("\n is_nan_or_inf(state.input) = %d \n", input_nan_inf); + if (input_nan_inf) getchar(); + + int weights_nan_inf = is_nan_or_inf(l.weights_gpu, l.nweights); + printf("\n is_nan_or_inf(l.weights_gpu) = %d \n", weights_nan_inf); + if (weights_nan_inf) getchar(); + */ + + CHECK_CUDNN(cudnnConvolutionForward(cudnn_handle(), + &alpha, //&one, + l.srcTensorDesc, + state.input, + l.weightDesc, + l.weights_gpu, + l.convDesc, + l.fw_algo, + state.workspace, + l.workspace_size, + &beta, //&one, + l.dstTensorDesc, + l.output_gpu)); + + //cudaDeviceSynchronize(); + if (l.batch_normalize) { + forward_batchnorm_layer_gpu(l, state); + } + else { + add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h); + } + //#endif // CUDNN_HALF + } + + +#else + fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1); + + int i, j; + int m = l.n / l.groups; + int k = l.size*l.size*l.c / l.groups; + int n = l.out_w*l.out_h; + for(i = 0; i < l.batch; ++i){ + for (j = 0; j < l.groups; ++j) { + //float *im = state.input + i*l.c*l.h*l.w; + float *im = state.input + (i*l.groups + j)*l.c / l.groups*l.h*l.w; + float *a = l.weights_gpu + j*l.nweights / l.groups; + float *b = state.workspace; + float *c = l.output_gpu + (i*l.groups + j)*n*m; + if (l.size == 1 && l.stride == 1 && l.dilation == 1) { + b = im; + } + else { + //im2col_ongpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, state.workspace); + + im2col_gpu_ext(im, // input + l.c / l.groups, // input channels + l.h, l.w, // input size (h, w) + l.size, l.size, // kernel size (h, w) + l.pad * l.dilation, l.pad * l.dilation, // padding (h, w) + l.stride_y, l.stride_x, // stride (h, w) + l.dilation, l.dilation, // dilation (h, w) + state.workspace); // output + + } + //gemm_ongpu(0, 0, m, n, k, 1., a, k, b, n, 1., c + i*m*n, n); + gemm_ongpu(0, 0, m, n, k, 1, a, k, b, n, 1, c, n); + } + } + + if (l.batch_normalize) { + forward_batchnorm_layer_gpu(l, state); + } + else { + add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h); + } +#endif + +//#ifndef CUDNN_HALF +//#endif // no CUDNN_HALF + + if (l.activation == SWISH) activate_array_swish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.output_gpu); + else if (l.activation == MISH) activate_array_mish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.output_gpu); + else if (l.activation == HARD_MISH) activate_array_hard_mish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.output_gpu); + else if (l.activation == NORM_CHAN) activate_array_normalize_channels_ongpu(l.output_gpu, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output_gpu); + else if (l.activation == NORM_CHAN_SOFTMAX) activate_array_normalize_channels_softmax_ongpu(l.output_gpu, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output_gpu, 0); + else if (l.activation == NORM_CHAN_SOFTMAX_MAXVAL) activate_array_normalize_channels_softmax_ongpu(l.output_gpu, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output_gpu, 1); + else if (l.activation != LINEAR) activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation); + //if(l.dot > 0) dot_error_gpu(l); + if(l.binary || l.xnor) swap_binary(&l); + //cudaDeviceSynchronize(); // for correct profiling of performance + + if (state.net.try_fix_nan) { + fix_nan_and_inf(l.output_gpu, l.outputs*l.batch); + } + + if(l.assisted_excitation && state.train) assisted_excitation_forward_gpu(l, state); + + if (l.antialiasing) { + network_state s = { 0 }; + s.train = state.train; + s.workspace = state.workspace; + s.net = state.net; + if (!state.train) s.index = state.index; // don't use TC for training (especially without cuda_convert_f32_to_f16() ) + s.input = l.output_gpu; + forward_convolutional_layer_gpu(*(l.input_layer), s); + simple_copy_ongpu(l.outputs*l.batch, l.output_gpu, l.input_antialiasing_gpu); + simple_copy_ongpu(l.input_layer->outputs*l.input_layer->batch, l.input_layer->output_gpu, l.output_gpu); + } + + if (l.coordconv) { + coord_conv_gpu(l.output_gpu, l.outputs*l.batch, l.out_w, l.out_h, l.out_c, l.batch, 0); + } +} + +void backward_convolutional_layer_gpu(convolutional_layer l, network_state state) +{ + if (l.coordconv) { + coord_conv_gpu(l.delta_gpu, l.outputs*l.batch, l.out_w, l.out_h, l.out_c, l.batch, 1); + } + + if (l.antialiasing) { + network_state s = { 0 }; + s.train = state.train; + s.workspace = state.workspace; + s.net = state.net; + s.delta = l.delta_gpu; // s.delta will be returned to l.delta_gpu + s.input = l.input_antialiasing_gpu; + //if (!state.train) s.index = state.index; // don't use TC for training (especially without cuda_convert_f32_to_f16() ) + simple_copy_ongpu(l.input_layer->outputs*l.input_layer->batch, l.delta_gpu, l.input_layer->delta_gpu); + backward_convolutional_layer_gpu(*(l.input_layer), s); + + simple_copy_ongpu(l.outputs*l.batch, l.input_antialiasing_gpu, l.output_gpu); + } + + if(state.net.try_fix_nan) constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1); + + if (l.activation == SWISH) gradient_array_swish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.delta_gpu); + else if (l.activation == MISH) gradient_array_mish_ongpu(l.outputs*l.batch, l.activation_input_gpu, l.delta_gpu); + else if (l.activation == HARD_MISH) gradient_array_hard_mish_ongpu(l.outputs*l.batch, l.activation_input_gpu, l.delta_gpu); + else if (l.activation == NORM_CHAN_SOFTMAX || l.activation == NORM_CHAN_SOFTMAX_MAXVAL) gradient_array_normalize_channels_softmax_ongpu(l.output_gpu, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.delta_gpu); + else if (l.activation == NORM_CHAN) gradient_array_normalize_channels_ongpu(l.output_gpu, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.delta_gpu); + else gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu); + + if (!l.batch_normalize) + backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h); + +//#ifndef CUDNN_HALF + //if(l.batch_normalize){ + // backward_batchnorm_layer_gpu(l, state); + //} else { + // //backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h); + //} +//#endif // no CUDNN_HALF + float *original_input = state.input; + + if(l.xnor) state.input = l.binary_input_gpu; +#ifdef CUDNN + float one = 1.f; + float alpha = 1, beta = 0; + +//#ifdef CUDNN_HALF + int iteration_num = get_current_iteration(state.net); //(*state.net.seen) / (state.net.batch*state.net.subdivisions); + if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || (iteration_num > 3 * state.net.burn_in) && state.net.loss_scale != 1) && + (l.c / l.groups) % 8 == 0 && l.n % 8 == 0 && l.groups <= 1 && l.size > 1) + { + const size_t input16_size = l.batch*l.c*l.w*l.h; + const size_t delta16_size = l.batch*l.n*l.out_w*l.out_h; + + if (*state.net.max_input16_size < input16_size) { + *state.net.max_input16_size = input16_size; + if (*state.net.input16_gpu) cuda_free(*state.net.input16_gpu); + assert(*state.net.max_input16_size > 0); + *state.net.input16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_input16_size); + } + float *input16 = *state.net.input16_gpu; + + if (*state.net.max_output16_size < delta16_size) { + *state.net.max_output16_size = delta16_size; + if (*state.net.output16_gpu) cuda_free(*state.net.output16_gpu); + assert(*state.net.max_output16_size > 0); + *state.net.output16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_output16_size); + } + float *delta16 = *state.net.output16_gpu; + + assert(input16_size > 0); + assert(delta16_size > 0); + cuda_convert_f32_to_f16(state.input, input16_size, input16); + cuda_convert_f32_to_f16(l.delta_gpu, delta16_size, delta16); + + if (l.batch_normalize) { + //if (!state.train) { + // l.mean_gpu = l.rolling_mean_gpu; + // l.variance_gpu = l.rolling_variance_gpu; + //} + float one = 1.0f; + float zero = 0.0f; + CHECK_CUDNN(cudnnBatchNormalizationBackward(cudnn_handle(), + CUDNN_BATCHNORM_SPATIAL, + &one, + &zero, + &one, + &one, + l.normDstTensorDescF16, + l.x_gpu, // input (input in BN-forward-inference) + l.normDstTensorDescF16, + delta16, // input + l.normDstTensorDescF16, + l.output_gpu, //l.x_norm_gpu, // output (new delta) + l.normTensorDesc, + l.scales_gpu, // input (should be FP32) + l.scale_updates_gpu, // output (should be FP32) + l.bias_updates_gpu, // output (should be FP32) + .00001, + l.mean_gpu, // input (should be FP32) + l.variance_gpu)); // input (should be FP32) + + simple_copy_ongpu(l.outputs*l.batch / 2, l.output_gpu, delta16); + //copy_ongpu(l.outputs*l.batch / 2, l.x_norm_gpu, 1, delta16, 1); + //cudaMemcpyAsync(delta16, l.x_norm_gpu, l.outputs*l.batch * sizeof(half), cudaMemcpyDefault, get_cuda_stream()); + } + else + { + //backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h); + } + + // convert input: state.input (x), l.delta_gpu (y) from fp32 to fp16 + // get output: l.weight_updates_gpu (dw) and convert it to fp32 (ONLY if it is fp16) + + // calculate conv weight updates + // Already: l.weight_updates_gpu = (l.weight_updates_gpu - l.weight*decay*batch*subdivision)*momentum + // so we should copy f32 to f16, or compute: f16=(w_up - w*d*b*s)*m + assert((l.nweights) > 0); + cuda_convert_f32_to_f16(l.weight_updates_gpu, l.nweights, l.weight_updates_gpu16); + + if (!state.net.adversarial && !l.train_only_bn) { + CHECK_CUDNN(cudnnConvolutionBackwardFilter(cudnn_handle(), + &one, + l.srcTensorDesc16, + input16, //state.input, + l.ddstTensorDesc16, + delta16, //l.delta_gpu, + l.convDesc, + l.bf_algo16, + state.workspace, + l.workspace_size, + &one, + l.dweightDesc16, + l.weight_updates_gpu16)); // l.weight_updates_gpu); + + cuda_convert_f16_to_f32(l.weight_updates_gpu16, l.nweights, l.weight_updates_gpu); + } + + if (state.delta) { + if (l.binary || l.xnor) swap_binary(&l); + + // http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBackwardData + // calculate delta for the next layer + // convert input: l.weights_gpu (w), l.delta_gpu (dy) from fp32 to fp16 + // get output: state.delta (dx) and convert it to fp32 (ONLY if it is fp16) + CHECK_CUDNN(cudnnConvolutionBackwardData(cudnn_handle(), + &alpha, + l.weightDesc16, + l.weights_gpu16, //l.weights_gpu, + l.ddstTensorDesc16, + delta16, //l.delta_gpu, + l.convDesc, + l.bd_algo16, + state.workspace, + l.workspace_size, + &beta, + l.dsrcTensorDesc16, + input16)); // state.delta); + + cuda_convert_f16_to_f32(input16, input16_size, state.delta); + + if (l.binary || l.xnor) swap_binary(&l); + if (l.xnor) gradient_array_ongpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, state.delta); + } + } + else { + //#else // CUDNN_HALF + + if(l.batch_normalize){ + backward_batchnorm_layer_gpu(l, state); + } + + if (!state.net.adversarial && !l.train_only_bn) { + + float *old_input = state.input; + + /* + if (l.reverse) { + if (*state.net.max_output16_size < l.inputs*l.batch) { + *state.net.max_output16_size = l.inputs*l.batch; + if (*state.net.output16_gpu) cuda_free(*state.net.output16_gpu); + assert(*state.net.max_output16_size > 0); + *state.net.output16_gpu = cuda_make_array(NULL, *state.net.max_output16_size); + } + float clip = 0.0; + float divider = 1.0; + float abs_add = 1.0; + mult_inverse_array_gpu(state.input, *state.net.output16_gpu, l.inputs*l.batch, l.reverse, divider, clip, abs_add); + state.input = *state.net.output16_gpu; + } + */ + + // calculate conv weight updates + // if used: beta=1 then loss decreases faster + CHECK_CUDNN(cudnnConvolutionBackwardFilter(cudnn_handle(), + &one, + l.srcTensorDesc, + state.input, + l.ddstTensorDesc, + l.delta_gpu, + l.convDesc, + l.bf_algo, + state.workspace, + l.workspace_size, + &one, + l.dweightDesc, + l.weight_updates_gpu)); + + state.input = old_input; + } + + + if (state.delta) { + if (l.binary || l.xnor) swap_binary(&l); + + float *old_weights = l.weights_gpu; + + /* + if (l.reverse) { + if (*state.net.max_output16_size < l.nweights) { + *state.net.max_output16_size = l.nweights; + if (*state.net.output16_gpu && *state.net.max_output16_size > 0) cuda_free(*state.net.output16_gpu); + assert(*state.net.max_output16_size > 0); + *state.net.output16_gpu = cuda_make_array(NULL, l.nweights); + } + float clip = 0.0; + float divider = 1.0; + float abs_add = 1.0; + mult_inverse_array_gpu(l.weights_gpu, *state.net.output16_gpu, l.nweights, l.reverse, divider, clip, abs_add); + l.weights_gpu = *state.net.output16_gpu; + } + */ + + // http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBackwardData + // calculate delta for the next layer + CHECK_CUDNN(cudnnConvolutionBackwardData(cudnn_handle(), + &one, + l.weightDesc, + l.weights_gpu, + l.ddstTensorDesc, + l.delta_gpu, + l.convDesc, + l.bd_algo, + state.workspace, + l.workspace_size, + &one, + l.dsrcTensorDesc, + state.delta)); + + l.weights_gpu = old_weights; + + if (l.binary || l.xnor) swap_binary(&l); + if (l.xnor) gradient_array_ongpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, state.delta); + } + } + +//#endif // CUDNN_HALF + +#else // CUDNN + if (l.batch_normalize) { + backward_batchnorm_layer_gpu(l, state); + } + + int m = l.n / l.groups; + int n = l.size*l.size*l.c / l.groups; + int k = l.out_w*l.out_h; + + int i, j; + for(i = 0; i < l.batch; ++i){ + for (j = 0; j < l.groups; ++j) { + float * a = l.delta_gpu + (i*l.groups + j)*m*k; + float * b = state.workspace; + float * c = l.weight_updates_gpu + j*l.nweights / l.groups; + + float *im = state.input + (i*l.groups + j)*l.c / l.groups*l.h*l.w; + + if (!state.net.adversarial && !l.train_only_bn) { + //im2col_ongpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, state.workspace); + im2col_gpu_ext(im, // input + l.c / l.groups, // input channels + l.h, l.w, // input size (h, w) + l.size, l.size, // kernel size (h, w) + l.pad * l.dilation, l.pad * l.dilation, // padding (h, w) + l.stride_y, l.stride_x, // stride (h, w) + l.dilation, l.dilation, // dilation (h, w) + state.workspace); // output + //gemm_ongpu(0, 1, m, n, k, 1, a + i*m*k, k, b, k, 1, c, n); + gemm_ongpu(0, 1, m, n, k, 1, a, k, b, k, 1, c, n); + } + + if (state.delta) { + if (l.binary || l.xnor) swap_binary(&l); + float * a = l.weights_gpu + j*l.nweights / l.groups; + float * b = l.delta_gpu + (i*l.groups + j)*m*k; + float * c = state.workspace; + + //gemm_ongpu(1, 0, n, k, m, 1, a, n, b + i*k*m, k, 0, c, k); + gemm_ongpu(1, 0, n, k, m, 1, a, n, b, k, 0, c, k); + + + float *delta = state.delta + (i*l.groups + j)*l.c / l.groups*l.h*l.w; + + //col2im_ongpu(state.workspace, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, delta); + col2im_gpu_ext( + state.workspace, // input + l.c / l.groups, // input channels + l.h, l.w, // input size (h, w) + l.size, l.size, // kernel size (h, w) + l.pad * l.dilation, l.pad * l.dilation, // padding size (h, w) + l.stride_y, l.stride_x, // stride size (h, w) + l.dilation, l.dilation, // dilation size (h, w) + delta); // output (delta) + + if (l.binary || l.xnor) { + swap_binary(&l); + } + if (l.xnor) gradient_array_ongpu(original_input + i*l.c*l.h*l.w, l.c*l.h*l.w, HARDTAN, state.delta + i*l.c*l.h*l.w); + } + } + } +#endif + if (state.net.try_fix_nan) { + if (state.delta) { + reset_nan_and_inf(state.delta, l.inputs * l.batch); + } + int size = l.nweights; + reset_nan_and_inf(l.weight_updates_gpu, size); + fix_nan_and_inf(l.weights_gpu, size); + } + + +} + +__global__ void calc_avg_activation_kernel(float *src, float *dst, int size, int channels, int batches) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + int xy = i % size; + int b = i / size; + + if (i < size*batches) { + dst[i] = 0; + for (int c = 0; c < channels; ++c) { + dst[i] += src[xy + size*(c + channels*b)]; + } + dst[i] = dst[i] / channels; + } +} + +void calc_avg_activation_gpu(float *src, float *dst, int size, int channels, int batches) +{ + const int num_blocks = get_number_of_blocks(size*batches, BLOCK); + + calc_avg_activation_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (src, dst, size, channels, batches); +} + + +__global__ void assisted_activation_kernel(float alpha, float *output, float *gt_gpu, float *a_avg_gpu, int size, int channels, int batches) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + int xy = i % size; + int b = i / size; + + if (b < batches) { + for (int c = 0; c < channels; ++c) { + output[xy + size*(c + channels*b)] += alpha * gt_gpu[i] * a_avg_gpu[i]; + //output[xy + size*(c + channels*b)] += gt_gpu[i] * a_avg_gpu[i]; + //output[xy + size*(c + channels*b)] += gt_gpu[i] * output[xy + size*(c + channels*b)]; + //output[xy + size*(c + channels*b)] = a_avg_gpu[i]; + } + } +} + +void assisted_activation_gpu(float alpha, float *output, float *gt_gpu, float *a_avg_gpu, int size, int channels, int batches) +{ + const int num_blocks = get_number_of_blocks(size*batches, BLOCK); + + assisted_activation_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (alpha, output, gt_gpu, a_avg_gpu, size, channels, batches); +} + + +__global__ void assisted_activation2_kernel(float alpha, float *output, float *gt_gpu, float *a_avg_gpu, int size, int channels, int batches) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + int xy = i % size; + int b = i / size; + float beta = 1 - alpha; + + if (b < batches) { + for (int c = 0; c < channels; ++c) { + if(gt_gpu[i] == 0) + output[xy + size*(c + channels*b)] *= beta; + + } + } +} + +void assisted_activation2_gpu(float alpha, float *output, float *gt_gpu, float *a_avg_gpu, int size, int channels, int batches) +{ + const int num_blocks = get_number_of_blocks(size*batches, BLOCK); + + assisted_activation2_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (alpha, output, gt_gpu, a_avg_gpu, size, channels, batches); +} + +void assisted_excitation_forward_gpu(convolutional_layer l, network_state state) +{ + const int iteration_num = get_current_iteration(state.net); //(*state.net.seen) / (state.net.batch*state.net.subdivisions); + + // epoch + //const float epoch = (float)(*state.net.seen) / state.net.train_images_num; + + // calculate alpha + //const float alpha = (1 + cos(3.141592 * iteration_num)) / (2 * state.net.max_batches); + //const float alpha = (1 + cos(3.141592 * epoch)) / (2 * state.net.max_batches); + float alpha = (1 + cos(3.141592 * iteration_num / state.net.max_batches)) / 2; + //float alpha = (1 + cos(3.141592 * iteration_num / state.net.max_batches)); + + if (l.assisted_excitation == 1) { + if (iteration_num > state.net.max_batches / 2) return; + } + else { + if (iteration_num < state.net.burn_in) return; + else + if (iteration_num > l.assisted_excitation) return; + else + alpha = (1 + cos(3.141592 * iteration_num / (state.net.burn_in + l.assisted_excitation))) / 2; // from 1 to 0 + } + + //printf("\n epoch = %f, alpha = %f, seen = %d, max_batches = %d, train_images_num = %d \n", + // epoch, alpha, (*state.net.seen), state.net.max_batches, state.net.train_images_num); + + //const int size = l.outputs * l.batch; + + float *a_avg = (float *)calloc(l.out_w * l.out_h * l.batch, sizeof(float)); + float *gt = (float *)calloc(l.out_w * l.out_h * l.batch, sizeof(float)); + + int b; + int w, h; + + l.max_boxes = state.net.num_boxes; + l.truths = l.max_boxes*(4 + 1); + + int num_truth = l.batch*l.truths; + float *truth_cpu = (float *)calloc(num_truth, sizeof(float)); + cuda_pull_array(state.truth, truth_cpu, num_truth); + //cudaStreamSynchronize(get_cuda_stream()); + //CHECK_CUDA(cudaPeekAtLastError()); + + for (b = 0; b < l.batch; ++b) + { + // calculate G + int t; + for (t = 0; t < state.net.num_boxes; ++t) { + box truth = float_to_box_stride(truth_cpu + t*(4 + 1) + b*l.truths, 1); + if (!truth.x) break; // continue; + float beta = 0; + //float beta = 1 - alpha; // from 0 to 1 + float dw = (1 - truth.w) * beta; + float dh = (1 - truth.h) * beta; + //printf(" alpha = %f, beta = %f, truth.w = %f, dw = %f, tw+dw = %f, l.out_w = %d \n", alpha, beta, truth.w, dw, truth.w+dw, l.out_w); + + int left = floorf((truth.x - (dw + truth.w) / 2) * l.out_w); + int right = ceilf((truth.x + (dw + truth.w) / 2) * l.out_w); + int top = floorf((truth.y - (dh + truth.h) / 2) * l.out_h); + int bottom = ceilf((truth.y + (dh + truth.h) / 2) * l.out_h); + if (left < 0) left = 0; + if (top < 0) top = 0; + if (right > l.out_w) right = l.out_w; + if (bottom > l.out_h) bottom = l.out_h; + + for (w = left; w <= right; w++) { + for (h = top; h < bottom; h++) { + gt[w + l.out_w * h + l.out_w*l.out_h*b] = 1; + } + } + } + } + + cuda_push_array(l.gt_gpu, gt, l.out_w * l.out_h * l.batch); + //cudaStreamSynchronize(get_cuda_stream()); + //CHECK_CUDA(cudaPeekAtLastError()); + + // calc avg_output on GPU - for whole batch + calc_avg_activation_gpu(l.output_gpu, l.a_avg_gpu, l.out_w * l.out_h, l.out_c, l.batch); + //cudaStreamSynchronize(get_cuda_stream()); + //CHECK_CUDA(cudaPeekAtLastError()); + + // calc new output + //assisted_activation2_gpu(1, l.output_gpu, l.gt_gpu, l.a_avg_gpu, l.out_w * l.out_h, l.out_c, l.batch); // AE3: gt increases (beta = 1 - alpha = 0) + //assisted_activation2_gpu(alpha, l.output_gpu, l.gt_gpu, l.a_avg_gpu, l.out_w * l.out_h, l.out_c, l.batch); + assisted_activation_gpu(alpha, l.output_gpu, l.gt_gpu, l.a_avg_gpu, l.out_w * l.out_h, l.out_c, l.batch); + //cudaStreamSynchronize(get_cuda_stream()); + //CHECK_CUDA(cudaPeekAtLastError()); + + + + /* + for (b = 0; b < l.batch; ++b) + { + // calculate average A + for (w = 0; w < l.out_w; w++) { + for (h = 0; h < l.out_h; h++) { + for (c = 0; c < l.out_c; c++) { + a_avg[w + l.out_w*(h + l.out_h*b)] += l.output[w + l.out_w*(h + l.out_h*(c + l.out_c*b))]; + } + a_avg[w + l.out_w*(h + l.out_h*b)] /= l.out_c; // a_avg / d + } + } + } + + // change activation + for (b = 0; b < l.batch; ++b) + { + for (w = 0; w < l.out_w; w++) { + for (h = 0; h < l.out_h; h++) { + for (c = 0; c < l.out_c; c++) + { + // a = a + alpha(t) + e(c,i,j) = a + alpha(t) + g(i,j) * avg_a(i,j) / channels + l.output[w + l.out_w*(h + l.out_h*(c + l.out_c*b))] += + alpha * + g[w + l.out_w*(h + l.out_h*b)] * + a_avg[w + l.out_w*(h + l.out_h*b)]; + + //l.output[w + l.out_w*(h + l.out_h*(c + l.out_c*b))] = + // alpha * g[w + l.out_w*(h + l.out_h*b)] * a_avg[w + l.out_w*(h + l.out_h*b)]; + } + } + } + } + */ + + if (0) // visualize ground truth + { +#ifdef OPENCV + cuda_pull_array(l.output_gpu, l.output, l.outputs * l.batch); + cudaStreamSynchronize(get_cuda_stream()); + CHECK_CUDA(cudaPeekAtLastError()); + + for (b = 0; b < l.batch; ++b) + { + printf(" Assisted Excitation alpha = %f \n", alpha); + image img = float_to_image(l.out_w, l.out_h, 1, >[l.out_w*l.out_h*b]); + char buff[100]; + sprintf(buff, "a_excitation_gt_%d", b); + show_image_cv(img, buff); + + //image img2 = float_to_image(l.out_w, l.out_h, 1, &l.output[l.out_w*l.out_h*l.out_c*b]); + image img2 = float_to_image_scaled(l.out_w, l.out_h, 1, &l.output[l.out_w*l.out_h*l.out_c*b]); + char buff2[100]; + sprintf(buff2, "a_excitation_output_%d", b); + show_image_cv(img2, buff2); + + /* + int c = l.out_c; + if (c > 4) c = 4; + image img3 = float_to_image(l.out_w, l.out_h, c, &l.output[l.out_w*l.out_h*l.out_c*b]); + image dc = collapse_image_layers(img3, 1); + char buff3[100]; + sprintf(buff3, "a_excitation_act_collapsed_%d", b); + show_image_cv(dc, buff3); + */ + + wait_key_cv(5); + } + wait_until_press_key_cv(); +#endif // OPENCV + } + + free(truth_cpu); + free(gt); + free(a_avg); +} + +void pull_convolutional_layer(convolutional_layer l) +{ + cuda_pull_array_async(l.weights_gpu, l.weights, l.nweights); + cuda_pull_array_async(l.biases_gpu, l.biases, l.n); + cuda_pull_array_async(l.weight_updates_gpu, l.weight_updates, l.nweights); + cuda_pull_array_async(l.bias_updates_gpu, l.bias_updates, l.n); + if (l.batch_normalize){ + cuda_pull_array_async(l.scales_gpu, l.scales, l.n); + cuda_pull_array_async(l.rolling_mean_gpu, l.rolling_mean, l.n); + cuda_pull_array_async(l.rolling_variance_gpu, l.rolling_variance, l.n); + } + if (l.adam){ + cuda_pull_array_async(l.m_gpu, l.m, l.nweights); + cuda_pull_array_async(l.v_gpu, l.v, l.nweights); + } + CHECK_CUDA(cudaPeekAtLastError()); + cudaStreamSynchronize(get_cuda_stream()); +} + +void push_convolutional_layer(convolutional_layer l) +{ + cuda_push_array(l.weights_gpu, l.weights, l.nweights); +#ifdef CUDNN_HALF + assert(l.nweights > 0); + cuda_convert_f32_to_f16(l.weights_gpu, l.nweights, l.weights_gpu16); +#endif + cuda_push_array(l.biases_gpu, l.biases, l.n); + if (l.train) { + cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.nweights); + cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n); + } + if (l.batch_normalize){ + cuda_push_array(l.scales_gpu, l.scales, l.n); + cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.n); + cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.n); + } + if (l.adam){ + cuda_push_array(l.m_gpu, l.m, l.nweights); + cuda_push_array(l.v_gpu, l.v, l.nweights); + } + CHECK_CUDA(cudaPeekAtLastError()); +} + +void update_convolutional_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay, float loss_scale) +{ + + /* + for (int angle = 0; angle < 360; angle++) { + printf(" angle = %d \n", angle); + smooth_rotate_weights_kernel(l.weights_gpu, l.weight_deform_gpu, l.nweights, l.n, l.size, angle, 0); + + cuda_pull_array(l.weight_deform_gpu, l.weights, l.nweights); + visualize_convolutional_layer(l, "weights", NULL); + wait_key_cv(10); + } + */ + + if (l.deform) { + + //for (l.angle = 0; l.angle < 360; l.angle += 1) + //{ + //stretch_weights_gpu(l.weight_updates_gpu, l.weight_deform_gpu, l.nweights, l.n, l.size, l.angle/180, 1); + //else simple_copy_ongpu(l.nweights, l.weight_updates_gpu, l.weight_deform_gpu); + + if (l.rotate) rotate_weights_gpu(l.weight_updates_gpu, l.weight_deform_gpu, l.nweights, l.n, l.size, 1); + else if (l.sway) sway_and_flip_weights_gpu(l.weight_updates_gpu, l.weight_deform_gpu, l.nweights, l.n, l.size, l.angle, 1); + else if (l.stretch) stretch_weights_gpu(l.weight_updates_gpu, l.weight_deform_gpu, l.nweights, l.n, l.size, 0, 1); + else if (l.stretch_sway) stretch_sway_flip_weights_gpu(l.weight_updates_gpu, l.weight_deform_gpu, l.nweights, l.n, l.size, l.angle, 1); + + //simple_copy_ongpu(l.nweights, l.weight_updates_gpu, l.weight_deform_gpu); + + reduce_and_expand_array_gpu(l.weight_deform_gpu, l.weight_updates_gpu, l.nweights, 4); + + //printf(" angle = %f \n", l.angle); + //cuda_pull_array(l.weight_deform_gpu, l.weights, l.nweights); + //visualize_convolutional_layer(l, "weights", NULL); + //wait_key_cv(10); + //} + + } + + // Loss scale for Mixed-Precision on Tensor-Cores + float learning_rate = learning_rate_init*l.learning_rate_scale / loss_scale; + //float momentum = a.momentum; + //float decay = a.decay; + //int batch = a.batch; + + + reset_nan_and_inf(l.weight_updates_gpu, l.nweights); + fix_nan_and_inf(l.weights_gpu, l.nweights); + + // Gradient Centralization + if (l.grad_centr && l.batch_normalize) { + // weights[filters][channels][height][width] + // for(filters) w[f] = w[f] - mean(w[c][h][w]) + gradient_centralization_gpu(l.size, l.size, l.c / l.groups, l.n, l.weight_updates_gpu); + } + + + if (l.adam) { + //adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.nweights, batch, a.t); + adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.nweights, batch, l.t); + + adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.n, batch, l.t); + if (l.scales_gpu) { + adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.n, batch, l.t); + } + } + else { + //axpy_ongpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1); + //axpy_ongpu(l.nweights, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1); + //scal_ongpu(l.nweights, momentum, l.weight_updates_gpu, 1); + + float *old_weight_updates_gpu = l.weight_updates_gpu; + + + if (l.reverse) { + float clip = 0.0; + float divider = 1.0; + float abs_add = 1.0; + mult_inverse_array_gpu(l.weight_updates_gpu, l.output_gpu, l.inputs*l.batch, l.reverse, divider, clip, abs_add); + l.weight_updates_gpu = l.output_gpu; + } + + + axpy_ongpu(l.nweights, -decay*batch*loss_scale, l.weights_gpu, 1, l.weight_updates_gpu, 1); + axpy_ongpu(l.nweights, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1); + + l.weight_updates_gpu = old_weight_updates_gpu; + + scal_ongpu(l.nweights, momentum, l.weight_updates_gpu, 1); + + axpy_ongpu(l.n, learning_rate / batch, l.bias_updates_gpu, 1, l.biases_gpu, 1); + scal_ongpu(l.n, momentum, l.bias_updates_gpu, 1); + + if (l.scales_gpu) { + axpy_ongpu(l.n, learning_rate / batch, l.scale_updates_gpu, 1, l.scales_gpu, 1); + scal_ongpu(l.n, momentum, l.scale_updates_gpu, 1); + } + } + + if (l.deform) { + //for (l.angle = 0; l.angle < 360; l.angle += 4) + //{ + expand_array_gpu(l.weights_gpu, l.weight_deform_gpu, l.nweights, 4); + + //simple_copy_ongpu(l.nweights, l.weight_deform_gpu, l.weights_gpu); + + if (l.rotate) rotate_weights_gpu(l.weight_deform_gpu, l.weights_gpu, l.nweights, l.n, l.size, 0); + else if (l.sway) sway_and_flip_weights_gpu(l.weight_deform_gpu, l.weights_gpu, l.nweights, l.n, l.size, l.angle, 0); + else if (l.stretch) stretch_weights_gpu(l.weight_deform_gpu, l.weights_gpu, l.nweights, l.n, l.size, 0, 0); + else if (l.stretch_sway) stretch_sway_flip_weights_gpu(l.weight_deform_gpu, l.weights_gpu, l.nweights, l.n, l.size, l.angle, 0); + + //printf(" angle = %f, reverse = %d \n", l.angle, 0); + //cuda_pull_array(l.weights_gpu, l.weights, l.nweights); + //visualize_convolutional_layer(l, "weights", NULL); + //wait_key_cv(10); + //} + } + + if (l.clip) { + constrain_ongpu(l.nweights, l.clip, l.weights_gpu, 1); + } +} + + + +/* +void update_convolutional_layer_gpu(convolutional_layer layer, int batch, float learning_rate, float momentum, float decay) +{ + int size = layer.size*layer.size*layer.c*layer.n; + axpy_ongpu(layer.n, learning_rate/batch, layer.bias_updates_gpu, 1, layer.biases_gpu, 1); + scal_ongpu(layer.n, momentum, layer.bias_updates_gpu, 1); + + if(layer.scales_gpu){ + axpy_ongpu(layer.n, learning_rate/batch, layer.scale_updates_gpu, 1, layer.scales_gpu, 1); + scal_ongpu(layer.n, momentum, layer.scale_updates_gpu, 1); + } + + if(layer.adam){ + scal_ongpu(size, layer.B1, layer.m_gpu, 1); + scal_ongpu(size, layer.B2, layer.v_gpu, 1); + + axpy_ongpu(size, -decay*batch, layer.weights_gpu, 1, layer.weight_updates_gpu, 1); + + axpy_ongpu(size, -(1-layer.B1), layer.weight_updates_gpu, 1, layer.m_gpu, 1); + mul_ongpu(size, layer.weight_updates_gpu, 1, layer.weight_updates_gpu, 1); + axpy_ongpu(size, (1-layer.B2), layer.weight_updates_gpu, 1, layer.v_gpu, 1); + + adam_gpu(size, layer.weights_gpu, layer.m_gpu, layer.v_gpu, layer.B1, layer.B2, learning_rate/batch, layer.eps, layer.t+1); + fill_ongpu(size, 0, layer.weight_updates_gpu, 1); + }else{ + axpy_ongpu(size, -decay*batch, layer.weights_gpu, 1, layer.weight_updates_gpu, 1); // wu = wu - w*decay*batch + axpy_ongpu(size, learning_rate/batch, layer.weight_updates_gpu, 1, layer.weights_gpu, 1); // w = w + wu*lr/batch + scal_ongpu(size, momentum, layer.weight_updates_gpu, 1); // wu = wu*momentum // wu = (wu - w*decay*batch)*momentum + // w = w + (wu - w*decay*batch)*lr/batch = w + wu*lr/batch - w*decay*lr = w*(1-decay*lr) + wu*lr/batch + //wu_prev = (wu_old - w_old*decay*batch)*momentum + + + //weights_update = weights_update_new + (weights_update_old - weights_old*decay*batch)*momentum - weights_new*decay*batch = + // = weights_update_new + weights_update_old*momentum - weights_old*decay*batch*momentum - weights_new*decay*batch + // = weights_update_new + weights_update_old*momentum - (weights_old*momentum + weights_new)*decay*batch + + //------------- RESULT -------------- + // weights_update = weights_update_new + weights_update_old*momentum - (weights_old*momentum + weights_new)*decay*batch + //----------------------------------- + + // weights_newest = weights_new + (weights_update_new + weights_update_old*momentum - (weights_old*momentum + weights_new)*decay*batch)*lr/batch + // = weights_new + weights_update_new*lr/batch + weights_update_old*momentum*lr/batch - weights_old*momentum*decay*batch*lr/batch - weights_new*decay*batch*lr/batch + // = weights_new + weights_update_new*lr/batch + weights_update_old*momentum*lr/batch - weights_old*momentum*decay*lr - weights_new*decay*lr + // = weights_new*(1 - decay*lr) - weights_old*momentum*decay*lr + (weights_update_new + weights_update_old*momentum)*lr/batch + + //------------- RESULT -------------- + // weights_newest = weights_new*(1 - decay*lr) - weights_old*momentum*(decay*lr) + (weights_update_new + weights_update_old*momentum)*lr/batch = + // = weights_new - (weights_new + weights_old*momentum)*decay*lr + (weights_update_new + weights_update_old*momentum)*lr / batch + //----------------------------------- + } +} +*/ -- Gitblit v1.8.0