From 168af40fe9a3cc81c6ee16b3e81f154780c36bdb Mon Sep 17 00:00:00 2001
From: Scheaven <xuepengqiang>
Date: 星期四, 03 六月 2021 15:03:27 +0800
Subject: [PATCH] up new v4

---
 lib/detecter_tools/darknet/dropout_layer_kernels.cu |  622 ++++++++++++++++++++++++++++----------------------------
 1 files changed, 311 insertions(+), 311 deletions(-)

diff --git a/lib/detecter_tools/darknet/dropout_layer_kernels.cu b/lib/detecter_tools/darknet/dropout_layer_kernels.cu
index 8a79255..05cde59 100644
--- a/lib/detecter_tools/darknet/dropout_layer_kernels.cu
+++ b/lib/detecter_tools/darknet/dropout_layer_kernels.cu
@@ -1,311 +1,311 @@
-#include <cuda_runtime.h>
-#include <curand.h>
-#include <cublas_v2.h>
-#include <cstring>
-
-#include "dropout_layer.h"
-#include "dark_cuda.h"
-#include "utils.h"
-#include "blas.h"
-
-#include "image_opencv.h"
-#include "image.h"
-
-
-__global__ void dropblock_fast_kernel(float *rand, float prob, int w, int h, int spatial, int filters, int batch, int block_size, float *drop_blocks_scale, float *output)
-{
-    const int threads = BLOCK;
-    const int id = threadIdx.x;
-    const int f = blockIdx.x % filters;
-    const int b = blockIdx.x / filters;
-
-    __shared__ int prob_block;
-    __shared__ int index_block;
-
-    if (id == 0) {
-        prob_block = 1.0 * 1000000;
-        index_block = -1;
-    }
-    __syncthreads();
-
-    int i;
-    for (i = id; i < spatial; i += threads) {
-        int index = b*spatial*f + f*spatial + i;
-
-        if (rand[index] < prob) {
-            //Chose with the lowest rand[i]
-            int new_val = rand[index] * 1000000;
-            rand[index] = 1;
-            int old_val = atomicMin(&prob_block, new_val);
-            if (new_val < old_val) {
-                index_block = i;
-                //if (b == 0) printf("\n rand[i] = %f, prob = %f, b = %d, f = %d, i = %d, index_block = %d \n", rand[i], prob, b, f, i, index_block);
-            }
-        }
-
-    }
-    __syncthreads();
-    if (index_block == -1) return;
-
-
-    int b_x = index_block % w;
-    int b_y = index_block / w;
-
-    if (b_x > (w - block_size)) b_x = b_x - (w - block_size);
-    if (b_y > (h - block_size)) b_y = b_y - (h - block_size);
-
-    b_x = max(0, min(b_x, w - block_size));
-    b_y = max(0, min(b_y, h - block_size));
-
-    int block_square_size = block_size * block_size;
-
-    for (i = id; i < block_square_size; i += threads)
-    {
-        int i_x = i % block_size;
-        int i_y = i / block_size;
-
-        int x = b_x + i_x;
-        int y = b_y + i_y;
-
-        if (x >= 0 && x < w && y >= 0 && y < h) {
-            int new_index = b*filters*spatial + f*spatial + y*w + x;
-
-            output[new_index] = 0;
-            rand[new_index] = 0;
-        }
-    }
-
-    //if (id == 0 && b == 0) printf(" f = %d, b = %d \n", f, b);
-
-    if (id == 0 && drop_blocks_scale) {
-        atomicAdd(&drop_blocks_scale[b], block_square_size);
-        //if(b == 0) printf("\n index_block = %d \n", index_block);
-    }
-
-}
-
-__global__ void set_scales_dropblock_kernel(float *drop_blocks_scale, int block_size_w, int block_size_h, int outputs, int batch)
-{
-    const int index = blockIdx.x*blockDim.x + threadIdx.x;
-    if (index >= batch) return;
-
-    //printf(" drop_blocks_scale[index] = %f \n", drop_blocks_scale[index]);
-    const float prob = drop_blocks_scale[index] / (float)outputs;
-    const float scale = 1.0f / (1.0f - prob);
-    drop_blocks_scale[index] = scale;
-}
-
-__global__ void scale_dropblock_kernel(float *output, int size, int outputs, float *drop_blocks_scale)
-{
-    const int index = blockIdx.x*blockDim.x + threadIdx.x;
-    if (index >= size) return;
-
-    const int b = index / outputs;
-    output[index] *= drop_blocks_scale[b];
-}
-
-
-__global__ void backward_dropblock_kernel(float *pass, float *delta, int size)
-{
-    const int index = blockIdx.x*blockDim.x + threadIdx.x;
-    if (index >= size) return;
-
-    if (pass[index] == 0) delta[index] = 0;
-}
-
-
-__global__ void yoloswag420blazeit360noscope(float *input, int size, float *rand, float prob, float scale)
-{
-    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
-    if(id < size) input[id] = (rand[id] < prob) ? 0 : input[id]*scale;
-}
-
-
-void forward_dropout_layer_gpu(dropout_layer l, network_state state)
-{
-    if (!state.train) return;
-    int iteration_num = get_current_iteration(state.net); // (*state.net.seen) / (state.net.batch*state.net.subdivisions);
-    //if (iteration_num < state.net.burn_in) return;
-
-    // We gradually increase the block size and the probability of dropout - during the first half of the training
-    float multiplier = 1.0;
-    if(iteration_num < (state.net.max_batches*0.85))
-        multiplier = (iteration_num / (float)(state.net.max_batches*0.85));
-
-    // dropblock
-    if (l.dropblock) {
-        //l.probability = 1 / keep_prob
-        //const int max_blocks_per_channel = 10;
-        const float cur_prob = l.probability * multiplier;
-        const float cur_scale = 1.f / (1.f - cur_prob);
-
-        int block_width = l.dropblock_size_abs *multiplier;
-        int block_height = l.dropblock_size_abs *multiplier;
-
-        if (l.dropblock_size_rel) {
-            block_width = l.dropblock_size_rel * l.w * multiplier;
-            block_height = l.dropblock_size_rel * l.h * multiplier;
-        }
-
-        block_width = max_val_cmp(1, block_width);
-        block_height = max_val_cmp(1, block_height);
-
-        block_width = min_val_cmp(l.w, block_width);
-        block_height = min_val_cmp(l.h, block_height);
-
-        const int block_size = min_val_cmp(block_width, block_height);
-        const float block_prob = cur_prob / (block_size*block_size);
-        assert(block_size <= l.w && block_size <= l.h);
-
-        const int size = l.inputs*l.batch;
-        cuda_random(l.rand_gpu, size);
-
-        fill_ongpu(l.batch, 0, l.drop_blocks_scale_gpu, 1);
-
-        //fill_ongpu(l.outputs * l.batch, 1, state.input, 1); // remove!!!
-
-        int num_blocks = l.batch * l.c;
-        dropblock_fast_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (l.rand_gpu, block_prob, l.w, l.h, l.w*l.h, l.c, l.batch, block_size, l.drop_blocks_scale_gpu, state.input);
-        CHECK_CUDA(cudaPeekAtLastError());
-
-        num_blocks = get_number_of_blocks(l.batch, BLOCK);
-        set_scales_dropblock_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (l.drop_blocks_scale_gpu, block_size, block_size, l.outputs, l.batch);
-        CHECK_CUDA(cudaPeekAtLastError());
-
-        /*
-        {
-            cuda_pull_array(l.drop_blocks_scale_gpu, l.drop_blocks_scale, l.batch);
-
-            float avg_scale = 0;
-
-            for (int b = 0; b < l.batch; ++b) {
-                const float scale = l.drop_blocks_scale[b];
-                avg_scale += scale;
-                printf(" %d x %d - block_size = %d, block_size*block_size = %d , ", l.w, l.h, block_size, block_size*block_size);
-                printf(" , l.drop_blocks_scale[b] = %f, scale = %f \t cur_prob = %f, cur_scale = %f \n",
-                    l.drop_blocks_scale[b], scale, cur_prob, cur_scale);
-            }
-            avg_scale = avg_scale / l.batch;
-            printf(" avg_scale = %f \n", avg_scale);
-
-            float *output = (float *)calloc(l.outputs * l.batch, sizeof(float));
-            cuda_pull_array(state.input, output, l.outputs * l.batch);
-
-            printf(" l.w = %d, l.h = %d, l.c = %d \n", l.w, l.h, l.c);
-
-            image img = float_to_image(l.w, l.h, l.c, output);
-            img = collapse_image_layers(img, 1);
-            //normalize_image(img);
-
-            show_image(img, "dropout - forward");
-            wait_key_cv(0);
-            //free_image(img);
-            //free(output);
-        }
-        */
-
-        num_blocks = get_number_of_blocks(l.outputs * l.batch, BLOCK);
-        scale_dropblock_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (state.input, l.outputs * l.batch, l.outputs, l.drop_blocks_scale_gpu);
-        CHECK_CUDA(cudaPeekAtLastError());
-
-    }
-    // dropout
-    else {
-        int size = l.inputs*l.batch;
-        cuda_random(l.rand_gpu, size);
-        /*
-        int i;
-        for(i = 0; i < size; ++i){
-            layer.rand[i] = rand_uniform();
-        }
-        cuda_push_array(layer.rand_gpu, layer.rand, size);
-        */
-
-        yoloswag420blazeit360noscope << <cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >> > (state.input, size, l.rand_gpu, l.probability, l.scale);
-        CHECK_CUDA(cudaPeekAtLastError());
-    }
-}
-
-void backward_dropout_layer_gpu(dropout_layer l, network_state state)
-{
-    if(!state.delta) return;
-    //int iteration_num = get_current_iteration(state.net); //(*state.net.seen) / (state.net.batch*state.net.subdivisions);
-    //if (iteration_num < state.net.burn_in) return;
-
-    const int size = l.inputs*l.batch;
-
-    // dropblock
-    if (l.dropblock) {
-        int iteration_num = get_current_iteration(state.net); //(*state.net.seen) / (state.net.batch*state.net.subdivisions);
-        float multiplier = 1.0;
-        if (iteration_num < (state.net.max_batches*0.85))
-            multiplier = (iteration_num / (float)(state.net.max_batches*0.85));
-
-        const float cur_prob = l.probability * multiplier;
-        const float cur_scale = 1.f / (1.f - cur_prob);
-
-        int block_width = l.dropblock_size_abs * multiplier;
-        int block_height = l.dropblock_size_abs * multiplier;
-
-        if (l.dropblock_size_rel) {
-            block_width = l.dropblock_size_rel * l.w * multiplier;
-            block_height = l.dropblock_size_rel * l.h * multiplier;
-        }
-
-        block_width = max_val_cmp(1, block_width);
-        block_height = max_val_cmp(1, block_height);
-
-        block_width = min_val_cmp(l.w, block_width);
-        block_height = min_val_cmp(l.h, block_height);
-
-        const int block_size = min_val_cmp(block_width, block_height);
-        const float block_prob = cur_prob / (block_size*block_size);
-
-        //fill_ongpu(l.outputs * l.batch, 1, state.delta, 1); // remove!!!
-
-        int num_blocks = get_number_of_blocks(l.outputs * l.batch, BLOCK);
-        backward_dropblock_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(l.rand_gpu, state.delta, l.outputs * l.batch);
-        CHECK_CUDA(cudaPeekAtLastError());
-
-        scale_dropblock_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (state.delta, l.outputs * l.batch, l.outputs, l.drop_blocks_scale_gpu);
-        CHECK_CUDA(cudaPeekAtLastError());
-
-        /*
-        {
-            cuda_pull_array(l.drop_blocks_scale_gpu, l.drop_blocks_scale, l.batch);
-
-            float avg_scale = 0;
-
-            for (int b = 0; b < l.batch; ++b) {
-                const float scale = l.drop_blocks_scale[b];
-                avg_scale += scale;
-                printf(" %d x %d - block_size = %d, block_size*block_size = %d , ", l.w, l.h, block_size, block_size*block_size);
-                printf(" , l.drop_blocks_scale[b] = %f, scale = %f \t cur_prob = %f, cur_scale = %f \n",
-                    l.drop_blocks_scale[b], scale, cur_prob, cur_scale);
-            }
-            avg_scale = avg_scale / l.batch;
-            printf(" avg_scale = %f \n", avg_scale);
-
-            float *output = (float *)calloc(l.outputs * l.batch, sizeof(float));
-            cuda_pull_array(state.delta, output, l.outputs * l.batch);
-
-            printf(" l.w = %d, l.h = %d, l.c = %d \n", l.w, l.h, l.c);
-
-            image img = float_to_image(l.w, l.h, l.c, output);
-            img = collapse_image_layers(img, 1);
-            //normalize_image(img);
-
-            show_image(img, "dropout - delta");
-            wait_key_cv(0);
-            //free_image(img);
-            //free(output);
-        }
-        */
-
-    }
-    // dropout
-    else {
-        yoloswag420blazeit360noscope << <cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >> > (state.delta, size, l.rand_gpu, l.probability, l.scale);
-        CHECK_CUDA(cudaPeekAtLastError());
-    }
-}
+#include <cuda_runtime.h>
+#include <curand.h>
+#include <cublas_v2.h>
+#include <cstring>
+
+#include "dropout_layer.h"
+#include "dark_cuda.h"
+#include "utils.h"
+#include "blas.h"
+
+#include "image_opencv.h"
+#include "image.h"
+
+
+__global__ void dropblock_fast_kernel(float *rand, float prob, int w, int h, int spatial, int filters, int batch, int block_size, float *drop_blocks_scale, float *output)
+{
+    const int threads = BLOCK;
+    const int id = threadIdx.x;
+    const int f = blockIdx.x % filters;
+    const int b = blockIdx.x / filters;
+
+    __shared__ int prob_block;
+    __shared__ int index_block;
+
+    if (id == 0) {
+        prob_block = 1.0 * 1000000;
+        index_block = -1;
+    }
+    __syncthreads();
+
+    int i;
+    for (i = id; i < spatial; i += threads) {
+        int index = b*spatial*f + f*spatial + i;
+
+        if (rand[index] < prob) {
+            //Chose with the lowest rand[i]
+            int new_val = rand[index] * 1000000;
+            rand[index] = 1;
+            int old_val = atomicMin(&prob_block, new_val);
+            if (new_val < old_val) {
+                index_block = i;
+                //if (b == 0) printf("\n rand[i] = %f, prob = %f, b = %d, f = %d, i = %d, index_block = %d \n", rand[i], prob, b, f, i, index_block);
+            }
+        }
+
+    }
+    __syncthreads();
+    if (index_block == -1) return;
+
+
+    int b_x = index_block % w;
+    int b_y = index_block / w;
+
+    if (b_x > (w - block_size)) b_x = b_x - (w - block_size);
+    if (b_y > (h - block_size)) b_y = b_y - (h - block_size);
+
+    b_x = max(0, min(b_x, w - block_size));
+    b_y = max(0, min(b_y, h - block_size));
+
+    int block_square_size = block_size * block_size;
+
+    for (i = id; i < block_square_size; i += threads)
+    {
+        int i_x = i % block_size;
+        int i_y = i / block_size;
+
+        int x = b_x + i_x;
+        int y = b_y + i_y;
+
+        if (x >= 0 && x < w && y >= 0 && y < h) {
+            int new_index = b*filters*spatial + f*spatial + y*w + x;
+
+            output[new_index] = 0;
+            rand[new_index] = 0;
+        }
+    }
+
+    //if (id == 0 && b == 0) printf(" f = %d, b = %d \n", f, b);
+
+    if (id == 0 && drop_blocks_scale) {
+        atomicAdd(&drop_blocks_scale[b], block_square_size);
+        //if(b == 0) printf("\n index_block = %d \n", index_block);
+    }
+
+}
+
+__global__ void set_scales_dropblock_kernel(float *drop_blocks_scale, int block_size_w, int block_size_h, int outputs, int batch)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index >= batch) return;
+
+    //printf(" drop_blocks_scale[index] = %f \n", drop_blocks_scale[index]);
+    const float prob = drop_blocks_scale[index] / (float)outputs;
+    const float scale = 1.0f / (1.0f - prob);
+    drop_blocks_scale[index] = scale;
+}
+
+__global__ void scale_dropblock_kernel(float *output, int size, int outputs, float *drop_blocks_scale)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index >= size) return;
+
+    const int b = index / outputs;
+    output[index] *= drop_blocks_scale[b];
+}
+
+
+__global__ void backward_dropblock_kernel(float *pass, float *delta, int size)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index >= size) return;
+
+    if (pass[index] == 0) delta[index] = 0;
+}
+
+
+__global__ void yoloswag420blazeit360noscope(float *input, int size, float *rand, float prob, float scale)
+{
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(id < size) input[id] = (rand[id] < prob) ? 0 : input[id]*scale;
+}
+
+
+void forward_dropout_layer_gpu(dropout_layer l, network_state state)
+{
+    if (!state.train) return;
+    int iteration_num = get_current_iteration(state.net); // (*state.net.seen) / (state.net.batch*state.net.subdivisions);
+    //if (iteration_num < state.net.burn_in) return;
+
+    // We gradually increase the block size and the probability of dropout - during the first half of the training
+    float multiplier = 1.0;
+    if(iteration_num < (state.net.max_batches*0.85))
+        multiplier = (iteration_num / (float)(state.net.max_batches*0.85));
+
+    // dropblock
+    if (l.dropblock) {
+        //l.probability = 1 / keep_prob
+        //const int max_blocks_per_channel = 10;
+        const float cur_prob = l.probability * multiplier;
+        const float cur_scale = 1.f / (1.f - cur_prob);
+
+        int block_width = l.dropblock_size_abs *multiplier;
+        int block_height = l.dropblock_size_abs *multiplier;
+
+        if (l.dropblock_size_rel) {
+            block_width = l.dropblock_size_rel * l.w * multiplier;
+            block_height = l.dropblock_size_rel * l.h * multiplier;
+        }
+
+        block_width = max_val_cmp(1, block_width);
+        block_height = max_val_cmp(1, block_height);
+
+        block_width = min_val_cmp(l.w, block_width);
+        block_height = min_val_cmp(l.h, block_height);
+
+        const int block_size = min_val_cmp(block_width, block_height);
+        const float block_prob = cur_prob / (block_size*block_size);
+        assert(block_size <= l.w && block_size <= l.h);
+
+        const int size = l.inputs*l.batch;
+        cuda_random(l.rand_gpu, size);
+
+        fill_ongpu(l.batch, 0, l.drop_blocks_scale_gpu, 1);
+
+        //fill_ongpu(l.outputs * l.batch, 1, state.input, 1); // remove!!!
+
+        int num_blocks = l.batch * l.c;
+        dropblock_fast_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (l.rand_gpu, block_prob, l.w, l.h, l.w*l.h, l.c, l.batch, block_size, l.drop_blocks_scale_gpu, state.input);
+        CHECK_CUDA(cudaPeekAtLastError());
+
+        num_blocks = get_number_of_blocks(l.batch, BLOCK);
+        set_scales_dropblock_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (l.drop_blocks_scale_gpu, block_size, block_size, l.outputs, l.batch);
+        CHECK_CUDA(cudaPeekAtLastError());
+
+        /*
+        {
+            cuda_pull_array(l.drop_blocks_scale_gpu, l.drop_blocks_scale, l.batch);
+
+            float avg_scale = 0;
+
+            for (int b = 0; b < l.batch; ++b) {
+                const float scale = l.drop_blocks_scale[b];
+                avg_scale += scale;
+                printf(" %d x %d - block_size = %d, block_size*block_size = %d , ", l.w, l.h, block_size, block_size*block_size);
+                printf(" , l.drop_blocks_scale[b] = %f, scale = %f \t cur_prob = %f, cur_scale = %f \n",
+                    l.drop_blocks_scale[b], scale, cur_prob, cur_scale);
+            }
+            avg_scale = avg_scale / l.batch;
+            printf(" avg_scale = %f \n", avg_scale);
+
+            float *output = (float *)calloc(l.outputs * l.batch, sizeof(float));
+            cuda_pull_array(state.input, output, l.outputs * l.batch);
+
+            printf(" l.w = %d, l.h = %d, l.c = %d \n", l.w, l.h, l.c);
+
+            image img = float_to_image(l.w, l.h, l.c, output);
+            img = collapse_image_layers(img, 1);
+            //normalize_image(img);
+
+            show_image(img, "dropout - forward");
+            wait_key_cv(0);
+            //free_image(img);
+            //free(output);
+        }
+        */
+
+        num_blocks = get_number_of_blocks(l.outputs * l.batch, BLOCK);
+        scale_dropblock_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (state.input, l.outputs * l.batch, l.outputs, l.drop_blocks_scale_gpu);
+        CHECK_CUDA(cudaPeekAtLastError());
+
+    }
+    // dropout
+    else {
+        int size = l.inputs*l.batch;
+        cuda_random(l.rand_gpu, size);
+        /*
+        int i;
+        for(i = 0; i < size; ++i){
+            layer.rand[i] = rand_uniform();
+        }
+        cuda_push_array(layer.rand_gpu, layer.rand, size);
+        */
+
+        yoloswag420blazeit360noscope << <cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >> > (state.input, size, l.rand_gpu, l.probability, l.scale);
+        CHECK_CUDA(cudaPeekAtLastError());
+    }
+}
+
+void backward_dropout_layer_gpu(dropout_layer l, network_state state)
+{
+    if(!state.delta) return;
+    //int iteration_num = get_current_iteration(state.net); //(*state.net.seen) / (state.net.batch*state.net.subdivisions);
+    //if (iteration_num < state.net.burn_in) return;
+
+    const int size = l.inputs*l.batch;
+
+    // dropblock
+    if (l.dropblock) {
+        int iteration_num = get_current_iteration(state.net); //(*state.net.seen) / (state.net.batch*state.net.subdivisions);
+        float multiplier = 1.0;
+        if (iteration_num < (state.net.max_batches*0.85))
+            multiplier = (iteration_num / (float)(state.net.max_batches*0.85));
+
+        const float cur_prob = l.probability * multiplier;
+        const float cur_scale = 1.f / (1.f - cur_prob);
+
+        int block_width = l.dropblock_size_abs * multiplier;
+        int block_height = l.dropblock_size_abs * multiplier;
+
+        if (l.dropblock_size_rel) {
+            block_width = l.dropblock_size_rel * l.w * multiplier;
+            block_height = l.dropblock_size_rel * l.h * multiplier;
+        }
+
+        block_width = max_val_cmp(1, block_width);
+        block_height = max_val_cmp(1, block_height);
+
+        block_width = min_val_cmp(l.w, block_width);
+        block_height = min_val_cmp(l.h, block_height);
+
+        const int block_size = min_val_cmp(block_width, block_height);
+        const float block_prob = cur_prob / (block_size*block_size);
+
+        //fill_ongpu(l.outputs * l.batch, 1, state.delta, 1); // remove!!!
+
+        int num_blocks = get_number_of_blocks(l.outputs * l.batch, BLOCK);
+        backward_dropblock_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(l.rand_gpu, state.delta, l.outputs * l.batch);
+        CHECK_CUDA(cudaPeekAtLastError());
+
+        scale_dropblock_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (state.delta, l.outputs * l.batch, l.outputs, l.drop_blocks_scale_gpu);
+        CHECK_CUDA(cudaPeekAtLastError());
+
+        /*
+        {
+            cuda_pull_array(l.drop_blocks_scale_gpu, l.drop_blocks_scale, l.batch);
+
+            float avg_scale = 0;
+
+            for (int b = 0; b < l.batch; ++b) {
+                const float scale = l.drop_blocks_scale[b];
+                avg_scale += scale;
+                printf(" %d x %d - block_size = %d, block_size*block_size = %d , ", l.w, l.h, block_size, block_size*block_size);
+                printf(" , l.drop_blocks_scale[b] = %f, scale = %f \t cur_prob = %f, cur_scale = %f \n",
+                    l.drop_blocks_scale[b], scale, cur_prob, cur_scale);
+            }
+            avg_scale = avg_scale / l.batch;
+            printf(" avg_scale = %f \n", avg_scale);
+
+            float *output = (float *)calloc(l.outputs * l.batch, sizeof(float));
+            cuda_pull_array(state.delta, output, l.outputs * l.batch);
+
+            printf(" l.w = %d, l.h = %d, l.c = %d \n", l.w, l.h, l.c);
+
+            image img = float_to_image(l.w, l.h, l.c, output);
+            img = collapse_image_layers(img, 1);
+            //normalize_image(img);
+
+            show_image(img, "dropout - delta");
+            wait_key_cv(0);
+            //free_image(img);
+            //free(output);
+        }
+        */
+
+    }
+    // dropout
+    else {
+        yoloswag420blazeit360noscope << <cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >> > (state.delta, size, l.rand_gpu, l.probability, l.scale);
+        CHECK_CUDA(cudaPeekAtLastError());
+    }
+}

--
Gitblit v1.8.0