~suntianyu/baseDetector.git

			@@ -40,7 +40,8 @@
			__device__ float softplus_kernel(float x, float threshold = 20) {
			if (x > threshold) return x; // too large
			else if (x < -threshold) return expf(x); // too small
			return logf(expf(x) + 1);
			return log1pf(expf(x));
			//return logf(expf(x) + 1);
			}
			__device__ float plse_activate_kernel(float x)
			{
			@@ -84,7 +85,7 @@
			__device__ float plse_gradient_kernel(float x){return (x < 0 \|\| x > 1) ? .01f : .125f;}
			__device__ float stair_gradient_kernel(float x)
			{
			if (floor(x) == x) return 0;
			if (floorf(x) == x) return 0;
			return 1;
			}

			@@ -217,9 +218,41 @@
			if (i < n) {
			float x_val = x[i];
			float sigmoid = logistic_activate_kernel(x_val);
			output_sigmoid_gpu[i] = sigmoid;
			if (output_sigmoid_gpu) output_sigmoid_gpu[i] = sigmoid;
			output_gpu[i] = x_val * sigmoid;
			}
			}

			__device__ float mish_njuffa(float x)
			{
			float r;
			float e = expf(x);
			r = 1.0f / fmaf(fmaf(-0.5f, e, -1.0f), e, -1.0f);
			r = fmaf(r, x, x);
			return r;
			}

			__device__ float mish_yashas(float x)
			{
			float e = __expf(x);
			if (x <= -18.0f)
			return x * e;

			float n = e * e + 2 * e;
			if (x <= -5.0f)
			return x * __fdividef(n, n + 2);

			return x - 2 * __fdividef(x, n + 2);
			}

			__device__ float mish_yashas2(float x)
			{
			float e = __expf(x);
			float n = e * e + 2 * e;
			if (x <= -0.6f)
			return x * __fdividef(n, n + 2);

			return x - 2 * __fdividef(x, n + 2);
			}

			// https://github.com/digantamisra98/Mish
			@@ -229,16 +262,37 @@
			if (i < n) {
			const float MISH_THRESHOLD = 20;
			float x_val = x[i];
			activation_input[i] = x_val; // store value before activation
			if (activation_input) activation_input[i] = x_val; // store value before activation
			//output_gpu[i] = x_val * tanh_activate_kernel(logf(1 + expf(x_val)));

			// Pytorch: https://github.com/thomasbrandon/mish-cuda/blob/master/csrc/mish.h#L17-L20
			// TF: https://github.com/tensorflow/addons/blob/093cdfa85d334cbe19a37624c33198f3140109ed/tensorflow_addons/custom_ops/activations/cc/kernels/mish_op.h#L40-L49
			// log1p(x) == log(x + 1)
			output_gpu[i] = x_val * tanh_activate_kernel( softplus_kernel(x_val, MISH_THRESHOLD) );
			//output_gpu[i] = x_val * tanh_activate_kernel( softplus_kernel(x_val, MISH_THRESHOLD) );
			output_gpu[i] = mish_yashas2(x_val);
			//output_gpu[i] = mish_njuffa(x_val);
			}
			}

			__device__ float hard_mish_yashas(float x)
			{
			if (x > 0)
			return x;
			if (x > -2)
			return x * x / 2 + x;
			return 0;
			}

			__global__ void activate_array_hard_mish_kernel(float x, int n, float activation_input, float *output_gpu)
			{
			int i = (blockIdx.x + blockIdx.ygridDim.x) blockDim.x + threadIdx.x;
			if (i < n) {

			float x_val = x[i];
			if (activation_input) activation_input[i] = x_val; // store value before activation
			output_gpu[i] = hard_mish_yashas(x_val);
			}
			}
			__global__ void activate_array_leaky_kernel(float *x, int n)
			{
			int index = blockIdx.x*blockDim.x + threadIdx.x;
			@@ -331,7 +385,8 @@
			// log1p(x) == log(x + 1)
			const float inp = activation_input_gpu[i];
			const float sp = softplus_kernel(inp, MISH_THRESHOLD);
			const float grad_sp = 1 - expf(-sp);
			const float grad_sp = -expm1f(-sp);
			//const float grad_sp = 1 - expf(-sp);
			const float tsp = tanh(sp);
			const float grad_tsp = (1 - tsptsp) grad_sp;
			const float grad = inp * grad_tsp + tsp;
			@@ -345,11 +400,38 @@
			}
			}

			__device__ float hard_mish_yashas_grad(float x)
			{
			if (x > 0)
			return 1;
			if (x > -2)
			return x + 1;
			return 0;
			}

			__global__ void gradient_array_hard_mish_kernel(int n, float activation_input_gpu, float delta)
			{
			int i = (blockIdx.x + blockIdx.ygridDim.x) blockDim.x + threadIdx.x;
			if (i < n) {

			const float x = activation_input_gpu[i];
			delta[i] *= hard_mish_yashas_grad(x);
			}
			}

			__global__ void gradient_array_leaky_kernel(float x, int n, float delta)
			{
			int index = blockIdx.x*blockDim.x + threadIdx.x;
			if (index < n) {
			delta[index] *= leaky_gradient_kernel(x[index]);
			}
			}

			__global__ void gradient_array_revleaky_kernel(float x, int n, float delta)
			{
			int index = blockIdx.x*blockDim.x + threadIdx.x;
			if (index < n) {
			delta[index] /= leaky_gradient_kernel(x[index]);
			}
			}

			@@ -413,7 +495,7 @@
			{
			const int num_blocks = get_number_of_blocks(n, BLOCK);
			if (a == LINEAR) return;
			else if(a == LEAKY) activate_array_leaky_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(x, n);
			else if (a == LEAKY \|\| a == REVLEAKY) activate_array_leaky_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(x, n);
			else if (a == LOGISTIC) activate_array_logistic_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(x, n);
			else if (a == TANH) activate_array_tanh_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(x, n);
			else if (a == HARDTAN) activate_array_hardtan_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(x, n);
			@@ -440,11 +522,19 @@
			CHECK_CUDA(cudaPeekAtLastError());
			}

			extern "C" void activate_array_hard_mish_ongpu(float x, int n, float activation_input_gpu, float *output_gpu)
			{
			const int num_blocks = get_number_of_blocks(n, BLOCK);
			activate_array_hard_mish_kernel << <cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >> >(x, n, activation_input_gpu, output_gpu);
			CHECK_CUDA(cudaPeekAtLastError());
			}

			extern "C" void gradient_array_ongpu(float x, int n, ACTIVATION a, float delta)
			{
			const int num_blocks = get_number_of_blocks(n, BLOCK);
			if (a == LINEAR) return;
			else if (a == LEAKY) gradient_array_leaky_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(x, n, delta);
			else if (a == REVLEAKY) gradient_array_revleaky_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(x, n, delta);
			else if (a == LOGISTIC) gradient_array_logistic_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(x, n, delta);
			else if (a == TANH) gradient_array_tanh_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(x, n, delta);
			else if (a == HARDTAN) gradient_array_hardtan_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(x, n, delta);
			@@ -477,6 +567,13 @@
			CHECK_CUDA(cudaPeekAtLastError());
			}

			extern "C" void gradient_array_hard_mish_ongpu(int n, float activation_input_gpu, float delta)
			{
			const int num_blocks = get_number_of_blocks(n, BLOCK);
			gradient_array_hard_mish_kernel << <cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >> > (n, activation_input_gpu, delta);
			CHECK_CUDA(cudaPeekAtLastError());
			}


			__global__ void activate_array_normalize_channels_kernel(float x, int size, int batch, int channels, int wh_step, float output_gpu)
			{