| | |
| | | __device__ float softplus_kernel(float x, float threshold = 20) {
|
| | | if (x > threshold) return x; // too large
|
| | | else if (x < -threshold) return expf(x); // too small
|
| | | return logf(expf(x) + 1);
|
| | | return log1pf(expf(x)); |
| | | //return logf(expf(x) + 1); |
| | | }
|
| | | __device__ float plse_activate_kernel(float x)
|
| | | {
|
| | |
| | | __device__ float plse_gradient_kernel(float x){return (x < 0 || x > 1) ? .01f : .125f;}
|
| | | __device__ float stair_gradient_kernel(float x)
|
| | | {
|
| | | if (floor(x) == x) return 0;
|
| | | if (floorf(x) == x) return 0; |
| | | return 1;
|
| | | }
|
| | |
|
| | |
| | | if (i < n) {
|
| | | float x_val = x[i];
|
| | | float sigmoid = logistic_activate_kernel(x_val);
|
| | | output_sigmoid_gpu[i] = sigmoid;
|
| | | if (output_sigmoid_gpu) output_sigmoid_gpu[i] = sigmoid; |
| | | output_gpu[i] = x_val * sigmoid;
|
| | | }
|
| | | } |
| | | |
| | | __device__ float mish_njuffa(float x) |
| | | { |
| | | float r; |
| | | float e = expf(x); |
| | | r = 1.0f / fmaf(fmaf(-0.5f, e, -1.0f), e, -1.0f); |
| | | r = fmaf(r, x, x); |
| | | return r; |
| | | } |
| | | |
| | | __device__ float mish_yashas(float x) |
| | | { |
| | | float e = __expf(x); |
| | | if (x <= -18.0f) |
| | | return x * e; |
| | | |
| | | float n = e * e + 2 * e; |
| | | if (x <= -5.0f) |
| | | return x * __fdividef(n, n + 2); |
| | | |
| | | return x - 2 * __fdividef(x, n + 2); |
| | | } |
| | | |
| | | __device__ float mish_yashas2(float x) |
| | | { |
| | | float e = __expf(x); |
| | | float n = e * e + 2 * e; |
| | | if (x <= -0.6f) |
| | | return x * __fdividef(n, n + 2); |
| | | |
| | | return x - 2 * __fdividef(x, n + 2); |
| | | }
|
| | |
|
| | | // https://github.com/digantamisra98/Mish
|
| | |
| | | if (i < n) {
|
| | | const float MISH_THRESHOLD = 20;
|
| | | float x_val = x[i];
|
| | | activation_input[i] = x_val; // store value before activation
|
| | | if (activation_input) activation_input[i] = x_val; // store value before activation |
| | | //output_gpu[i] = x_val * tanh_activate_kernel(logf(1 + expf(x_val)));
|
| | |
|
| | | // Pytorch: https://github.com/thomasbrandon/mish-cuda/blob/master/csrc/mish.h#L17-L20
|
| | | // TF: https://github.com/tensorflow/addons/blob/093cdfa85d334cbe19a37624c33198f3140109ed/tensorflow_addons/custom_ops/activations/cc/kernels/mish_op.h#L40-L49
|
| | | // log1p(x) == log(x + 1)
|
| | | output_gpu[i] = x_val * tanh_activate_kernel( softplus_kernel(x_val, MISH_THRESHOLD) );
|
| | | //output_gpu[i] = x_val * tanh_activate_kernel( softplus_kernel(x_val, MISH_THRESHOLD) ); |
| | | output_gpu[i] = mish_yashas2(x_val); |
| | | //output_gpu[i] = mish_njuffa(x_val); |
| | | }
|
| | | }
|
| | |
|
| | | __device__ float hard_mish_yashas(float x) |
| | | { |
| | | if (x > 0) |
| | | return x; |
| | | if (x > -2) |
| | | return x * x / 2 + x; |
| | | return 0; |
| | | } |
| | | |
| | | __global__ void activate_array_hard_mish_kernel(float *x, int n, float *activation_input, float *output_gpu) |
| | | { |
| | | int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; |
| | | if (i < n) { |
| | | |
| | | float x_val = x[i]; |
| | | if (activation_input) activation_input[i] = x_val; // store value before activation |
| | | output_gpu[i] = hard_mish_yashas(x_val); |
| | | } |
| | | } |
| | | __global__ void activate_array_leaky_kernel(float *x, int n)
|
| | | {
|
| | | int index = blockIdx.x*blockDim.x + threadIdx.x;
|
| | |
| | | // log1p(x) == log(x + 1)
|
| | | const float inp = activation_input_gpu[i];
|
| | | const float sp = softplus_kernel(inp, MISH_THRESHOLD);
|
| | | const float grad_sp = 1 - expf(-sp);
|
| | | const float grad_sp = -expm1f(-sp); |
| | | //const float grad_sp = 1 - expf(-sp); |
| | | const float tsp = tanh(sp);
|
| | | const float grad_tsp = (1 - tsp*tsp) * grad_sp;
|
| | | const float grad = inp * grad_tsp + tsp;
|
| | |
| | | }
|
| | | }
|
| | |
|
| | | __device__ float hard_mish_yashas_grad(float x) |
| | | { |
| | | if (x > 0) |
| | | return 1; |
| | | if (x > -2) |
| | | return x + 1; |
| | | return 0; |
| | | } |
| | | |
| | | __global__ void gradient_array_hard_mish_kernel(int n, float *activation_input_gpu, float *delta) |
| | | { |
| | | int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; |
| | | if (i < n) { |
| | | |
| | | const float x = activation_input_gpu[i]; |
| | | delta[i] *= hard_mish_yashas_grad(x); |
| | | } |
| | | } |
| | | |
| | | __global__ void gradient_array_leaky_kernel(float *x, int n, float *delta)
|
| | | {
|
| | | int index = blockIdx.x*blockDim.x + threadIdx.x;
|
| | | if (index < n) {
|
| | | delta[index] *= leaky_gradient_kernel(x[index]);
|
| | | } |
| | | } |
| | | |
| | | __global__ void gradient_array_revleaky_kernel(float *x, int n, float *delta) |
| | | { |
| | | int index = blockIdx.x*blockDim.x + threadIdx.x; |
| | | if (index < n) { |
| | | delta[index] /= leaky_gradient_kernel(x[index]); |
| | | }
|
| | | }
|
| | |
|
| | |
| | | {
|
| | | const int num_blocks = get_number_of_blocks(n, BLOCK);
|
| | | if (a == LINEAR) return;
|
| | | else if(a == LEAKY) activate_array_leaky_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(x, n);
|
| | | else if (a == LEAKY || a == REVLEAKY) activate_array_leaky_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(x, n); |
| | | else if (a == LOGISTIC) activate_array_logistic_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(x, n);
|
| | | else if (a == TANH) activate_array_tanh_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(x, n);
|
| | | else if (a == HARDTAN) activate_array_hardtan_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(x, n);
|
| | |
| | | CHECK_CUDA(cudaPeekAtLastError());
|
| | | }
|
| | |
|
| | | extern "C" void activate_array_hard_mish_ongpu(float *x, int n, float *activation_input_gpu, float *output_gpu) |
| | | { |
| | | const int num_blocks = get_number_of_blocks(n, BLOCK); |
| | | activate_array_hard_mish_kernel << <cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >> >(x, n, activation_input_gpu, output_gpu); |
| | | CHECK_CUDA(cudaPeekAtLastError()); |
| | | } |
| | | |
| | | extern "C" void gradient_array_ongpu(float *x, int n, ACTIVATION a, float *delta)
|
| | | {
|
| | | const int num_blocks = get_number_of_blocks(n, BLOCK);
|
| | | if (a == LINEAR) return;
|
| | | else if (a == LEAKY) gradient_array_leaky_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(x, n, delta);
|
| | | else if (a == REVLEAKY) gradient_array_revleaky_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(x, n, delta); |
| | | else if (a == LOGISTIC) gradient_array_logistic_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(x, n, delta);
|
| | | else if (a == TANH) gradient_array_tanh_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(x, n, delta);
|
| | | else if (a == HARDTAN) gradient_array_hardtan_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(x, n, delta);
|
| | |
| | | CHECK_CUDA(cudaPeekAtLastError());
|
| | | }
|
| | |
|
| | | extern "C" void gradient_array_hard_mish_ongpu(int n, float *activation_input_gpu, float *delta) |
| | | { |
| | | const int num_blocks = get_number_of_blocks(n, BLOCK); |
| | | gradient_array_hard_mish_kernel << <cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >> > (n, activation_input_gpu, delta); |
| | | CHECK_CUDA(cudaPeekAtLastError()); |
| | | } |
| | | |
| | |
|
| | | __global__ void activate_array_normalize_channels_kernel(float *x, int size, int batch, int channels, int wh_step, float *output_gpu)
|
| | | {
|