//sys
|
#include <cmath>
|
#include <stdio.h>
|
#include <cassert>
|
#include <iostream>
|
#include <cuda_runtime.h>
|
#include <cuda.h>
|
#include <stdint.h>
|
#include <string.h>
|
//my
|
#include "detect.h"
|
|
#define NV_CUDA_CHECK(status) \
|
{ \
|
if (status != 0) \
|
{ \
|
std::cout << "Cuda failure: " << cudaGetErrorString(status) << " in file " << __FILE__ \
|
<< " at line " << __LINE__ << std::endl; \
|
abort(); \
|
} \
|
}
|
|
namespace nvinfer1
|
{
|
Detect::Detect()
|
{
|
}
|
|
Detect::Detect(const void* data, size_t length)
|
{
|
const char *d = reinterpret_cast<const char*>(data), *a = d;
|
read(d,_n_anchor);
|
read(d,_n_classes);
|
read(d,_n_grid_h);
|
read(d, _n_grid_w);
|
read(d, _n_output_size);
|
//printf("anchor:%d,classes:%d,gh:%d,gw:%d,size:%d\n", _n_anchor, _n_classes, _n_grid_h, _n_grid_w, _n_output_size);
|
assert(d == a + length);
|
}
|
|
Detect::Detect(const uint32_t n_anchor_, const uint32_t n_classes_,
|
const uint32_t n_grid_h_, const uint32_t n_grid_w_/*,
|
const uint32_t &n_stride_h_, const uint32_t &n_stride_w_*/):
|
_n_anchor(n_anchor_),
|
_n_classes(n_classes_),
|
_n_grid_h(n_grid_h_),
|
_n_grid_w(n_grid_w_)
|
{
|
_n_output_size = (5 + _n_classes)*_n_anchor*_n_grid_h*_n_grid_w;
|
}
|
Detect::~Detect()
|
{}
|
|
inline __device__ float sigmoidGPU(const float& x) { return 1.0f / (1.0f + __expf(-x)); }
|
|
__global__ void gpu_detect_layer(const float *input_,
|
float* output_,
|
const uint32_t n_grid_h_,
|
const uint32_t n_grid_w_,
|
const uint32_t n_classes_,
|
const uint32_t n_anchor_)
|
{
|
uint32_t x_id = blockIdx.x * blockDim.x + threadIdx.x;
|
uint32_t y_id = blockIdx.y * blockDim.y + threadIdx.y;
|
uint32_t z_id = blockIdx.z * blockDim.z + threadIdx.z;
|
|
if ((x_id >= n_grid_w_) || (y_id >= n_grid_h_) || (z_id >= n_anchor_))
|
{
|
return;
|
}
|
// printf("grid_h:%d,grid_w:%d,class:%d,anchor:%d\n", n_grid_h_, n_grid_w_, n_classes_, n_anchor_);
|
const int numGridCells = n_grid_h_ * n_grid_w_;
|
const int bbindex = y_id * n_grid_w_ + x_id;
|
|
output_[bbindex + numGridCells * (z_id * (5 + n_classes_) + 0)]
|
= 2.f * sigmoidGPU(input_[bbindex + numGridCells * (z_id * (5 + n_classes_) + 0)])-0.5f;
|
|
output_[bbindex + numGridCells * (z_id * (5 + n_classes_) + 1)]
|
= 2.f * sigmoidGPU(input_[bbindex + numGridCells * (z_id * (5 + n_classes_) + 1)])-0.5f;
|
|
float w = 2.f * sigmoidGPU(input_[bbindex + numGridCells * (z_id * (5 + n_classes_) + 2)]);
|
output_[bbindex + numGridCells * (z_id * (5 + n_classes_) + 2)]
|
= w*w;
|
|
float h = 2.f* sigmoidGPU(input_[bbindex + numGridCells * (z_id * (5 + n_classes_) + 3)]);
|
output_[bbindex + numGridCells * (z_id * (5 + n_classes_) + 3)]
|
= h*h;
|
|
output_[bbindex + numGridCells * (z_id * (5 + n_classes_) + 4)]
|
= sigmoidGPU(input_[bbindex + numGridCells * (z_id * (5 + n_classes_) + 4)]);
|
for (uint32_t i = 0; i < n_classes_; ++i)
|
{
|
output_[bbindex + numGridCells * (z_id * (5 + n_classes_) + (5 + i))]
|
= sigmoidGPU(input_[bbindex + numGridCells * (z_id * (5 + n_classes_) + (5 + i))]);
|
}
|
}
|
|
cudaError_t cuda_detect_layer(const void* input_,
|
void* output_,
|
const uint32_t& batch_size_,
|
const uint32_t& grid_h_,
|
const uint32_t& grid_w_,
|
const uint32_t& n_classes_,
|
const uint32_t& n_anchor_,
|
uint64_t n_output_size_,
|
cudaStream_t stream_)
|
{
|
dim3 threads_per_block(16, 16, 4);
|
dim3 number_of_blocks((grid_w_ / threads_per_block.x) + 1,
|
(grid_h_ / threads_per_block.y) + 1,
|
(n_anchor_ / threads_per_block.z) + 1);
|
for (int batch = 0; batch < batch_size_; ++batch)
|
{
|
gpu_detect_layer << <number_of_blocks, threads_per_block, 0, stream_ >> >(
|
reinterpret_cast<const float*>(input_) + (batch * n_output_size_),
|
reinterpret_cast<float*>(output_) + (batch * n_output_size_),
|
grid_h_,
|
grid_w_,
|
n_classes_,
|
n_anchor_);
|
}
|
return cudaGetLastError();
|
}
|
|
int Detect::enqueue(int batchSize,
|
const void* const* inputs,
|
void** outputs,
|
void* workspace,
|
cudaStream_t stream) noexcept
|
{
|
NV_CUDA_CHECK(cuda_detect_layer(inputs[0], outputs[0], batchSize, _n_grid_h, _n_grid_w, _n_classes, _n_anchor, _n_output_size, stream));
|
return 0;
|
}
|
|
|
bool Detect::supportsFormat(DataType type, PluginFormat format) const noexcept
|
{
|
return (type == DataType::kFLOAT && format == PluginFormat::kLINEAR);
|
}
|
|
void Detect::configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) noexcept
|
{
|
|
}
|
|
size_t Detect::getSerializationSize() const noexcept
|
{
|
return sizeof(_n_anchor) + sizeof(_n_classes) + sizeof(_n_grid_h) + sizeof(_n_grid_w)
|
+ sizeof(_n_output_size);
|
}
|
|
void Detect::serialize(void *buffer) const noexcept
|
{
|
char *d = static_cast<char*>(buffer), *a = d;
|
write(d,_n_anchor);
|
write(d, _n_classes);
|
write(d, _n_grid_h);
|
write(d, _n_grid_w);
|
write(d, _n_output_size);
|
assert(d == a + getSerializationSize());
|
}
|
|
void Detect::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput)
|
{
|
|
}
|
IPluginV2* Detect::clone() const noexcept
|
{
|
Detect *p = new Detect(_n_anchor,_n_classes,_n_grid_h,_n_grid_w);
|
p->setPluginNamespace(_s_plugin_namespace.c_str());
|
return p;
|
}
|
|
|
//
|
PluginFieldCollection DetectPluginCreator::_fc{};
|
std::vector<PluginField> DetectPluginCreator::_vec_plugin_attributes;
|
|
DetectPluginCreator::DetectPluginCreator()
|
{
|
_vec_plugin_attributes.clear();
|
_fc.nbFields = _vec_plugin_attributes.size();
|
_fc.fields = _vec_plugin_attributes.data();
|
}
|
|
const char* DetectPluginCreator::getPluginName() const noexcept
|
{
|
return "DETECT_TRT";
|
}
|
|
const char* DetectPluginCreator::getPluginVersion() const noexcept
|
{
|
return "1.0";
|
}
|
|
const PluginFieldCollection* DetectPluginCreator::getFieldNames() noexcept
|
{
|
return &_fc;
|
}
|
|
IPluginV2* DetectPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) noexcept
|
{
|
Detect* obj = new Detect();
|
obj->setPluginNamespace(_s_name_space.c_str());
|
return obj;
|
}
|
|
IPluginV2* DetectPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) noexcept
|
{
|
Detect* obj = new Detect(serialData, serialLength);
|
obj->setPluginNamespace(_s_name_space.c_str());
|
return obj;
|
}
|
|
void DetectPluginCreator::setPluginNamespace(const char* libNamespace) noexcept
|
{
|
_s_name_space = libNamespace;
|
}
|
|
const char* DetectPluginCreator::getPluginNamespace() const noexcept
|
{
|
return _s_name_space.c_str();
|
}
|
}//end namespace nvinfer1
|