#include "conv.h"
|
|
#include <cmath>
|
#include <sys/time.h>
|
|
#include <npp.h>
|
#include <helper_cuda.h>
|
#include <helper_string.h>
|
#include "Exceptions.h"
|
|
|
static const int MEMORY_ALGN_DEVICE = 511;
|
static const int HD_MEMORY_ALGN_DEVICE = 511;
|
|
static inline int DivUp(int x, int d)
|
{
|
return (x + d - 1) / d;
|
}
|
|
static int set_data(uint8_t *data, const int width, const int height, unsigned char *mY, unsigned char *mU, unsigned char *mV)
|
{
|
uint8_t* yuv_data = (uint8_t*)data;
|
if (!yuv_data)
|
{
|
return -1;
|
}
|
|
uint32_t i, j;
|
uint32_t off;
|
uint32_t off_yuv;
|
uint32_t half_h;
|
uint32_t half_w;
|
uint32_t u_size;
|
uint8_t* yuv_ptr;
|
uint8_t* u_ptr;
|
uint8_t* v_ptr;
|
|
int w = width;
|
int h = height;
|
|
//从这一句来看,即使是同一种格式,进来也要处理一下。
|
size_t nPitch = (w + HD_MEMORY_ALGN_DEVICE) & ~HD_MEMORY_ALGN_DEVICE;
|
off = 0;
|
off_yuv = 0;
|
for (i = 0; i < (uint32_t)h; i++)
|
{
|
memcpy(mY + off, yuv_data + off_yuv, w);
|
off += nPitch;
|
off_yuv += w;
|
}
|
|
half_w = w >> 1;
|
half_h = h >> 1;
|
u_size = half_w * half_h;
|
nPitch = (half_w + HD_MEMORY_ALGN_DEVICE) & ~HD_MEMORY_ALGN_DEVICE;
|
|
off_yuv = w * h;
|
off = 0;
|
for (i = 0; i < half_h; i++)
|
{
|
yuv_ptr = yuv_data + off_yuv;
|
u_ptr = mU + off;
|
v_ptr = mV + off;
|
for (j = 0; j < (uint32_t)w; j += 2)
|
{
|
*u_ptr++ = *yuv_ptr++;
|
*v_ptr++ = *yuv_ptr++;
|
}
|
off_yuv += w;
|
off += nPitch;
|
}
|
|
return 0;
|
}
|
|
/////////////handle
|
class convertor{
|
public:
|
convertor(const int srcW, const int srcH, const int dstW, const int dstH, const int gpu);
|
~convertor();
|
int yuv2bgr(unsigned char **bgr, int *bgrLen);
|
int resize2bgr(unsigned char *in, unsigned char **data, int *data_len);
|
int resizeyuv(unsigned char *in, unsigned char **data, int *data_len);
|
int fill_yuv(const unsigned char *yuv);
|
private:
|
void init_yuv();
|
void init_resize();
|
void init_resize_bgr();
|
void init_resize_yuv();
|
private:
|
int width;
|
int height;
|
|
unsigned char aSamplingFactors[3];
|
int nMCUBlocksH;
|
int nMCUBlocksV;
|
|
Npp8u *apSrcImage[3];
|
NppiSize aSrcSize[3];
|
Npp32s aSrcImageStep[3];
|
size_t aSrcPitch[3];
|
|
uint8_t *mY;
|
uint8_t *mU;
|
uint8_t *mV;
|
|
///////////////////////////
|
int rWidth;
|
int rHeight;
|
float fx;
|
float fy;
|
|
Npp8u *apDstImage[3] = {0,0,0};
|
Npp32s aDstImageStep[3];
|
NppiSize aDstSize[3];
|
|
/////////////////////////////
|
Npp8u *imgOrigin;
|
size_t pitchOrigin;
|
NppiSize sizeOrigin;
|
|
unsigned char *bgrOrigin;
|
int bgrOriginLen;
|
size_t bgrOriginPitch;
|
|
////////////////////////////
|
Npp8u *imgResize;
|
size_t pitchResize;
|
NppiSize sizeResize;
|
|
unsigned char *bgrScale;
|
int bgrScaleLen;
|
size_t bgrScalePitch;
|
|
// resize only
|
////////////////////////////
|
Npp8u *originBGR;
|
int pitchOriginBGR;
|
Npp8u *resizedBGR;
|
int pitchResizedBGR;
|
unsigned char *hostResizedBGR;
|
|
///////////////////////////
|
unsigned char *nv12;
|
|
bool initialized_yuv, initialized_resize, initialized_resize_bgr, initialized_resize_yuv;
|
int gpu_index;
|
};
|
|
|
convertor::convertor(const int srcW, const int srcH, const int dstW, const int dstH, const int gpu)
|
:width(srcW)
|
,height(srcH)
|
,rWidth(dstW)
|
,rHeight(dstH)
|
,fx(-1)
|
,fy(-1)
|
,mY(NULL)
|
,mU(NULL)
|
,mV(NULL)
|
,imgOrigin(0)
|
,imgResize(0)
|
,bgrOrigin(NULL)
|
,bgrOriginLen(0)
|
,bgrScale(NULL)
|
,bgrScaleLen(0)
|
,originBGR(0)
|
,pitchOriginBGR(0)
|
,resizedBGR(0)
|
,pitchResizedBGR(0)
|
,hostResizedBGR(NULL)
|
,nv12(NULL)
|
,initialized_yuv(false)
|
,initialized_resize(false)
|
,initialized_resize_bgr(false)
|
,initialized_resize_yuv(false)
|
,gpu_index(gpu)
|
{}
|
|
static void setGPUDevice(const int gpu){
|
if (gpu >= 0){
|
cudaSetDevice(gpu);
|
}
|
}
|
|
void convertor::init_yuv(){
|
if (initialized_yuv) return;
|
initialized_yuv = true;
|
|
setGPUDevice(gpu_index);
|
|
for(int i = 0; i < 3; i++){
|
apSrcImage[i] = 0;
|
apDstImage[i] = 0;
|
}
|
|
aSamplingFactors[0] = 34;
|
aSamplingFactors[1] = 17;
|
aSamplingFactors[2] = 17;
|
|
nMCUBlocksH = 0;
|
nMCUBlocksV = 0;
|
|
for (int i = 0; i < 3; ++i)
|
{
|
nMCUBlocksV = std::max(nMCUBlocksV, aSamplingFactors[i] & 0x0f);
|
nMCUBlocksH = std::max(nMCUBlocksH, aSamplingFactors[i] >> 4);
|
}
|
|
for (int i = 0; i < 3; ++i)
|
{
|
NppiSize oBlocks;
|
NppiSize oBlocksPerMCU = { aSamplingFactors[i] >> 4, aSamplingFactors[i] & 0x0f };
|
|
oBlocks.width = (int)ceil((width + 7) / 8 *
|
static_cast<float>(oBlocksPerMCU.width) / nMCUBlocksH);
|
oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width;
|
|
oBlocks.height = (int)ceil((height + 7) / 8 *
|
static_cast<float>(oBlocksPerMCU.height) / nMCUBlocksV);
|
oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height;
|
|
aSrcSize[i].width = oBlocks.width * 8;
|
aSrcSize[i].height = oBlocks.height * 8;
|
|
// Allocate Memory
|
size_t nPitch;
|
NPP_CHECK_CUDA(cudaMallocPitch((void**)&(apSrcImage[i]), &nPitch, aSrcSize[i].width, aSrcSize[i].height));
|
aSrcPitch[i] = nPitch;
|
aSrcImageStep[i] = static_cast<Npp32s>(nPitch);
|
}
|
|
NPP_CHECK_CUDA(cudaMallocPitch((void**)&imgOrigin, &pitchOrigin, width * 3, height));
|
|
bgrOriginPitch = width * 3;
|
bgrOriginLen = bgrOriginPitch * height;
|
NPP_CHECK_CUDA(cudaHostAlloc((void**)&bgrOrigin, bgrOriginLen, cudaHostAllocDefault));
|
|
sizeOrigin.width = width;
|
sizeOrigin.height = height;
|
|
uint32_t nPitch = (width + MEMORY_ALGN_DEVICE) & ~MEMORY_ALGN_DEVICE;
|
NPP_CHECK_CUDA(cudaHostAlloc((void**)&mY, nPitch * height, cudaHostAllocDefault));
|
nPitch = (width/2 + MEMORY_ALGN_DEVICE) & ~MEMORY_ALGN_DEVICE;
|
NPP_CHECK_CUDA(cudaHostAlloc((void**)&mU, nPitch * height / 2, cudaHostAllocDefault));
|
NPP_CHECK_CUDA(cudaHostAlloc((void**)&mV, nPitch * height / 2, cudaHostAllocDefault));
|
|
}
|
|
void convertor::init_resize(){
|
if (initialized_resize) return;
|
initialized_resize = true;
|
|
setGPUDevice(gpu_index);
|
|
NppiSize oDstImageSize;
|
oDstImageSize.width = std::max(1, rWidth);
|
oDstImageSize.height = std::max(1, rHeight);
|
|
sizeResize.width = oDstImageSize.width;
|
sizeResize.height = oDstImageSize.height;
|
|
for (int i=0; i < 3; ++i)
|
{
|
NppiSize oBlocks;
|
NppiSize oBlocksPerMCU = { aSamplingFactors[i] & 0x0f, aSamplingFactors[i] >> 4};
|
|
oBlocks.width = (int)ceil((oDstImageSize.width + 7)/8 *
|
static_cast<float>(oBlocksPerMCU.width)/nMCUBlocksH);
|
oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width;
|
|
oBlocks.height = (int)ceil((oDstImageSize.height+7)/8 *
|
static_cast<float>(oBlocksPerMCU.height)/nMCUBlocksV);
|
oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height;
|
|
aDstSize[i].width = oBlocks.width * 8;
|
aDstSize[i].height = oBlocks.height * 8;
|
|
// Allocate Memory
|
size_t nPitch;
|
NPP_CHECK_CUDA(cudaMallocPitch((void**)&apDstImage[i], &nPitch, aDstSize[i].width, aDstSize[i].height));
|
aDstImageStep[i] = static_cast<Npp32s>(nPitch);
|
}
|
|
if (rWidth > 0 && rHeight > 0 && rWidth < width && rHeight < height){
|
fx = (float)(rWidth) / (float)(width);
|
fy = (float)(rHeight) / (float)(height);
|
}
|
|
if (imgResize == 0){
|
if (rWidth > 0 && rHeight > 0 && rWidth < width && rHeight < height){
|
NPP_CHECK_CUDA(cudaMallocPitch((void**)&imgResize, &pitchResize, rWidth * 3, rHeight));
|
}
|
}
|
if (!bgrScale){
|
if (rWidth > 0 && rHeight > 0 && rWidth < width && rHeight < height){
|
bgrScalePitch = rWidth * 3;
|
bgrScaleLen = bgrScalePitch * rHeight;
|
NPP_CHECK_CUDA(cudaHostAlloc((void**)&bgrScale, bgrScaleLen, cudaHostAllocDefault));
|
}
|
}
|
}
|
|
void convertor::init_resize_bgr(){
|
if (initialized_resize_bgr) return;
|
initialized_resize_bgr = true;
|
|
setGPUDevice(gpu_index);
|
if (originBGR == 0){
|
originBGR = nppiMalloc_8u_C3(width, height, &pitchOriginBGR);
|
}
|
if (resizedBGR == 0){
|
resizedBGR = nppiMalloc_8u_C3(rWidth, rHeight, &pitchResizedBGR);
|
}
|
if (hostResizedBGR == NULL){
|
NPP_CHECK_CUDA(cudaHostAlloc((void**)&hostResizedBGR, rWidth * 3 * rHeight, cudaHostAllocDefault));
|
}
|
}
|
|
void convertor::init_resize_yuv(){
|
if (initialized_resize_yuv) return;
|
initialized_resize_yuv = true;
|
|
if (rWidth > 0 && rHeight > 0){
|
fx = (float)(width) / (float)(rWidth);
|
fy = (float)(height) / (float)(rHeight);
|
}
|
|
nv12 = (unsigned char*)malloc(rWidth*rHeight*3/2);
|
}
|
|
convertor::~convertor(){
|
setGPUDevice(gpu_index);
|
|
if(mY) cudaFreeHost(mY);
|
if(mU) cudaFreeHost(mU);
|
if(mV) cudaFreeHost(mV);
|
|
for (int i = 0; i < 3; ++i)//内存释放
|
{
|
cudaFree(apSrcImage[i]);
|
cudaFree(apDstImage[i]);
|
}
|
|
if (imgOrigin) cudaFree(imgOrigin);
|
if (imgResize) cudaFree(imgResize);
|
|
if (bgrOrigin) cudaFreeHost(bgrOrigin);
|
if (bgrScale) cudaFreeHost(bgrScale);
|
|
if (originBGR) nppiFree(originBGR);
|
if (resizedBGR) nppiFree(resizedBGR);
|
if (hostResizedBGR) cudaFreeHost(hostResizedBGR);
|
|
if (nv12) free(nv12);
|
}
|
|
int convertor::fill_yuv(const unsigned char *yuv){
|
init_yuv();
|
int ret = set_data((uint8_t*)yuv, width, height, mY, mU, mV);
|
if (ret < 0) return ret;
|
|
setGPUDevice(gpu_index);
|
|
NPP_CHECK_CUDA(cudaMemcpy(apSrcImage[0], mY, aSrcPitch[0] * height, cudaMemcpyHostToDevice));
|
NPP_CHECK_CUDA(cudaMemcpy(apSrcImage[1], mU, aSrcPitch[1] * height / 2, cudaMemcpyHostToDevice));
|
NPP_CHECK_CUDA(cudaMemcpy(apSrcImage[2], mV, aSrcPitch[2] * height / 2, cudaMemcpyHostToDevice));
|
return 0;
|
}
|
|
int convertor::yuv2bgr(unsigned char **bgr, int *bgrLen){
|
|
*bgr = NULL;
|
*bgrLen = 0;
|
|
setGPUDevice(gpu_index);
|
|
NPP_CHECK_NPP(nppiYUV420ToBGR_8u_P3C3R(apSrcImage, aSrcImageStep, imgOrigin, pitchOrigin, sizeOrigin));
|
|
NPP_CHECK_CUDA(cudaMemcpy2D(bgrOrigin, bgrOriginPitch, imgOrigin, pitchOrigin, bgrOriginPitch, height, cudaMemcpyDeviceToHost));
|
*bgr = bgrOrigin;
|
*bgrLen = bgrOriginLen;
|
|
return 0;
|
}
|
|
int convertor::resize2bgr(unsigned char *in, unsigned char **data, int *data_len){
|
*data = NULL;
|
*data_len = 0;
|
|
if ((rWidth < 0 && rHeight < 0) || (rWidth > width && rHeight > height)){
|
return -1;
|
}
|
|
setGPUDevice(gpu_index);
|
|
if (!in){
|
|
init_resize();
|
|
NppiSize oDstImageSize;
|
oDstImageSize.width = std::max(1, rWidth);
|
oDstImageSize.height = std::max(1, rHeight);
|
for (int i = 0; i < 3; ++i)
|
{
|
NppiSize oBlocksPerMCU = { aSamplingFactors[i] & 0x0f, aSamplingFactors[i] >> 4};
|
NppiSize oSrcImageSize = {(width * oBlocksPerMCU.width) / nMCUBlocksH, (height * oBlocksPerMCU.height)/nMCUBlocksV};
|
NppiRect oSrcImageROI = {0,0,oSrcImageSize.width, oSrcImageSize.height};
|
NppiRect oDstImageROI;
|
NppiInterpolationMode eInterploationMode = NPPI_INTER_SUPER;
|
NPP_CHECK_NPP(nppiGetResizeRect(oSrcImageROI, &oDstImageROI,
|
fx,
|
fy,
|
0.0, 0.0, eInterploationMode));
|
NPP_CHECK_NPP(nppiResizeSqrPixel_8u_C1R(apSrcImage[i], oSrcImageSize, aSrcImageStep[i], oSrcImageROI,
|
apDstImage[i], aDstImageStep[i], oDstImageROI ,
|
fx,
|
fy,
|
0.0, 0.0, eInterploationMode));
|
}
|
NPP_CHECK_NPP(nppiYUV420ToBGR_8u_P3C3R(apDstImage, aDstImageStep, imgResize, pitchResize, sizeResize));
|
NPP_CHECK_CUDA(cudaMemcpy2D(bgrScale, bgrScalePitch, imgResize, pitchResize, bgrScalePitch, rHeight, cudaMemcpyDeviceToHost));
|
*data = bgrScale;
|
*data_len = bgrScaleLen;
|
}else{
|
|
init_resize_bgr();
|
|
NppiSize oSrcSize;
|
oSrcSize.width = width;
|
oSrcSize.height = height;
|
|
NPP_CHECK_CUDA(cudaMemcpy2D(originBGR, pitchOriginBGR, in, width*3, width*3, height, cudaMemcpyHostToDevice));
|
|
NppiRect oSrcROI;
|
oSrcROI.x = 0;
|
oSrcROI.y = 0;
|
oSrcROI.width = width;
|
oSrcROI.height = height;
|
|
|
NppiRect oDstROI;
|
oDstROI.x = 0;
|
oDstROI.y = 0;
|
oDstROI.width = rWidth;
|
oDstROI.height = rHeight;
|
|
// Scale Factor
|
double nXFactor = double(oDstROI.width) / double(oSrcROI.width);
|
double nYFactor = double(oDstROI.height) / double(oSrcROI.height);
|
|
// Scaled X/Y Shift
|
double nXShift = - oSrcROI.x * nXFactor ;
|
double nYShift = - oSrcROI.y * nYFactor;
|
int eInterpolation = NPPI_INTER_SUPER;
|
if (nXFactor >= 1.f || nYFactor >= 1.f)
|
eInterpolation = NPPI_INTER_LANCZOS;
|
|
NppStatus ret = nppiResizeSqrPixel_8u_C3R(originBGR, oSrcSize, pitchOriginBGR, oSrcROI,
|
resizedBGR, pitchResizedBGR, oDstROI, nXFactor, nYFactor, nXShift, nYShift, eInterpolation );
|
|
if(ret != NPP_SUCCESS) {
|
printf("imageResize_8u_C3R failed %d.\n", ret);
|
return -2;
|
}
|
size_t pitch = rWidth * 3;
|
*data_len = pitch * rHeight;
|
NPP_CHECK_CUDA(cudaMemcpy2D(hostResizedBGR, pitch, resizedBGR, pitchResizedBGR, pitch, rHeight, cudaMemcpyDeviceToHost));
|
*data = hostResizedBGR;
|
}
|
return 0;
|
}
|
|
static int nv12_nearest_scale(uint8_t* __restrict src, uint8_t* __restrict dst,
|
int srcWidth, int srcHeight, int dstWidth, int dstHeight)
|
{
|
register int sw = srcWidth; //register keyword is for local var to accelorate
|
register int sh = srcHeight;
|
register int dw = dstWidth;
|
register int dh = dstHeight;
|
register int y, x;
|
unsigned long int srcy, srcx, src_index, dst_index;
|
unsigned long int xrIntFloat_16 = (sw << 16) / dw + 1; //better than float division
|
unsigned long int yrIntFloat_16 = (sh << 16) / dh + 1;
|
|
uint8_t* dst_uv = dst + dh * dw; //memory start pointer of dest uv
|
uint8_t* src_uv = src + sh * sw; //memory start pointer of source uv
|
uint8_t* dst_uv_yScanline;
|
uint8_t* src_uv_yScanline;
|
uint8_t* dst_y_slice = dst; //memory start pointer of dest y
|
uint8_t* src_y_slice;
|
uint8_t* sp;
|
uint8_t* dp;
|
|
for (y = 0; y < (dh & ~7); ++y) //'dh & ~7' is to generate faster assembly code
|
{
|
srcy = (y * yrIntFloat_16) >> 16;
|
src_y_slice = src + srcy * sw;
|
|
if((y & 1) == 0)
|
{
|
dst_uv_yScanline = dst_uv + (y / 2) * dw;
|
src_uv_yScanline = src_uv + (srcy / 2) * sw;
|
}
|
|
for(x = 0; x < (dw & ~7); ++x)
|
{
|
srcx = (x * xrIntFloat_16) >> 16;
|
dst_y_slice[x] = src_y_slice[srcx];
|
|
if((y & 1) == 0) //y is even
|
{
|
if((x & 1) == 0) //x is even
|
{
|
src_index = (srcx / 2) * 2;
|
|
sp = dst_uv_yScanline + x;
|
dp = src_uv_yScanline + src_index;
|
*sp = *dp;
|
++sp;
|
++dp;
|
*sp = *dp;
|
}
|
}
|
}
|
dst_y_slice += dw;
|
}
|
return 0;
|
}
|
|
int convertor::resizeyuv(unsigned char *in, unsigned char **data, int *data_len){
|
|
init_resize_yuv();
|
|
*data_len = rWidth*rHeight*3/2;
|
*data = nv12;
|
|
return nv12_nearest_scale(in, nv12, width, height, rWidth, rHeight);
|
}
|
|
convHandle conv_create(const int srcW, const int srcH, const int dstW, const int dstH, const int gpu){
|
if (gpu < 0) return NULL;
|
|
convertor *conv = new convertor(srcW, srcH, dstW, dstH, gpu);
|
return conv;
|
}
|
|
void conv_destroy(convHandle h){
|
if (!h) return;
|
convertor *conv = (convertor*)h;
|
delete conv;
|
}
|
|
int yuv2bgrandresize(convHandle h, void *yuv, unsigned char **bgr, int *bgrLen, unsigned char **scaleBGR, int *scaleBGRLen){
|
if (!h) return -2;
|
convertor *conv = (convertor*)h;
|
int ret = conv->fill_yuv((unsigned char*)yuv);
|
if (ret != 0) return ret;
|
ret = conv->yuv2bgr(bgr, bgrLen);
|
if (ret != 0) return ret;
|
ret = conv->resize2bgr(NULL, scaleBGR, scaleBGRLen);
|
return ret;
|
}
|
|
int yuv2bgr(convHandle h, void *yuv, unsigned char **bgr, int *bgrLen){
|
if (!h) return -2;
|
convertor *conv = (convertor*)h;
|
int ret = conv->fill_yuv((unsigned char*)yuv);
|
if (ret != 0) return ret;
|
return conv->yuv2bgr(bgr, bgrLen);
|
}
|
|
int yuv2resizedbgr(convHandle h, void *yuv, unsigned char **bgr, int *bgrLen){
|
if (!h) return -2;
|
convertor *conv = (convertor*)h;
|
int ret = conv->fill_yuv((unsigned char*)yuv);
|
if (ret != 0) return ret;
|
ret = conv->resize2bgr(NULL, bgr, bgrLen);
|
return ret;
|
}
|
|
int resizebgr(convHandle h, void *data, unsigned char **resized, int *len){
|
if (!h) return -2;
|
convertor *conv = (convertor*)h;
|
return conv->resize2bgr((unsigned char*)data, resized, len);
|
}
|
|
int resizeyuv(convHandle h, void *data, unsigned char **resized, int *len){
|
if (!h) return -2;
|
convertor *conv = (convertor*)h;
|
return conv->resizeyuv((unsigned char*)data, resized, len);
|
}
|