From a10f7bf10c195421b089cda0c13f2195c614cb80 Mon Sep 17 00:00:00 2001 From: zhangmeng <775834166@qq.com> Date: 星期一, 06 一月 2020 14:45:04 +0800 Subject: [PATCH] remove .so --- goconv/conv.cpp | 592 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 files changed, 592 insertions(+), 0 deletions(-) diff --git a/goconv/conv.cpp b/goconv/conv.cpp new file mode 100644 index 0000000..098a396 --- /dev/null +++ b/goconv/conv.cpp @@ -0,0 +1,592 @@ +#include "conv.h" + +#include <cmath> +#include <sys/time.h> + +#include <npp.h> +#include <helper_cuda.h> +#include <helper_string.h> +#include "Exceptions.h" + + +static const int MEMORY_ALGN_DEVICE = 511; +static const int HD_MEMORY_ALGN_DEVICE = 511; + +static inline int DivUp(int x, int d) +{ + return (x + d - 1) / d; +} + +static int set_data(uint8_t *data, const int width, const int height, unsigned char *mY, unsigned char *mU, unsigned char *mV) +{ + uint8_t* yuv_data = (uint8_t*)data; + if (!yuv_data) + { + return -1; + } + + uint32_t i, j; + uint32_t off; + uint32_t off_yuv; + uint32_t half_h; + uint32_t half_w; + uint32_t u_size; + uint8_t* yuv_ptr; + uint8_t* u_ptr; + uint8_t* v_ptr; + + int w = width; + int h = height; + + //浠庤繖涓�鍙ユ潵鐪嬶紝鍗充娇鏄悓涓�绉嶆牸寮忥紝杩涙潵涔熻澶勭悊涓�涓嬨�� + size_t nPitch = (w + HD_MEMORY_ALGN_DEVICE) & ~HD_MEMORY_ALGN_DEVICE; + off = 0; + off_yuv = 0; + for (i = 0; i < (uint32_t)h; i++) + { + memcpy(mY + off, yuv_data + off_yuv, w); + off += nPitch; + off_yuv += w; + } + + half_w = w >> 1; + half_h = h >> 1; + u_size = half_w * half_h; + nPitch = (half_w + HD_MEMORY_ALGN_DEVICE) & ~HD_MEMORY_ALGN_DEVICE; + + off_yuv = w * h; + off = 0; + for (i = 0; i < half_h; i++) + { + yuv_ptr = yuv_data + off_yuv; + u_ptr = mU + off; + v_ptr = mV + off; + for (j = 0; j < (uint32_t)w; j += 2) + { + *u_ptr++ = *yuv_ptr++; + *v_ptr++ = *yuv_ptr++; + } + off_yuv += w; + off += nPitch; + } + + return 0; +} + +/////////////handle +class convertor{ +public: + convertor(const int srcW, const int srcH, const int dstW, const int dstH, const int gpu); + ~convertor(); + int yuv2bgr(unsigned char **bgr, int *bgrLen); + int resize2bgr(unsigned char *in, unsigned char **data, int *data_len); + int resizeyuv(unsigned char *in, unsigned char **data, int *data_len); + int fill_yuv(const unsigned char *yuv); +private: + void init_yuv(); + void init_resize(); + void init_resize_bgr(); + void init_resize_yuv(); +private: + int width; + int height; + + unsigned char aSamplingFactors[3]; + int nMCUBlocksH; + int nMCUBlocksV; + + Npp8u *apSrcImage[3]; + NppiSize aSrcSize[3]; + Npp32s aSrcImageStep[3]; + size_t aSrcPitch[3]; + + uint8_t *mY; + uint8_t *mU; + uint8_t *mV; + +/////////////////////////// + int rWidth; + int rHeight; + float fx; + float fy; + + Npp8u *apDstImage[3] = {0,0,0}; + Npp32s aDstImageStep[3]; + NppiSize aDstSize[3]; + +///////////////////////////// + Npp8u *imgOrigin; + size_t pitchOrigin; + NppiSize sizeOrigin; + + unsigned char *bgrOrigin; + int bgrOriginLen; + size_t bgrOriginPitch; + +//////////////////////////// + Npp8u *imgResize; + size_t pitchResize; + NppiSize sizeResize; + + unsigned char *bgrScale; + int bgrScaleLen; + size_t bgrScalePitch; + +// resize only +//////////////////////////// + Npp8u *originBGR; + int pitchOriginBGR; + Npp8u *resizedBGR; + int pitchResizedBGR; + unsigned char *hostResizedBGR; + +/////////////////////////// + unsigned char *nv12; + + bool initialized_yuv, initialized_resize, initialized_resize_bgr, initialized_resize_yuv; + int gpu_index; +}; + + +convertor::convertor(const int srcW, const int srcH, const int dstW, const int dstH, const int gpu) +:width(srcW) +,height(srcH) +,rWidth(dstW) +,rHeight(dstH) +,fx(-1) +,fy(-1) +,mY(NULL) +,mU(NULL) +,mV(NULL) +,imgOrigin(0) +,imgResize(0) +,bgrOrigin(NULL) +,bgrOriginLen(0) +,bgrScale(NULL) +,bgrScaleLen(0) +,originBGR(0) +,pitchOriginBGR(0) +,resizedBGR(0) +,pitchResizedBGR(0) +,hostResizedBGR(NULL) +,nv12(NULL) +,initialized_yuv(false) +,initialized_resize(false) +,initialized_resize_bgr(false) +,initialized_resize_yuv(false) +,gpu_index(gpu) +{} + +static void setGPUDevice(const int gpu){ + if (gpu >= 0){ + cudaSetDevice(gpu); + } +} + +void convertor::init_yuv(){ + if (initialized_yuv) return; + initialized_yuv = true; + + setGPUDevice(gpu_index); + + for(int i = 0; i < 3; i++){ + apSrcImage[i] = 0; + apDstImage[i] = 0; + } + + aSamplingFactors[0] = 34; + aSamplingFactors[1] = 17; + aSamplingFactors[2] = 17; + + nMCUBlocksH = 0; + nMCUBlocksV = 0; + + for (int i = 0; i < 3; ++i) + { + nMCUBlocksV = std::max(nMCUBlocksV, aSamplingFactors[i] & 0x0f); + nMCUBlocksH = std::max(nMCUBlocksH, aSamplingFactors[i] >> 4); + } + + for (int i = 0; i < 3; ++i) + { + NppiSize oBlocks; + NppiSize oBlocksPerMCU = { aSamplingFactors[i] >> 4, aSamplingFactors[i] & 0x0f }; + + oBlocks.width = (int)ceil((width + 7) / 8 * + static_cast<float>(oBlocksPerMCU.width) / nMCUBlocksH); + oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width; + + oBlocks.height = (int)ceil((height + 7) / 8 * + static_cast<float>(oBlocksPerMCU.height) / nMCUBlocksV); + oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height; + + aSrcSize[i].width = oBlocks.width * 8; + aSrcSize[i].height = oBlocks.height * 8; + + // Allocate Memory + size_t nPitch; + NPP_CHECK_CUDA(cudaMallocPitch((void**)&(apSrcImage[i]), &nPitch, aSrcSize[i].width, aSrcSize[i].height)); + aSrcPitch[i] = nPitch; + aSrcImageStep[i] = static_cast<Npp32s>(nPitch); + } + + NPP_CHECK_CUDA(cudaMallocPitch((void**)&imgOrigin, &pitchOrigin, width * 3, height)); + + bgrOriginPitch = width * 3; + bgrOriginLen = bgrOriginPitch * height; + NPP_CHECK_CUDA(cudaHostAlloc((void**)&bgrOrigin, bgrOriginLen, cudaHostAllocDefault)); + + sizeOrigin.width = width; + sizeOrigin.height = height; + + uint32_t nPitch = (width + MEMORY_ALGN_DEVICE) & ~MEMORY_ALGN_DEVICE; + NPP_CHECK_CUDA(cudaHostAlloc((void**)&mY, nPitch * height, cudaHostAllocDefault)); + nPitch = (width/2 + MEMORY_ALGN_DEVICE) & ~MEMORY_ALGN_DEVICE; + NPP_CHECK_CUDA(cudaHostAlloc((void**)&mU, nPitch * height / 2, cudaHostAllocDefault)); + NPP_CHECK_CUDA(cudaHostAlloc((void**)&mV, nPitch * height / 2, cudaHostAllocDefault)); + +} + +void convertor::init_resize(){ + if (initialized_resize) return; + initialized_resize = true; + + setGPUDevice(gpu_index); + + NppiSize oDstImageSize; + oDstImageSize.width = std::max(1, rWidth); + oDstImageSize.height = std::max(1, rHeight); + + sizeResize.width = oDstImageSize.width; + sizeResize.height = oDstImageSize.height; + + for (int i=0; i < 3; ++i) + { + NppiSize oBlocks; + NppiSize oBlocksPerMCU = { aSamplingFactors[i] & 0x0f, aSamplingFactors[i] >> 4}; + + oBlocks.width = (int)ceil((oDstImageSize.width + 7)/8 * + static_cast<float>(oBlocksPerMCU.width)/nMCUBlocksH); + oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width; + + oBlocks.height = (int)ceil((oDstImageSize.height+7)/8 * + static_cast<float>(oBlocksPerMCU.height)/nMCUBlocksV); + oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height; + + aDstSize[i].width = oBlocks.width * 8; + aDstSize[i].height = oBlocks.height * 8; + + // Allocate Memory + size_t nPitch; + NPP_CHECK_CUDA(cudaMallocPitch((void**)&apDstImage[i], &nPitch, aDstSize[i].width, aDstSize[i].height)); + aDstImageStep[i] = static_cast<Npp32s>(nPitch); + } + + if (rWidth > 0 && rHeight > 0 && rWidth < width && rHeight < height){ + fx = (float)(rWidth) / (float)(width); + fy = (float)(rHeight) / (float)(height); + } + + if (imgResize == 0){ + if (rWidth > 0 && rHeight > 0 && rWidth < width && rHeight < height){ + NPP_CHECK_CUDA(cudaMallocPitch((void**)&imgResize, &pitchResize, rWidth * 3, rHeight)); + } + } + if (!bgrScale){ + if (rWidth > 0 && rHeight > 0 && rWidth < width && rHeight < height){ + bgrScalePitch = rWidth * 3; + bgrScaleLen = bgrScalePitch * rHeight; + NPP_CHECK_CUDA(cudaHostAlloc((void**)&bgrScale, bgrScaleLen, cudaHostAllocDefault)); + } + } +} + +void convertor::init_resize_bgr(){ + if (initialized_resize_bgr) return; + initialized_resize_bgr = true; + + setGPUDevice(gpu_index); + if (originBGR == 0){ + originBGR = nppiMalloc_8u_C3(width, height, &pitchOriginBGR); + } + if (resizedBGR == 0){ + resizedBGR = nppiMalloc_8u_C3(rWidth, rHeight, &pitchResizedBGR); + } + if (hostResizedBGR == NULL){ + NPP_CHECK_CUDA(cudaHostAlloc((void**)&hostResizedBGR, rWidth * 3 * rHeight, cudaHostAllocDefault)); + } +} + +void convertor::init_resize_yuv(){ + if (initialized_resize_yuv) return; + initialized_resize_yuv = true; + + if (rWidth > 0 && rHeight > 0){ + fx = (float)(width) / (float)(rWidth); + fy = (float)(height) / (float)(rHeight); + } + + nv12 = (unsigned char*)malloc(rWidth*rHeight*3/2); +} + +convertor::~convertor(){ + setGPUDevice(gpu_index); + + if(mY) cudaFreeHost(mY); + if(mU) cudaFreeHost(mU); + if(mV) cudaFreeHost(mV); + + for (int i = 0; i < 3; ++i)//鍐呭瓨閲婃斁 + { + cudaFree(apSrcImage[i]); + cudaFree(apDstImage[i]); + } + + if (imgOrigin) cudaFree(imgOrigin); + if (imgResize) cudaFree(imgResize); + + if (bgrOrigin) cudaFreeHost(bgrOrigin); + if (bgrScale) cudaFreeHost(bgrScale); + + if (originBGR) nppiFree(originBGR); + if (resizedBGR) nppiFree(resizedBGR); + if (hostResizedBGR) cudaFreeHost(hostResizedBGR); + + if (nv12) free(nv12); +} + +int convertor::fill_yuv(const unsigned char *yuv){ + init_yuv(); + int ret = set_data((uint8_t*)yuv, width, height, mY, mU, mV); + if (ret < 0) return ret; + + setGPUDevice(gpu_index); + + NPP_CHECK_CUDA(cudaMemcpy(apSrcImage[0], mY, aSrcPitch[0] * height, cudaMemcpyHostToDevice)); + NPP_CHECK_CUDA(cudaMemcpy(apSrcImage[1], mU, aSrcPitch[1] * height / 2, cudaMemcpyHostToDevice)); + NPP_CHECK_CUDA(cudaMemcpy(apSrcImage[2], mV, aSrcPitch[2] * height / 2, cudaMemcpyHostToDevice)); + return 0; +} + +int convertor::yuv2bgr(unsigned char **bgr, int *bgrLen){ + + *bgr = NULL; + *bgrLen = 0; + + setGPUDevice(gpu_index); + + NPP_CHECK_NPP(nppiYUV420ToBGR_8u_P3C3R(apSrcImage, aSrcImageStep, imgOrigin, pitchOrigin, sizeOrigin)); + + NPP_CHECK_CUDA(cudaMemcpy2D(bgrOrigin, bgrOriginPitch, imgOrigin, pitchOrigin, bgrOriginPitch, height, cudaMemcpyDeviceToHost)); + *bgr = bgrOrigin; + *bgrLen = bgrOriginLen; + + return 0; +} + +int convertor::resize2bgr(unsigned char *in, unsigned char **data, int *data_len){ + *data = NULL; + *data_len = 0; + + if ((rWidth < 0 && rHeight < 0) || (rWidth > width && rHeight > height)){ + return -1; + } + + setGPUDevice(gpu_index); + + if (!in){ + + init_resize(); + + NppiSize oDstImageSize; + oDstImageSize.width = std::max(1, rWidth); + oDstImageSize.height = std::max(1, rHeight); + for (int i = 0; i < 3; ++i) + { + NppiSize oBlocksPerMCU = { aSamplingFactors[i] & 0x0f, aSamplingFactors[i] >> 4}; + NppiSize oSrcImageSize = {(width * oBlocksPerMCU.width) / nMCUBlocksH, (height * oBlocksPerMCU.height)/nMCUBlocksV}; + NppiRect oSrcImageROI = {0,0,oSrcImageSize.width, oSrcImageSize.height}; + NppiRect oDstImageROI; + NppiInterpolationMode eInterploationMode = NPPI_INTER_SUPER; + NPP_CHECK_NPP(nppiGetResizeRect(oSrcImageROI, &oDstImageROI, + fx, + fy, + 0.0, 0.0, eInterploationMode)); + NPP_CHECK_NPP(nppiResizeSqrPixel_8u_C1R(apSrcImage[i], oSrcImageSize, aSrcImageStep[i], oSrcImageROI, + apDstImage[i], aDstImageStep[i], oDstImageROI , + fx, + fy, + 0.0, 0.0, eInterploationMode)); + } + NPP_CHECK_NPP(nppiYUV420ToBGR_8u_P3C3R(apDstImage, aDstImageStep, imgResize, pitchResize, sizeResize)); + NPP_CHECK_CUDA(cudaMemcpy2D(bgrScale, bgrScalePitch, imgResize, pitchResize, bgrScalePitch, rHeight, cudaMemcpyDeviceToHost)); + *data = bgrScale; + *data_len = bgrScaleLen; + }else{ + + init_resize_bgr(); + + NppiSize oSrcSize; + oSrcSize.width = width; + oSrcSize.height = height; + + NPP_CHECK_CUDA(cudaMemcpy2D(originBGR, pitchOriginBGR, in, width*3, width*3, height, cudaMemcpyHostToDevice)); + + NppiRect oSrcROI; + oSrcROI.x = 0; + oSrcROI.y = 0; + oSrcROI.width = width; + oSrcROI.height = height; + + + NppiRect oDstROI; + oDstROI.x = 0; + oDstROI.y = 0; + oDstROI.width = rWidth; + oDstROI.height = rHeight; + + // Scale Factor + double nXFactor = double(oDstROI.width) / double(oSrcROI.width); + double nYFactor = double(oDstROI.height) / double(oSrcROI.height); + + // Scaled X/Y Shift + double nXShift = - oSrcROI.x * nXFactor ; + double nYShift = - oSrcROI.y * nYFactor; + int eInterpolation = NPPI_INTER_SUPER; + if (nXFactor >= 1.f || nYFactor >= 1.f) + eInterpolation = NPPI_INTER_LANCZOS; + + NppStatus ret = nppiResizeSqrPixel_8u_C3R(originBGR, oSrcSize, pitchOriginBGR, oSrcROI, + resizedBGR, pitchResizedBGR, oDstROI, nXFactor, nYFactor, nXShift, nYShift, eInterpolation ); + + if(ret != NPP_SUCCESS) { + printf("imageResize_8u_C3R failed %d.\n", ret); + return -2; + } + size_t pitch = rWidth * 3; + *data_len = pitch * rHeight; + NPP_CHECK_CUDA(cudaMemcpy2D(hostResizedBGR, pitch, resizedBGR, pitchResizedBGR, pitch, rHeight, cudaMemcpyDeviceToHost)); + *data = hostResizedBGR; + } + return 0; +} + +static int nv12_nearest_scale(uint8_t* __restrict src, uint8_t* __restrict dst, + int srcWidth, int srcHeight, int dstWidth, int dstHeight) +{ + register int sw = srcWidth; //register keyword is for local var to accelorate + register int sh = srcHeight; + register int dw = dstWidth; + register int dh = dstHeight; + register int y, x; + unsigned long int srcy, srcx, src_index, dst_index; + unsigned long int xrIntFloat_16 = (sw << 16) / dw + 1; //better than float division + unsigned long int yrIntFloat_16 = (sh << 16) / dh + 1; + + uint8_t* dst_uv = dst + dh * dw; //memory start pointer of dest uv + uint8_t* src_uv = src + sh * sw; //memory start pointer of source uv + uint8_t* dst_uv_yScanline; + uint8_t* src_uv_yScanline; + uint8_t* dst_y_slice = dst; //memory start pointer of dest y + uint8_t* src_y_slice; + uint8_t* sp; + uint8_t* dp; + + for (y = 0; y < (dh & ~7); ++y) //'dh & ~7' is to generate faster assembly code + { + srcy = (y * yrIntFloat_16) >> 16; + src_y_slice = src + srcy * sw; + + if((y & 1) == 0) + { + dst_uv_yScanline = dst_uv + (y / 2) * dw; + src_uv_yScanline = src_uv + (srcy / 2) * sw; + } + + for(x = 0; x < (dw & ~7); ++x) + { + srcx = (x * xrIntFloat_16) >> 16; + dst_y_slice[x] = src_y_slice[srcx]; + + if((y & 1) == 0) //y is even + { + if((x & 1) == 0) //x is even + { + src_index = (srcx / 2) * 2; + + sp = dst_uv_yScanline + x; + dp = src_uv_yScanline + src_index; + *sp = *dp; + ++sp; + ++dp; + *sp = *dp; + } + } + } + dst_y_slice += dw; + } + return 0; +} + +int convertor::resizeyuv(unsigned char *in, unsigned char **data, int *data_len){ + + init_resize_yuv(); + + *data_len = rWidth*rHeight*3/2; + *data = nv12; + + return nv12_nearest_scale(in, nv12, width, height, rWidth, rHeight); +} + +convHandle conv_create(const int srcW, const int srcH, const int dstW, const int dstH, const int gpu){ + if (gpu < 0) return NULL; + + convertor *conv = new convertor(srcW, srcH, dstW, dstH, gpu); + return conv; +} + +void conv_destroy(convHandle h){ + if (!h) return; + convertor *conv = (convertor*)h; + delete conv; +} + +int yuv2bgrandresize(convHandle h, void *yuv, unsigned char **bgr, int *bgrLen, unsigned char **scaleBGR, int *scaleBGRLen){ + if (!h) return -2; + convertor *conv = (convertor*)h; + int ret = conv->fill_yuv((unsigned char*)yuv); + if (ret != 0) return ret; + ret = conv->yuv2bgr(bgr, bgrLen); + if (ret != 0) return ret; + ret = conv->resize2bgr(NULL, scaleBGR, scaleBGRLen); + return ret; +} + +int yuv2bgr(convHandle h, void *yuv, unsigned char **bgr, int *bgrLen){ + if (!h) return -2; + convertor *conv = (convertor*)h; + int ret = conv->fill_yuv((unsigned char*)yuv); + if (ret != 0) return ret; + return conv->yuv2bgr(bgr, bgrLen); +} + +int yuv2resizedbgr(convHandle h, void *yuv, unsigned char **bgr, int *bgrLen){ + if (!h) return -2; + convertor *conv = (convertor*)h; + int ret = conv->fill_yuv((unsigned char*)yuv); + if (ret != 0) return ret; + ret = conv->resize2bgr(NULL, bgr, bgrLen); + return ret; +} + +int resizebgr(convHandle h, void *data, unsigned char **resized, int *len){ + if (!h) return -2; + convertor *conv = (convertor*)h; + return conv->resize2bgr((unsigned char*)data, resized, len); +} + +int resizeyuv(convHandle h, void *data, unsigned char **resized, int *len){ + if (!h) return -2; + convertor *conv = (convertor*)h; + return conv->resizeyuv((unsigned char*)data, resized, len); +} -- Gitblit v1.8.0