From a10f7bf10c195421b089cda0c13f2195c614cb80 Mon Sep 17 00:00:00 2001
From: zhangmeng <775834166@qq.com>
Date: 星期一, 06 一月 2020 14:45:04 +0800
Subject: [PATCH] remove .so

---
 goconv/conv.cpp |  592 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 592 insertions(+), 0 deletions(-)

diff --git a/goconv/conv.cpp b/goconv/conv.cpp
new file mode 100644
index 0000000..098a396
--- /dev/null
+++ b/goconv/conv.cpp
@@ -0,0 +1,592 @@
+#include "conv.h"
+
+#include <cmath>
+#include <sys/time.h>
+
+#include <npp.h>
+#include <helper_cuda.h>
+#include <helper_string.h>
+#include "Exceptions.h"
+
+
+static const int MEMORY_ALGN_DEVICE = 511;
+static const int HD_MEMORY_ALGN_DEVICE = 511;
+
+static inline int DivUp(int x, int d)
+{
+    return (x + d - 1) / d;
+}
+
+static int set_data(uint8_t *data, const int width, const int height, unsigned char *mY, unsigned char *mU, unsigned char *mV)
+{
+    uint8_t* yuv_data = (uint8_t*)data;
+    if (!yuv_data)
+    {
+        return -1;
+    }
+
+    uint32_t    i, j;
+    uint32_t    off;
+    uint32_t    off_yuv;
+    uint32_t    half_h;
+    uint32_t    half_w;
+    uint32_t    u_size;
+    uint8_t*    yuv_ptr;
+    uint8_t*    u_ptr;
+    uint8_t*    v_ptr;
+
+    int w = width;
+    int h = height;
+
+    //浠庤繖涓�鍙ユ潵鐪嬶紝鍗充娇鏄悓涓�绉嶆牸寮忥紝杩涙潵涔熻澶勭悊涓�涓嬨��
+    size_t nPitch  = (w + HD_MEMORY_ALGN_DEVICE) & ~HD_MEMORY_ALGN_DEVICE;
+    off     = 0;
+    off_yuv = 0;
+    for (i = 0; i < (uint32_t)h; i++)
+    {
+        memcpy(mY + off, yuv_data + off_yuv, w);
+        off     += nPitch;
+        off_yuv += w;
+    }
+
+    half_w = w >> 1;
+    half_h = h >> 1;
+    u_size = half_w * half_h;
+    nPitch = (half_w + HD_MEMORY_ALGN_DEVICE) & ~HD_MEMORY_ALGN_DEVICE;
+    
+    off_yuv = w * h;
+    off = 0;
+    for (i = 0; i < half_h; i++)
+    {  
+        yuv_ptr = yuv_data + off_yuv;
+        u_ptr = mU + off;
+        v_ptr = mV + off;
+        for (j = 0; j < (uint32_t)w; j += 2)
+        {
+            *u_ptr++ = *yuv_ptr++;
+            *v_ptr++ = *yuv_ptr++;
+        }
+        off_yuv += w;
+        off += nPitch;
+    }
+
+    return 0;
+}
+
+/////////////handle
+class convertor{
+public: 
+    convertor(const int srcW, const int srcH, const int dstW, const int dstH, const int gpu);
+    ~convertor();
+    int yuv2bgr(unsigned char **bgr, int *bgrLen);
+    int resize2bgr(unsigned char *in, unsigned char **data, int *data_len);
+    int resizeyuv(unsigned char *in, unsigned char **data, int *data_len);
+    int fill_yuv(const unsigned char *yuv);
+private: 
+    void init_yuv();
+    void init_resize();
+    void init_resize_bgr();
+    void init_resize_yuv();
+private: 
+    int width;
+    int height;
+
+    unsigned char aSamplingFactors[3];
+    int nMCUBlocksH;
+    int nMCUBlocksV;
+
+    Npp8u   *apSrcImage[3];
+    NppiSize aSrcSize[3];
+    Npp32s   aSrcImageStep[3];
+    size_t   aSrcPitch[3];
+
+    uint8_t *mY;
+    uint8_t *mU;
+    uint8_t *mV;
+
+///////////////////////////
+    int rWidth;
+    int rHeight;
+    float fx;
+    float fy;
+
+    Npp8u *apDstImage[3] = {0,0,0};
+    Npp32s aDstImageStep[3];
+    NppiSize aDstSize[3];
+
+/////////////////////////////
+    Npp8u *imgOrigin;
+    size_t pitchOrigin;
+    NppiSize sizeOrigin;
+
+    unsigned char *bgrOrigin;
+    int bgrOriginLen;
+    size_t bgrOriginPitch;
+
+////////////////////////////
+    Npp8u *imgResize;
+    size_t pitchResize;
+    NppiSize sizeResize;
+
+    unsigned char *bgrScale;
+    int bgrScaleLen;
+    size_t bgrScalePitch;
+
+// resize only
+////////////////////////////
+    Npp8u *originBGR;
+    int pitchOriginBGR;
+    Npp8u *resizedBGR;
+    int pitchResizedBGR;
+    unsigned char *hostResizedBGR;
+
+///////////////////////////
+    unsigned char *nv12;
+
+    bool initialized_yuv, initialized_resize, initialized_resize_bgr, initialized_resize_yuv;
+    int gpu_index;
+};
+
+
+convertor::convertor(const int srcW, const int srcH, const int dstW, const int dstH, const int gpu)
+:width(srcW)
+,height(srcH)
+,rWidth(dstW) 
+,rHeight(dstH)
+,fx(-1)
+,fy(-1)
+,mY(NULL)
+,mU(NULL)
+,mV(NULL)
+,imgOrigin(0)
+,imgResize(0)
+,bgrOrigin(NULL)
+,bgrOriginLen(0)
+,bgrScale(NULL)
+,bgrScaleLen(0)
+,originBGR(0)
+,pitchOriginBGR(0)
+,resizedBGR(0)
+,pitchResizedBGR(0)
+,hostResizedBGR(NULL)
+,nv12(NULL)
+,initialized_yuv(false)
+,initialized_resize(false)
+,initialized_resize_bgr(false)
+,initialized_resize_yuv(false)
+,gpu_index(gpu)
+{}
+
+static void setGPUDevice(const int gpu){
+    if (gpu >= 0){
+        cudaSetDevice(gpu);
+    }
+}
+
+void convertor::init_yuv(){
+    if (initialized_yuv) return;
+    initialized_yuv = true;
+
+    setGPUDevice(gpu_index);
+
+    for(int i = 0; i < 3; i++){
+        apSrcImage[i] = 0;
+        apDstImage[i] = 0;
+    }
+
+    aSamplingFactors[0] = 34;
+    aSamplingFactors[1] = 17;
+    aSamplingFactors[2] = 17;
+
+    nMCUBlocksH = 0;
+    nMCUBlocksV = 0;
+
+    for (int i = 0; i < 3; ++i)
+    {
+        nMCUBlocksV = std::max(nMCUBlocksV, aSamplingFactors[i] & 0x0f);
+        nMCUBlocksH = std::max(nMCUBlocksH, aSamplingFactors[i] >> 4);
+    }
+
+    for (int i = 0; i < 3; ++i)
+    {
+        NppiSize oBlocks;
+        NppiSize oBlocksPerMCU = { aSamplingFactors[i] >> 4, aSamplingFactors[i] & 0x0f };
+
+        oBlocks.width = (int)ceil((width   + 7) / 8 *
+            static_cast<float>(oBlocksPerMCU.width) / nMCUBlocksH);
+        oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width;
+
+        oBlocks.height = (int)ceil((height + 7) / 8 *
+            static_cast<float>(oBlocksPerMCU.height) / nMCUBlocksV);
+        oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height;
+
+        aSrcSize[i].width  = oBlocks.width  * 8;
+        aSrcSize[i].height = oBlocks.height * 8;
+
+        // Allocate Memory
+        size_t nPitch;
+        NPP_CHECK_CUDA(cudaMallocPitch((void**)&(apSrcImage[i]), &nPitch, aSrcSize[i].width, aSrcSize[i].height));
+        aSrcPitch[i] = nPitch;
+        aSrcImageStep[i] = static_cast<Npp32s>(nPitch);
+    }
+
+    NPP_CHECK_CUDA(cudaMallocPitch((void**)&imgOrigin, &pitchOrigin, width * 3, height));
+
+    bgrOriginPitch = width * 3;
+    bgrOriginLen = bgrOriginPitch * height;
+    NPP_CHECK_CUDA(cudaHostAlloc((void**)&bgrOrigin, bgrOriginLen, cudaHostAllocDefault));
+
+    sizeOrigin.width = width;
+    sizeOrigin.height = height;
+
+    uint32_t nPitch = (width + MEMORY_ALGN_DEVICE) & ~MEMORY_ALGN_DEVICE;
+    NPP_CHECK_CUDA(cudaHostAlloc((void**)&mY, nPitch * height, cudaHostAllocDefault));
+    nPitch = (width/2 + MEMORY_ALGN_DEVICE) & ~MEMORY_ALGN_DEVICE;
+    NPP_CHECK_CUDA(cudaHostAlloc((void**)&mU, nPitch * height / 2, cudaHostAllocDefault));
+    NPP_CHECK_CUDA(cudaHostAlloc((void**)&mV, nPitch * height / 2, cudaHostAllocDefault));
+
+}
+
+void convertor::init_resize(){
+    if (initialized_resize) return;
+    initialized_resize = true;
+
+    setGPUDevice(gpu_index);
+
+    NppiSize oDstImageSize;
+    oDstImageSize.width  = std::max(1, rWidth);
+    oDstImageSize.height = std::max(1, rHeight);
+    
+    sizeResize.width = oDstImageSize.width; 
+    sizeResize.height = oDstImageSize.height;
+
+    for (int i=0; i < 3; ++i)
+    {
+        NppiSize oBlocks;
+        NppiSize oBlocksPerMCU = { aSamplingFactors[i] & 0x0f, aSamplingFactors[i] >> 4};
+
+        oBlocks.width = (int)ceil((oDstImageSize.width + 7)/8  *
+                                  static_cast<float>(oBlocksPerMCU.width)/nMCUBlocksH);
+        oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width;
+
+        oBlocks.height = (int)ceil((oDstImageSize.height+7)/8 *
+                                   static_cast<float>(oBlocksPerMCU.height)/nMCUBlocksV);
+        oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height;
+
+        aDstSize[i].width = oBlocks.width * 8;
+        aDstSize[i].height = oBlocks.height * 8;
+
+        // Allocate Memory
+        size_t nPitch;
+        NPP_CHECK_CUDA(cudaMallocPitch((void**)&apDstImage[i], &nPitch, aDstSize[i].width, aDstSize[i].height));
+        aDstImageStep[i] = static_cast<Npp32s>(nPitch);
+    }
+
+    if (rWidth > 0 && rHeight > 0 && rWidth < width && rHeight < height){
+        fx = (float)(rWidth) / (float)(width);
+        fy = (float)(rHeight) / (float)(height);
+    }
+
+    if (imgResize == 0){
+        if (rWidth > 0 && rHeight > 0 && rWidth < width && rHeight < height){
+            NPP_CHECK_CUDA(cudaMallocPitch((void**)&imgResize, &pitchResize, rWidth * 3, rHeight));
+        }
+    }
+    if (!bgrScale){
+        if (rWidth > 0 && rHeight > 0 && rWidth < width && rHeight < height){
+            bgrScalePitch = rWidth * 3;
+            bgrScaleLen = bgrScalePitch * rHeight;
+            NPP_CHECK_CUDA(cudaHostAlloc((void**)&bgrScale, bgrScaleLen, cudaHostAllocDefault));
+        }
+    }
+}
+
+void convertor::init_resize_bgr(){
+    if (initialized_resize_bgr) return;
+    initialized_resize_bgr = true;
+
+    setGPUDevice(gpu_index);
+    if (originBGR == 0){
+        originBGR = nppiMalloc_8u_C3(width, height, &pitchOriginBGR);
+    }
+    if (resizedBGR == 0){
+        resizedBGR = nppiMalloc_8u_C3(rWidth, rHeight, &pitchResizedBGR);
+    }
+    if (hostResizedBGR == NULL){
+        NPP_CHECK_CUDA(cudaHostAlloc((void**)&hostResizedBGR, rWidth * 3 * rHeight, cudaHostAllocDefault));
+    }
+}
+
+void convertor::init_resize_yuv(){
+    if (initialized_resize_yuv) return;
+    initialized_resize_yuv = true;
+
+    if (rWidth > 0 && rHeight > 0){
+        fx = (float)(width) / (float)(rWidth);
+        fy = (float)(height) / (float)(rHeight);
+    }
+
+    nv12 = (unsigned char*)malloc(rWidth*rHeight*3/2);
+}
+
+convertor::~convertor(){
+    setGPUDevice(gpu_index);
+
+    if(mY) cudaFreeHost(mY);
+    if(mU) cudaFreeHost(mU);
+    if(mV) cudaFreeHost(mV);
+
+    for (int i = 0; i < 3; ++i)//鍐呭瓨閲婃斁
+    {
+    	cudaFree(apSrcImage[i]);
+        cudaFree(apDstImage[i]);
+    }
+
+    if (imgOrigin) cudaFree(imgOrigin);
+    if (imgResize) cudaFree(imgResize);
+
+    if (bgrOrigin) cudaFreeHost(bgrOrigin);
+    if (bgrScale) cudaFreeHost(bgrScale);
+
+    if (originBGR) nppiFree(originBGR);
+    if (resizedBGR) nppiFree(resizedBGR);
+    if (hostResizedBGR) cudaFreeHost(hostResizedBGR);
+
+    if (nv12) free(nv12);
+}
+
+int convertor::fill_yuv(const unsigned char *yuv){
+    init_yuv();
+    int ret = set_data((uint8_t*)yuv, width, height, mY, mU, mV);
+    if (ret < 0) return ret;
+
+    setGPUDevice(gpu_index);
+
+    NPP_CHECK_CUDA(cudaMemcpy(apSrcImage[0], mY, aSrcPitch[0] * height,     cudaMemcpyHostToDevice));
+    NPP_CHECK_CUDA(cudaMemcpy(apSrcImage[1], mU, aSrcPitch[1] * height / 2, cudaMemcpyHostToDevice));
+    NPP_CHECK_CUDA(cudaMemcpy(apSrcImage[2], mV, aSrcPitch[2] * height / 2, cudaMemcpyHostToDevice));
+    return 0;
+}
+
+int convertor::yuv2bgr(unsigned char **bgr, int *bgrLen){
+
+    *bgr = NULL;
+    *bgrLen = 0;
+
+    setGPUDevice(gpu_index);
+
+    NPP_CHECK_NPP(nppiYUV420ToBGR_8u_P3C3R(apSrcImage, aSrcImageStep, imgOrigin, pitchOrigin, sizeOrigin));
+
+    NPP_CHECK_CUDA(cudaMemcpy2D(bgrOrigin, bgrOriginPitch, imgOrigin, pitchOrigin, bgrOriginPitch, height, cudaMemcpyDeviceToHost));
+    *bgr = bgrOrigin;
+    *bgrLen = bgrOriginLen;
+
+    return 0;
+}
+
+int convertor::resize2bgr(unsigned char *in, unsigned char **data, int *data_len){
+    *data = NULL;
+    *data_len = 0;
+
+    if ((rWidth < 0 && rHeight < 0) || (rWidth > width && rHeight > height)){
+        return -1;
+    }
+
+    setGPUDevice(gpu_index);
+
+    if (!in){
+
+        init_resize();
+
+        NppiSize oDstImageSize;
+        oDstImageSize.width  = std::max(1, rWidth);
+        oDstImageSize.height = std::max(1, rHeight);
+        for (int i = 0; i < 3; ++i)
+        {
+            NppiSize oBlocksPerMCU = { aSamplingFactors[i] & 0x0f, aSamplingFactors[i] >> 4};
+            NppiSize oSrcImageSize = {(width * oBlocksPerMCU.width) / nMCUBlocksH, (height * oBlocksPerMCU.height)/nMCUBlocksV};
+            NppiRect oSrcImageROI = {0,0,oSrcImageSize.width, oSrcImageSize.height};
+            NppiRect oDstImageROI;
+            NppiInterpolationMode eInterploationMode = NPPI_INTER_SUPER;
+            NPP_CHECK_NPP(nppiGetResizeRect(oSrcImageROI, &oDstImageROI,
+                                            fx,
+                                            fy,
+                                            0.0, 0.0, eInterploationMode));
+            NPP_CHECK_NPP(nppiResizeSqrPixel_8u_C1R(apSrcImage[i], oSrcImageSize, aSrcImageStep[i], oSrcImageROI,
+                                                    apDstImage[i], aDstImageStep[i], oDstImageROI ,
+                                                    fx,
+                                                    fy,
+                                                    0.0, 0.0, eInterploationMode));
+        }
+        NPP_CHECK_NPP(nppiYUV420ToBGR_8u_P3C3R(apDstImage, aDstImageStep, imgResize, pitchResize, sizeResize));
+        NPP_CHECK_CUDA(cudaMemcpy2D(bgrScale, bgrScalePitch, imgResize, pitchResize, bgrScalePitch, rHeight, cudaMemcpyDeviceToHost));
+        *data = bgrScale;
+        *data_len = bgrScaleLen;
+    }else{
+        
+        init_resize_bgr();
+
+        NppiSize oSrcSize;
+        oSrcSize.width = width;
+        oSrcSize.height = height;
+
+        NPP_CHECK_CUDA(cudaMemcpy2D(originBGR, pitchOriginBGR, in, width*3, width*3, height, cudaMemcpyHostToDevice));
+
+        NppiRect oSrcROI;
+        oSrcROI.x = 0;
+        oSrcROI.y = 0;
+        oSrcROI.width = width;
+        oSrcROI.height = height;
+
+
+        NppiRect oDstROI;
+        oDstROI.x = 0;
+        oDstROI.y = 0;
+        oDstROI.width = rWidth;
+        oDstROI.height = rHeight;
+
+        // Scale Factor
+        double nXFactor = double(oDstROI.width) / double(oSrcROI.width);
+        double nYFactor = double(oDstROI.height) / double(oSrcROI.height);
+
+        // Scaled X/Y  Shift
+        double nXShift = - oSrcROI.x * nXFactor ;
+        double nYShift = - oSrcROI.y * nYFactor;
+        int eInterpolation = NPPI_INTER_SUPER;
+        if (nXFactor >= 1.f || nYFactor >= 1.f)
+            eInterpolation = NPPI_INTER_LANCZOS;
+
+        NppStatus ret = nppiResizeSqrPixel_8u_C3R(originBGR, oSrcSize, pitchOriginBGR, oSrcROI, 
+            resizedBGR, pitchResizedBGR, oDstROI, nXFactor, nYFactor, nXShift, nYShift, eInterpolation );
+
+        if(ret != NPP_SUCCESS) {
+            printf("imageResize_8u_C3R failed %d.\n", ret);
+            return -2;
+        }
+        size_t pitch = rWidth * 3;
+        *data_len = pitch * rHeight;
+        NPP_CHECK_CUDA(cudaMemcpy2D(hostResizedBGR, pitch, resizedBGR, pitchResizedBGR, pitch, rHeight, cudaMemcpyDeviceToHost));
+        *data = hostResizedBGR;
+    }
+    return 0;
+}
+
+static int nv12_nearest_scale(uint8_t* __restrict src, uint8_t* __restrict dst,   
+                        int srcWidth, int srcHeight, int dstWidth, int dstHeight)
+{   
+    register int sw = srcWidth;  //register keyword is for local var to accelorate    
+    register int sh = srcHeight;   
+    register int dw = dstWidth;   
+    register int dh = dstHeight;   
+    register int y, x;   
+    unsigned long int srcy, srcx, src_index, dst_index;   
+    unsigned long int xrIntFloat_16 = (sw << 16) / dw + 1; //better than float division   
+    unsigned long int yrIntFloat_16 = (sh << 16) / dh + 1;   
+   
+    uint8_t* dst_uv = dst + dh * dw; //memory start pointer of dest uv   
+    uint8_t* src_uv = src + sh * sw; //memory start pointer of source uv   
+    uint8_t* dst_uv_yScanline;   
+    uint8_t* src_uv_yScanline;   
+    uint8_t* dst_y_slice = dst; //memory start pointer of dest y   
+    uint8_t* src_y_slice;   
+    uint8_t* sp;   
+    uint8_t* dp;   
+    
+    for (y = 0; y < (dh & ~7); ++y)  //'dh & ~7' is to generate faster assembly code   
+    {   
+        srcy = (y * yrIntFloat_16) >> 16;   
+        src_y_slice = src + srcy * sw;   
+   
+        if((y & 1) == 0)   
+        {   
+            dst_uv_yScanline = dst_uv + (y / 2) * dw;   
+            src_uv_yScanline = src_uv + (srcy / 2) * sw;   
+        }   
+   
+        for(x = 0; x < (dw & ~7); ++x)   
+        {   
+            srcx = (x * xrIntFloat_16) >> 16;   
+            dst_y_slice[x] = src_y_slice[srcx];   
+   
+            if((y & 1) == 0) //y is even   
+            {   
+                if((x & 1) == 0) //x is even   
+                {   
+                    src_index = (srcx / 2) * 2;   
+               
+                    sp = dst_uv_yScanline + x;   
+                    dp = src_uv_yScanline + src_index;   
+                    *sp = *dp;   
+                    ++sp;   
+                    ++dp;   
+                    *sp = *dp;   
+                }   
+             }   
+         }   
+         dst_y_slice += dw;   
+    }
+    return 0;
+}
+
+int convertor::resizeyuv(unsigned char *in, unsigned char **data, int *data_len){
+        
+    init_resize_yuv();
+
+    *data_len = rWidth*rHeight*3/2;
+    *data = nv12;
+
+    return nv12_nearest_scale(in, nv12, width, height, rWidth, rHeight);
+}
+
+convHandle conv_create(const int srcW, const int srcH, const int dstW, const int dstH, const int gpu){
+    if (gpu < 0) return NULL;
+
+    convertor *conv = new convertor(srcW, srcH, dstW, dstH, gpu);
+    return conv;
+}
+
+void conv_destroy(convHandle h){
+    if (!h) return;    
+    convertor *conv = (convertor*)h;
+    delete conv;
+}
+
+int yuv2bgrandresize(convHandle h, void *yuv, unsigned char **bgr, int *bgrLen, unsigned char **scaleBGR, int *scaleBGRLen){
+    if (!h) return -2;
+    convertor *conv = (convertor*)h;
+    int ret = conv->fill_yuv((unsigned char*)yuv);
+    if (ret != 0) return ret;
+    ret = conv->yuv2bgr(bgr, bgrLen);
+    if (ret != 0) return ret;
+    ret = conv->resize2bgr(NULL, scaleBGR, scaleBGRLen);
+    return ret;
+}
+
+int yuv2bgr(convHandle h, void *yuv, unsigned char **bgr, int *bgrLen){
+    if (!h) return -2;
+    convertor *conv = (convertor*)h;
+    int ret = conv->fill_yuv((unsigned char*)yuv);
+    if (ret != 0) return ret;
+    return conv->yuv2bgr(bgr, bgrLen);
+}
+
+int yuv2resizedbgr(convHandle h, void *yuv, unsigned char **bgr, int *bgrLen){
+    if (!h) return -2;
+    convertor *conv = (convertor*)h;
+    int ret = conv->fill_yuv((unsigned char*)yuv);
+    if (ret != 0) return ret;
+    ret = conv->resize2bgr(NULL, bgr, bgrLen);
+    return ret;
+}
+
+int resizebgr(convHandle h, void *data, unsigned char **resized, int *len){
+    if (!h) return -2;
+    convertor *conv = (convertor*)h;
+    return conv->resize2bgr((unsigned char*)data, resized, len);
+}
+
+int resizeyuv(convHandle h, void *data, unsigned char **resized, int *len){
+    if (!h) return -2;
+    convertor *conv = (convertor*)h;
+    return conv->resizeyuv((unsigned char*)data, resized, len);
+}

--
Gitblit v1.8.0