#include "conv.h" #include #include #include #include #include #include "Exceptions.h" static const int MEMORY_ALGN_DEVICE = 511; static const int HD_MEMORY_ALGN_DEVICE = 511; static inline int DivUp(int x, int d) { return (x + d - 1) / d; } static int set_data(uint8_t *data, const int width, const int height, unsigned char *mY, unsigned char *mU, unsigned char *mV) { uint8_t* yuv_data = (uint8_t*)data; if (!yuv_data) { return -1; } uint32_t i, j; uint32_t off; uint32_t off_yuv; uint32_t half_h; uint32_t half_w; uint32_t u_size; uint8_t* yuv_ptr; uint8_t* u_ptr; uint8_t* v_ptr; int w = width; int h = height; //从这一句来看,即使是同一种格式,进来也要处理一下。 size_t nPitch = (w + HD_MEMORY_ALGN_DEVICE) & ~HD_MEMORY_ALGN_DEVICE; off = 0; off_yuv = 0; for (i = 0; i < (uint32_t)h; i++) { memcpy(mY + off, yuv_data + off_yuv, w); off += nPitch; off_yuv += w; } half_w = w >> 1; half_h = h >> 1; u_size = half_w * half_h; nPitch = (half_w + HD_MEMORY_ALGN_DEVICE) & ~HD_MEMORY_ALGN_DEVICE; off_yuv = w * h; off = 0; for (i = 0; i < half_h; i++) { yuv_ptr = yuv_data + off_yuv; u_ptr = mU + off; v_ptr = mV + off; for (j = 0; j < (uint32_t)w; j += 2) { *u_ptr++ = *yuv_ptr++; *v_ptr++ = *yuv_ptr++; } off_yuv += w; off += nPitch; } return 0; } /////////////handle class convertor{ public: convertor(const int srcW, const int srcH, const int dstW, const int dstH, const int gpu); ~convertor(); int yuv2bgr(unsigned char **bgr, int *bgrLen); int resize2bgr(unsigned char *in, unsigned char **data, int *data_len); int resizeyuv(unsigned char *in, unsigned char **data, int *data_len); int fill_yuv(const unsigned char *yuv); private: void init_yuv(); void init_resize(); void init_resize_bgr(); void init_resize_yuv(); private: int width; int height; unsigned char aSamplingFactors[3]; int nMCUBlocksH; int nMCUBlocksV; Npp8u *apSrcImage[3]; NppiSize aSrcSize[3]; Npp32s aSrcImageStep[3]; size_t aSrcPitch[3]; uint8_t *mY; uint8_t *mU; uint8_t *mV; /////////////////////////// int rWidth; int rHeight; float fx; float fy; Npp8u *apDstImage[3] = {0,0,0}; Npp32s aDstImageStep[3]; NppiSize aDstSize[3]; ///////////////////////////// Npp8u *imgOrigin; size_t pitchOrigin; NppiSize sizeOrigin; unsigned char *bgrOrigin; int bgrOriginLen; size_t bgrOriginPitch; //////////////////////////// Npp8u *imgResize; size_t pitchResize; NppiSize sizeResize; unsigned char *bgrScale; int bgrScaleLen; size_t bgrScalePitch; // resize only //////////////////////////// Npp8u *originBGR; int pitchOriginBGR; Npp8u *resizedBGR; int pitchResizedBGR; unsigned char *hostResizedBGR; /////////////////////////// unsigned char *nv12; bool initialized_yuv, initialized_resize, initialized_resize_bgr, initialized_resize_yuv; int gpu_index; }; convertor::convertor(const int srcW, const int srcH, const int dstW, const int dstH, const int gpu) :width(srcW) ,height(srcH) ,rWidth(dstW) ,rHeight(dstH) ,fx(-1) ,fy(-1) ,mY(NULL) ,mU(NULL) ,mV(NULL) ,imgOrigin(0) ,imgResize(0) ,bgrOrigin(NULL) ,bgrOriginLen(0) ,bgrScale(NULL) ,bgrScaleLen(0) ,originBGR(0) ,pitchOriginBGR(0) ,resizedBGR(0) ,pitchResizedBGR(0) ,hostResizedBGR(NULL) ,nv12(NULL) ,initialized_yuv(false) ,initialized_resize(false) ,initialized_resize_bgr(false) ,initialized_resize_yuv(false) ,gpu_index(gpu) {} static void setGPUDevice(const int gpu){ if (gpu >= 0){ cudaSetDevice(gpu); } } void convertor::init_yuv(){ if (initialized_yuv) return; initialized_yuv = true; setGPUDevice(gpu_index); for(int i = 0; i < 3; i++){ apSrcImage[i] = 0; apDstImage[i] = 0; } aSamplingFactors[0] = 34; aSamplingFactors[1] = 17; aSamplingFactors[2] = 17; nMCUBlocksH = 0; nMCUBlocksV = 0; for (int i = 0; i < 3; ++i) { nMCUBlocksV = std::max(nMCUBlocksV, aSamplingFactors[i] & 0x0f); nMCUBlocksH = std::max(nMCUBlocksH, aSamplingFactors[i] >> 4); } for (int i = 0; i < 3; ++i) { NppiSize oBlocks; NppiSize oBlocksPerMCU = { aSamplingFactors[i] >> 4, aSamplingFactors[i] & 0x0f }; oBlocks.width = (int)ceil((width + 7) / 8 * static_cast(oBlocksPerMCU.width) / nMCUBlocksH); oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width; oBlocks.height = (int)ceil((height + 7) / 8 * static_cast(oBlocksPerMCU.height) / nMCUBlocksV); oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height; aSrcSize[i].width = oBlocks.width * 8; aSrcSize[i].height = oBlocks.height * 8; // Allocate Memory size_t nPitch; NPP_CHECK_CUDA(cudaMallocPitch((void**)&(apSrcImage[i]), &nPitch, aSrcSize[i].width, aSrcSize[i].height)); aSrcPitch[i] = nPitch; aSrcImageStep[i] = static_cast(nPitch); } NPP_CHECK_CUDA(cudaMallocPitch((void**)&imgOrigin, &pitchOrigin, width * 3, height)); bgrOriginPitch = width * 3; bgrOriginLen = bgrOriginPitch * height; NPP_CHECK_CUDA(cudaHostAlloc((void**)&bgrOrigin, bgrOriginLen, cudaHostAllocDefault)); sizeOrigin.width = width; sizeOrigin.height = height; uint32_t nPitch = (width + MEMORY_ALGN_DEVICE) & ~MEMORY_ALGN_DEVICE; NPP_CHECK_CUDA(cudaHostAlloc((void**)&mY, nPitch * height, cudaHostAllocDefault)); nPitch = (width/2 + MEMORY_ALGN_DEVICE) & ~MEMORY_ALGN_DEVICE; NPP_CHECK_CUDA(cudaHostAlloc((void**)&mU, nPitch * height / 2, cudaHostAllocDefault)); NPP_CHECK_CUDA(cudaHostAlloc((void**)&mV, nPitch * height / 2, cudaHostAllocDefault)); } void convertor::init_resize(){ if (initialized_resize) return; initialized_resize = true; setGPUDevice(gpu_index); NppiSize oDstImageSize; oDstImageSize.width = std::max(1, rWidth); oDstImageSize.height = std::max(1, rHeight); sizeResize.width = oDstImageSize.width; sizeResize.height = oDstImageSize.height; for (int i=0; i < 3; ++i) { NppiSize oBlocks; NppiSize oBlocksPerMCU = { aSamplingFactors[i] & 0x0f, aSamplingFactors[i] >> 4}; oBlocks.width = (int)ceil((oDstImageSize.width + 7)/8 * static_cast(oBlocksPerMCU.width)/nMCUBlocksH); oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width; oBlocks.height = (int)ceil((oDstImageSize.height+7)/8 * static_cast(oBlocksPerMCU.height)/nMCUBlocksV); oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height; aDstSize[i].width = oBlocks.width * 8; aDstSize[i].height = oBlocks.height * 8; // Allocate Memory size_t nPitch; NPP_CHECK_CUDA(cudaMallocPitch((void**)&apDstImage[i], &nPitch, aDstSize[i].width, aDstSize[i].height)); aDstImageStep[i] = static_cast(nPitch); } if (rWidth > 0 && rHeight > 0 && rWidth < width && rHeight < height){ fx = (float)(rWidth) / (float)(width); fy = (float)(rHeight) / (float)(height); } if (imgResize == 0){ if (rWidth > 0 && rHeight > 0 && rWidth < width && rHeight < height){ NPP_CHECK_CUDA(cudaMallocPitch((void**)&imgResize, &pitchResize, rWidth * 3, rHeight)); } } if (!bgrScale){ if (rWidth > 0 && rHeight > 0 && rWidth < width && rHeight < height){ bgrScalePitch = rWidth * 3; bgrScaleLen = bgrScalePitch * rHeight; NPP_CHECK_CUDA(cudaHostAlloc((void**)&bgrScale, bgrScaleLen, cudaHostAllocDefault)); } } } void convertor::init_resize_bgr(){ if (initialized_resize_bgr) return; initialized_resize_bgr = true; setGPUDevice(gpu_index); if (originBGR == 0){ originBGR = nppiMalloc_8u_C3(width, height, &pitchOriginBGR); } if (resizedBGR == 0){ resizedBGR = nppiMalloc_8u_C3(rWidth, rHeight, &pitchResizedBGR); } if (hostResizedBGR == NULL){ NPP_CHECK_CUDA(cudaHostAlloc((void**)&hostResizedBGR, rWidth * 3 * rHeight, cudaHostAllocDefault)); } } void convertor::init_resize_yuv(){ if (initialized_resize_yuv) return; initialized_resize_yuv = true; if (rWidth > 0 && rHeight > 0){ fx = (float)(width) / (float)(rWidth); fy = (float)(height) / (float)(rHeight); } nv12 = (unsigned char*)malloc(rWidth*rHeight*3/2); } convertor::~convertor(){ setGPUDevice(gpu_index); if(mY) cudaFreeHost(mY); if(mU) cudaFreeHost(mU); if(mV) cudaFreeHost(mV); for (int i = 0; i < 3; ++i)//内存释放 { cudaFree(apSrcImage[i]); cudaFree(apDstImage[i]); } if (imgOrigin) cudaFree(imgOrigin); if (imgResize) cudaFree(imgResize); if (bgrOrigin) cudaFreeHost(bgrOrigin); if (bgrScale) cudaFreeHost(bgrScale); if (originBGR) nppiFree(originBGR); if (resizedBGR) nppiFree(resizedBGR); if (hostResizedBGR) cudaFreeHost(hostResizedBGR); if (nv12) free(nv12); } int convertor::fill_yuv(const unsigned char *yuv){ init_yuv(); int ret = set_data((uint8_t*)yuv, width, height, mY, mU, mV); if (ret < 0) return ret; setGPUDevice(gpu_index); NPP_CHECK_CUDA(cudaMemcpy(apSrcImage[0], mY, aSrcPitch[0] * height, cudaMemcpyHostToDevice)); NPP_CHECK_CUDA(cudaMemcpy(apSrcImage[1], mU, aSrcPitch[1] * height / 2, cudaMemcpyHostToDevice)); NPP_CHECK_CUDA(cudaMemcpy(apSrcImage[2], mV, aSrcPitch[2] * height / 2, cudaMemcpyHostToDevice)); return 0; } int convertor::yuv2bgr(unsigned char **bgr, int *bgrLen){ *bgr = NULL; *bgrLen = 0; setGPUDevice(gpu_index); NPP_CHECK_NPP(nppiYUV420ToBGR_8u_P3C3R(apSrcImage, aSrcImageStep, imgOrigin, pitchOrigin, sizeOrigin)); NPP_CHECK_CUDA(cudaMemcpy2D(bgrOrigin, bgrOriginPitch, imgOrigin, pitchOrigin, bgrOriginPitch, height, cudaMemcpyDeviceToHost)); *bgr = bgrOrigin; *bgrLen = bgrOriginLen; return 0; } int convertor::resize2bgr(unsigned char *in, unsigned char **data, int *data_len){ *data = NULL; *data_len = 0; if ((rWidth < 0 && rHeight < 0) || (rWidth > width && rHeight > height)){ return -1; } setGPUDevice(gpu_index); if (!in){ init_resize(); NppiSize oDstImageSize; oDstImageSize.width = std::max(1, rWidth); oDstImageSize.height = std::max(1, rHeight); for (int i = 0; i < 3; ++i) { NppiSize oBlocksPerMCU = { aSamplingFactors[i] & 0x0f, aSamplingFactors[i] >> 4}; NppiSize oSrcImageSize = {(width * oBlocksPerMCU.width) / nMCUBlocksH, (height * oBlocksPerMCU.height)/nMCUBlocksV}; NppiRect oSrcImageROI = {0,0,oSrcImageSize.width, oSrcImageSize.height}; NppiRect oDstImageROI; NppiInterpolationMode eInterploationMode = NPPI_INTER_SUPER; NPP_CHECK_NPP(nppiGetResizeRect(oSrcImageROI, &oDstImageROI, fx, fy, 0.0, 0.0, eInterploationMode)); NPP_CHECK_NPP(nppiResizeSqrPixel_8u_C1R(apSrcImage[i], oSrcImageSize, aSrcImageStep[i], oSrcImageROI, apDstImage[i], aDstImageStep[i], oDstImageROI , fx, fy, 0.0, 0.0, eInterploationMode)); } NPP_CHECK_NPP(nppiYUV420ToBGR_8u_P3C3R(apDstImage, aDstImageStep, imgResize, pitchResize, sizeResize)); NPP_CHECK_CUDA(cudaMemcpy2D(bgrScale, bgrScalePitch, imgResize, pitchResize, bgrScalePitch, rHeight, cudaMemcpyDeviceToHost)); *data = bgrScale; *data_len = bgrScaleLen; }else{ init_resize_bgr(); NppiSize oSrcSize; oSrcSize.width = width; oSrcSize.height = height; NPP_CHECK_CUDA(cudaMemcpy2D(originBGR, pitchOriginBGR, in, width*3, width*3, height, cudaMemcpyHostToDevice)); NppiRect oSrcROI; oSrcROI.x = 0; oSrcROI.y = 0; oSrcROI.width = width; oSrcROI.height = height; NppiRect oDstROI; oDstROI.x = 0; oDstROI.y = 0; oDstROI.width = rWidth; oDstROI.height = rHeight; // Scale Factor double nXFactor = double(oDstROI.width) / double(oSrcROI.width); double nYFactor = double(oDstROI.height) / double(oSrcROI.height); // Scaled X/Y Shift double nXShift = - oSrcROI.x * nXFactor ; double nYShift = - oSrcROI.y * nYFactor; int eInterpolation = NPPI_INTER_SUPER; if (nXFactor >= 1.f || nYFactor >= 1.f) eInterpolation = NPPI_INTER_LANCZOS; NppStatus ret = nppiResizeSqrPixel_8u_C3R(originBGR, oSrcSize, pitchOriginBGR, oSrcROI, resizedBGR, pitchResizedBGR, oDstROI, nXFactor, nYFactor, nXShift, nYShift, eInterpolation ); if(ret != NPP_SUCCESS) { printf("imageResize_8u_C3R failed %d.\n", ret); return -2; } size_t pitch = rWidth * 3; *data_len = pitch * rHeight; NPP_CHECK_CUDA(cudaMemcpy2D(hostResizedBGR, pitch, resizedBGR, pitchResizedBGR, pitch, rHeight, cudaMemcpyDeviceToHost)); *data = hostResizedBGR; } return 0; } static int nv12_nearest_scale(uint8_t* __restrict src, uint8_t* __restrict dst, int srcWidth, int srcHeight, int dstWidth, int dstHeight) { register int sw = srcWidth; //register keyword is for local var to accelorate register int sh = srcHeight; register int dw = dstWidth; register int dh = dstHeight; register int y, x; unsigned long int srcy, srcx, src_index, dst_index; unsigned long int xrIntFloat_16 = (sw << 16) / dw + 1; //better than float division unsigned long int yrIntFloat_16 = (sh << 16) / dh + 1; uint8_t* dst_uv = dst + dh * dw; //memory start pointer of dest uv uint8_t* src_uv = src + sh * sw; //memory start pointer of source uv uint8_t* dst_uv_yScanline; uint8_t* src_uv_yScanline; uint8_t* dst_y_slice = dst; //memory start pointer of dest y uint8_t* src_y_slice; uint8_t* sp; uint8_t* dp; for (y = 0; y < (dh & ~7); ++y) //'dh & ~7' is to generate faster assembly code { srcy = (y * yrIntFloat_16) >> 16; src_y_slice = src + srcy * sw; if((y & 1) == 0) { dst_uv_yScanline = dst_uv + (y / 2) * dw; src_uv_yScanline = src_uv + (srcy / 2) * sw; } for(x = 0; x < (dw & ~7); ++x) { srcx = (x * xrIntFloat_16) >> 16; dst_y_slice[x] = src_y_slice[srcx]; if((y & 1) == 0) //y is even { if((x & 1) == 0) //x is even { src_index = (srcx / 2) * 2; sp = dst_uv_yScanline + x; dp = src_uv_yScanline + src_index; *sp = *dp; ++sp; ++dp; *sp = *dp; } } } dst_y_slice += dw; } return 0; } int convertor::resizeyuv(unsigned char *in, unsigned char **data, int *data_len){ init_resize_yuv(); *data_len = rWidth*rHeight*3/2; *data = nv12; return nv12_nearest_scale(in, nv12, width, height, rWidth, rHeight); } convHandle conv_create(const int srcW, const int srcH, const int dstW, const int dstH, const int gpu){ if (gpu < 0) return NULL; convertor *conv = new convertor(srcW, srcH, dstW, dstH, gpu); return conv; } void conv_destroy(convHandle h){ if (!h) return; convertor *conv = (convertor*)h; delete conv; } int yuv2bgrandresize(convHandle h, void *yuv, unsigned char **bgr, int *bgrLen, unsigned char **scaleBGR, int *scaleBGRLen){ if (!h) return -2; convertor *conv = (convertor*)h; int ret = conv->fill_yuv((unsigned char*)yuv); if (ret != 0) return ret; ret = conv->yuv2bgr(bgr, bgrLen); if (ret != 0) return ret; ret = conv->resize2bgr(NULL, scaleBGR, scaleBGRLen); return ret; } int yuv2bgr(convHandle h, void *yuv, unsigned char **bgr, int *bgrLen){ if (!h) return -2; convertor *conv = (convertor*)h; int ret = conv->fill_yuv((unsigned char*)yuv); if (ret != 0) return ret; return conv->yuv2bgr(bgr, bgrLen); } int yuv2resizedbgr(convHandle h, void *yuv, unsigned char **bgr, int *bgrLen){ if (!h) return -2; convertor *conv = (convertor*)h; int ret = conv->fill_yuv((unsigned char*)yuv); if (ret != 0) return ret; ret = conv->resize2bgr(NULL, bgr, bgrLen); return ret; } int resizebgr(convHandle h, void *data, unsigned char **resized, int *len){ if (!h) return -2; convertor *conv = (convertor*)h; return conv->resize2bgr((unsigned char*)data, resized, len); } int resizeyuv(convHandle h, void *data, unsigned char **resized, int *len){ if (!h) return -2; convertor *conv = (convertor*)h; return conv->resizeyuv((unsigned char*)data, resized, len); }