From 6d6e6d425c4fe63a487ff27be9341671b2c1dd93 Mon Sep 17 00:00:00 2001
From: stydb <suntianyu0923@gmail.com>
Date: 星期二, 03 十二月 2019 11:12:36 +0800
Subject: [PATCH] update

---
 goconv/inc/helper_string.h   |  526 ++++++++++
 goconv/goconv.go             |  258 +++++
 .gitignore                   |    1 
 go.sum                       |   22 
 goconv/inc/helper_cuda.h     | 1261 +++++++++++++++++++++++++
 goconv/inc/Exceptions.h      |  181 +++
 go.mod                       |    8 
 goconv/conv.cpp              |  592 +++++++++++
 main.go                      |   90 +
 gohumantrack/gohumantrack.go |   19 
 runtime/libcffmpeg.so        |    0 
 goconv/conv.h                |   23 
 12 files changed, 2,932 insertions(+), 49 deletions(-)

diff --git a/.gitignore b/.gitignore
index 553d97f..3dee989 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 *.avi
 track
+FPN_REID*
diff --git a/go.mod b/go.mod
index 5af2311..2a00f22 100644
--- a/go.mod
+++ b/go.mod
@@ -2,4 +2,10 @@
 
 go 1.12
 
-require basic.com/valib/goffmpeg.git v0.0.0-20191129092141-7363d175bd62
+require (
+	basic.com/valib/godraw.git v0.0.0-20191122082247-26e9987cd183
+	basic.com/valib/goffmpeg.git v0.0.0-20191203025021-783b80757681
+	basic.com/valib/gogpu.git v0.0.0-20190711044327-62043b070865
+	github.com/disintegration/imaging v1.6.2
+	github.com/llgcode/draw2d v0.0.0-20190810100245-79e59b6b8fbc // indirect
+)
diff --git a/go.sum b/go.sum
index 4bb58e5..f28b48d 100644
--- a/go.sum
+++ b/go.sum
@@ -1,2 +1,20 @@
-basic.com/valib/goffmpeg.git v0.0.0-20191129092141-7363d175bd62 h1:KzkPzJE76RkHeYBgAMfSiz1vzJaQRKkRDCXnw2XmxqA=
-basic.com/valib/goffmpeg.git v0.0.0-20191129092141-7363d175bd62/go.mod h1:1x75Hawh/BjgPsQtuJ24px89gzk3uAslD8e0Xs6Z7GQ=
+basic.com/valib/godraw.git v0.0.0-20191122082247-26e9987cd183 h1:QQ1L0Ev4vcSD23d99+rW5S/mnmdTAPAI2GZ7tkMgCE4=
+basic.com/valib/godraw.git v0.0.0-20191122082247-26e9987cd183/go.mod h1:LntbWczdG87utrKx7rWYmIh1VZ+X2oPN7Q2IXb6oRE0=
+basic.com/valib/goffmpeg.git v0.0.0-20191203025021-783b80757681 h1:n5zinCkvVghdKw0ZenxMo+lFjaXhHSr9ecICuQZLjNw=
+basic.com/valib/goffmpeg.git v0.0.0-20191203025021-783b80757681/go.mod h1:1x75Hawh/BjgPsQtuJ24px89gzk3uAslD8e0Xs6Z7GQ=
+basic.com/valib/gogpu.git v0.0.0-20190711044327-62043b070865 h1:3XvkNdRlJDXV45ie8U0uGA9ImJZtyTT0C/h+4Rizv0Y=
+basic.com/valib/gogpu.git v0.0.0-20190711044327-62043b070865/go.mod h1:yxux5RP4A6a591vWljXxGlHdERVVyWDD3TwwQjuyogw=
+github.com/disintegration/imaging v1.6.2 h1:w1LecBlG2Lnp8B3jk5zSuNqd7b4DXhcjwek1ei82L+c=
+github.com/disintegration/imaging v1.6.2/go.mod h1:44/5580QXChDfwIclfc/PCwrr44amcmDAg8hxG0Ewe4=
+github.com/go-gl/gl v0.0.0-20180407155706-68e253793080/go.mod h1:482civXOzJJCPzJ4ZOX/pwvXBWSnzD4OKMdH4ClKGbk=
+github.com/go-gl/glfw v0.0.0-20180426074136-46a8d530c326/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU=
+github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g=
+github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
+github.com/jung-kurt/gofpdf v1.0.0/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes=
+github.com/llgcode/draw2d v0.0.0-20190810100245-79e59b6b8fbc h1:ZvNhCJfRtl03A0VOIfvO9W22/0b6dmn1APa4Q6j9oHM=
+github.com/llgcode/draw2d v0.0.0-20190810100245-79e59b6b8fbc/go.mod h1:mVa0dA29Db2S4LVqDYLlsePDzRJLDfdhVZiI15uY0FA=
+github.com/llgcode/ps v0.0.0-20150911083025-f1443b32eedb/go.mod h1:1l8ky+Ew27CMX29uG+a2hNOKpeNYEQjjtiALiBlFQbY=
+golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs=
+golang.org/x/image v0.0.0-20191009234506-e7c1f5e7dbb8 h1:hVwzHzIUGRjiF7EcUjqNxk3NCfkPxbDKRdnNE1Rpg0U=
+golang.org/x/image v0.0.0-20191009234506-e7c1f5e7dbb8/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
diff --git a/goconv/conv.cpp b/goconv/conv.cpp
new file mode 100644
index 0000000..098a396
--- /dev/null
+++ b/goconv/conv.cpp
@@ -0,0 +1,592 @@
+#include "conv.h"
+
+#include <cmath>
+#include <sys/time.h>
+
+#include <npp.h>
+#include <helper_cuda.h>
+#include <helper_string.h>
+#include "Exceptions.h"
+
+
+static const int MEMORY_ALGN_DEVICE = 511;
+static const int HD_MEMORY_ALGN_DEVICE = 511;
+
+static inline int DivUp(int x, int d)
+{
+    return (x + d - 1) / d;
+}
+
+static int set_data(uint8_t *data, const int width, const int height, unsigned char *mY, unsigned char *mU, unsigned char *mV)
+{
+    uint8_t* yuv_data = (uint8_t*)data;
+    if (!yuv_data)
+    {
+        return -1;
+    }
+
+    uint32_t    i, j;
+    uint32_t    off;
+    uint32_t    off_yuv;
+    uint32_t    half_h;
+    uint32_t    half_w;
+    uint32_t    u_size;
+    uint8_t*    yuv_ptr;
+    uint8_t*    u_ptr;
+    uint8_t*    v_ptr;
+
+    int w = width;
+    int h = height;
+
+    //浠庤繖涓�鍙ユ潵鐪嬶紝鍗充娇鏄悓涓�绉嶆牸寮忥紝杩涙潵涔熻澶勭悊涓�涓嬨��
+    size_t nPitch  = (w + HD_MEMORY_ALGN_DEVICE) & ~HD_MEMORY_ALGN_DEVICE;
+    off     = 0;
+    off_yuv = 0;
+    for (i = 0; i < (uint32_t)h; i++)
+    {
+        memcpy(mY + off, yuv_data + off_yuv, w);
+        off     += nPitch;
+        off_yuv += w;
+    }
+
+    half_w = w >> 1;
+    half_h = h >> 1;
+    u_size = half_w * half_h;
+    nPitch = (half_w + HD_MEMORY_ALGN_DEVICE) & ~HD_MEMORY_ALGN_DEVICE;
+    
+    off_yuv = w * h;
+    off = 0;
+    for (i = 0; i < half_h; i++)
+    {  
+        yuv_ptr = yuv_data + off_yuv;
+        u_ptr = mU + off;
+        v_ptr = mV + off;
+        for (j = 0; j < (uint32_t)w; j += 2)
+        {
+            *u_ptr++ = *yuv_ptr++;
+            *v_ptr++ = *yuv_ptr++;
+        }
+        off_yuv += w;
+        off += nPitch;
+    }
+
+    return 0;
+}
+
+/////////////handle
+class convertor{
+public: 
+    convertor(const int srcW, const int srcH, const int dstW, const int dstH, const int gpu);
+    ~convertor();
+    int yuv2bgr(unsigned char **bgr, int *bgrLen);
+    int resize2bgr(unsigned char *in, unsigned char **data, int *data_len);
+    int resizeyuv(unsigned char *in, unsigned char **data, int *data_len);
+    int fill_yuv(const unsigned char *yuv);
+private: 
+    void init_yuv();
+    void init_resize();
+    void init_resize_bgr();
+    void init_resize_yuv();
+private: 
+    int width;
+    int height;
+
+    unsigned char aSamplingFactors[3];
+    int nMCUBlocksH;
+    int nMCUBlocksV;
+
+    Npp8u   *apSrcImage[3];
+    NppiSize aSrcSize[3];
+    Npp32s   aSrcImageStep[3];
+    size_t   aSrcPitch[3];
+
+    uint8_t *mY;
+    uint8_t *mU;
+    uint8_t *mV;
+
+///////////////////////////
+    int rWidth;
+    int rHeight;
+    float fx;
+    float fy;
+
+    Npp8u *apDstImage[3] = {0,0,0};
+    Npp32s aDstImageStep[3];
+    NppiSize aDstSize[3];
+
+/////////////////////////////
+    Npp8u *imgOrigin;
+    size_t pitchOrigin;
+    NppiSize sizeOrigin;
+
+    unsigned char *bgrOrigin;
+    int bgrOriginLen;
+    size_t bgrOriginPitch;
+
+////////////////////////////
+    Npp8u *imgResize;
+    size_t pitchResize;
+    NppiSize sizeResize;
+
+    unsigned char *bgrScale;
+    int bgrScaleLen;
+    size_t bgrScalePitch;
+
+// resize only
+////////////////////////////
+    Npp8u *originBGR;
+    int pitchOriginBGR;
+    Npp8u *resizedBGR;
+    int pitchResizedBGR;
+    unsigned char *hostResizedBGR;
+
+///////////////////////////
+    unsigned char *nv12;
+
+    bool initialized_yuv, initialized_resize, initialized_resize_bgr, initialized_resize_yuv;
+    int gpu_index;
+};
+
+
+convertor::convertor(const int srcW, const int srcH, const int dstW, const int dstH, const int gpu)
+:width(srcW)
+,height(srcH)
+,rWidth(dstW) 
+,rHeight(dstH)
+,fx(-1)
+,fy(-1)
+,mY(NULL)
+,mU(NULL)
+,mV(NULL)
+,imgOrigin(0)
+,imgResize(0)
+,bgrOrigin(NULL)
+,bgrOriginLen(0)
+,bgrScale(NULL)
+,bgrScaleLen(0)
+,originBGR(0)
+,pitchOriginBGR(0)
+,resizedBGR(0)
+,pitchResizedBGR(0)
+,hostResizedBGR(NULL)
+,nv12(NULL)
+,initialized_yuv(false)
+,initialized_resize(false)
+,initialized_resize_bgr(false)
+,initialized_resize_yuv(false)
+,gpu_index(gpu)
+{}
+
+static void setGPUDevice(const int gpu){
+    if (gpu >= 0){
+        cudaSetDevice(gpu);
+    }
+}
+
+void convertor::init_yuv(){
+    if (initialized_yuv) return;
+    initialized_yuv = true;
+
+    setGPUDevice(gpu_index);
+
+    for(int i = 0; i < 3; i++){
+        apSrcImage[i] = 0;
+        apDstImage[i] = 0;
+    }
+
+    aSamplingFactors[0] = 34;
+    aSamplingFactors[1] = 17;
+    aSamplingFactors[2] = 17;
+
+    nMCUBlocksH = 0;
+    nMCUBlocksV = 0;
+
+    for (int i = 0; i < 3; ++i)
+    {
+        nMCUBlocksV = std::max(nMCUBlocksV, aSamplingFactors[i] & 0x0f);
+        nMCUBlocksH = std::max(nMCUBlocksH, aSamplingFactors[i] >> 4);
+    }
+
+    for (int i = 0; i < 3; ++i)
+    {
+        NppiSize oBlocks;
+        NppiSize oBlocksPerMCU = { aSamplingFactors[i] >> 4, aSamplingFactors[i] & 0x0f };
+
+        oBlocks.width = (int)ceil((width   + 7) / 8 *
+            static_cast<float>(oBlocksPerMCU.width) / nMCUBlocksH);
+        oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width;
+
+        oBlocks.height = (int)ceil((height + 7) / 8 *
+            static_cast<float>(oBlocksPerMCU.height) / nMCUBlocksV);
+        oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height;
+
+        aSrcSize[i].width  = oBlocks.width  * 8;
+        aSrcSize[i].height = oBlocks.height * 8;
+
+        // Allocate Memory
+        size_t nPitch;
+        NPP_CHECK_CUDA(cudaMallocPitch((void**)&(apSrcImage[i]), &nPitch, aSrcSize[i].width, aSrcSize[i].height));
+        aSrcPitch[i] = nPitch;
+        aSrcImageStep[i] = static_cast<Npp32s>(nPitch);
+    }
+
+    NPP_CHECK_CUDA(cudaMallocPitch((void**)&imgOrigin, &pitchOrigin, width * 3, height));
+
+    bgrOriginPitch = width * 3;
+    bgrOriginLen = bgrOriginPitch * height;
+    NPP_CHECK_CUDA(cudaHostAlloc((void**)&bgrOrigin, bgrOriginLen, cudaHostAllocDefault));
+
+    sizeOrigin.width = width;
+    sizeOrigin.height = height;
+
+    uint32_t nPitch = (width + MEMORY_ALGN_DEVICE) & ~MEMORY_ALGN_DEVICE;
+    NPP_CHECK_CUDA(cudaHostAlloc((void**)&mY, nPitch * height, cudaHostAllocDefault));
+    nPitch = (width/2 + MEMORY_ALGN_DEVICE) & ~MEMORY_ALGN_DEVICE;
+    NPP_CHECK_CUDA(cudaHostAlloc((void**)&mU, nPitch * height / 2, cudaHostAllocDefault));
+    NPP_CHECK_CUDA(cudaHostAlloc((void**)&mV, nPitch * height / 2, cudaHostAllocDefault));
+
+}
+
+void convertor::init_resize(){
+    if (initialized_resize) return;
+    initialized_resize = true;
+
+    setGPUDevice(gpu_index);
+
+    NppiSize oDstImageSize;
+    oDstImageSize.width  = std::max(1, rWidth);
+    oDstImageSize.height = std::max(1, rHeight);
+    
+    sizeResize.width = oDstImageSize.width; 
+    sizeResize.height = oDstImageSize.height;
+
+    for (int i=0; i < 3; ++i)
+    {
+        NppiSize oBlocks;
+        NppiSize oBlocksPerMCU = { aSamplingFactors[i] & 0x0f, aSamplingFactors[i] >> 4};
+
+        oBlocks.width = (int)ceil((oDstImageSize.width + 7)/8  *
+                                  static_cast<float>(oBlocksPerMCU.width)/nMCUBlocksH);
+        oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width;
+
+        oBlocks.height = (int)ceil((oDstImageSize.height+7)/8 *
+                                   static_cast<float>(oBlocksPerMCU.height)/nMCUBlocksV);
+        oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height;
+
+        aDstSize[i].width = oBlocks.width * 8;
+        aDstSize[i].height = oBlocks.height * 8;
+
+        // Allocate Memory
+        size_t nPitch;
+        NPP_CHECK_CUDA(cudaMallocPitch((void**)&apDstImage[i], &nPitch, aDstSize[i].width, aDstSize[i].height));
+        aDstImageStep[i] = static_cast<Npp32s>(nPitch);
+    }
+
+    if (rWidth > 0 && rHeight > 0 && rWidth < width && rHeight < height){
+        fx = (float)(rWidth) / (float)(width);
+        fy = (float)(rHeight) / (float)(height);
+    }
+
+    if (imgResize == 0){
+        if (rWidth > 0 && rHeight > 0 && rWidth < width && rHeight < height){
+            NPP_CHECK_CUDA(cudaMallocPitch((void**)&imgResize, &pitchResize, rWidth * 3, rHeight));
+        }
+    }
+    if (!bgrScale){
+        if (rWidth > 0 && rHeight > 0 && rWidth < width && rHeight < height){
+            bgrScalePitch = rWidth * 3;
+            bgrScaleLen = bgrScalePitch * rHeight;
+            NPP_CHECK_CUDA(cudaHostAlloc((void**)&bgrScale, bgrScaleLen, cudaHostAllocDefault));
+        }
+    }
+}
+
+void convertor::init_resize_bgr(){
+    if (initialized_resize_bgr) return;
+    initialized_resize_bgr = true;
+
+    setGPUDevice(gpu_index);
+    if (originBGR == 0){
+        originBGR = nppiMalloc_8u_C3(width, height, &pitchOriginBGR);
+    }
+    if (resizedBGR == 0){
+        resizedBGR = nppiMalloc_8u_C3(rWidth, rHeight, &pitchResizedBGR);
+    }
+    if (hostResizedBGR == NULL){
+        NPP_CHECK_CUDA(cudaHostAlloc((void**)&hostResizedBGR, rWidth * 3 * rHeight, cudaHostAllocDefault));
+    }
+}
+
+void convertor::init_resize_yuv(){
+    if (initialized_resize_yuv) return;
+    initialized_resize_yuv = true;
+
+    if (rWidth > 0 && rHeight > 0){
+        fx = (float)(width) / (float)(rWidth);
+        fy = (float)(height) / (float)(rHeight);
+    }
+
+    nv12 = (unsigned char*)malloc(rWidth*rHeight*3/2);
+}
+
+convertor::~convertor(){
+    setGPUDevice(gpu_index);
+
+    if(mY) cudaFreeHost(mY);
+    if(mU) cudaFreeHost(mU);
+    if(mV) cudaFreeHost(mV);
+
+    for (int i = 0; i < 3; ++i)//鍐呭瓨閲婃斁
+    {
+    	cudaFree(apSrcImage[i]);
+        cudaFree(apDstImage[i]);
+    }
+
+    if (imgOrigin) cudaFree(imgOrigin);
+    if (imgResize) cudaFree(imgResize);
+
+    if (bgrOrigin) cudaFreeHost(bgrOrigin);
+    if (bgrScale) cudaFreeHost(bgrScale);
+
+    if (originBGR) nppiFree(originBGR);
+    if (resizedBGR) nppiFree(resizedBGR);
+    if (hostResizedBGR) cudaFreeHost(hostResizedBGR);
+
+    if (nv12) free(nv12);
+}
+
+int convertor::fill_yuv(const unsigned char *yuv){
+    init_yuv();
+    int ret = set_data((uint8_t*)yuv, width, height, mY, mU, mV);
+    if (ret < 0) return ret;
+
+    setGPUDevice(gpu_index);
+
+    NPP_CHECK_CUDA(cudaMemcpy(apSrcImage[0], mY, aSrcPitch[0] * height,     cudaMemcpyHostToDevice));
+    NPP_CHECK_CUDA(cudaMemcpy(apSrcImage[1], mU, aSrcPitch[1] * height / 2, cudaMemcpyHostToDevice));
+    NPP_CHECK_CUDA(cudaMemcpy(apSrcImage[2], mV, aSrcPitch[2] * height / 2, cudaMemcpyHostToDevice));
+    return 0;
+}
+
+int convertor::yuv2bgr(unsigned char **bgr, int *bgrLen){
+
+    *bgr = NULL;
+    *bgrLen = 0;
+
+    setGPUDevice(gpu_index);
+
+    NPP_CHECK_NPP(nppiYUV420ToBGR_8u_P3C3R(apSrcImage, aSrcImageStep, imgOrigin, pitchOrigin, sizeOrigin));
+
+    NPP_CHECK_CUDA(cudaMemcpy2D(bgrOrigin, bgrOriginPitch, imgOrigin, pitchOrigin, bgrOriginPitch, height, cudaMemcpyDeviceToHost));
+    *bgr = bgrOrigin;
+    *bgrLen = bgrOriginLen;
+
+    return 0;
+}
+
+int convertor::resize2bgr(unsigned char *in, unsigned char **data, int *data_len){
+    *data = NULL;
+    *data_len = 0;
+
+    if ((rWidth < 0 && rHeight < 0) || (rWidth > width && rHeight > height)){
+        return -1;
+    }
+
+    setGPUDevice(gpu_index);
+
+    if (!in){
+
+        init_resize();
+
+        NppiSize oDstImageSize;
+        oDstImageSize.width  = std::max(1, rWidth);
+        oDstImageSize.height = std::max(1, rHeight);
+        for (int i = 0; i < 3; ++i)
+        {
+            NppiSize oBlocksPerMCU = { aSamplingFactors[i] & 0x0f, aSamplingFactors[i] >> 4};
+            NppiSize oSrcImageSize = {(width * oBlocksPerMCU.width) / nMCUBlocksH, (height * oBlocksPerMCU.height)/nMCUBlocksV};
+            NppiRect oSrcImageROI = {0,0,oSrcImageSize.width, oSrcImageSize.height};
+            NppiRect oDstImageROI;
+            NppiInterpolationMode eInterploationMode = NPPI_INTER_SUPER;
+            NPP_CHECK_NPP(nppiGetResizeRect(oSrcImageROI, &oDstImageROI,
+                                            fx,
+                                            fy,
+                                            0.0, 0.0, eInterploationMode));
+            NPP_CHECK_NPP(nppiResizeSqrPixel_8u_C1R(apSrcImage[i], oSrcImageSize, aSrcImageStep[i], oSrcImageROI,
+                                                    apDstImage[i], aDstImageStep[i], oDstImageROI ,
+                                                    fx,
+                                                    fy,
+                                                    0.0, 0.0, eInterploationMode));
+        }
+        NPP_CHECK_NPP(nppiYUV420ToBGR_8u_P3C3R(apDstImage, aDstImageStep, imgResize, pitchResize, sizeResize));
+        NPP_CHECK_CUDA(cudaMemcpy2D(bgrScale, bgrScalePitch, imgResize, pitchResize, bgrScalePitch, rHeight, cudaMemcpyDeviceToHost));
+        *data = bgrScale;
+        *data_len = bgrScaleLen;
+    }else{
+        
+        init_resize_bgr();
+
+        NppiSize oSrcSize;
+        oSrcSize.width = width;
+        oSrcSize.height = height;
+
+        NPP_CHECK_CUDA(cudaMemcpy2D(originBGR, pitchOriginBGR, in, width*3, width*3, height, cudaMemcpyHostToDevice));
+
+        NppiRect oSrcROI;
+        oSrcROI.x = 0;
+        oSrcROI.y = 0;
+        oSrcROI.width = width;
+        oSrcROI.height = height;
+
+
+        NppiRect oDstROI;
+        oDstROI.x = 0;
+        oDstROI.y = 0;
+        oDstROI.width = rWidth;
+        oDstROI.height = rHeight;
+
+        // Scale Factor
+        double nXFactor = double(oDstROI.width) / double(oSrcROI.width);
+        double nYFactor = double(oDstROI.height) / double(oSrcROI.height);
+
+        // Scaled X/Y  Shift
+        double nXShift = - oSrcROI.x * nXFactor ;
+        double nYShift = - oSrcROI.y * nYFactor;
+        int eInterpolation = NPPI_INTER_SUPER;
+        if (nXFactor >= 1.f || nYFactor >= 1.f)
+            eInterpolation = NPPI_INTER_LANCZOS;
+
+        NppStatus ret = nppiResizeSqrPixel_8u_C3R(originBGR, oSrcSize, pitchOriginBGR, oSrcROI, 
+            resizedBGR, pitchResizedBGR, oDstROI, nXFactor, nYFactor, nXShift, nYShift, eInterpolation );
+
+        if(ret != NPP_SUCCESS) {
+            printf("imageResize_8u_C3R failed %d.\n", ret);
+            return -2;
+        }
+        size_t pitch = rWidth * 3;
+        *data_len = pitch * rHeight;
+        NPP_CHECK_CUDA(cudaMemcpy2D(hostResizedBGR, pitch, resizedBGR, pitchResizedBGR, pitch, rHeight, cudaMemcpyDeviceToHost));
+        *data = hostResizedBGR;
+    }
+    return 0;
+}
+
+static int nv12_nearest_scale(uint8_t* __restrict src, uint8_t* __restrict dst,   
+                        int srcWidth, int srcHeight, int dstWidth, int dstHeight)
+{   
+    register int sw = srcWidth;  //register keyword is for local var to accelorate    
+    register int sh = srcHeight;   
+    register int dw = dstWidth;   
+    register int dh = dstHeight;   
+    register int y, x;   
+    unsigned long int srcy, srcx, src_index, dst_index;   
+    unsigned long int xrIntFloat_16 = (sw << 16) / dw + 1; //better than float division   
+    unsigned long int yrIntFloat_16 = (sh << 16) / dh + 1;   
+   
+    uint8_t* dst_uv = dst + dh * dw; //memory start pointer of dest uv   
+    uint8_t* src_uv = src + sh * sw; //memory start pointer of source uv   
+    uint8_t* dst_uv_yScanline;   
+    uint8_t* src_uv_yScanline;   
+    uint8_t* dst_y_slice = dst; //memory start pointer of dest y   
+    uint8_t* src_y_slice;   
+    uint8_t* sp;   
+    uint8_t* dp;   
+    
+    for (y = 0; y < (dh & ~7); ++y)  //'dh & ~7' is to generate faster assembly code   
+    {   
+        srcy = (y * yrIntFloat_16) >> 16;   
+        src_y_slice = src + srcy * sw;   
+   
+        if((y & 1) == 0)   
+        {   
+            dst_uv_yScanline = dst_uv + (y / 2) * dw;   
+            src_uv_yScanline = src_uv + (srcy / 2) * sw;   
+        }   
+   
+        for(x = 0; x < (dw & ~7); ++x)   
+        {   
+            srcx = (x * xrIntFloat_16) >> 16;   
+            dst_y_slice[x] = src_y_slice[srcx];   
+   
+            if((y & 1) == 0) //y is even   
+            {   
+                if((x & 1) == 0) //x is even   
+                {   
+                    src_index = (srcx / 2) * 2;   
+               
+                    sp = dst_uv_yScanline + x;   
+                    dp = src_uv_yScanline + src_index;   
+                    *sp = *dp;   
+                    ++sp;   
+                    ++dp;   
+                    *sp = *dp;   
+                }   
+             }   
+         }   
+         dst_y_slice += dw;   
+    }
+    return 0;
+}
+
+int convertor::resizeyuv(unsigned char *in, unsigned char **data, int *data_len){
+        
+    init_resize_yuv();
+
+    *data_len = rWidth*rHeight*3/2;
+    *data = nv12;
+
+    return nv12_nearest_scale(in, nv12, width, height, rWidth, rHeight);
+}
+
+convHandle conv_create(const int srcW, const int srcH, const int dstW, const int dstH, const int gpu){
+    if (gpu < 0) return NULL;
+
+    convertor *conv = new convertor(srcW, srcH, dstW, dstH, gpu);
+    return conv;
+}
+
+void conv_destroy(convHandle h){
+    if (!h) return;    
+    convertor *conv = (convertor*)h;
+    delete conv;
+}
+
+int yuv2bgrandresize(convHandle h, void *yuv, unsigned char **bgr, int *bgrLen, unsigned char **scaleBGR, int *scaleBGRLen){
+    if (!h) return -2;
+    convertor *conv = (convertor*)h;
+    int ret = conv->fill_yuv((unsigned char*)yuv);
+    if (ret != 0) return ret;
+    ret = conv->yuv2bgr(bgr, bgrLen);
+    if (ret != 0) return ret;
+    ret = conv->resize2bgr(NULL, scaleBGR, scaleBGRLen);
+    return ret;
+}
+
+int yuv2bgr(convHandle h, void *yuv, unsigned char **bgr, int *bgrLen){
+    if (!h) return -2;
+    convertor *conv = (convertor*)h;
+    int ret = conv->fill_yuv((unsigned char*)yuv);
+    if (ret != 0) return ret;
+    return conv->yuv2bgr(bgr, bgrLen);
+}
+
+int yuv2resizedbgr(convHandle h, void *yuv, unsigned char **bgr, int *bgrLen){
+    if (!h) return -2;
+    convertor *conv = (convertor*)h;
+    int ret = conv->fill_yuv((unsigned char*)yuv);
+    if (ret != 0) return ret;
+    ret = conv->resize2bgr(NULL, bgr, bgrLen);
+    return ret;
+}
+
+int resizebgr(convHandle h, void *data, unsigned char **resized, int *len){
+    if (!h) return -2;
+    convertor *conv = (convertor*)h;
+    return conv->resize2bgr((unsigned char*)data, resized, len);
+}
+
+int resizeyuv(convHandle h, void *data, unsigned char **resized, int *len){
+    if (!h) return -2;
+    convertor *conv = (convertor*)h;
+    return conv->resizeyuv((unsigned char*)data, resized, len);
+}
diff --git a/goconv/conv.h b/goconv/conv.h
new file mode 100644
index 0000000..a55d9a8
--- /dev/null
+++ b/goconv/conv.h
@@ -0,0 +1,23 @@
+#ifndef __RESIZE_NPP_H__
+#define __RESIZE_NPP_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+typedef void* convHandle;
+convHandle conv_create(const int srcW, const int srcH, const int dstW, const int dstH, const int gpu);
+
+void conv_destroy(convHandle h);
+int yuv2bgrandresize(convHandle h, void *yuv, unsigned char **bgr, int *bgrLen, unsigned char **scaleBGR, int *scaleBGRLen);
+int yuv2bgr(convHandle h, void *yuv, unsigned char **bgr, int *bgrLen);
+int yuv2resizedbgr(convHandle h, void *yuv, unsigned char **bgr, int *bgrLen);
+
+int resizebgr(convHandle h, void *data, unsigned char **resized, int *len);
+int resizeyuv(convHandle h, void *data, unsigned char **resized, int *len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //__RESIZE_NPP_H__
diff --git a/goconv/goconv.go b/goconv/goconv.go
new file mode 100644
index 0000000..db38a94
--- /dev/null
+++ b/goconv/goconv.go
@@ -0,0 +1,258 @@
+package goconv
+
+/*
+#cgo CFLAGS: -I./ -I./inc -I/usr/local/cuda/include
+#cgo CXXFLAGS: -I./ -I./inc -I/usr/local/cuda/include -std=c++11
+#cgo LDFLAGS: -L/usr/local/cuda/lib64 -lnppig -lnppicc -lnppial -lnppisu -lcudart -ldl
+#include <stdlib.h>
+#include "conv.h"
+*/
+import "C"
+import (
+	"unsafe"
+
+	"basic.com/valib/godraw.git"
+	"basic.com/valib/gogpu.git"
+	"github.com/disintegration/imaging"
+)
+
+const (
+	need     = 200
+	reserved = 512
+)
+
+func gpuIndex(lastIndex int) int {
+	indices := gogpu.RankGPU()
+	if len(indices) == 0 {
+		return -1
+	}
+
+	for _, v := range indices {
+		if v != lastIndex {
+			if gogpu.SatisfyGPU(v, need, need/2) {
+				return v
+			}
+		}
+	}
+
+	if gogpu.SatisfyGPU(lastIndex, need, reserved) {
+		return lastIndex
+	}
+	return -1
+}
+
+type convertor struct {
+	width   int
+	height  int
+	rWidth  int
+	rHeight int
+	conv    C.convHandle
+}
+
+var convts []*convertor
+
+func find(w, h, rw, rh int) *convertor {
+	for _, v := range convts {
+		if v.width == w && v.height == h && v.rWidth == rw && v.rHeight == rh {
+			return v
+		}
+	}
+	gpu := gpuIndex(0)
+	if gpu < 0 {
+		return nil
+	}
+	cw := C.conv_create(C.int(w), C.int(h), C.int(rw), C.int(rh), C.int(gpu))
+	if cw == nil {
+		return nil
+	}
+	c := &convertor{w, h, rw, rh, cw}
+	convts = append(convts, c)
+	return c
+}
+
+// YUV2BGR yuv->bgr
+func YUV2BGR(yuv []byte, w, h int) []byte {
+
+	cw := find(w, h, -1, -1)
+	if cw == nil {
+		return yuv2bgr(yuv, w, h)
+	}
+	var bgr *C.uchar
+	var bgrLen C.int
+	ret := C.yuv2bgr(cw.conv, unsafe.Pointer(&yuv[0]), &bgr, &bgrLen)
+	if ret != 0 {
+		return nil
+	}
+	const maxLen = 0x7fffffff
+	goBGRLen := int(bgrLen)
+	if goBGRLen > 0 {
+		return (*[maxLen]byte)(unsafe.Pointer(bgr))[:goBGRLen:goBGRLen]
+	}
+	return nil
+}
+
+// YUV2ResizedBGR yuv -> resized bgr
+func YUV2ResizedBGR(yuv []byte, w, h, rw, rh int) []byte {
+
+	cw := find(w, h, rw, rh)
+	if cw == nil {
+		bgr := yuv2bgr(yuv, w, h)
+		return bgresize(bgr, w, h, rw, rh)
+	}
+	var bgr *C.uchar
+	var bgrLen C.int
+	ret := C.yuv2resizedbgr(cw.conv, unsafe.Pointer(&yuv[0]), &bgr, &bgrLen)
+	if ret != 0 {
+		return nil
+	}
+	const maxLen = 0x7fffffff
+	goBGRLen := int(bgrLen)
+	if goBGRLen > 0 {
+		return (*[maxLen]byte)(unsafe.Pointer(bgr))[:goBGRLen:goBGRLen]
+	}
+	return nil
+
+}
+
+// ResizeBGR resize
+func ResizeBGR(bgrO []byte, w, h, rw, rh int) []byte {
+	if (rw < 0 && rh < 0) || (rw > w && rh > h) {
+		return bgrO
+	}
+
+	cw := find(w, h, rw, rh)
+	if cw == nil {
+		return bgresize(bgrO, w, h, rw, rh)
+	}
+
+	var bgr *C.uchar
+	var bgrLen C.int
+	ret := C.resizebgr(cw.conv, unsafe.Pointer(&bgrO[0]), &bgr, &bgrLen)
+	if ret != 0 {
+		return nil
+	}
+	const maxLen = 0x7fffffff
+	goBGRLen := int(bgrLen)
+	if goBGRLen > 0 {
+		return (*[maxLen]byte)(unsafe.Pointer(bgr))[:goBGRLen:goBGRLen]
+	}
+	return nil
+}
+
+// ResizeYUV yuv
+func ResizeYUV(yuv []byte, w, h, rw, rh int) []byte {
+	if (rw < 0 && rh < 0) || (rw > w && rh > h) {
+		return yuv
+	}
+
+	cw := find(w, h, rw, rh)
+	if cw == nil {
+		return yuv
+	}
+
+	var resized *C.uchar
+	var resizedLen C.int
+	ret := C.resizeyuv(cw.conv, unsafe.Pointer(&yuv[0]), &resized, &resizedLen)
+	if ret != 0 {
+		return nil
+	}
+
+	const maxLen = 0x7fffffff
+	goResizedLen := int(resizedLen)
+	if goResizedLen > 0 {
+		return (*[maxLen]byte)(unsafe.Pointer(resized))[:goResizedLen:goResizedLen]
+	}
+	return nil
+}
+
+// YUV2BGRandResize conv and resize
+func YUV2BGRandResize(yuv []byte, w, h, rw, rh int) ([]byte, []byte) {
+	cw := find(w, h, rw, rh)
+	if cw == nil {
+		origin := yuv2bgr(yuv, w, h)
+		resized := bgresize(origin, w, h, rw, rh)
+		return origin, resized
+	}
+
+	var bgr *C.uchar
+	var bgrLen C.int
+	var scale *C.uchar
+	var scaleLen C.int
+
+	ret := C.yuv2bgrandresize(cw.conv, unsafe.Pointer(&yuv[0]), &bgr, &bgrLen, &scale, &scaleLen)
+
+	if ret != 0 {
+		return nil, nil
+	}
+	var out, resized []byte
+
+	const maxLen = 0x7fffffff
+	goBGRLen, goScaleLen := int(bgrLen), int(scaleLen)
+	if goBGRLen > 0 {
+		out = (*[maxLen]byte)(unsafe.Pointer(bgr))[:goBGRLen:goBGRLen]
+	}
+	if goScaleLen > 0 {
+		resized = (*[maxLen]byte)(unsafe.Pointer(scale))[:goScaleLen:goScaleLen]
+	}
+	return out, resized
+
+}
+
+// Free free
+func Free() {
+	for _, v := range convts {
+		if v.conv != nil {
+			C.conv_destroy(v.conv)
+		}
+	}
+}
+
+func yuv2bgr(yuv []byte, w, h int) []byte {
+
+	data := make([]byte, 0, w*h*3)
+	start := w * h
+	for i := 0; i < h; i++ {
+		for j := 0; j < w; j++ {
+
+			index := i/2*w + j - (j & 0x01)
+
+			y := int32(yuv[j+i*w])
+			v := int32(yuv[start+index])
+			u := int32(yuv[start+index+1])
+
+			r := y + (140*(v-128))/100
+			g := y - (34*(u-128)+71*(v-128))/100
+			b := y + (177*(u-128))/100
+
+			if r > 255 {
+				r = 255
+			}
+			if r < 0 {
+				r = 0
+			}
+			if g > 255 {
+				g = 255
+			}
+			if g < 0 {
+				g = 0
+			}
+			if b > 255 {
+				b = 255
+			}
+			if b < 0 {
+				b = 0
+			}
+			data = append(data, byte(r), byte(g), byte(b))
+		}
+	}
+	return data
+}
+
+func bgresize(bgr []byte, w, h, rw, rh int) []byte {
+	img, err := godraw.ToImage(bgr, w, h)
+	if err != nil {
+		return nil
+	}
+	dstImg := imaging.Resize(img, rw, rh, imaging.NearestNeighbor)
+	return godraw.Image2BGR(dstImg)
+}
diff --git a/goconv/inc/Exceptions.h b/goconv/inc/Exceptions.h
new file mode 100644
index 0000000..627bfd9
--- /dev/null
+++ b/goconv/inc/Exceptions.h
@@ -0,0 +1,181 @@
+/**
+ * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+#ifndef NV_UTIL_NPP_EXCEPTIONS_H
+#define NV_UTIL_NPP_EXCEPTIONS_H
+
+
+#include <string>
+#include <sstream>
+#include <iostream>
+
+/// All npp related C++ classes are put into the npp namespace.
+namespace npp
+{
+
+    /// Exception base class.
+    ///     This exception base class will be used for everything C++ throught
+    /// the NPP project.
+    ///     The exception contains a string message, as well as data fields for a string
+    /// containing the name of the file as well as the line number where the exception was thrown.
+    ///     The easiest way of throwing exceptions and providing filename and line number is
+    /// to use one of the ASSERT macros defined for that purpose.
+    class Exception
+    {
+        public:
+            /// Constructor.
+            /// \param rMessage A message with information as to why the exception was thrown.
+            /// \param rFileName The name of the file where the exception was thrown.
+            /// \param nLineNumber Line number in the file where the exception was thrown.
+            explicit
+            Exception(const std::string &rMessage = "", const std::string &rFileName = "", unsigned int nLineNumber = 0)
+                : sMessage_(rMessage), sFileName_(rFileName), nLineNumber_(nLineNumber)
+            { };
+
+            Exception(const Exception &rException)
+                : sMessage_(rException.sMessage_), sFileName_(rException.sFileName_), nLineNumber_(rException.nLineNumber_)
+            { };
+
+            virtual
+            ~Exception()
+            { };
+
+            /// Get the exception's message.
+            const
+            std::string &
+            message()
+            const
+            {
+                return sMessage_;
+            }
+
+            /// Get the exception's file info.
+            const
+            std::string &
+            fileName()
+            const
+            {
+                return sFileName_;
+            }
+
+            /// Get the exceptions's line info.
+            unsigned int
+            lineNumber()
+            const
+            {
+                return nLineNumber_;
+            }
+
+
+            /// Create a clone of this exception.
+            ///      This creates a new Exception object on the heap. It is
+            /// the responsibility of the user of this function to free this memory
+            /// (delete x).
+            virtual
+            Exception *
+            clone()
+            const
+            {
+                return new Exception(*this);
+            }
+
+            /// Create a single string with all the exceptions information.
+            ///     The virtual toString() method is used by the operator<<()
+            /// so that all exceptions derived from this base-class can print
+            /// their full information correctly even if a reference to their
+            /// exact type is not had at the time of printing (i.e. the basic
+            /// operator<<() is used).
+            virtual
+            std::string
+            toString()
+            const
+            {
+                std::ostringstream oOutputString;
+                oOutputString << fileName() << ":" << lineNumber() << ": " << message();
+                return oOutputString.str();
+            }
+
+        private:
+            std::string sMessage_;      ///< Message regarding the cause of the exception.
+            std::string sFileName_;     ///< Name of the file where the exception was thrown.
+            unsigned int nLineNumber_;  ///< Line number in the file where the exception was thrown
+    };
+
+    /// Output stream inserter for Exception.
+    /// \param rOutputStream The stream the exception information is written to.
+    /// \param rException The exception that's being written.
+    /// \return Reference to the output stream being used.
+    static std::ostream &
+    operator << (std::ostream &rOutputStream, const Exception &rException)
+    {
+        rOutputStream << rException.toString();
+        return rOutputStream;
+    }
+
+    /// Basic assert macro.
+    ///     This macro should be used to enforce any kind of pre or post conditions.
+    /// Unlike the C-runtime assert macro, this macro does not abort execution, but throws
+    /// a C++ exception. The exception is automatically filled with information about the failing
+    /// condition, the filename and line number where the exception was thrown.
+    /// \note The macro is written in such a way that omitting a semicolon after its usage
+    ///     causes a compiler error. The correct way to invoke this macro is:
+    /// NPP_ASSERT(n < MAX);
+#define NPP_ASSERT(C) do {if (!(C)) throw npp::Exception(#C " assertion faild!", __FILE__, __LINE__);} while(false)
+
+    // ASSERT macro.
+    //  Same functionality as the basic assert macro with the added ability to pass
+    //  a message M. M should be a string literal.
+    //  Note: Never use code inside ASSERT() that causes a side-effect ASSERT macros may get compiled
+    //      out in release mode.
+#define NPP_ASSERT_MSG(C, M) do {if (!(C)) throw npp::Exception(#C " assertion faild! Message: " M, __FILE__, __LINE__);} while(false)
+
+#ifdef _DEBUG
+    /// Basic debug assert macro.
+    ///     This macro is identical in every respect to NPP_ASSERT(C) but it does get compiled to a
+    /// no-op in release builds. It is therefor of utmost importance to not put statements into
+    /// this macro that cause side effects required for correct program execution.
+#define NPP_DEBUG_ASSERT(C) do {if (!(C)) throw npp::Exception(#C " debug assertion faild!", __FILE__, __LINE__);} while(false)
+#else
+#define NPP_DEBUG_ASSERT(C)
+#endif
+
+    /// ASSERT for null-pointer test.
+    /// It is safe to put code with side effects into this macro. Also: This macro never
+    /// gets compiled to a no-op because resource allocation may fail based on external causes not under
+    /// control of a software developer.
+#define NPP_ASSERT_NOT_NULL(P) do {if ((P) == 0) throw npp::Exception(#P " not null assertion faild!", __FILE__, __LINE__);} while(false)
+
+    /// Macro for flagging methods as not implemented.
+    /// The macro throws an exception with a message that an implementation was missing
+#define NPP_NOT_IMPLEMENTED() do {throw npp::Exception("Implementation missing!", __FILE__, __LINE__);} while(false)
+
+    /// Macro for checking error return code of CUDA (runtime) calls.
+    /// This macro never gets disabled.
+#define NPP_CHECK_CUDA(S) do {cudaError_t eCUDAResult; \
+        eCUDAResult = S; \
+        if (eCUDAResult != cudaSuccess) std::cout << "NPP_CHECK_CUDA - eCUDAResult = " << eCUDAResult << std::endl; \
+        NPP_ASSERT(eCUDAResult == cudaSuccess);} while (false)
+
+    /// Macro for checking error return code for NPP calls.
+#define NPP_CHECK_NPP(S) do {NppStatus eStatusNPP; \
+        eStatusNPP = S; \
+        if (eStatusNPP != NPP_SUCCESS) std::cout << "NPP_CHECK_NPP - eStatusNPP = " << _cudaGetErrorEnum(eStatusNPP) << "("<< eStatusNPP << ")" << std::endl; \
+        NPP_ASSERT(eStatusNPP == NPP_SUCCESS);} while (false)
+
+    /// Macro for checking error return codes from cuFFT calls.
+#define NPP_CHECK_CUFFT(S) do {cufftResult eCUFFTResult; \
+        eCUFFTResult = S; \
+        if (eCUFFTResult != NPP_SUCCESS) std::cout << "NPP_CHECK_CUFFT - eCUFFTResult = " << eCUFFTResult << std::endl; \
+        NPP_ASSERT(eCUFFTResult == CUFFT_SUCCESS);} while (false)
+
+} // npp namespace
+
+#endif // NV_UTIL_NPP_EXCEPTIONS_H
diff --git a/goconv/inc/helper_cuda.h b/goconv/inc/helper_cuda.h
new file mode 100644
index 0000000..b24684c
--- /dev/null
+++ b/goconv/inc/helper_cuda.h
@@ -0,0 +1,1261 @@
+/**
+ * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+////////////////////////////////////////////////////////////////////////////////
+// These are CUDA Helper functions for initialization and error checking
+
+#ifndef HELPER_CUDA_H
+#define HELPER_CUDA_H
+
+#pragma once
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <helper_string.h>
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// Note, it is required that your SDK sample to include the proper header files, please
+// refer the CUDA examples for examples of the needed CUDA headers, which may change depending
+// on which CUDA functions are used.
+
+// CUDA Runtime error messages
+#ifdef __DRIVER_TYPES_H__
+static const char *_cudaGetErrorEnum(cudaError_t error)
+{
+    switch (error)
+    {
+        case cudaSuccess:
+            return "cudaSuccess";
+
+        case cudaErrorMissingConfiguration:
+            return "cudaErrorMissingConfiguration";
+
+        case cudaErrorMemoryAllocation:
+            return "cudaErrorMemoryAllocation";
+
+        case cudaErrorInitializationError:
+            return "cudaErrorInitializationError";
+
+        case cudaErrorLaunchFailure:
+            return "cudaErrorLaunchFailure";
+
+        case cudaErrorPriorLaunchFailure:
+            return "cudaErrorPriorLaunchFailure";
+
+        case cudaErrorLaunchTimeout:
+            return "cudaErrorLaunchTimeout";
+
+        case cudaErrorLaunchOutOfResources:
+            return "cudaErrorLaunchOutOfResources";
+
+        case cudaErrorInvalidDeviceFunction:
+            return "cudaErrorInvalidDeviceFunction";
+
+        case cudaErrorInvalidConfiguration:
+            return "cudaErrorInvalidConfiguration";
+
+        case cudaErrorInvalidDevice:
+            return "cudaErrorInvalidDevice";
+
+        case cudaErrorInvalidValue:
+            return "cudaErrorInvalidValue";
+
+        case cudaErrorInvalidPitchValue:
+            return "cudaErrorInvalidPitchValue";
+
+        case cudaErrorInvalidSymbol:
+            return "cudaErrorInvalidSymbol";
+
+        case cudaErrorMapBufferObjectFailed:
+            return "cudaErrorMapBufferObjectFailed";
+
+        case cudaErrorUnmapBufferObjectFailed:
+            return "cudaErrorUnmapBufferObjectFailed";
+
+        case cudaErrorInvalidHostPointer:
+            return "cudaErrorInvalidHostPointer";
+
+        case cudaErrorInvalidDevicePointer:
+            return "cudaErrorInvalidDevicePointer";
+
+        case cudaErrorInvalidTexture:
+            return "cudaErrorInvalidTexture";
+
+        case cudaErrorInvalidTextureBinding:
+            return "cudaErrorInvalidTextureBinding";
+
+        case cudaErrorInvalidChannelDescriptor:
+            return "cudaErrorInvalidChannelDescriptor";
+
+        case cudaErrorInvalidMemcpyDirection:
+            return "cudaErrorInvalidMemcpyDirection";
+
+        case cudaErrorAddressOfConstant:
+            return "cudaErrorAddressOfConstant";
+
+        case cudaErrorTextureFetchFailed:
+            return "cudaErrorTextureFetchFailed";
+
+        case cudaErrorTextureNotBound:
+            return "cudaErrorTextureNotBound";
+
+        case cudaErrorSynchronizationError:
+            return "cudaErrorSynchronizationError";
+
+        case cudaErrorInvalidFilterSetting:
+            return "cudaErrorInvalidFilterSetting";
+
+        case cudaErrorInvalidNormSetting:
+            return "cudaErrorInvalidNormSetting";
+
+        case cudaErrorMixedDeviceExecution:
+            return "cudaErrorMixedDeviceExecution";
+
+        case cudaErrorCudartUnloading:
+            return "cudaErrorCudartUnloading";
+
+        case cudaErrorUnknown:
+            return "cudaErrorUnknown";
+
+        case cudaErrorNotYetImplemented:
+            return "cudaErrorNotYetImplemented";
+
+        case cudaErrorMemoryValueTooLarge:
+            return "cudaErrorMemoryValueTooLarge";
+
+        case cudaErrorInvalidResourceHandle:
+            return "cudaErrorInvalidResourceHandle";
+
+        case cudaErrorNotReady:
+            return "cudaErrorNotReady";
+
+        case cudaErrorInsufficientDriver:
+            return "cudaErrorInsufficientDriver";
+
+        case cudaErrorSetOnActiveProcess:
+            return "cudaErrorSetOnActiveProcess";
+
+        case cudaErrorInvalidSurface:
+            return "cudaErrorInvalidSurface";
+
+        case cudaErrorNoDevice:
+            return "cudaErrorNoDevice";
+
+        case cudaErrorECCUncorrectable:
+            return "cudaErrorECCUncorrectable";
+
+        case cudaErrorSharedObjectSymbolNotFound:
+            return "cudaErrorSharedObjectSymbolNotFound";
+
+        case cudaErrorSharedObjectInitFailed:
+            return "cudaErrorSharedObjectInitFailed";
+
+        case cudaErrorUnsupportedLimit:
+            return "cudaErrorUnsupportedLimit";
+
+        case cudaErrorDuplicateVariableName:
+            return "cudaErrorDuplicateVariableName";
+
+        case cudaErrorDuplicateTextureName:
+            return "cudaErrorDuplicateTextureName";
+
+        case cudaErrorDuplicateSurfaceName:
+            return "cudaErrorDuplicateSurfaceName";
+
+        case cudaErrorDevicesUnavailable:
+            return "cudaErrorDevicesUnavailable";
+
+        case cudaErrorInvalidKernelImage:
+            return "cudaErrorInvalidKernelImage";
+
+        case cudaErrorNoKernelImageForDevice:
+            return "cudaErrorNoKernelImageForDevice";
+
+        case cudaErrorIncompatibleDriverContext:
+            return "cudaErrorIncompatibleDriverContext";
+
+        case cudaErrorPeerAccessAlreadyEnabled:
+            return "cudaErrorPeerAccessAlreadyEnabled";
+
+        case cudaErrorPeerAccessNotEnabled:
+            return "cudaErrorPeerAccessNotEnabled";
+
+        case cudaErrorDeviceAlreadyInUse:
+            return "cudaErrorDeviceAlreadyInUse";
+
+        case cudaErrorProfilerDisabled:
+            return "cudaErrorProfilerDisabled";
+
+        case cudaErrorProfilerNotInitialized:
+            return "cudaErrorProfilerNotInitialized";
+
+        case cudaErrorProfilerAlreadyStarted:
+            return "cudaErrorProfilerAlreadyStarted";
+
+        case cudaErrorProfilerAlreadyStopped:
+            return "cudaErrorProfilerAlreadyStopped";
+
+        /* Since CUDA 4.0*/
+        case cudaErrorAssert:
+            return "cudaErrorAssert";
+
+        case cudaErrorTooManyPeers:
+            return "cudaErrorTooManyPeers";
+
+        case cudaErrorHostMemoryAlreadyRegistered:
+            return "cudaErrorHostMemoryAlreadyRegistered";
+
+        case cudaErrorHostMemoryNotRegistered:
+            return "cudaErrorHostMemoryNotRegistered";
+
+        /* Since CUDA 5.0 */
+        case cudaErrorOperatingSystem:
+            return "cudaErrorOperatingSystem";
+
+        case cudaErrorPeerAccessUnsupported:
+            return "cudaErrorPeerAccessUnsupported";
+
+        case cudaErrorLaunchMaxDepthExceeded:
+            return "cudaErrorLaunchMaxDepthExceeded";
+
+        case cudaErrorLaunchFileScopedTex:
+            return "cudaErrorLaunchFileScopedTex";
+
+        case cudaErrorLaunchFileScopedSurf:
+            return "cudaErrorLaunchFileScopedSurf";
+
+        case cudaErrorSyncDepthExceeded:
+            return "cudaErrorSyncDepthExceeded";
+
+        case cudaErrorLaunchPendingCountExceeded:
+            return "cudaErrorLaunchPendingCountExceeded";
+
+        case cudaErrorNotPermitted:
+            return "cudaErrorNotPermitted";
+
+        case cudaErrorNotSupported:
+            return "cudaErrorNotSupported";
+
+        /* Since CUDA 6.0 */
+        case cudaErrorHardwareStackError:
+            return "cudaErrorHardwareStackError";
+
+        case cudaErrorIllegalInstruction:
+            return "cudaErrorIllegalInstruction";
+
+        case cudaErrorMisalignedAddress:
+            return "cudaErrorMisalignedAddress";
+
+        case cudaErrorInvalidAddressSpace:
+            return "cudaErrorInvalidAddressSpace";
+
+        case cudaErrorInvalidPc:
+            return "cudaErrorInvalidPc";
+
+        case cudaErrorIllegalAddress:
+            return "cudaErrorIllegalAddress";
+
+        /* Since CUDA 6.5*/
+        case cudaErrorInvalidPtx:
+            return "cudaErrorInvalidPtx";
+
+        case cudaErrorInvalidGraphicsContext:
+            return "cudaErrorInvalidGraphicsContext";
+
+        case cudaErrorStartupFailure:
+            return "cudaErrorStartupFailure";
+
+        case cudaErrorApiFailureBase:
+            return "cudaErrorApiFailureBase";
+    }
+
+    return "<unknown>";
+}
+#endif
+
+#ifdef __cuda_cuda_h__
+// CUDA Driver API errors
+static const char *_cudaGetErrorEnum(CUresult error)
+{
+    switch (error)
+    {
+        case CUDA_SUCCESS:
+            return "CUDA_SUCCESS";
+
+        case CUDA_ERROR_INVALID_VALUE:
+            return "CUDA_ERROR_INVALID_VALUE";
+
+        case CUDA_ERROR_OUT_OF_MEMORY:
+            return "CUDA_ERROR_OUT_OF_MEMORY";
+
+        case CUDA_ERROR_NOT_INITIALIZED:
+            return "CUDA_ERROR_NOT_INITIALIZED";
+
+        case CUDA_ERROR_DEINITIALIZED:
+            return "CUDA_ERROR_DEINITIALIZED";
+
+        case CUDA_ERROR_PROFILER_DISABLED:
+            return "CUDA_ERROR_PROFILER_DISABLED";
+
+        case CUDA_ERROR_PROFILER_NOT_INITIALIZED:
+            return "CUDA_ERROR_PROFILER_NOT_INITIALIZED";
+
+        case CUDA_ERROR_PROFILER_ALREADY_STARTED:
+            return "CUDA_ERROR_PROFILER_ALREADY_STARTED";
+
+        case CUDA_ERROR_PROFILER_ALREADY_STOPPED:
+            return "CUDA_ERROR_PROFILER_ALREADY_STOPPED";
+
+        case CUDA_ERROR_NO_DEVICE:
+            return "CUDA_ERROR_NO_DEVICE";
+
+        case CUDA_ERROR_INVALID_DEVICE:
+            return "CUDA_ERROR_INVALID_DEVICE";
+
+        case CUDA_ERROR_INVALID_IMAGE:
+            return "CUDA_ERROR_INVALID_IMAGE";
+
+        case CUDA_ERROR_INVALID_CONTEXT:
+            return "CUDA_ERROR_INVALID_CONTEXT";
+
+        case CUDA_ERROR_CONTEXT_ALREADY_CURRENT:
+            return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT";
+
+        case CUDA_ERROR_MAP_FAILED:
+            return "CUDA_ERROR_MAP_FAILED";
+
+        case CUDA_ERROR_UNMAP_FAILED:
+            return "CUDA_ERROR_UNMAP_FAILED";
+
+        case CUDA_ERROR_ARRAY_IS_MAPPED:
+            return "CUDA_ERROR_ARRAY_IS_MAPPED";
+
+        case CUDA_ERROR_ALREADY_MAPPED:
+            return "CUDA_ERROR_ALREADY_MAPPED";
+
+        case CUDA_ERROR_NO_BINARY_FOR_GPU:
+            return "CUDA_ERROR_NO_BINARY_FOR_GPU";
+
+        case CUDA_ERROR_ALREADY_ACQUIRED:
+            return "CUDA_ERROR_ALREADY_ACQUIRED";
+
+        case CUDA_ERROR_NOT_MAPPED:
+            return "CUDA_ERROR_NOT_MAPPED";
+
+        case CUDA_ERROR_NOT_MAPPED_AS_ARRAY:
+            return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY";
+
+        case CUDA_ERROR_NOT_MAPPED_AS_POINTER:
+            return "CUDA_ERROR_NOT_MAPPED_AS_POINTER";
+
+        case CUDA_ERROR_ECC_UNCORRECTABLE:
+            return "CUDA_ERROR_ECC_UNCORRECTABLE";
+
+        case CUDA_ERROR_UNSUPPORTED_LIMIT:
+            return "CUDA_ERROR_UNSUPPORTED_LIMIT";
+
+        case CUDA_ERROR_CONTEXT_ALREADY_IN_USE:
+            return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE";
+
+        case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED:
+            return "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED";
+
+        case CUDA_ERROR_INVALID_PTX:
+            return "CUDA_ERROR_INVALID_PTX";
+
+        case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT:
+            return "CUDA_ERROR_INVALID_GRAPHICS_CONTEXT";
+
+        case CUDA_ERROR_INVALID_SOURCE:
+            return "CUDA_ERROR_INVALID_SOURCE";
+
+        case CUDA_ERROR_FILE_NOT_FOUND:
+            return "CUDA_ERROR_FILE_NOT_FOUND";
+
+        case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND:
+            return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND";
+
+        case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED:
+            return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED";
+
+        case CUDA_ERROR_OPERATING_SYSTEM:
+            return "CUDA_ERROR_OPERATING_SYSTEM";
+
+        case CUDA_ERROR_INVALID_HANDLE:
+            return "CUDA_ERROR_INVALID_HANDLE";
+
+        case CUDA_ERROR_NOT_FOUND:
+            return "CUDA_ERROR_NOT_FOUND";
+
+        case CUDA_ERROR_NOT_READY:
+            return "CUDA_ERROR_NOT_READY";
+
+        case CUDA_ERROR_ILLEGAL_ADDRESS:
+            return "CUDA_ERROR_ILLEGAL_ADDRESS";
+
+        case CUDA_ERROR_LAUNCH_FAILED:
+            return "CUDA_ERROR_LAUNCH_FAILED";
+
+        case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
+            return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
+
+        case CUDA_ERROR_LAUNCH_TIMEOUT:
+            return "CUDA_ERROR_LAUNCH_TIMEOUT";
+
+        case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING:
+            return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";
+
+        case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED:
+            return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED";
+
+        case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED:
+            return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED";
+
+        case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE:
+            return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE";
+
+        case CUDA_ERROR_CONTEXT_IS_DESTROYED:
+            return "CUDA_ERROR_CONTEXT_IS_DESTROYED";
+
+        case CUDA_ERROR_ASSERT:
+            return "CUDA_ERROR_ASSERT";
+
+        case CUDA_ERROR_TOO_MANY_PEERS:
+            return "CUDA_ERROR_TOO_MANY_PEERS";
+
+        case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED:
+            return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED";
+
+        case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED:
+            return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED";
+
+        case CUDA_ERROR_HARDWARE_STACK_ERROR:
+            return "CUDA_ERROR_HARDWARE_STACK_ERROR";
+
+        case CUDA_ERROR_ILLEGAL_INSTRUCTION:
+            return "CUDA_ERROR_ILLEGAL_INSTRUCTION";
+
+        case CUDA_ERROR_MISALIGNED_ADDRESS:
+            return "CUDA_ERROR_MISALIGNED_ADDRESS";
+
+        case CUDA_ERROR_INVALID_ADDRESS_SPACE:
+            return "CUDA_ERROR_INVALID_ADDRESS_SPACE";
+
+        case CUDA_ERROR_INVALID_PC:
+            return "CUDA_ERROR_INVALID_PC";
+
+        case CUDA_ERROR_NOT_PERMITTED:
+            return "CUDA_ERROR_NOT_PERMITTED";
+
+        case CUDA_ERROR_NOT_SUPPORTED:
+            return "CUDA_ERROR_NOT_SUPPORTED";
+
+        case CUDA_ERROR_UNKNOWN:
+            return "CUDA_ERROR_UNKNOWN";
+    }
+
+    return "<unknown>";
+}
+#endif
+
+#ifdef CUBLAS_API_H_
+// cuBLAS API errors
+static const char *_cudaGetErrorEnum(cublasStatus_t error)
+{
+    switch (error)
+    {
+        case CUBLAS_STATUS_SUCCESS:
+            return "CUBLAS_STATUS_SUCCESS";
+
+        case CUBLAS_STATUS_NOT_INITIALIZED:
+            return "CUBLAS_STATUS_NOT_INITIALIZED";
+
+        case CUBLAS_STATUS_ALLOC_FAILED:
+            return "CUBLAS_STATUS_ALLOC_FAILED";
+
+        case CUBLAS_STATUS_INVALID_VALUE:
+            return "CUBLAS_STATUS_INVALID_VALUE";
+
+        case CUBLAS_STATUS_ARCH_MISMATCH:
+            return "CUBLAS_STATUS_ARCH_MISMATCH";
+
+        case CUBLAS_STATUS_MAPPING_ERROR:
+            return "CUBLAS_STATUS_MAPPING_ERROR";
+
+        case CUBLAS_STATUS_EXECUTION_FAILED:
+            return "CUBLAS_STATUS_EXECUTION_FAILED";
+
+        case CUBLAS_STATUS_INTERNAL_ERROR:
+            return "CUBLAS_STATUS_INTERNAL_ERROR";
+
+        case CUBLAS_STATUS_NOT_SUPPORTED:
+            return "CUBLAS_STATUS_NOT_SUPPORTED";
+
+        case CUBLAS_STATUS_LICENSE_ERROR:
+            return "CUBLAS_STATUS_LICENSE_ERROR";
+    }
+
+    return "<unknown>";
+}
+#endif
+
+#ifdef _CUFFT_H_
+// cuFFT API errors
+static const char *_cudaGetErrorEnum(cufftResult error)
+{
+    switch (error)
+    {
+        case CUFFT_SUCCESS:
+            return "CUFFT_SUCCESS";
+
+        case CUFFT_INVALID_PLAN:
+            return "CUFFT_INVALID_PLAN";
+
+        case CUFFT_ALLOC_FAILED:
+            return "CUFFT_ALLOC_FAILED";
+
+        case CUFFT_INVALID_TYPE:
+            return "CUFFT_INVALID_TYPE";
+
+        case CUFFT_INVALID_VALUE:
+            return "CUFFT_INVALID_VALUE";
+
+        case CUFFT_INTERNAL_ERROR:
+            return "CUFFT_INTERNAL_ERROR";
+
+        case CUFFT_EXEC_FAILED:
+            return "CUFFT_EXEC_FAILED";
+
+        case CUFFT_SETUP_FAILED:
+            return "CUFFT_SETUP_FAILED";
+
+        case CUFFT_INVALID_SIZE:
+            return "CUFFT_INVALID_SIZE";
+
+        case CUFFT_UNALIGNED_DATA:
+            return "CUFFT_UNALIGNED_DATA";
+
+        case CUFFT_INCOMPLETE_PARAMETER_LIST:
+            return "CUFFT_INCOMPLETE_PARAMETER_LIST";
+
+        case CUFFT_INVALID_DEVICE:
+            return "CUFFT_INVALID_DEVICE";
+
+        case CUFFT_PARSE_ERROR:
+            return "CUFFT_PARSE_ERROR";
+
+        case CUFFT_NO_WORKSPACE:
+            return "CUFFT_NO_WORKSPACE";
+
+        case CUFFT_NOT_IMPLEMENTED:
+            return "CUFFT_NOT_IMPLEMENTED";
+
+        case CUFFT_LICENSE_ERROR:
+            return "CUFFT_LICENSE_ERROR";
+    }
+
+    return "<unknown>";
+}
+#endif
+
+
+#ifdef CUSPARSEAPI
+// cuSPARSE API errors
+static const char *_cudaGetErrorEnum(cusparseStatus_t error)
+{
+    switch (error)
+    {
+        case CUSPARSE_STATUS_SUCCESS:
+            return "CUSPARSE_STATUS_SUCCESS";
+
+        case CUSPARSE_STATUS_NOT_INITIALIZED:
+            return "CUSPARSE_STATUS_NOT_INITIALIZED";
+
+        case CUSPARSE_STATUS_ALLOC_FAILED:
+            return "CUSPARSE_STATUS_ALLOC_FAILED";
+
+        case CUSPARSE_STATUS_INVALID_VALUE:
+            return "CUSPARSE_STATUS_INVALID_VALUE";
+
+        case CUSPARSE_STATUS_ARCH_MISMATCH:
+            return "CUSPARSE_STATUS_ARCH_MISMATCH";
+
+        case CUSPARSE_STATUS_MAPPING_ERROR:
+            return "CUSPARSE_STATUS_MAPPING_ERROR";
+
+        case CUSPARSE_STATUS_EXECUTION_FAILED:
+            return "CUSPARSE_STATUS_EXECUTION_FAILED";
+
+        case CUSPARSE_STATUS_INTERNAL_ERROR:
+            return "CUSPARSE_STATUS_INTERNAL_ERROR";
+
+        case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+            return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+    }
+
+    return "<unknown>";
+}
+#endif
+
+#ifdef CUSOLVER_COMMON_H_
+//cuSOLVER API errors
+static const char *_cudaGetErrorEnum(cusolverStatus_t error)
+{
+   switch(error)
+   {
+       case CUSOLVER_STATUS_SUCCESS:
+           return "CUSOLVER_STATUS_SUCCESS";
+       case CUSOLVER_STATUS_NOT_INITIALIZED:
+           return "CUSOLVER_STATUS_NOT_INITIALIZED";
+       case CUSOLVER_STATUS_ALLOC_FAILED:
+           return "CUSOLVER_STATUS_ALLOC_FAILED";
+       case CUSOLVER_STATUS_INVALID_VALUE:
+           return "CUSOLVER_STATUS_INVALID_VALUE";
+       case CUSOLVER_STATUS_ARCH_MISMATCH:
+           return "CUSOLVER_STATUS_ARCH_MISMATCH";
+       case CUSOLVER_STATUS_MAPPING_ERROR:
+           return "CUSOLVER_STATUS_MAPPING_ERROR";
+       case CUSOLVER_STATUS_EXECUTION_FAILED:
+           return "CUSOLVER_STATUS_EXECUTION_FAILED";
+       case CUSOLVER_STATUS_INTERNAL_ERROR:
+           return "CUSOLVER_STATUS_INTERNAL_ERROR";
+       case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+           return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+       case CUSOLVER_STATUS_NOT_SUPPORTED :
+           return "CUSOLVER_STATUS_NOT_SUPPORTED ";
+       case CUSOLVER_STATUS_ZERO_PIVOT:
+           return "CUSOLVER_STATUS_ZERO_PIVOT";
+       case CUSOLVER_STATUS_INVALID_LICENSE:
+           return "CUSOLVER_STATUS_INVALID_LICENSE";
+    }
+
+    return "<unknown>";
+
+}
+#endif
+
+#ifdef CURAND_H_
+// cuRAND API errors
+static const char *_cudaGetErrorEnum(curandStatus_t error)
+{
+    switch (error)
+    {
+        case CURAND_STATUS_SUCCESS:
+            return "CURAND_STATUS_SUCCESS";
+
+        case CURAND_STATUS_VERSION_MISMATCH:
+            return "CURAND_STATUS_VERSION_MISMATCH";
+
+        case CURAND_STATUS_NOT_INITIALIZED:
+            return "CURAND_STATUS_NOT_INITIALIZED";
+
+        case CURAND_STATUS_ALLOCATION_FAILED:
+            return "CURAND_STATUS_ALLOCATION_FAILED";
+
+        case CURAND_STATUS_TYPE_ERROR:
+            return "CURAND_STATUS_TYPE_ERROR";
+
+        case CURAND_STATUS_OUT_OF_RANGE:
+            return "CURAND_STATUS_OUT_OF_RANGE";
+
+        case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+            return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+
+        case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+            return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+
+        case CURAND_STATUS_LAUNCH_FAILURE:
+            return "CURAND_STATUS_LAUNCH_FAILURE";
+
+        case CURAND_STATUS_PREEXISTING_FAILURE:
+            return "CURAND_STATUS_PREEXISTING_FAILURE";
+
+        case CURAND_STATUS_INITIALIZATION_FAILED:
+            return "CURAND_STATUS_INITIALIZATION_FAILED";
+
+        case CURAND_STATUS_ARCH_MISMATCH:
+            return "CURAND_STATUS_ARCH_MISMATCH";
+
+        case CURAND_STATUS_INTERNAL_ERROR:
+            return "CURAND_STATUS_INTERNAL_ERROR";
+    }
+
+    return "<unknown>";
+}
+#endif
+
+#ifdef NV_NPPIDEFS_H
+// NPP API errors
+static const char *_cudaGetErrorEnum(NppStatus error)
+{
+    switch (error)
+    {
+        case NPP_NOT_SUPPORTED_MODE_ERROR:
+            return "NPP_NOT_SUPPORTED_MODE_ERROR";
+
+        case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR:
+            return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR";
+
+        case NPP_RESIZE_NO_OPERATION_ERROR:
+            return "NPP_RESIZE_NO_OPERATION_ERROR";
+
+        case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY:
+            return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
+
+        case NPP_BAD_ARG_ERROR:
+            return "NPP_BAD_ARGUMENT_ERROR";
+
+        case NPP_COEFF_ERROR:
+            return "NPP_COEFFICIENT_ERROR";
+
+        case NPP_RECT_ERROR:
+            return "NPP_RECTANGLE_ERROR";
+
+        case NPP_QUAD_ERROR:
+            return "NPP_QUADRANGLE_ERROR";
+
+        case NPP_MEM_ALLOC_ERR:
+            return "NPP_MEMORY_ALLOCATION_ERROR";
+
+        case NPP_HISTO_NUMBER_OF_LEVELS_ERROR:
+            return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+
+        case NPP_INVALID_INPUT:
+            return "NPP_INVALID_INPUT";
+
+        case NPP_POINTER_ERROR:
+            return "NPP_POINTER_ERROR";
+
+        case NPP_WARNING:
+            return "NPP_WARNING";
+
+        case NPP_ODD_ROI_WARNING:
+            return "NPP_ODD_ROI_WARNING";
+#else
+
+            // These are for CUDA 5.5 or higher
+        case NPP_BAD_ARGUMENT_ERROR:
+            return "NPP_BAD_ARGUMENT_ERROR";
+
+        case NPP_COEFFICIENT_ERROR:
+            return "NPP_COEFFICIENT_ERROR";
+
+        case NPP_RECTANGLE_ERROR:
+            return "NPP_RECTANGLE_ERROR";
+
+        case NPP_QUADRANGLE_ERROR:
+            return "NPP_QUADRANGLE_ERROR";
+
+        case NPP_MEMORY_ALLOCATION_ERR:
+            return "NPP_MEMORY_ALLOCATION_ERROR";
+
+        case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR:
+            return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+
+        case NPP_INVALID_HOST_POINTER_ERROR:
+            return "NPP_INVALID_HOST_POINTER_ERROR";
+
+        case NPP_INVALID_DEVICE_POINTER_ERROR:
+            return "NPP_INVALID_DEVICE_POINTER_ERROR";
+#endif
+
+        case NPP_LUT_NUMBER_OF_LEVELS_ERROR:
+            return "NPP_LUT_NUMBER_OF_LEVELS_ERROR";
+
+        case NPP_TEXTURE_BIND_ERROR:
+            return "NPP_TEXTURE_BIND_ERROR";
+
+        case NPP_WRONG_INTERSECTION_ROI_ERROR:
+            return "NPP_WRONG_INTERSECTION_ROI_ERROR";
+
+        case NPP_NOT_EVEN_STEP_ERROR:
+            return "NPP_NOT_EVEN_STEP_ERROR";
+
+        case NPP_INTERPOLATION_ERROR:
+            return "NPP_INTERPOLATION_ERROR";
+
+        case NPP_RESIZE_FACTOR_ERROR:
+            return "NPP_RESIZE_FACTOR_ERROR";
+
+        case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR:
+            return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR";
+
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
+
+        case NPP_MEMFREE_ERR:
+            return "NPP_MEMFREE_ERR";
+
+        case NPP_MEMSET_ERR:
+            return "NPP_MEMSET_ERR";
+
+        case NPP_MEMCPY_ERR:
+            return "NPP_MEMCPY_ERROR";
+
+        case NPP_MIRROR_FLIP_ERR:
+            return "NPP_MIRROR_FLIP_ERR";
+#else
+
+        case NPP_MEMFREE_ERROR:
+            return "NPP_MEMFREE_ERROR";
+
+        case NPP_MEMSET_ERROR:
+            return "NPP_MEMSET_ERROR";
+
+        case NPP_MEMCPY_ERROR:
+            return "NPP_MEMCPY_ERROR";
+
+        case NPP_MIRROR_FLIP_ERROR:
+            return "NPP_MIRROR_FLIP_ERROR";
+#endif
+
+        case NPP_ALIGNMENT_ERROR:
+            return "NPP_ALIGNMENT_ERROR";
+
+        case NPP_STEP_ERROR:
+            return "NPP_STEP_ERROR";
+
+        case NPP_SIZE_ERROR:
+            return "NPP_SIZE_ERROR";
+
+        case NPP_NULL_POINTER_ERROR:
+            return "NPP_NULL_POINTER_ERROR";
+
+        case NPP_CUDA_KERNEL_EXECUTION_ERROR:
+            return "NPP_CUDA_KERNEL_EXECUTION_ERROR";
+
+        case NPP_NOT_IMPLEMENTED_ERROR:
+            return "NPP_NOT_IMPLEMENTED_ERROR";
+
+        case NPP_ERROR:
+            return "NPP_ERROR";
+
+        case NPP_SUCCESS:
+            return "NPP_SUCCESS";
+
+        case NPP_WRONG_INTERSECTION_QUAD_WARNING:
+            return "NPP_WRONG_INTERSECTION_QUAD_WARNING";
+
+        case NPP_MISALIGNED_DST_ROI_WARNING:
+            return "NPP_MISALIGNED_DST_ROI_WARNING";
+
+        case NPP_AFFINE_QUAD_INCORRECT_WARNING:
+            return "NPP_AFFINE_QUAD_INCORRECT_WARNING";
+
+        case NPP_DOUBLE_SIZE_WARNING:
+            return "NPP_DOUBLE_SIZE_WARNING";
+
+        case NPP_WRONG_INTERSECTION_ROI_WARNING:
+            return "NPP_WRONG_INTERSECTION_ROI_WARNING";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000
+        /* These are 6.0 or higher */
+        case NPP_LUT_PALETTE_BITSIZE_ERROR:
+            return "NPP_LUT_PALETTE_BITSIZE_ERROR";
+
+        case NPP_ZC_MODE_NOT_SUPPORTED_ERROR:
+            return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR";
+
+        case NPP_QUALITY_INDEX_ERROR:
+            return "NPP_QUALITY_INDEX_ERROR";
+
+        case NPP_CHANNEL_ORDER_ERROR:
+            return "NPP_CHANNEL_ORDER_ERROR";
+
+        case NPP_ZERO_MASK_VALUE_ERROR:
+            return "NPP_ZERO_MASK_VALUE_ERROR";
+
+        case NPP_NUMBER_OF_CHANNELS_ERROR:
+            return "NPP_NUMBER_OF_CHANNELS_ERROR";
+
+        case NPP_COI_ERROR:
+            return "NPP_COI_ERROR";
+
+        case NPP_DIVISOR_ERROR:
+            return "NPP_DIVISOR_ERROR";
+
+        case NPP_CHANNEL_ERROR:
+            return "NPP_CHANNEL_ERROR";
+
+        case NPP_STRIDE_ERROR:
+            return "NPP_STRIDE_ERROR";
+
+        case NPP_ANCHOR_ERROR:
+            return "NPP_ANCHOR_ERROR";
+
+        case NPP_MASK_SIZE_ERROR:
+            return "NPP_MASK_SIZE_ERROR";
+
+        case NPP_MOMENT_00_ZERO_ERROR:
+            return "NPP_MOMENT_00_ZERO_ERROR";
+
+        case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR:
+            return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR";
+
+        case NPP_THRESHOLD_ERROR:
+            return "NPP_THRESHOLD_ERROR";
+
+        case NPP_CONTEXT_MATCH_ERROR:
+            return "NPP_CONTEXT_MATCH_ERROR";
+
+        case NPP_FFT_FLAG_ERROR:
+            return "NPP_FFT_FLAG_ERROR";
+
+        case NPP_FFT_ORDER_ERROR:
+            return "NPP_FFT_ORDER_ERROR";
+
+        case NPP_SCALE_RANGE_ERROR:
+            return "NPP_SCALE_RANGE_ERROR";
+
+        case NPP_DATA_TYPE_ERROR:
+            return "NPP_DATA_TYPE_ERROR";
+
+        case NPP_OUT_OFF_RANGE_ERROR:
+            return "NPP_OUT_OFF_RANGE_ERROR";
+
+        case NPP_DIVIDE_BY_ZERO_ERROR:
+            return "NPP_DIVIDE_BY_ZERO_ERROR";
+
+        case NPP_RANGE_ERROR:
+            return "NPP_RANGE_ERROR";
+
+        case NPP_NO_MEMORY_ERROR:
+            return "NPP_NO_MEMORY_ERROR";
+
+        case NPP_ERROR_RESERVED:
+            return "NPP_ERROR_RESERVED";
+
+        case NPP_NO_OPERATION_WARNING:
+            return "NPP_NO_OPERATION_WARNING";
+
+        case NPP_DIVIDE_BY_ZERO_WARNING:
+            return "NPP_DIVIDE_BY_ZERO_WARNING";
+#endif
+
+    }
+
+    return "<unknown>";
+}
+#endif
+
+#ifdef __DRIVER_TYPES_H__
+#ifndef DEVICE_RESET
+#define DEVICE_RESET cudaDeviceReset();
+#endif
+#else
+#ifndef DEVICE_RESET
+#define DEVICE_RESET
+#endif
+#endif
+
+template< typename T >
+void check(T result, char const *const func, const char *const file, int const line)
+{
+    if (result)
+    {
+        fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n",
+                file, line, static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
+        DEVICE_RESET
+        // Make sure we call CUDA Device Reset before exiting
+        exit(EXIT_FAILURE);
+    }
+}
+
+#ifdef __DRIVER_TYPES_H__
+// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
+#define checkCudaErrors(val)           check ( (val), #val, __FILE__, __LINE__ )
+
+// This will output the proper error string when calling cudaGetLastError
+#define getLastCudaError(msg)      __getLastCudaError (msg, __FILE__, __LINE__)
+
+inline void __getLastCudaError(const char *errorMessage, const char *file, const int line)
+{
+    cudaError_t err = cudaGetLastError();
+
+    if (cudaSuccess != err)
+    {
+        fprintf(stderr, "%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n",
+                file, line, errorMessage, (int)err, cudaGetErrorString(err));
+        DEVICE_RESET
+        exit(EXIT_FAILURE);
+    }
+}
+#endif
+
+#ifndef MAX
+#define MAX(a,b) (a > b ? a : b)
+#endif
+
+// Float To Int conversion
+inline int ftoi(float value)
+{
+    return (value >= 0 ? (int)(value + 0.5) : (int)(value - 0.5));
+}
+
+// Beginning of GPU Architecture definitions
+inline int _ConvertSMVer2Cores(int major, int minor)
+{
+    // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
+    typedef struct
+    {
+        int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
+        int Cores;
+    } sSMtoCores;
+
+    sSMtoCores nGpuArchCoresPerSM[] =
+    {
+        { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
+        { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
+        { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
+        { 0x32, 192}, // Kepler Generation (SM 3.2) GK10x class
+        { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
+        { 0x37, 192}, // Kepler Generation (SM 3.7) GK21x class
+        { 0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class
+        { 0x52, 128}, // Maxwell Generation (SM 5.2) GM20x class
+        {   -1, -1 }
+    };
+
+    int index = 0;
+
+    while (nGpuArchCoresPerSM[index].SM != -1)
+    {
+        if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor))
+        {
+            return nGpuArchCoresPerSM[index].Cores;
+        }
+
+        index++;
+    }
+
+    // If we don't find the values, we default use the previous one to run properly
+    printf("MapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[index-1].Cores);
+    return nGpuArchCoresPerSM[index-1].Cores;
+}
+// end of GPU Architecture definitions
+
+#ifdef __CUDA_RUNTIME_H__
+// General GPU Device CUDA Initialization
+inline int gpuDeviceInit(int devID)
+{
+    int device_count;
+    checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+    if (device_count == 0)
+    {
+        fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n");
+        exit(EXIT_FAILURE);
+    }
+
+    if (devID < 0)
+    {
+        devID = 0;
+    }
+
+    if (devID > device_count-1)
+    {
+        fprintf(stderr, "\n");
+        fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", device_count);
+        fprintf(stderr, ">> gpuDeviceInit (-device=%d) is not a valid GPU device. <<\n", devID);
+        fprintf(stderr, "\n");
+        return -devID;
+    }
+
+    cudaDeviceProp deviceProp;
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
+
+    if (deviceProp.computeMode == cudaComputeModeProhibited)
+    {
+        fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n");
+        return -1;
+    }
+
+    if (deviceProp.major < 1)
+    {
+        fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
+        exit(EXIT_FAILURE);
+    }
+
+    checkCudaErrors(cudaSetDevice(devID));
+    printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name);
+
+    return devID;
+}
+
+// This function returns the best GPU (with maximum GFLOPS)
+inline int gpuGetMaxGflopsDeviceId()
+{
+    int current_device     = 0, sm_per_multiproc  = 0;
+    int max_perf_device    = 0;
+    int device_count       = 0, best_SM_arch      = 0;
+    int devices_prohibited = 0;
+    
+    unsigned long long max_compute_perf = 0;
+    cudaDeviceProp deviceProp;
+    cudaGetDeviceCount(&device_count);
+    
+    checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+    if (device_count == 0)
+    {
+        fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: no devices supporting CUDA.\n");
+        exit(EXIT_FAILURE);
+    }
+
+    // Find the best major SM Architecture GPU device
+    while (current_device < device_count)
+    {
+        cudaGetDeviceProperties(&deviceProp, current_device);
+
+        // If this GPU is not running on Compute Mode prohibited, then we can add it to the list
+        if (deviceProp.computeMode != cudaComputeModeProhibited)
+        {
+            if (deviceProp.major > 0 && deviceProp.major < 9999)
+            {
+                best_SM_arch = MAX(best_SM_arch, deviceProp.major);
+            }
+        }
+        else
+        {
+            devices_prohibited++;
+        }
+
+        current_device++;
+    }
+
+    if (devices_prohibited == device_count)
+    {
+    	fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: all devices have compute mode prohibited.\n");
+    	exit(EXIT_FAILURE);
+    }
+
+    // Find the best CUDA capable GPU device
+    current_device = 0;
+
+    while (current_device < device_count)
+    {
+        cudaGetDeviceProperties(&deviceProp, current_device);
+
+        // If this GPU is not running on Compute Mode prohibited, then we can add it to the list
+        if (deviceProp.computeMode != cudaComputeModeProhibited)
+        {
+            if (deviceProp.major == 9999 && deviceProp.minor == 9999)
+            {
+                sm_per_multiproc = 1;
+            }
+            else
+            {
+                sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
+            }
+
+            unsigned long long compute_perf  = (unsigned long long) deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
+
+            if (compute_perf  > max_compute_perf)
+            {
+                // If we find GPU with SM major > 2, search only these
+                if (best_SM_arch > 2)
+                {
+                    // If our device==dest_SM_arch, choose this, or else pass
+                    if (deviceProp.major == best_SM_arch)
+                    {
+                        max_compute_perf  = compute_perf;
+                        max_perf_device   = current_device;
+                    }
+                }
+                else
+                {
+                    max_compute_perf  = compute_perf;
+                    max_perf_device   = current_device;
+                }
+            }
+        }
+
+        ++current_device;
+    }
+
+    return max_perf_device;
+}
+
+
+// Initialization code to find the best CUDA Device
+inline int findCudaDevice(int argc, const char **argv)
+{
+    cudaDeviceProp deviceProp;
+    int devID = 0;
+
+    // If the command-line has a device number specified, use it
+    if (checkCmdLineFlag(argc, argv, "device"))
+    {
+        devID = getCmdLineArgumentInt(argc, argv, "device=");
+
+        if (devID < 0)
+        {
+            printf("Invalid command line parameter\n ");
+            exit(EXIT_FAILURE);
+        }
+        else
+        {
+            devID = gpuDeviceInit(devID);
+
+            if (devID < 0)
+            {
+                printf("exiting...\n");
+                exit(EXIT_FAILURE);
+            }
+        }
+    }
+    else
+    {
+        // Otherwise pick the device with highest Gflops/s
+        devID = gpuGetMaxGflopsDeviceId();
+        checkCudaErrors(cudaSetDevice(devID));
+        checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
+        printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
+    }
+
+    return devID;
+}
+
+// General check for CUDA GPU SM Capabilities
+inline bool checkCudaCapabilities(int major_version, int minor_version)
+{
+    cudaDeviceProp deviceProp;
+    deviceProp.major = 0;
+    deviceProp.minor = 0;
+    int dev;
+
+    checkCudaErrors(cudaGetDevice(&dev));
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
+
+    if ((deviceProp.major > major_version) ||
+        (deviceProp.major == major_version && deviceProp.minor >= minor_version))
+    {
+        printf("  Device %d: <%16s >, Compute SM %d.%d detected\n", dev, deviceProp.name, deviceProp.major, deviceProp.minor);
+        return true;
+    }
+    else
+    {
+        printf("  No GPU device was found that can support CUDA compute capability %d.%d.\n", major_version, minor_version);
+        return false;
+    }
+}
+#endif
+
+// end of CUDA Helper Functions
+
+
+#endif
diff --git a/goconv/inc/helper_string.h b/goconv/inc/helper_string.h
new file mode 100644
index 0000000..9b68cc7
--- /dev/null
+++ b/goconv/inc/helper_string.h
@@ -0,0 +1,526 @@
+/**
+ * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+// These are helper functions for the SDK samples (string parsing, timers, etc)
+#ifndef STRING_HELPER_H
+#define STRING_HELPER_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fstream>
+#include <string>
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#ifndef _CRT_SECURE_NO_DEPRECATE
+#define _CRT_SECURE_NO_DEPRECATE
+#endif
+#ifndef STRCASECMP
+#define STRCASECMP  _stricmp
+#endif
+#ifndef STRNCASECMP
+#define STRNCASECMP _strnicmp
+#endif
+#ifndef STRCPY
+#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
+#endif
+
+#ifndef FOPEN
+#define FOPEN(fHandle,filename,mode) fopen_s(&fHandle, filename, mode)
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result != 0)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf_s
+#endif
+#ifndef SPRINTF
+#define SPRINTF sprintf_s
+#endif
+#else // Linux Includes
+#include <string.h>
+#include <strings.h>
+
+#ifndef STRCASECMP
+#define STRCASECMP  strcasecmp
+#endif
+#ifndef STRNCASECMP
+#define STRNCASECMP strncasecmp
+#endif
+#ifndef STRCPY
+#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
+#endif
+
+#ifndef FOPEN
+#define FOPEN(fHandle,filename,mode) (fHandle = fopen(filename, mode))
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result == NULL)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf
+#endif
+#ifndef SPRINTF
+#define SPRINTF sprintf
+#endif
+#endif
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// CUDA Utility Helper Functions
+inline int stringRemoveDelimiter(char delimiter, const char *string)
+{
+    int string_start = 0;
+
+    while (string[string_start] == delimiter)
+    {
+        string_start++;
+    }
+
+    if (string_start >= (int)strlen(string)-1)
+    {
+        return 0;
+    }
+
+    return string_start;
+}
+
+inline int getFileExtension(char *filename, char **extension)
+{
+    int string_length = (int)strlen(filename);
+
+    while (filename[string_length--] != '.')
+    {
+        if (string_length == 0)
+            break;
+    }
+
+    if (string_length > 0) string_length += 2;
+
+    if (string_length == 0)
+        *extension = NULL;
+    else
+        *extension = &filename[string_length];
+
+    return string_length;
+}
+
+
+inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref)
+{
+    bool bFound = false;
+
+    if (argc >= 1)
+    {
+        for (int i=1; i < argc; i++)
+        {
+            int string_start = stringRemoveDelimiter('-', argv[i]);
+            const char *string_argv = &argv[i][string_start];
+
+            const char *equal_pos = strchr(string_argv, '=');
+            int argv_length = (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
+
+            int length = (int)strlen(string_ref);
+
+            if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length))
+            {
+                bFound = true;
+                continue;
+            }
+        }
+    }
+
+    return bFound;
+}
+
+// This function wraps the CUDA Driver API into a template function
+template <class T>
+inline bool getCmdLineArgumentValue(const int argc, const char **argv, const char *string_ref, T *value)
+{
+    bool bFound = false;
+
+    if (argc >= 1)
+    {
+        for (int i=1; i < argc; i++)
+        {
+            int string_start = stringRemoveDelimiter('-', argv[i]);
+            const char *string_argv = &argv[i][string_start];
+            int length = (int)strlen(string_ref);
+
+            if (!STRNCASECMP(string_argv, string_ref, length))
+            {
+                if (length+1 <= (int)strlen(string_argv))
+                {
+                    int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+                    *value = (T)atoi(&string_argv[length + auto_inc]);
+                }
+
+                bFound = true;
+                i=argc;
+            }
+        }
+    }
+
+    return bFound;
+}
+
+inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref)
+{
+    bool bFound = false;
+    int value = -1;
+
+    if (argc >= 1)
+    {
+        for (int i=1; i < argc; i++)
+        {
+            int string_start = stringRemoveDelimiter('-', argv[i]);
+            const char *string_argv = &argv[i][string_start];
+            int length = (int)strlen(string_ref);
+
+            if (!STRNCASECMP(string_argv, string_ref, length))
+            {
+                if (length+1 <= (int)strlen(string_argv))
+                {
+                    int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+                    value = atoi(&string_argv[length + auto_inc]);
+                }
+                else
+                {
+                    value = 0;
+                }
+
+                bFound = true;
+                continue;
+            }
+        }
+    }
+
+    if (bFound)
+    {
+        return value;
+    }
+    else
+    {
+        return 0;
+    }
+}
+
+inline float getCmdLineArgumentFloat(const int argc, const char **argv, const char *string_ref)
+{
+    bool bFound = false;
+    float value = -1;
+
+    if (argc >= 1)
+    {
+        for (int i=1; i < argc; i++)
+        {
+            int string_start = stringRemoveDelimiter('-', argv[i]);
+            const char *string_argv = &argv[i][string_start];
+            int length = (int)strlen(string_ref);
+
+            if (!STRNCASECMP(string_argv, string_ref, length))
+            {
+                if (length+1 <= (int)strlen(string_argv))
+                {
+                    int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+                    value = (float)atof(&string_argv[length + auto_inc]);
+                }
+                else
+                {
+                    value = 0.f;
+                }
+
+                bFound = true;
+                continue;
+            }
+        }
+    }
+
+    if (bFound)
+    {
+        return value;
+    }
+    else
+    {
+        return 0;
+    }
+}
+
+inline bool getCmdLineArgumentString(const int argc, const char **argv,
+                                     const char *string_ref, char **string_retval)
+{
+    bool bFound = false;
+
+    if (argc >= 1)
+    {
+        for (int i=1; i < argc; i++)
+        {
+            int string_start = stringRemoveDelimiter('-', argv[i]);
+            char *string_argv = (char *)&argv[i][string_start];
+            int length = (int)strlen(string_ref);
+
+            if (!STRNCASECMP(string_argv, string_ref, length))
+            {
+                *string_retval = &string_argv[length+1];
+                bFound = true;
+                continue;
+            }
+        }
+    }
+
+    if (!bFound)
+    {
+        *string_retval = NULL;
+    }
+
+    return bFound;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Find the path for a file assuming that
+//! files are found in the searchPath.
+//!
+//! @return the path if succeeded, otherwise 0
+//! @param filename         name of the file
+//! @param executable_path  optional absolute path of the executable
+//////////////////////////////////////////////////////////////////////////////
+inline char *sdkFindFilePath(const char *filename, const char *executable_path)
+{
+    // <executable_name> defines a variable that is replaced with the name of the executable
+
+    // Typical relative search paths to locate needed companion files (e.g. sample input data, or JIT source files)
+    // The origin for the relative search may be the .exe file, a .bat file launching an .exe, a browser .exe launching the .exe or .bat, etc
+    const char *searchPath[] =
+    {
+        "./",                                       // same dir
+        "./common/",                                // "/common/" subdir
+        "./common/data/",                           // "/common/data/" subdir
+        "./data/",                                  // "/data/" subdir
+        "./src/",                                   // "/src/" subdir
+        "./src/<executable_name>/data/",            // "/src/<executable_name>/data/" subdir
+        "./inc/",                                   // "/inc/" subdir
+        "./0_Simple/",                              // "/0_Simple/" subdir
+        "./1_Utilities/",                           // "/1_Utilities/" subdir
+        "./2_Graphics/",                            // "/2_Graphics/" subdir
+        "./3_Imaging/",                             // "/3_Imaging/" subdir
+        "./4_Finance/",                             // "/4_Finance/" subdir
+        "./5_Simulations/",                         // "/5_Simulations/" subdir
+        "./6_Advanced/",                            // "/6_Advanced/" subdir
+        "./7_CUDALibraries/",                       // "/7_CUDALibraries/" subdir
+        "./8_Android/",                             // "/8_Android/" subdir
+        "./samples/",                               // "/samples/" subdir
+
+        "./0_Simple/<executable_name>/data/",        // "/0_Simple/<executable_name>/data/" subdir
+        "./1_Utilities/<executable_name>/data/",     // "/1_Utilities/<executable_name>/data/" subdir
+        "./2_Graphics/<executable_name>/data/",      // "/2_Graphics/<executable_name>/data/" subdir
+        "./3_Imaging/<executable_name>/data/",       // "/3_Imaging/<executable_name>/data/" subdir
+        "./4_Finance/<executable_name>/data/",       // "/4_Finance/<executable_name>/data/" subdir
+        "./5_Simulations/<executable_name>/data/",   // "/5_Simulations/<executable_name>/data/" subdir
+        "./6_Advanced/<executable_name>/data/",      // "/6_Advanced/<executable_name>/data/" subdir
+        "./7_CUDALibraries/<executable_name>/",      // "/7_CUDALibraries/<executable_name>/" subdir
+        "./7_CUDALibraries/<executable_name>/data/", // "/7_CUDALibraries/<executable_name>/data/" subdir
+
+        "../",                                      // up 1 in tree
+        "../common/",                               // up 1 in tree, "/common/" subdir
+        "../common/data/",                          // up 1 in tree, "/common/data/" subdir
+        "../data/",                                 // up 1 in tree, "/data/" subdir
+        "../src/",                                  // up 1 in tree, "/src/" subdir
+        "../inc/",                                  // up 1 in tree, "/inc/" subdir
+
+        "../0_Simple/<executable_name>/data/",       // up 1 in tree, "/0_Simple/<executable_name>/" subdir
+        "../1_Utilities/<executable_name>/data/",    // up 1 in tree, "/1_Utilities/<executable_name>/" subdir
+        "../2_Graphics/<executable_name>/data/",     // up 1 in tree, "/2_Graphics/<executable_name>/" subdir
+        "../3_Imaging/<executable_name>/data/",      // up 1 in tree, "/3_Imaging/<executable_name>/" subdir
+        "../4_Finance/<executable_name>/data/",      // up 1 in tree, "/4_Finance/<executable_name>/" subdir
+        "../5_Simulations/<executable_name>/data/",  // up 1 in tree, "/5_Simulations/<executable_name>/" subdir
+        "../6_Advanced/<executable_name>/data/",     // up 1 in tree, "/6_Advanced/<executable_name>/" subdir
+        "../7_CUDALibraries/<executable_name>/data/",// up 1 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+        "../8_Android/<executable_name>/data/",      // up 1 in tree, "/8_Android/<executable_name>/" subdir
+        "../samples/<executable_name>/data/",        // up 1 in tree, "/samples/<executable_name>/" subdir
+        "../../",                                        // up 2 in tree
+        "../../common/",                                 // up 2 in tree, "/common/" subdir
+        "../../common/data/",                            // up 2 in tree, "/common/data/" subdir
+        "../../data/",                                   // up 2 in tree, "/data/" subdir
+        "../../src/",                                    // up 2 in tree, "/src/" subdir
+        "../../inc/",                                    // up 2 in tree, "/inc/" subdir
+        "../../sandbox/<executable_name>/data/",         // up 2 in tree, "/sandbox/<executable_name>/" subdir
+        "../../0_Simple/<executable_name>/data/",        // up 2 in tree, "/0_Simple/<executable_name>/" subdir
+        "../../1_Utilities/<executable_name>/data/",     // up 2 in tree, "/1_Utilities/<executable_name>/" subdir
+        "../../2_Graphics/<executable_name>/data/",      // up 2 in tree, "/2_Graphics/<executable_name>/" subdir
+        "../../3_Imaging/<executable_name>/data/",       // up 2 in tree, "/3_Imaging/<executable_name>/" subdir
+        "../../4_Finance/<executable_name>/data/",       // up 2 in tree, "/4_Finance/<executable_name>/" subdir
+        "../../5_Simulations/<executable_name>/data/",   // up 2 in tree, "/5_Simulations/<executable_name>/" subdir
+        "../../6_Advanced/<executable_name>/data/",      // up 2 in tree, "/6_Advanced/<executable_name>/" subdir
+        "../../7_CUDALibraries/<executable_name>/data/", // up 2 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+        "../../8_Android/<executable_name>/data/",       // up 2 in tree, "/8_Android/<executable_name>/" subdir
+        "../../samples/<executable_name>/data/",         // up 2 in tree, "/samples/<executable_name>/" subdir
+        "../../../",                                        // up 3 in tree
+        "../../../src/<executable_name>/",                  // up 3 in tree, "/src/<executable_name>/" subdir
+        "../../../src/<executable_name>/data/",             // up 3 in tree, "/src/<executable_name>/data/" subdir
+        "../../../src/<executable_name>/src/",              // up 3 in tree, "/src/<executable_name>/src/" subdir
+        "../../../src/<executable_name>/inc/",              // up 3 in tree, "/src/<executable_name>/inc/" subdir
+        "../../../sandbox/<executable_name>/",              // up 3 in tree, "/sandbox/<executable_name>/" subdir
+        "../../../sandbox/<executable_name>/data/",         // up 3 in tree, "/sandbox/<executable_name>/data/" subdir
+        "../../../sandbox/<executable_name>/src/",          // up 3 in tree, "/sandbox/<executable_name>/src/" subdir
+        "../../../sandbox/<executable_name>/inc/",          // up 3 in tree, "/sandbox/<executable_name>/inc/" subdir
+        "../../../0_Simple/<executable_name>/data/",        // up 3 in tree, "/0_Simple/<executable_name>/" subdir
+        "../../../1_Utilities/<executable_name>/data/",     // up 3 in tree, "/1_Utilities/<executable_name>/" subdir
+        "../../../2_Graphics/<executable_name>/data/",      // up 3 in tree, "/2_Graphics/<executable_name>/" subdir
+        "../../../3_Imaging/<executable_name>/data/",       // up 3 in tree, "/3_Imaging/<executable_name>/" subdir
+        "../../../4_Finance/<executable_name>/data/",       // up 3 in tree, "/4_Finance/<executable_name>/" subdir
+        "../../../5_Simulations/<executable_name>/data/",   // up 3 in tree, "/5_Simulations/<executable_name>/" subdir
+        "../../../6_Advanced/<executable_name>/data/",      // up 3 in tree, "/6_Advanced/<executable_name>/" subdir
+        "../../../7_CUDALibraries/<executable_name>/data/", // up 3 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+        "../../../8_Android/<executable_name>/data/",       // up 3 in tree, "/8_Android/<executable_name>/" subdir
+        "../../../0_Simple/<executable_name>/",        // up 3 in tree, "/0_Simple/<executable_name>/" subdir
+        "../../../1_Utilities/<executable_name>/",     // up 3 in tree, "/1_Utilities/<executable_name>/" subdir
+        "../../../2_Graphics/<executable_name>/",      // up 3 in tree, "/2_Graphics/<executable_name>/" subdir
+        "../../../3_Imaging/<executable_name>/",       // up 3 in tree, "/3_Imaging/<executable_name>/" subdir
+        "../../../4_Finance/<executable_name>/",       // up 3 in tree, "/4_Finance/<executable_name>/" subdir
+        "../../../5_Simulations/<executable_name>/",   // up 3 in tree, "/5_Simulations/<executable_name>/" subdir
+        "../../../6_Advanced/<executable_name>/",      // up 3 in tree, "/6_Advanced/<executable_name>/" subdir
+        "../../../7_CUDALibraries/<executable_name>/", // up 3 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+        "../../../8_Android/<executable_name>/",       // up 3 in tree, "/8_Android/<executable_name>/" subdir
+        "../../../samples/<executable_name>/data/",         // up 3 in tree, "/samples/<executable_name>/" subdir
+        "../../../common/",                                 // up 3 in tree, "../../../common/" subdir
+        "../../../common/data/",                            // up 3 in tree, "../../../common/data/" subdir
+        "../../../data/",                                   // up 3 in tree, "../../../data/" subdir
+        "../../../../",                                // up 4 in tree
+        "../../../../src/<executable_name>/",          // up 4 in tree, "/src/<executable_name>/" subdir
+        "../../../../src/<executable_name>/data/",     // up 4 in tree, "/src/<executable_name>/data/" subdir
+        "../../../../src/<executable_name>/src/",      // up 4 in tree, "/src/<executable_name>/src/" subdir
+        "../../../../src/<executable_name>/inc/",      // up 4 in tree, "/src/<executable_name>/inc/" subdir
+        "../../../../sandbox/<executable_name>/",      // up 4 in tree, "/sandbox/<executable_name>/" subdir
+        "../../../../sandbox/<executable_name>/data/", // up 4 in tree, "/sandbox/<executable_name>/data/" subdir
+        "../../../../sandbox/<executable_name>/src/",  // up 4 in tree, "/sandbox/<executable_name>/src/" subdir
+        "../../../../sandbox/<executable_name>/inc/",   // up 4 in tree, "/sandbox/<executable_name>/inc/" subdir
+        "../../../../0_Simple/<executable_name>/data/",     // up 4 in tree, "/0_Simple/<executable_name>/" subdir
+        "../../../../1_Utilities/<executable_name>/data/",  // up 4 in tree, "/1_Utilities/<executable_name>/" subdir
+        "../../../../2_Graphics/<executable_name>/data/",   // up 4 in tree, "/2_Graphics/<executable_name>/" subdir
+        "../../../../3_Imaging/<executable_name>/data/",    // up 4 in tree, "/3_Imaging/<executable_name>/" subdir
+        "../../../../4_Finance/<executable_name>/data/",    // up 4 in tree, "/4_Finance/<executable_name>/" subdir
+        "../../../../5_Simulations/<executable_name>/data/",// up 4 in tree, "/5_Simulations/<executable_name>/" subdir
+        "../../../../6_Advanced/<executable_name>/data/",   // up 4 in tree, "/6_Advanced/<executable_name>/" subdir
+        "../../../../7_CUDALibraries/<executable_name>/data/", // up 4 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+        "../../../../8_Android/<executable_name>/data/",    // up 4 in tree, "/8_Android/<executable_name>/" subdir
+        "../../../../0_Simple/<executable_name>/",     // up 4 in tree, "/0_Simple/<executable_name>/" subdir
+        "../../../../1_Utilities/<executable_name>/",  // up 4 in tree, "/1_Utilities/<executable_name>/" subdir
+        "../../../../2_Graphics/<executable_name>/",   // up 4 in tree, "/2_Graphics/<executable_name>/" subdir
+        "../../../../3_Imaging/<executable_name>/",    // up 4 in tree, "/3_Imaging/<executable_name>/" subdir
+        "../../../../4_Finance/<executable_name>/",    // up 4 in tree, "/4_Finance/<executable_name>/" subdir
+        "../../../../5_Simulations/<executable_name>/",// up 4 in tree, "/5_Simulations/<executable_name>/" subdir
+        "../../../../6_Advanced/<executable_name>/",   // up 4 in tree, "/6_Advanced/<executable_name>/" subdir
+        "../../../../7_CUDALibraries/<executable_name>/", // up 4 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+        "../../../../8_Android/<executable_name>/",    // up 4 in tree, "/8_Android/<executable_name>/" subdir
+        "../../../../samples/<executable_name>/data/",      // up 4 in tree, "/samples/<executable_name>/" subdir
+        "../../../../common/",                              // up 4 in tree, "../../../common/" subdir
+        "../../../../common/data/",                         // up 4 in tree, "../../../common/data/" subdir
+        "../../../../data/",                                // up 4 in tree, "../../../data/" subdir
+        "../../../../../",                                // up 5 in tree
+        "../../../../../src/<executable_name>/",          // up 5 in tree, "/src/<executable_name>/" subdir
+        "../../../../../src/<executable_name>/data/",     // up 5 in tree, "/src/<executable_name>/data/" subdir
+        "../../../../../src/<executable_name>/src/",      // up 5 in tree, "/src/<executable_name>/src/" subdir
+        "../../../../../src/<executable_name>/inc/",      // up 5 in tree, "/src/<executable_name>/inc/" subdir
+        "../../../../../sandbox/<executable_name>/",      // up 5 in tree, "/sandbox/<executable_name>/" subdir
+        "../../../../../sandbox/<executable_name>/data/", // up 5 in tree, "/sandbox/<executable_name>/data/" subdir
+        "../../../../../sandbox/<executable_name>/src/",  // up 5 in tree, "/sandbox/<executable_name>/src/" subdir
+        "../../../../../sandbox/<executable_name>/inc/",   // up 5 in tree, "/sandbox/<executable_name>/inc/" subdir
+        "../../../../../0_Simple/<executable_name>/data/",     // up 5 in tree, "/0_Simple/<executable_name>/" subdir
+        "../../../../../1_Utilities/<executable_name>/data/",  // up 5 in tree, "/1_Utilities/<executable_name>/" subdir
+        "../../../../../2_Graphics/<executable_name>/data/",   // up 5 in tree, "/2_Graphics/<executable_name>/" subdir
+        "../../../../../3_Imaging/<executable_name>/data/",    // up 5 in tree, "/3_Imaging/<executable_name>/" subdir
+        "../../../../../4_Finance/<executable_name>/data/",    // up 5 in tree, "/4_Finance/<executable_name>/" subdir
+        "../../../../../5_Simulations/<executable_name>/data/",// up 5 in tree, "/5_Simulations/<executable_name>/" subdir
+        "../../../../../6_Advanced/<executable_name>/data/",   // up 5 in tree, "/6_Advanced/<executable_name>/" subdir
+        "../../../../../7_CUDALibraries/<executable_name>/data/", // up 5 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+        "../../../../../8_Android/<executable_name>/data/",    // up 5 in tree, "/8_Android/<executable_name>/" subdir
+        "../../../../../samples/<executable_name>/data/",      // up 5 in tree, "/samples/<executable_name>/" subdir
+        "../../../../../common/",                         // up 5 in tree, "../../../common/" subdir
+        "../../../../../common/data/",                    // up 5 in tree, "../../../common/data/" subdir
+    };
+
+    // Extract the executable name
+    std::string executable_name;
+
+    if (executable_path != 0)
+    {
+        executable_name = std::string(executable_path);
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+        // Windows path delimiter
+        size_t delimiter_pos = executable_name.find_last_of('\\');
+        executable_name.erase(0, delimiter_pos + 1);
+
+        if (executable_name.rfind(".exe") != std::string::npos)
+        {
+            // we strip .exe, only if the .exe is found
+            executable_name.resize(executable_name.size() - 4);
+        }
+
+#else
+        // Linux & OSX path delimiter
+        size_t delimiter_pos = executable_name.find_last_of('/');
+        executable_name.erase(0,delimiter_pos+1);
+#endif
+    }
+
+    // Loop over all search paths and return the first hit
+    for (unsigned int i = 0; i < sizeof(searchPath)/sizeof(char *); ++i)
+    {
+        std::string path(searchPath[i]);
+        size_t executable_name_pos = path.find("<executable_name>");
+
+        // If there is executable_name variable in the searchPath
+        // replace it with the value
+        if (executable_name_pos != std::string::npos)
+        {
+            if (executable_path != 0)
+            {
+                path.replace(executable_name_pos, strlen("<executable_name>"), executable_name);
+            }
+            else
+            {
+                // Skip this path entry if no executable argument is given
+                continue;
+            }
+        }
+
+#ifdef _DEBUG
+        printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str());
+#endif
+
+        // Test if the file exists
+        path.append(filename);
+        FILE *fp;
+        FOPEN(fp, path.c_str(), "rb");
+
+        if (fp != NULL)
+        {
+            fclose(fp);
+            // File found
+            // returning an allocated array here for backwards compatibility reasons
+            char *file_path = (char *) malloc(path.length() + 1);
+            STRCPY(file_path, path.length() + 1, path.c_str());
+            return file_path;
+        }
+
+        if (fp)
+        {
+            fclose(fp);
+        }
+    }
+
+    // File not found
+    return 0;
+}
+
+#endif
diff --git a/gohumantrack/gohumantrack.go b/gohumantrack/gohumantrack.go
index b0a8198..8c65fc8 100644
--- a/gohumantrack/gohumantrack.go
+++ b/gohumantrack/gohumantrack.go
@@ -17,12 +17,15 @@
 
 void *create_batch_image(const int size){
 	c_img *imgs = (c_img*)malloc(size * sizeof(c_img));
+	for(int i = 0; i < size; i++){
+		imgs[i].data_ = NULL;
+	}
 	return imgs;
 }
 int fill_images(void *imgs, const int size, const int index, void *data, const int w, const int h, const int c){
 	if(!imgs || !data || size <= index) return -1;
 	c_img *images = (c_img*)imgs;
-	images[index].data_ = data;
+	images[index].data_ = (unsigned char*)data;
 	images[index].w_ = w;
 	images[index].h_ = h;
 	images[index].c_ = c;
@@ -36,9 +39,9 @@
 	}
 	return ret;
 }
-void *process(void *handle, void *imgs, const int size){
+void *process(void *handle, void *imgs, const int size, void *result){
 	c_img *images = (c_img*)imgs;
-	c_fgRet *res = init_fgres(size);
+	c_fgRet *res = (c_fgRet *)result;
 	int ret = c_human_tracker_process(handle, images, size, res);
 	if (ret != 0)
 		return NULL;
@@ -95,6 +98,7 @@
 // HumanTracker struct
 type HumanTracker struct {
 	handle    unsafe.Pointer
+	result 	  unsafe.Pointer
 	batchSize int
 }
 
@@ -105,7 +109,8 @@
 	}
 	p := C.c_human_tracker_create(C.int(gpu), C.int(batchSize), C.int(flag))
 	if p != nil {
-		return &HumanTracker{p, batchSize}
+		res := C.init_fgres(C.int(batchSize))
+		return &HumanTracker{p, res, batchSize}
 	}
 	return nil
 }
@@ -114,6 +119,9 @@
 func (h *HumanTracker) Free() {
 	if h.handle != nil {
 		C.c_human_tracker_release(&h.handle)
+	}
+	if h.result != nil {
+		C.free(h.result)
 	}
 }
 
@@ -146,11 +154,10 @@
 		}
 	}
 
-	cRet := C.process(h.handle, cImgs, C.int(h.batchSize))
+	cRet := C.process(h.handle, cImgs, C.int(h.batchSize), h.result)
 	if cRet == nil {
 		return nil, errors.New("create C results error")
 	}
-	defer C.free(unsafe.Pointer(cRet))
 
 	var result []FgResult
 	p := uintptr(cRet)
diff --git a/main.go b/main.go
index 411172c..44c902a 100644
--- a/main.go
+++ b/main.go
@@ -6,7 +6,7 @@
 	"time"
 
 	"track/gohumantrack"
-
+	"track/goconv"
 	"basic.com/valib/goffmpeg.git"
 )
 
@@ -19,6 +19,54 @@
 	flag.StringVar(&url1, "cam1", "", "url")
 	flag.StringVar(&url2, "cam2", "", "url")
 }
+
+func run(cam1, cam2 *goffmpeg.GoFFMPEG, tracker *gohumantrack.HumanTracker) bool {
+	data1, ow1, oh1, _ := cam1.GetYUV()
+	data2, ow2, oh2, _ := cam2.GetYUV()
+	if ow1 > 0 && oh1 > 0 && ow2 > 0 && oh2 > 0 {
+		bgr1 := goconv.YUV2BGR(data1, ow1, oh1)
+		bgr2 := goconv.YUV2BGR(data2, ow2, oh2)
+		if bgr1 == nil || bgr2 == nil{
+			return false
+		}
+		var images []gohumantrack.ImageHumanTracker
+		img := gohumantrack.ImageHumanTracker{
+			Data:    bgr1,
+			Width:   ow1,
+			Height:  oh1,
+			Channel: 3,
+		}
+		images = append(images, img)
+		img = gohumantrack.ImageHumanTracker{
+			Data:    bgr2,
+			Width:   ow2,
+			Height:  oh2,
+			Channel: 3,
+		}
+		images = append(images, img)
+		res, err := tracker.Process(images)
+		if err == nil {
+			for _, v := range res {
+				fmt.Printf("result size: %d\n", v.FgNum)
+				for i := 0; i < int(v.FgNum); i++ {
+					r := v.Fginfo[i]
+					if r.Confidence > 0 {
+						fmt.Printf(" Index %d Rect: %dx%dx%dx%d", i, r.Left, r.Top, r.Right, r.Bottom)
+						fmt.Printf(" Confidence: %f", r.Confidence*100)
+						fmt.Printf(" Center: %dx%d", r.X, r.Y)
+						fmt.Printf(" ID: %d\n", r.ID)
+					}
+				}
+
+			}
+		} else {
+			fmt.Println("process error: ", err)
+		}
+		return true
+	} 
+	return false
+}
+
 func main() {
 	flag.Parse()
 	fmt.Println("url1: ", url1, " url2: ", url2)
@@ -42,45 +90,7 @@
 	cam2.CloseStream()
 
 	for {
-		data1, ow1, oh1, _ := cam1.GetYUV()
-		data2, ow2, oh2, _ := cam2.GetYUV()
-		if ow1 > 0 && oh1 > 0 && ow2 > 0 && oh2 > 0 {
-
-			var images []gohumantrack.ImageHumanTracker
-			img := gohumantrack.ImageHumanTracker{
-				Data:    data1,
-				Width:   ow1,
-				Height:  oh1,
-				Channel: 3,
-			}
-			images = append(images, img)
-			img = gohumantrack.ImageHumanTracker{
-				Data:    data2,
-				Width:   ow2,
-				Height:  oh2,
-				Channel: 3,
-			}
-			images = append(images, img)
-			res, err := tracker.Process(images)
-			if err == nil {
-				for _, v := range res {
-					fmt.Printf("result size: %d\n", v.FgNum)
-					for i := 0; i < int(v.FgNum); i++ {
-						r := v.Fginfo[i]
-						if r.Confidence > 0 {
-							fmt.Printf(" Index %d Rect: %dx%dx%dx%d", i, r.Left, r.Top, r.Right, r.Bottom)
-							fmt.Printf(" Confidence: %f", r.Confidence*100)
-							fmt.Printf(" Center: %dx%d", r.X, r.Y)
-							fmt.Printf(" ID: %d\n", r.ID)
-						}
-					}
-
-				}
-			} else {
-				fmt.Println("process error: ", err)
-			}
-		} else {
-			//			fmt.Println("cam1 size: ", ow1, "x", oh1, " cam2 size: ", ow2, "x", oh2)
+		if !run(cam1, cam2, tracker){
 			time.Sleep(38 * time.Millisecond)
 		}
 
diff --git a/runtime/libcffmpeg.so b/runtime/libcffmpeg.so
index 707acbb..3989dea 100755
--- a/runtime/libcffmpeg.so
+++ b/runtime/libcffmpeg.so
Binary files differ

--
Gitblit v1.8.0