~chenshijun/analysis.git

parent: 1118eadd | 补丁 | 提交 | ignore whitespace

zhangmeng

2019-12-19 ab80a1fb4d519b46540eb23751fe3751219729e8

add conv

6个文件已添加

	goconv/conv.cpp	592 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	goconv/conv.h	23 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	goconv/goconv.go	258 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	goconv/inc/Exceptions.h	181 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	goconv/inc/helper_cuda.h	1261 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	goconv/inc/helper_string.h	526 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史

 goconv/conv.cpp

New file
@@ -0,0 +1,592 @@
#include "conv.h"

#include <cmath>
#include <sys/time.h>

#include <npp.h>
#include <helper_cuda.h>
#include <helper_string.h>
#include "Exceptions.h"


static const int MEMORY_ALGN_DEVICE = 511;
static const int HD_MEMORY_ALGN_DEVICE = 511;

static inline int DivUp(int x, int d)
{
    return (x + d - 1) / d;
}

static int set_data(uint8_t *data, const int width, const int height, unsigned char *mY, unsigned char *mU, unsigned char *mV)
{
    uint8_t* yuv_data = (uint8_t*)data;
    if (!yuv_data)
    {
        return -1;
    }

    uint32_t    i, j;
    uint32_t    off;
    uint32_t    off_yuv;
    uint32_t    half_h;
    uint32_t    half_w;
    uint32_t    u_size;
    uint8_t*    yuv_ptr;
    uint8_t*    u_ptr;
    uint8_t*    v_ptr;

    int w = width;
    int h = height;

    //从这一句来看，即使是同一种格式，进来也要处理一下。
    size_t nPitch  = (w + HD_MEMORY_ALGN_DEVICE) & ~HD_MEMORY_ALGN_DEVICE;
    off     = 0;
    off_yuv = 0;
    for (i = 0; i < (uint32_t)h; i++)
    {
        memcpy(mY + off, yuv_data + off_yuv, w);
        off     += nPitch;
        off_yuv += w;
    }

    half_w = w >> 1;
    half_h = h >> 1;
    u_size = half_w * half_h;
    nPitch = (half_w + HD_MEMORY_ALGN_DEVICE) & ~HD_MEMORY_ALGN_DEVICE;
    
    off_yuv = w * h;
    off = 0;
    for (i = 0; i < half_h; i++)
    {  
        yuv_ptr = yuv_data + off_yuv;
        u_ptr = mU + off;
        v_ptr = mV + off;
        for (j = 0; j < (uint32_t)w; j += 2)
        {
            *u_ptr++ = *yuv_ptr++;
            *v_ptr++ = *yuv_ptr++;
        }
        off_yuv += w;
        off += nPitch;
    }

    return 0;
}

/////////////handle
class convertor{
public: 
    convertor(const int srcW, const int srcH, const int dstW, const int dstH, const int gpu);
    ~convertor();
    int yuv2bgr(unsigned char **bgr, int *bgrLen);
    int resize2bgr(unsigned char *in, unsigned char **data, int *data_len);
    int resizeyuv(unsigned char *in, unsigned char **data, int *data_len);
    int fill_yuv(const unsigned char *yuv);
private: 
    void init_yuv();
    void init_resize();
    void init_resize_bgr();
    void init_resize_yuv();
private: 
    int width;
    int height;

    unsigned char aSamplingFactors[3];
    int nMCUBlocksH;
    int nMCUBlocksV;

    Npp8u   *apSrcImage[3];
    NppiSize aSrcSize[3];
    Npp32s   aSrcImageStep[3];
    size_t   aSrcPitch[3];

    uint8_t *mY;
    uint8_t *mU;
    uint8_t *mV;

///////////////////////////
    int rWidth;
    int rHeight;
    float fx;
    float fy;

    Npp8u *apDstImage[3] = {0,0,0};
    Npp32s aDstImageStep[3];
    NppiSize aDstSize[3];

/////////////////////////////
    Npp8u *imgOrigin;
    size_t pitchOrigin;
    NppiSize sizeOrigin;

    unsigned char *bgrOrigin;
    int bgrOriginLen;
    size_t bgrOriginPitch;

////////////////////////////
    Npp8u *imgResize;
    size_t pitchResize;
    NppiSize sizeResize;

    unsigned char *bgrScale;
    int bgrScaleLen;
    size_t bgrScalePitch;

// resize only
////////////////////////////
    Npp8u *originBGR;
    int pitchOriginBGR;
    Npp8u *resizedBGR;
    int pitchResizedBGR;
    unsigned char *hostResizedBGR;

///////////////////////////
    unsigned char *nv12;

    bool initialized_yuv, initialized_resize, initialized_resize_bgr, initialized_resize_yuv;
    int gpu_index;
};


convertor::convertor(const int srcW, const int srcH, const int dstW, const int dstH, const int gpu)
:width(srcW)
,height(srcH)
,rWidth(dstW) 
,rHeight(dstH)
,fx(-1)
,fy(-1)
,mY(NULL)
,mU(NULL)
,mV(NULL)
,imgOrigin(0)
,imgResize(0)
,bgrOrigin(NULL)
,bgrOriginLen(0)
,bgrScale(NULL)
,bgrScaleLen(0)
,originBGR(0)
,pitchOriginBGR(0)
,resizedBGR(0)
,pitchResizedBGR(0)
,hostResizedBGR(NULL)
,nv12(NULL)
,initialized_yuv(false)
,initialized_resize(false)
,initialized_resize_bgr(false)
,initialized_resize_yuv(false)
,gpu_index(gpu)
{}

static void setGPUDevice(const int gpu){
    if (gpu >= 0){
        cudaSetDevice(gpu);
    }
}

void convertor::init_yuv(){
    if (initialized_yuv) return;
    initialized_yuv = true;

    setGPUDevice(gpu_index);

    for(int i = 0; i < 3; i++){
        apSrcImage[i] = 0;
        apDstImage[i] = 0;
    }

    aSamplingFactors[0] = 34;
    aSamplingFactors[1] = 17;
    aSamplingFactors[2] = 17;

    nMCUBlocksH = 0;
    nMCUBlocksV = 0;

    for (int i = 0; i < 3; ++i)
    {
        nMCUBlocksV = std::max(nMCUBlocksV, aSamplingFactors[i] & 0x0f);
        nMCUBlocksH = std::max(nMCUBlocksH, aSamplingFactors[i] >> 4);
    }

    for (int i = 0; i < 3; ++i)
    {
        NppiSize oBlocks;
        NppiSize oBlocksPerMCU = { aSamplingFactors[i] >> 4, aSamplingFactors[i] & 0x0f };

        oBlocks.width = (int)ceil((width   + 7) / 8 *
            static_cast<float>(oBlocksPerMCU.width) / nMCUBlocksH);
        oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width;

        oBlocks.height = (int)ceil((height + 7) / 8 *
            static_cast<float>(oBlocksPerMCU.height) / nMCUBlocksV);
        oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height;

        aSrcSize[i].width  = oBlocks.width  * 8;
        aSrcSize[i].height = oBlocks.height * 8;

        // Allocate Memory
        size_t nPitch;
        NPP_CHECK_CUDA(cudaMallocPitch((void**)&(apSrcImage[i]), &nPitch, aSrcSize[i].width, aSrcSize[i].height));
        aSrcPitch[i] = nPitch;
        aSrcImageStep[i] = static_cast<Npp32s>(nPitch);
    }

    NPP_CHECK_CUDA(cudaMallocPitch((void**)&imgOrigin, &pitchOrigin, width * 3, height));

    bgrOriginPitch = width * 3;
    bgrOriginLen = bgrOriginPitch * height;
    NPP_CHECK_CUDA(cudaHostAlloc((void**)&bgrOrigin, bgrOriginLen, cudaHostAllocDefault));

    sizeOrigin.width = width;
    sizeOrigin.height = height;

    uint32_t nPitch = (width + MEMORY_ALGN_DEVICE) & ~MEMORY_ALGN_DEVICE;
    NPP_CHECK_CUDA(cudaHostAlloc((void**)&mY, nPitch * height, cudaHostAllocDefault));
    nPitch = (width/2 + MEMORY_ALGN_DEVICE) & ~MEMORY_ALGN_DEVICE;
    NPP_CHECK_CUDA(cudaHostAlloc((void**)&mU, nPitch * height / 2, cudaHostAllocDefault));
    NPP_CHECK_CUDA(cudaHostAlloc((void**)&mV, nPitch * height / 2, cudaHostAllocDefault));

}

void convertor::init_resize(){
    if (initialized_resize) return;
    initialized_resize = true;

    setGPUDevice(gpu_index);

    NppiSize oDstImageSize;
    oDstImageSize.width  = std::max(1, rWidth);
    oDstImageSize.height = std::max(1, rHeight);
    
    sizeResize.width = oDstImageSize.width; 
    sizeResize.height = oDstImageSize.height;

    for (int i=0; i < 3; ++i)
    {
        NppiSize oBlocks;
        NppiSize oBlocksPerMCU = { aSamplingFactors[i] & 0x0f, aSamplingFactors[i] >> 4};

        oBlocks.width = (int)ceil((oDstImageSize.width + 7)/8  *
                                  static_cast<float>(oBlocksPerMCU.width)/nMCUBlocksH);
        oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width;

        oBlocks.height = (int)ceil((oDstImageSize.height+7)/8 *
                                   static_cast<float>(oBlocksPerMCU.height)/nMCUBlocksV);
        oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height;

        aDstSize[i].width = oBlocks.width * 8;
        aDstSize[i].height = oBlocks.height * 8;

        // Allocate Memory
        size_t nPitch;
        NPP_CHECK_CUDA(cudaMallocPitch((void**)&apDstImage[i], &nPitch, aDstSize[i].width, aDstSize[i].height));
        aDstImageStep[i] = static_cast<Npp32s>(nPitch);
    }

    if (rWidth > 0 && rHeight > 0 && rWidth < width && rHeight < height){
        fx = (float)(rWidth) / (float)(width);
        fy = (float)(rHeight) / (float)(height);
    }

    if (imgResize == 0){
        if (rWidth > 0 && rHeight > 0 && rWidth < width && rHeight < height){
            NPP_CHECK_CUDA(cudaMallocPitch((void**)&imgResize, &pitchResize, rWidth * 3, rHeight));
        }
    }
    if (!bgrScale){
        if (rWidth > 0 && rHeight > 0 && rWidth < width && rHeight < height){
            bgrScalePitch = rWidth * 3;
            bgrScaleLen = bgrScalePitch * rHeight;
            NPP_CHECK_CUDA(cudaHostAlloc((void**)&bgrScale, bgrScaleLen, cudaHostAllocDefault));
        }
    }
}

void convertor::init_resize_bgr(){
    if (initialized_resize_bgr) return;
    initialized_resize_bgr = true;

    setGPUDevice(gpu_index);
    if (originBGR == 0){
        originBGR = nppiMalloc_8u_C3(width, height, &pitchOriginBGR);
    }
    if (resizedBGR == 0){
        resizedBGR = nppiMalloc_8u_C3(rWidth, rHeight, &pitchResizedBGR);
    }
    if (hostResizedBGR == NULL){
        NPP_CHECK_CUDA(cudaHostAlloc((void**)&hostResizedBGR, rWidth * 3 * rHeight, cudaHostAllocDefault));
    }
}

void convertor::init_resize_yuv(){
    if (initialized_resize_yuv) return;
    initialized_resize_yuv = true;

    if (rWidth > 0 && rHeight > 0){
        fx = (float)(width) / (float)(rWidth);
        fy = (float)(height) / (float)(rHeight);
    }

    nv12 = (unsigned char*)malloc(rWidth*rHeight*3/2);
}

convertor::~convertor(){
    setGPUDevice(gpu_index);

    if(mY) cudaFreeHost(mY);
    if(mU) cudaFreeHost(mU);
    if(mV) cudaFreeHost(mV);

    for (int i = 0; i < 3; ++i)//内存释放
    {
        cudaFree(apSrcImage[i]);
        cudaFree(apDstImage[i]);
    }

    if (imgOrigin) cudaFree(imgOrigin);
    if (imgResize) cudaFree(imgResize);

    if (bgrOrigin) cudaFreeHost(bgrOrigin);
    if (bgrScale) cudaFreeHost(bgrScale);

    if (originBGR) nppiFree(originBGR);
    if (resizedBGR) nppiFree(resizedBGR);
    if (hostResizedBGR) cudaFreeHost(hostResizedBGR);

    if (nv12) free(nv12);
}

int convertor::fill_yuv(const unsigned char *yuv){
    init_yuv();
    int ret = set_data((uint8_t*)yuv, width, height, mY, mU, mV);
    if (ret < 0) return ret;

    setGPUDevice(gpu_index);

    NPP_CHECK_CUDA(cudaMemcpy(apSrcImage[0], mY, aSrcPitch[0] * height,     cudaMemcpyHostToDevice));
    NPP_CHECK_CUDA(cudaMemcpy(apSrcImage[1], mU, aSrcPitch[1] * height / 2, cudaMemcpyHostToDevice));
    NPP_CHECK_CUDA(cudaMemcpy(apSrcImage[2], mV, aSrcPitch[2] * height / 2, cudaMemcpyHostToDevice));
    return 0;
}

int convertor::yuv2bgr(unsigned char **bgr, int *bgrLen){

    *bgr = NULL;
    *bgrLen = 0;

    setGPUDevice(gpu_index);

    NPP_CHECK_NPP(nppiYUV420ToBGR_8u_P3C3R(apSrcImage, aSrcImageStep, imgOrigin, pitchOrigin, sizeOrigin));

    NPP_CHECK_CUDA(cudaMemcpy2D(bgrOrigin, bgrOriginPitch, imgOrigin, pitchOrigin, bgrOriginPitch, height, cudaMemcpyDeviceToHost));
    *bgr = bgrOrigin;
    *bgrLen = bgrOriginLen;

    return 0;
}

int convertor::resize2bgr(unsigned char *in, unsigned char **data, int *data_len){
    *data = NULL;
    *data_len = 0;

    if ((rWidth < 0 && rHeight < 0) || (rWidth > width && rHeight > height)){
        return -1;
    }

    setGPUDevice(gpu_index);

    if (!in){

        init_resize();

        NppiSize oDstImageSize;
        oDstImageSize.width  = std::max(1, rWidth);
        oDstImageSize.height = std::max(1, rHeight);
        for (int i = 0; i < 3; ++i)
        {
            NppiSize oBlocksPerMCU = { aSamplingFactors[i] & 0x0f, aSamplingFactors[i] >> 4};
            NppiSize oSrcImageSize = {(width * oBlocksPerMCU.width) / nMCUBlocksH, (height * oBlocksPerMCU.height)/nMCUBlocksV};
            NppiRect oSrcImageROI = {0,0,oSrcImageSize.width, oSrcImageSize.height};
            NppiRect oDstImageROI;
            NppiInterpolationMode eInterploationMode = NPPI_INTER_SUPER;
            NPP_CHECK_NPP(nppiGetResizeRect(oSrcImageROI, &oDstImageROI,
                                            fx,
                                            fy,
                                            0.0, 0.0, eInterploationMode));
            NPP_CHECK_NPP(nppiResizeSqrPixel_8u_C1R(apSrcImage[i], oSrcImageSize, aSrcImageStep[i], oSrcImageROI,
                                                    apDstImage[i], aDstImageStep[i], oDstImageROI ,
                                                    fx,
                                                    fy,
                                                    0.0, 0.0, eInterploationMode));
        }
        NPP_CHECK_NPP(nppiYUV420ToBGR_8u_P3C3R(apDstImage, aDstImageStep, imgResize, pitchResize, sizeResize));
        NPP_CHECK_CUDA(cudaMemcpy2D(bgrScale, bgrScalePitch, imgResize, pitchResize, bgrScalePitch, rHeight, cudaMemcpyDeviceToHost));
        *data = bgrScale;
        *data_len = bgrScaleLen;
    }else{
        
        init_resize_bgr();

        NppiSize oSrcSize;
        oSrcSize.width = width;
        oSrcSize.height = height;

        NPP_CHECK_CUDA(cudaMemcpy2D(originBGR, pitchOriginBGR, in, width*3, width*3, height, cudaMemcpyHostToDevice));

        NppiRect oSrcROI;
        oSrcROI.x = 0;
        oSrcROI.y = 0;
        oSrcROI.width = width;
        oSrcROI.height = height;


        NppiRect oDstROI;
        oDstROI.x = 0;
        oDstROI.y = 0;
        oDstROI.width = rWidth;
        oDstROI.height = rHeight;

        // Scale Factor
        double nXFactor = double(oDstROI.width) / double(oSrcROI.width);
        double nYFactor = double(oDstROI.height) / double(oSrcROI.height);

        // Scaled X/Y  Shift
        double nXShift = - oSrcROI.x * nXFactor ;
        double nYShift = - oSrcROI.y * nYFactor;
        int eInterpolation = NPPI_INTER_SUPER;
        if (nXFactor >= 1.f || nYFactor >= 1.f)
            eInterpolation = NPPI_INTER_LANCZOS;

        NppStatus ret = nppiResizeSqrPixel_8u_C3R(originBGR, oSrcSize, pitchOriginBGR, oSrcROI, 
            resizedBGR, pitchResizedBGR, oDstROI, nXFactor, nYFactor, nXShift, nYShift, eInterpolation );

        if(ret != NPP_SUCCESS) {
            printf("imageResize_8u_C3R failed %d.\n", ret);
            return -2;
        }
        size_t pitch = rWidth * 3;
        *data_len = pitch * rHeight;
        NPP_CHECK_CUDA(cudaMemcpy2D(hostResizedBGR, pitch, resizedBGR, pitchResizedBGR, pitch, rHeight, cudaMemcpyDeviceToHost));
        *data = hostResizedBGR;
    }
    return 0;
}

static int nv12_nearest_scale(uint8_t* __restrict src, uint8_t* __restrict dst,   
                        int srcWidth, int srcHeight, int dstWidth, int dstHeight)
{   
    register int sw = srcWidth;  //register keyword is for local var to accelorate    
    register int sh = srcHeight;   
    register int dw = dstWidth;   
    register int dh = dstHeight;   
    register int y, x;   
    unsigned long int srcy, srcx, src_index, dst_index;   
    unsigned long int xrIntFloat_16 = (sw << 16) / dw + 1; //better than float division   
    unsigned long int yrIntFloat_16 = (sh << 16) / dh + 1;   
   
    uint8_t* dst_uv = dst + dh * dw; //memory start pointer of dest uv   
    uint8_t* src_uv = src + sh * sw; //memory start pointer of source uv   
    uint8_t* dst_uv_yScanline;   
    uint8_t* src_uv_yScanline;   
    uint8_t* dst_y_slice = dst; //memory start pointer of dest y   
    uint8_t* src_y_slice;   
    uint8_t* sp;   
    uint8_t* dp;   
    
    for (y = 0; y < (dh & ~7); ++y)  //'dh & ~7' is to generate faster assembly code   
    {   
        srcy = (y * yrIntFloat_16) >> 16;   
        src_y_slice = src + srcy * sw;   
   
        if((y & 1) == 0)   
        {   
            dst_uv_yScanline = dst_uv + (y / 2) * dw;   
            src_uv_yScanline = src_uv + (srcy / 2) * sw;   
        }   
   
        for(x = 0; x < (dw & ~7); ++x)   
        {   
            srcx = (x * xrIntFloat_16) >> 16;   
            dst_y_slice[x] = src_y_slice[srcx];   
   
            if((y & 1) == 0) //y is even   
            {   
                if((x & 1) == 0) //x is even   
                {   
                    src_index = (srcx / 2) * 2;   
               
                    sp = dst_uv_yScanline + x;   
                    dp = src_uv_yScanline + src_index;   
                    *sp = *dp;   
                    ++sp;   
                    ++dp;   
                    *sp = *dp;   
                }   
             }   
         }   
         dst_y_slice += dw;   
    }
    return 0;
}

int convertor::resizeyuv(unsigned char *in, unsigned char **data, int *data_len){
        
    init_resize_yuv();

    *data_len = rWidth*rHeight*3/2;
    *data = nv12;

    return nv12_nearest_scale(in, nv12, width, height, rWidth, rHeight);
}

convHandle conv_create(const int srcW, const int srcH, const int dstW, const int dstH, const int gpu){
    if (gpu < 0) return NULL;

    convertor *conv = new convertor(srcW, srcH, dstW, dstH, gpu);
    return conv;
}

void conv_destroy(convHandle h){
    if (!h) return;    
    convertor *conv = (convertor*)h;
    delete conv;
}

int yuv2bgrandresize(convHandle h, void *yuv, unsigned char **bgr, int *bgrLen, unsigned char **scaleBGR, int *scaleBGRLen){
    if (!h) return -2;
    convertor *conv = (convertor*)h;
    int ret = conv->fill_yuv((unsigned char*)yuv);
    if (ret != 0) return ret;
    ret = conv->yuv2bgr(bgr, bgrLen);
    if (ret != 0) return ret;
    ret = conv->resize2bgr(NULL, scaleBGR, scaleBGRLen);
    return ret;
}

int yuv2bgr(convHandle h, void *yuv, unsigned char **bgr, int *bgrLen){
    if (!h) return -2;
    convertor *conv = (convertor*)h;
    int ret = conv->fill_yuv((unsigned char*)yuv);
    if (ret != 0) return ret;
    return conv->yuv2bgr(bgr, bgrLen);
}

int yuv2resizedbgr(convHandle h, void *yuv, unsigned char **bgr, int *bgrLen){
    if (!h) return -2;
    convertor *conv = (convertor*)h;
    int ret = conv->fill_yuv((unsigned char*)yuv);
    if (ret != 0) return ret;
    ret = conv->resize2bgr(NULL, bgr, bgrLen);
    return ret;
}

int resizebgr(convHandle h, void *data, unsigned char **resized, int *len){
    if (!h) return -2;
    convertor *conv = (convertor*)h;
    return conv->resize2bgr((unsigned char*)data, resized, len);
}

int resizeyuv(convHandle h, void *data, unsigned char **resized, int *len){
    if (!h) return -2;
    convertor *conv = (convertor*)h;
    return conv->resizeyuv((unsigned char*)data, resized, len);
}

 goconv/conv.h

New file
@@ -0,0 +1,23 @@
#ifndef __RESIZE_NPP_H__
#define __RESIZE_NPP_H__

#ifdef __cplusplus
extern "C"{
#endif

typedef void* convHandle;
convHandle conv_create(const int srcW, const int srcH, const int dstW, const int dstH, const int gpu);

void conv_destroy(convHandle h);
int yuv2bgrandresize(convHandle h, void *yuv, unsigned char **bgr, int *bgrLen, unsigned char **scaleBGR, int *scaleBGRLen);
int yuv2bgr(convHandle h, void *yuv, unsigned char **bgr, int *bgrLen);
int yuv2resizedbgr(convHandle h, void *yuv, unsigned char **bgr, int *bgrLen);

int resizebgr(convHandle h, void *data, unsigned char **resized, int *len);
int resizeyuv(convHandle h, void *data, unsigned char **resized, int *len);

#ifdef __cplusplus
}
#endif

#endif //__RESIZE_NPP_H__

 goconv/goconv.go

New file
@@ -0,0 +1,258 @@
package goconv

/*
#cgo CFLAGS: -I./ -I./inc -I/usr/local/cuda/include
#cgo CXXFLAGS: -I./ -I./inc -I/usr/local/cuda/include -std=c++11
#cgo LDFLAGS: -L/usr/local/cuda/lib64 -lnppig -lnppicc -lnppial -lnppisu -lcudart -ldl
#include <stdlib.h>
#include "conv.h"
*/
import "C"
import (
    "unsafe"

    "basic.com/valib/godraw.git"
    "basic.com/valib/gogpu.git"
    "github.com/disintegration/imaging"
)

const (
    need     = 200
    reserved = 512
)

func gpuIndex(lastIndex int) int {
    indices := gogpu.RankGPU()
    if len(indices) == 0 {
        return -1
    }

    for _, v := range indices {
        if v != lastIndex {
            if gogpu.SatisfyGPU(v, need, need/2) {
                return v
            }
        }
    }

    if gogpu.SatisfyGPU(lastIndex, need, reserved) {
        return lastIndex
    }
    return -1
}

type convertor struct {
    width   int
    height  int
    rWidth  int
    rHeight int
    conv    C.convHandle
}

var convts []*convertor

func find(w, h, rw, rh int) *convertor {
    for _, v := range convts {
        if v.width == w && v.height == h && v.rWidth == rw && v.rHeight == rh {
            return v
        }
    }
    gpu := gpuIndex(0)
    if gpu < 0 {
        return nil
    }
    cw := C.conv_create(C.int(w), C.int(h), C.int(rw), C.int(rh), C.int(gpu))
    if cw == nil {
        return nil
    }
    c := &convertor{w, h, rw, rh, cw}
    convts = append(convts, c)
    return c
}

// YUV2BGR yuv->bgr
func YUV2BGR(yuv []byte, w, h int) []byte {

    cw := find(w, h, -1, -1)
    if cw == nil {
        return yuv2bgr(yuv, w, h)
    }
    var bgr *C.uchar
    var bgrLen C.int
    ret := C.yuv2bgr(cw.conv, unsafe.Pointer(&yuv[0]), &bgr, &bgrLen)
    if ret != 0 {
        return nil
    }
    const maxLen = 0x7fffffff
    goBGRLen := int(bgrLen)
    if goBGRLen > 0 {
        return (*[maxLen]byte)(unsafe.Pointer(bgr))[:goBGRLen:goBGRLen]
    }
    return nil
}

// YUV2ResizedBGR yuv -> resized bgr
func YUV2ResizedBGR(yuv []byte, w, h, rw, rh int) []byte {

    cw := find(w, h, rw, rh)
    if cw == nil {
        bgr := yuv2bgr(yuv, w, h)
        return bgresize(bgr, w, h, rw, rh)
    }
    var bgr *C.uchar
    var bgrLen C.int
    ret := C.yuv2resizedbgr(cw.conv, unsafe.Pointer(&yuv[0]), &bgr, &bgrLen)
    if ret != 0 {
        return nil
    }
    const maxLen = 0x7fffffff
    goBGRLen := int(bgrLen)
    if goBGRLen > 0 {
        return (*[maxLen]byte)(unsafe.Pointer(bgr))[:goBGRLen:goBGRLen]
    }
    return nil

}

// ResizeBGR resize
func ResizeBGR(bgrO []byte, w, h, rw, rh int) []byte {
    if (rw < 0 && rh < 0) || (rw > w && rh > h) {
        return bgrO
    }

    cw := find(w, h, rw, rh)
    if cw == nil {
        return bgresize(bgrO, w, h, rw, rh)
    }

    var bgr *C.uchar
    var bgrLen C.int
    ret := C.resizebgr(cw.conv, unsafe.Pointer(&bgrO[0]), &bgr, &bgrLen)
    if ret != 0 {
        return nil
    }
    const maxLen = 0x7fffffff
    goBGRLen := int(bgrLen)
    if goBGRLen > 0 {
        return (*[maxLen]byte)(unsafe.Pointer(bgr))[:goBGRLen:goBGRLen]
    }
    return nil
}

// ResizeYUV yuv
func ResizeYUV(yuv []byte, w, h, rw, rh int) []byte {
    if (rw < 0 && rh < 0) || (rw > w && rh > h) {
        return yuv
    }

    cw := find(w, h, rw, rh)
    if cw == nil {
        return yuv
    }

    var resized *C.uchar
    var resizedLen C.int
    ret := C.resizeyuv(cw.conv, unsafe.Pointer(&yuv[0]), &resized, &resizedLen)
    if ret != 0 {
        return nil
    }

    const maxLen = 0x7fffffff
    goResizedLen := int(resizedLen)
    if goResizedLen > 0 {
        return (*[maxLen]byte)(unsafe.Pointer(resized))[:goResizedLen:goResizedLen]
    }
    return nil
}

// YUV2BGRandResize conv and resize
func YUV2BGRandResize(yuv []byte, w, h, rw, rh int) ([]byte, []byte) {
    cw := find(w, h, rw, rh)
    if cw == nil {
        origin := yuv2bgr(yuv, w, h)
        resized := bgresize(origin, w, h, rw, rh)
        return origin, resized
    }

    var bgr *C.uchar
    var bgrLen C.int
    var scale *C.uchar
    var scaleLen C.int

    ret := C.yuv2bgrandresize(cw.conv, unsafe.Pointer(&yuv[0]), &bgr, &bgrLen, &scale, &scaleLen)

    if ret != 0 {
        return nil, nil
    }
    var out, resized []byte

    const maxLen = 0x7fffffff
    goBGRLen, goScaleLen := int(bgrLen), int(scaleLen)
    if goBGRLen > 0 {
        out = (*[maxLen]byte)(unsafe.Pointer(bgr))[:goBGRLen:goBGRLen]
    }
    if goScaleLen > 0 {
        resized = (*[maxLen]byte)(unsafe.Pointer(scale))[:goScaleLen:goScaleLen]
    }
    return out, resized

}

// Free free
func Free() {
    for _, v := range convts {
        if v.conv != nil {
            C.conv_destroy(v.conv)
        }
    }
}

func yuv2bgr(yuv []byte, w, h int) []byte {

    data := make([]byte, 0, w*h*3)
    start := w * h
    for i := 0; i < h; i++ {
        for j := 0; j < w; j++ {

            index := i/2*w + j - (j & 0x01)

            y := int32(yuv[j+i*w])
            v := int32(yuv[start+index])
            u := int32(yuv[start+index+1])

            r := y + (140*(v-128))/100
            g := y - (34*(u-128)+71*(v-128))/100
            b := y + (177*(u-128))/100

            if r > 255 {
                r = 255
            }
            if r < 0 {
                r = 0
            }
            if g > 255 {
                g = 255
            }
            if g < 0 {
                g = 0
            }
            if b > 255 {
                b = 255
            }
            if b < 0 {
                b = 0
            }
            data = append(data, byte(r), byte(g), byte(b))
        }
    }
    return data
}

func bgresize(bgr []byte, w, h, rw, rh int) []byte {
    img, err := godraw.ToImage(bgr, w, h)
    if err != nil {
        return nil
    }
    dstImg := imaging.Resize(img, rw, rh, imaging.NearestNeighbor)
    return godraw.Image2BGR(dstImg)
}

 goconv/inc/Exceptions.h

New file
@@ -0,0 +1,181 @@
/**
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#ifndef NV_UTIL_NPP_EXCEPTIONS_H
#define NV_UTIL_NPP_EXCEPTIONS_H


#include <string>
#include <sstream>
#include <iostream>

/// All npp related C++ classes are put into the npp namespace.
namespace npp
{

    /// Exception base class.
    ///     This exception base class will be used for everything C++ throught
    /// the NPP project.
    ///     The exception contains a string message, as well as data fields for a string
    /// containing the name of the file as well as the line number where the exception was thrown.
    ///     The easiest way of throwing exceptions and providing filename and line number is
    /// to use one of the ASSERT macros defined for that purpose.
    class Exception
    {
        public:
            /// Constructor.
            /// \param rMessage A message with information as to why the exception was thrown.
            /// \param rFileName The name of the file where the exception was thrown.
            /// \param nLineNumber Line number in the file where the exception was thrown.
            explicit
            Exception(const std::string &rMessage = "", const std::string &rFileName = "", unsigned int nLineNumber = 0)
                : sMessage_(rMessage), sFileName_(rFileName), nLineNumber_(nLineNumber)
            { };

            Exception(const Exception &rException)
                : sMessage_(rException.sMessage_), sFileName_(rException.sFileName_), nLineNumber_(rException.nLineNumber_)
            { };

            virtual
            ~Exception()
            { };

            /// Get the exception's message.
            const
            std::string &
            message()
            const
            {
                return sMessage_;
            }

            /// Get the exception's file info.
            const
            std::string &
            fileName()
            const
            {
                return sFileName_;
            }

            /// Get the exceptions's line info.
            unsigned int
            lineNumber()
            const
            {
                return nLineNumber_;
            }


            /// Create a clone of this exception.
            ///      This creates a new Exception object on the heap. It is
            /// the responsibility of the user of this function to free this memory
            /// (delete x).
            virtual
            Exception *
            clone()
            const
            {
                return new Exception(*this);
            }

            /// Create a single string with all the exceptions information.
            ///     The virtual toString() method is used by the operator<<()
            /// so that all exceptions derived from this base-class can print
            /// their full information correctly even if a reference to their
            /// exact type is not had at the time of printing (i.e. the basic
            /// operator<<() is used).
            virtual
            std::string
            toString()
            const
            {
                std::ostringstream oOutputString;
                oOutputString << fileName() << ":" << lineNumber() << ": " << message();
                return oOutputString.str();
            }

        private:
            std::string sMessage_;      ///< Message regarding the cause of the exception.
            std::string sFileName_;     ///< Name of the file where the exception was thrown.
            unsigned int nLineNumber_;  ///< Line number in the file where the exception was thrown
    };

    /// Output stream inserter for Exception.
    /// \param rOutputStream The stream the exception information is written to.
    /// \param rException The exception that's being written.
    /// \return Reference to the output stream being used.
    static std::ostream &
    operator << (std::ostream &rOutputStream, const Exception &rException)
    {
        rOutputStream << rException.toString();
        return rOutputStream;
    }

    /// Basic assert macro.
    ///     This macro should be used to enforce any kind of pre or post conditions.
    /// Unlike the C-runtime assert macro, this macro does not abort execution, but throws
    /// a C++ exception. The exception is automatically filled with information about the failing
    /// condition, the filename and line number where the exception was thrown.
    /// \note The macro is written in such a way that omitting a semicolon after its usage
    ///     causes a compiler error. The correct way to invoke this macro is:
    /// NPP_ASSERT(n < MAX);
#define NPP_ASSERT(C) do {if (!(C)) throw npp::Exception(#C " assertion faild!", __FILE__, __LINE__);} while(false)

    // ASSERT macro.
    //  Same functionality as the basic assert macro with the added ability to pass
    //  a message M. M should be a string literal.
    //  Note: Never use code inside ASSERT() that causes a side-effect ASSERT macros may get compiled
    //      out in release mode.
#define NPP_ASSERT_MSG(C, M) do {if (!(C)) throw npp::Exception(#C " assertion faild! Message: " M, __FILE__, __LINE__);} while(false)

#ifdef _DEBUG
    /// Basic debug assert macro.
    ///     This macro is identical in every respect to NPP_ASSERT(C) but it does get compiled to a
    /// no-op in release builds. It is therefor of utmost importance to not put statements into
    /// this macro that cause side effects required for correct program execution.
#define NPP_DEBUG_ASSERT(C) do {if (!(C)) throw npp::Exception(#C " debug assertion faild!", __FILE__, __LINE__);} while(false)
#else
#define NPP_DEBUG_ASSERT(C)
#endif

    /// ASSERT for null-pointer test.
    /// It is safe to put code with side effects into this macro. Also: This macro never
    /// gets compiled to a no-op because resource allocation may fail based on external causes not under
    /// control of a software developer.
#define NPP_ASSERT_NOT_NULL(P) do {if ((P) == 0) throw npp::Exception(#P " not null assertion faild!", __FILE__, __LINE__);} while(false)

    /// Macro for flagging methods as not implemented.
    /// The macro throws an exception with a message that an implementation was missing
#define NPP_NOT_IMPLEMENTED() do {throw npp::Exception("Implementation missing!", __FILE__, __LINE__);} while(false)

    /// Macro for checking error return code of CUDA (runtime) calls.
    /// This macro never gets disabled.
#define NPP_CHECK_CUDA(S) do {cudaError_t eCUDAResult; \
        eCUDAResult = S; \
        if (eCUDAResult != cudaSuccess) std::cout << "NPP_CHECK_CUDA - eCUDAResult = " << eCUDAResult << std::endl; \
        NPP_ASSERT(eCUDAResult == cudaSuccess);} while (false)

    /// Macro for checking error return code for NPP calls.
#define NPP_CHECK_NPP(S) do {NppStatus eStatusNPP; \
        eStatusNPP = S; \
        if (eStatusNPP != NPP_SUCCESS) std::cout << "NPP_CHECK_NPP - eStatusNPP = " << _cudaGetErrorEnum(eStatusNPP) << "("<< eStatusNPP << ")" << std::endl; \
        NPP_ASSERT(eStatusNPP == NPP_SUCCESS);} while (false)

    /// Macro for checking error return codes from cuFFT calls.
#define NPP_CHECK_CUFFT(S) do {cufftResult eCUFFTResult; \
        eCUFFTResult = S; \
        if (eCUFFTResult != NPP_SUCCESS) std::cout << "NPP_CHECK_CUFFT - eCUFFTResult = " << eCUFFTResult << std::endl; \
        NPP_ASSERT(eCUFFTResult == CUFFT_SUCCESS);} while (false)

} // npp namespace

#endif // NV_UTIL_NPP_EXCEPTIONS_H

 goconv/inc/helper_cuda.h

New file
@@ -0,0 +1,1261 @@
/**
 * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions for initialization and error checking

#ifndef HELPER_CUDA_H
#define HELPER_CUDA_H

#pragma once

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#include <helper_string.h>

#ifndef EXIT_WAIVED
#define EXIT_WAIVED 2
#endif

// Note, it is required that your SDK sample to include the proper header files, please
// refer the CUDA examples for examples of the needed CUDA headers, which may change depending
// on which CUDA functions are used.

// CUDA Runtime error messages
#ifdef __DRIVER_TYPES_H__
static const char *_cudaGetErrorEnum(cudaError_t error)
{
    switch (error)
    {
        case cudaSuccess:
            return "cudaSuccess";

        case cudaErrorMissingConfiguration:
            return "cudaErrorMissingConfiguration";

        case cudaErrorMemoryAllocation:
            return "cudaErrorMemoryAllocation";

        case cudaErrorInitializationError:
            return "cudaErrorInitializationError";

        case cudaErrorLaunchFailure:
            return "cudaErrorLaunchFailure";

        case cudaErrorPriorLaunchFailure:
            return "cudaErrorPriorLaunchFailure";

        case cudaErrorLaunchTimeout:
            return "cudaErrorLaunchTimeout";

        case cudaErrorLaunchOutOfResources:
            return "cudaErrorLaunchOutOfResources";

        case cudaErrorInvalidDeviceFunction:
            return "cudaErrorInvalidDeviceFunction";

        case cudaErrorInvalidConfiguration:
            return "cudaErrorInvalidConfiguration";

        case cudaErrorInvalidDevice:
            return "cudaErrorInvalidDevice";

        case cudaErrorInvalidValue:
            return "cudaErrorInvalidValue";

        case cudaErrorInvalidPitchValue:
            return "cudaErrorInvalidPitchValue";

        case cudaErrorInvalidSymbol:
            return "cudaErrorInvalidSymbol";

        case cudaErrorMapBufferObjectFailed:
            return "cudaErrorMapBufferObjectFailed";

        case cudaErrorUnmapBufferObjectFailed:
            return "cudaErrorUnmapBufferObjectFailed";

        case cudaErrorInvalidHostPointer:
            return "cudaErrorInvalidHostPointer";

        case cudaErrorInvalidDevicePointer:
            return "cudaErrorInvalidDevicePointer";

        case cudaErrorInvalidTexture:
            return "cudaErrorInvalidTexture";

        case cudaErrorInvalidTextureBinding:
            return "cudaErrorInvalidTextureBinding";

        case cudaErrorInvalidChannelDescriptor:
            return "cudaErrorInvalidChannelDescriptor";

        case cudaErrorInvalidMemcpyDirection:
            return "cudaErrorInvalidMemcpyDirection";

        case cudaErrorAddressOfConstant:
            return "cudaErrorAddressOfConstant";

        case cudaErrorTextureFetchFailed:
            return "cudaErrorTextureFetchFailed";

        case cudaErrorTextureNotBound:
            return "cudaErrorTextureNotBound";

        case cudaErrorSynchronizationError:
            return "cudaErrorSynchronizationError";

        case cudaErrorInvalidFilterSetting:
            return "cudaErrorInvalidFilterSetting";

        case cudaErrorInvalidNormSetting:
            return "cudaErrorInvalidNormSetting";

        case cudaErrorMixedDeviceExecution:
            return "cudaErrorMixedDeviceExecution";

        case cudaErrorCudartUnloading:
            return "cudaErrorCudartUnloading";

        case cudaErrorUnknown:
            return "cudaErrorUnknown";

        case cudaErrorNotYetImplemented:
            return "cudaErrorNotYetImplemented";

        case cudaErrorMemoryValueTooLarge:
            return "cudaErrorMemoryValueTooLarge";

        case cudaErrorInvalidResourceHandle:
            return "cudaErrorInvalidResourceHandle";

        case cudaErrorNotReady:
            return "cudaErrorNotReady";

        case cudaErrorInsufficientDriver:
            return "cudaErrorInsufficientDriver";

        case cudaErrorSetOnActiveProcess:
            return "cudaErrorSetOnActiveProcess";

        case cudaErrorInvalidSurface:
            return "cudaErrorInvalidSurface";

        case cudaErrorNoDevice:
            return "cudaErrorNoDevice";

        case cudaErrorECCUncorrectable:
            return "cudaErrorECCUncorrectable";

        case cudaErrorSharedObjectSymbolNotFound:
            return "cudaErrorSharedObjectSymbolNotFound";

        case cudaErrorSharedObjectInitFailed:
            return "cudaErrorSharedObjectInitFailed";

        case cudaErrorUnsupportedLimit:
            return "cudaErrorUnsupportedLimit";

        case cudaErrorDuplicateVariableName:
            return "cudaErrorDuplicateVariableName";

        case cudaErrorDuplicateTextureName:
            return "cudaErrorDuplicateTextureName";

        case cudaErrorDuplicateSurfaceName:
            return "cudaErrorDuplicateSurfaceName";

        case cudaErrorDevicesUnavailable:
            return "cudaErrorDevicesUnavailable";

        case cudaErrorInvalidKernelImage:
            return "cudaErrorInvalidKernelImage";

        case cudaErrorNoKernelImageForDevice:
            return "cudaErrorNoKernelImageForDevice";

        case cudaErrorIncompatibleDriverContext:
            return "cudaErrorIncompatibleDriverContext";

        case cudaErrorPeerAccessAlreadyEnabled:
            return "cudaErrorPeerAccessAlreadyEnabled";

        case cudaErrorPeerAccessNotEnabled:
            return "cudaErrorPeerAccessNotEnabled";

        case cudaErrorDeviceAlreadyInUse:
            return "cudaErrorDeviceAlreadyInUse";

        case cudaErrorProfilerDisabled:
            return "cudaErrorProfilerDisabled";

        case cudaErrorProfilerNotInitialized:
            return "cudaErrorProfilerNotInitialized";

        case cudaErrorProfilerAlreadyStarted:
            return "cudaErrorProfilerAlreadyStarted";

        case cudaErrorProfilerAlreadyStopped:
            return "cudaErrorProfilerAlreadyStopped";

        /* Since CUDA 4.0*/
        case cudaErrorAssert:
            return "cudaErrorAssert";

        case cudaErrorTooManyPeers:
            return "cudaErrorTooManyPeers";

        case cudaErrorHostMemoryAlreadyRegistered:
            return "cudaErrorHostMemoryAlreadyRegistered";

        case cudaErrorHostMemoryNotRegistered:
            return "cudaErrorHostMemoryNotRegistered";

        /* Since CUDA 5.0 */
        case cudaErrorOperatingSystem:
            return "cudaErrorOperatingSystem";

        case cudaErrorPeerAccessUnsupported:
            return "cudaErrorPeerAccessUnsupported";

        case cudaErrorLaunchMaxDepthExceeded:
            return "cudaErrorLaunchMaxDepthExceeded";

        case cudaErrorLaunchFileScopedTex:
            return "cudaErrorLaunchFileScopedTex";

        case cudaErrorLaunchFileScopedSurf:
            return "cudaErrorLaunchFileScopedSurf";

        case cudaErrorSyncDepthExceeded:
            return "cudaErrorSyncDepthExceeded";

        case cudaErrorLaunchPendingCountExceeded:
            return "cudaErrorLaunchPendingCountExceeded";

        case cudaErrorNotPermitted:
            return "cudaErrorNotPermitted";

        case cudaErrorNotSupported:
            return "cudaErrorNotSupported";

        /* Since CUDA 6.0 */
        case cudaErrorHardwareStackError:
            return "cudaErrorHardwareStackError";

        case cudaErrorIllegalInstruction:
            return "cudaErrorIllegalInstruction";

        case cudaErrorMisalignedAddress:
            return "cudaErrorMisalignedAddress";

        case cudaErrorInvalidAddressSpace:
            return "cudaErrorInvalidAddressSpace";

        case cudaErrorInvalidPc:
            return "cudaErrorInvalidPc";

        case cudaErrorIllegalAddress:
            return "cudaErrorIllegalAddress";

        /* Since CUDA 6.5*/
        case cudaErrorInvalidPtx:
            return "cudaErrorInvalidPtx";

        case cudaErrorInvalidGraphicsContext:
            return "cudaErrorInvalidGraphicsContext";

        case cudaErrorStartupFailure:
            return "cudaErrorStartupFailure";

        case cudaErrorApiFailureBase:
            return "cudaErrorApiFailureBase";
    }

    return "<unknown>";
}
#endif

#ifdef __cuda_cuda_h__
// CUDA Driver API errors
static const char *_cudaGetErrorEnum(CUresult error)
{
    switch (error)
    {
        case CUDA_SUCCESS:
            return "CUDA_SUCCESS";

        case CUDA_ERROR_INVALID_VALUE:
            return "CUDA_ERROR_INVALID_VALUE";

        case CUDA_ERROR_OUT_OF_MEMORY:
            return "CUDA_ERROR_OUT_OF_MEMORY";

        case CUDA_ERROR_NOT_INITIALIZED:
            return "CUDA_ERROR_NOT_INITIALIZED";

        case CUDA_ERROR_DEINITIALIZED:
            return "CUDA_ERROR_DEINITIALIZED";

        case CUDA_ERROR_PROFILER_DISABLED:
            return "CUDA_ERROR_PROFILER_DISABLED";

        case CUDA_ERROR_PROFILER_NOT_INITIALIZED:
            return "CUDA_ERROR_PROFILER_NOT_INITIALIZED";

        case CUDA_ERROR_PROFILER_ALREADY_STARTED:
            return "CUDA_ERROR_PROFILER_ALREADY_STARTED";

        case CUDA_ERROR_PROFILER_ALREADY_STOPPED:
            return "CUDA_ERROR_PROFILER_ALREADY_STOPPED";

        case CUDA_ERROR_NO_DEVICE:
            return "CUDA_ERROR_NO_DEVICE";

        case CUDA_ERROR_INVALID_DEVICE:
            return "CUDA_ERROR_INVALID_DEVICE";

        case CUDA_ERROR_INVALID_IMAGE:
            return "CUDA_ERROR_INVALID_IMAGE";

        case CUDA_ERROR_INVALID_CONTEXT:
            return "CUDA_ERROR_INVALID_CONTEXT";

        case CUDA_ERROR_CONTEXT_ALREADY_CURRENT:
            return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT";

        case CUDA_ERROR_MAP_FAILED:
            return "CUDA_ERROR_MAP_FAILED";

        case CUDA_ERROR_UNMAP_FAILED:
            return "CUDA_ERROR_UNMAP_FAILED";

        case CUDA_ERROR_ARRAY_IS_MAPPED:
            return "CUDA_ERROR_ARRAY_IS_MAPPED";

        case CUDA_ERROR_ALREADY_MAPPED:
            return "CUDA_ERROR_ALREADY_MAPPED";

        case CUDA_ERROR_NO_BINARY_FOR_GPU:
            return "CUDA_ERROR_NO_BINARY_FOR_GPU";

        case CUDA_ERROR_ALREADY_ACQUIRED:
            return "CUDA_ERROR_ALREADY_ACQUIRED";

        case CUDA_ERROR_NOT_MAPPED:
            return "CUDA_ERROR_NOT_MAPPED";

        case CUDA_ERROR_NOT_MAPPED_AS_ARRAY:
            return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY";

        case CUDA_ERROR_NOT_MAPPED_AS_POINTER:
            return "CUDA_ERROR_NOT_MAPPED_AS_POINTER";

        case CUDA_ERROR_ECC_UNCORRECTABLE:
            return "CUDA_ERROR_ECC_UNCORRECTABLE";

        case CUDA_ERROR_UNSUPPORTED_LIMIT:
            return "CUDA_ERROR_UNSUPPORTED_LIMIT";

        case CUDA_ERROR_CONTEXT_ALREADY_IN_USE:
            return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE";

        case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED:
            return "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED";

        case CUDA_ERROR_INVALID_PTX:
            return "CUDA_ERROR_INVALID_PTX";

        case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT:
            return "CUDA_ERROR_INVALID_GRAPHICS_CONTEXT";

        case CUDA_ERROR_INVALID_SOURCE:
            return "CUDA_ERROR_INVALID_SOURCE";

        case CUDA_ERROR_FILE_NOT_FOUND:
            return "CUDA_ERROR_FILE_NOT_FOUND";

        case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND:
            return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND";

        case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED:
            return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED";

        case CUDA_ERROR_OPERATING_SYSTEM:
            return "CUDA_ERROR_OPERATING_SYSTEM";

        case CUDA_ERROR_INVALID_HANDLE:
            return "CUDA_ERROR_INVALID_HANDLE";

        case CUDA_ERROR_NOT_FOUND:
            return "CUDA_ERROR_NOT_FOUND";

        case CUDA_ERROR_NOT_READY:
            return "CUDA_ERROR_NOT_READY";

        case CUDA_ERROR_ILLEGAL_ADDRESS:
            return "CUDA_ERROR_ILLEGAL_ADDRESS";

        case CUDA_ERROR_LAUNCH_FAILED:
            return "CUDA_ERROR_LAUNCH_FAILED";

        case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
            return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";

        case CUDA_ERROR_LAUNCH_TIMEOUT:
            return "CUDA_ERROR_LAUNCH_TIMEOUT";

        case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING:
            return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";

        case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED:
            return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED";

        case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED:
            return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED";

        case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE:
            return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE";

        case CUDA_ERROR_CONTEXT_IS_DESTROYED:
            return "CUDA_ERROR_CONTEXT_IS_DESTROYED";

        case CUDA_ERROR_ASSERT:
            return "CUDA_ERROR_ASSERT";

        case CUDA_ERROR_TOO_MANY_PEERS:
            return "CUDA_ERROR_TOO_MANY_PEERS";

        case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED:
            return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED";

        case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED:
            return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED";

        case CUDA_ERROR_HARDWARE_STACK_ERROR:
            return "CUDA_ERROR_HARDWARE_STACK_ERROR";

        case CUDA_ERROR_ILLEGAL_INSTRUCTION:
            return "CUDA_ERROR_ILLEGAL_INSTRUCTION";

        case CUDA_ERROR_MISALIGNED_ADDRESS:
            return "CUDA_ERROR_MISALIGNED_ADDRESS";

        case CUDA_ERROR_INVALID_ADDRESS_SPACE:
            return "CUDA_ERROR_INVALID_ADDRESS_SPACE";

        case CUDA_ERROR_INVALID_PC:
            return "CUDA_ERROR_INVALID_PC";

        case CUDA_ERROR_NOT_PERMITTED:
            return "CUDA_ERROR_NOT_PERMITTED";

        case CUDA_ERROR_NOT_SUPPORTED:
            return "CUDA_ERROR_NOT_SUPPORTED";

        case CUDA_ERROR_UNKNOWN:
            return "CUDA_ERROR_UNKNOWN";
    }

    return "<unknown>";
}
#endif

#ifdef CUBLAS_API_H_
// cuBLAS API errors
static const char *_cudaGetErrorEnum(cublasStatus_t error)
{
    switch (error)
    {
        case CUBLAS_STATUS_SUCCESS:
            return "CUBLAS_STATUS_SUCCESS";

        case CUBLAS_STATUS_NOT_INITIALIZED:
            return "CUBLAS_STATUS_NOT_INITIALIZED";

        case CUBLAS_STATUS_ALLOC_FAILED:
            return "CUBLAS_STATUS_ALLOC_FAILED";

        case CUBLAS_STATUS_INVALID_VALUE:
            return "CUBLAS_STATUS_INVALID_VALUE";

        case CUBLAS_STATUS_ARCH_MISMATCH:
            return "CUBLAS_STATUS_ARCH_MISMATCH";

        case CUBLAS_STATUS_MAPPING_ERROR:
            return "CUBLAS_STATUS_MAPPING_ERROR";

        case CUBLAS_STATUS_EXECUTION_FAILED:
            return "CUBLAS_STATUS_EXECUTION_FAILED";

        case CUBLAS_STATUS_INTERNAL_ERROR:
            return "CUBLAS_STATUS_INTERNAL_ERROR";

        case CUBLAS_STATUS_NOT_SUPPORTED:
            return "CUBLAS_STATUS_NOT_SUPPORTED";

        case CUBLAS_STATUS_LICENSE_ERROR:
            return "CUBLAS_STATUS_LICENSE_ERROR";
    }

    return "<unknown>";
}
#endif

#ifdef _CUFFT_H_
// cuFFT API errors
static const char *_cudaGetErrorEnum(cufftResult error)
{
    switch (error)
    {
        case CUFFT_SUCCESS:
            return "CUFFT_SUCCESS";

        case CUFFT_INVALID_PLAN:
            return "CUFFT_INVALID_PLAN";

        case CUFFT_ALLOC_FAILED:
            return "CUFFT_ALLOC_FAILED";

        case CUFFT_INVALID_TYPE:
            return "CUFFT_INVALID_TYPE";

        case CUFFT_INVALID_VALUE:
            return "CUFFT_INVALID_VALUE";

        case CUFFT_INTERNAL_ERROR:
            return "CUFFT_INTERNAL_ERROR";

        case CUFFT_EXEC_FAILED:
            return "CUFFT_EXEC_FAILED";

        case CUFFT_SETUP_FAILED:
            return "CUFFT_SETUP_FAILED";

        case CUFFT_INVALID_SIZE:
            return "CUFFT_INVALID_SIZE";

        case CUFFT_UNALIGNED_DATA:
            return "CUFFT_UNALIGNED_DATA";

        case CUFFT_INCOMPLETE_PARAMETER_LIST:
            return "CUFFT_INCOMPLETE_PARAMETER_LIST";

        case CUFFT_INVALID_DEVICE:
            return "CUFFT_INVALID_DEVICE";

        case CUFFT_PARSE_ERROR:
            return "CUFFT_PARSE_ERROR";

        case CUFFT_NO_WORKSPACE:
            return "CUFFT_NO_WORKSPACE";

        case CUFFT_NOT_IMPLEMENTED:
            return "CUFFT_NOT_IMPLEMENTED";

        case CUFFT_LICENSE_ERROR:
            return "CUFFT_LICENSE_ERROR";
    }

    return "<unknown>";
}
#endif


#ifdef CUSPARSEAPI
// cuSPARSE API errors
static const char *_cudaGetErrorEnum(cusparseStatus_t error)
{
    switch (error)
    {
        case CUSPARSE_STATUS_SUCCESS:
            return "CUSPARSE_STATUS_SUCCESS";

        case CUSPARSE_STATUS_NOT_INITIALIZED:
            return "CUSPARSE_STATUS_NOT_INITIALIZED";

        case CUSPARSE_STATUS_ALLOC_FAILED:
            return "CUSPARSE_STATUS_ALLOC_FAILED";

        case CUSPARSE_STATUS_INVALID_VALUE:
            return "CUSPARSE_STATUS_INVALID_VALUE";

        case CUSPARSE_STATUS_ARCH_MISMATCH:
            return "CUSPARSE_STATUS_ARCH_MISMATCH";

        case CUSPARSE_STATUS_MAPPING_ERROR:
            return "CUSPARSE_STATUS_MAPPING_ERROR";

        case CUSPARSE_STATUS_EXECUTION_FAILED:
            return "CUSPARSE_STATUS_EXECUTION_FAILED";

        case CUSPARSE_STATUS_INTERNAL_ERROR:
            return "CUSPARSE_STATUS_INTERNAL_ERROR";

        case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
            return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
    }

    return "<unknown>";
}
#endif

#ifdef CUSOLVER_COMMON_H_
//cuSOLVER API errors
static const char *_cudaGetErrorEnum(cusolverStatus_t error)
{
   switch(error)
   {
       case CUSOLVER_STATUS_SUCCESS:
           return "CUSOLVER_STATUS_SUCCESS";
       case CUSOLVER_STATUS_NOT_INITIALIZED:
           return "CUSOLVER_STATUS_NOT_INITIALIZED";
       case CUSOLVER_STATUS_ALLOC_FAILED:
           return "CUSOLVER_STATUS_ALLOC_FAILED";
       case CUSOLVER_STATUS_INVALID_VALUE:
           return "CUSOLVER_STATUS_INVALID_VALUE";
       case CUSOLVER_STATUS_ARCH_MISMATCH:
           return "CUSOLVER_STATUS_ARCH_MISMATCH";
       case CUSOLVER_STATUS_MAPPING_ERROR:
           return "CUSOLVER_STATUS_MAPPING_ERROR";
       case CUSOLVER_STATUS_EXECUTION_FAILED:
           return "CUSOLVER_STATUS_EXECUTION_FAILED";
       case CUSOLVER_STATUS_INTERNAL_ERROR:
           return "CUSOLVER_STATUS_INTERNAL_ERROR";
       case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
           return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
       case CUSOLVER_STATUS_NOT_SUPPORTED :
           return "CUSOLVER_STATUS_NOT_SUPPORTED ";
       case CUSOLVER_STATUS_ZERO_PIVOT:
           return "CUSOLVER_STATUS_ZERO_PIVOT";
       case CUSOLVER_STATUS_INVALID_LICENSE:
           return "CUSOLVER_STATUS_INVALID_LICENSE";
    }

    return "<unknown>";

}
#endif

#ifdef CURAND_H_
// cuRAND API errors
static const char *_cudaGetErrorEnum(curandStatus_t error)
{
    switch (error)
    {
        case CURAND_STATUS_SUCCESS:
            return "CURAND_STATUS_SUCCESS";

        case CURAND_STATUS_VERSION_MISMATCH:
            return "CURAND_STATUS_VERSION_MISMATCH";

        case CURAND_STATUS_NOT_INITIALIZED:
            return "CURAND_STATUS_NOT_INITIALIZED";

        case CURAND_STATUS_ALLOCATION_FAILED:
            return "CURAND_STATUS_ALLOCATION_FAILED";

        case CURAND_STATUS_TYPE_ERROR:
            return "CURAND_STATUS_TYPE_ERROR";

        case CURAND_STATUS_OUT_OF_RANGE:
            return "CURAND_STATUS_OUT_OF_RANGE";

        case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
            return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";

        case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
            return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";

        case CURAND_STATUS_LAUNCH_FAILURE:
            return "CURAND_STATUS_LAUNCH_FAILURE";

        case CURAND_STATUS_PREEXISTING_FAILURE:
            return "CURAND_STATUS_PREEXISTING_FAILURE";

        case CURAND_STATUS_INITIALIZATION_FAILED:
            return "CURAND_STATUS_INITIALIZATION_FAILED";

        case CURAND_STATUS_ARCH_MISMATCH:
            return "CURAND_STATUS_ARCH_MISMATCH";

        case CURAND_STATUS_INTERNAL_ERROR:
            return "CURAND_STATUS_INTERNAL_ERROR";
    }

    return "<unknown>";
}
#endif

#ifdef NV_NPPIDEFS_H
// NPP API errors
static const char *_cudaGetErrorEnum(NppStatus error)
{
    switch (error)
    {
        case NPP_NOT_SUPPORTED_MODE_ERROR:
            return "NPP_NOT_SUPPORTED_MODE_ERROR";

        case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR:
            return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR";

        case NPP_RESIZE_NO_OPERATION_ERROR:
            return "NPP_RESIZE_NO_OPERATION_ERROR";

        case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY:
            return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY";

#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000

        case NPP_BAD_ARG_ERROR:
            return "NPP_BAD_ARGUMENT_ERROR";

        case NPP_COEFF_ERROR:
            return "NPP_COEFFICIENT_ERROR";

        case NPP_RECT_ERROR:
            return "NPP_RECTANGLE_ERROR";

        case NPP_QUAD_ERROR:
            return "NPP_QUADRANGLE_ERROR";

        case NPP_MEM_ALLOC_ERR:
            return "NPP_MEMORY_ALLOCATION_ERROR";

        case NPP_HISTO_NUMBER_OF_LEVELS_ERROR:
            return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";

        case NPP_INVALID_INPUT:
            return "NPP_INVALID_INPUT";

        case NPP_POINTER_ERROR:
            return "NPP_POINTER_ERROR";

        case NPP_WARNING:
            return "NPP_WARNING";

        case NPP_ODD_ROI_WARNING:
            return "NPP_ODD_ROI_WARNING";
#else

            // These are for CUDA 5.5 or higher
        case NPP_BAD_ARGUMENT_ERROR:
            return "NPP_BAD_ARGUMENT_ERROR";

        case NPP_COEFFICIENT_ERROR:
            return "NPP_COEFFICIENT_ERROR";

        case NPP_RECTANGLE_ERROR:
            return "NPP_RECTANGLE_ERROR";

        case NPP_QUADRANGLE_ERROR:
            return "NPP_QUADRANGLE_ERROR";

        case NPP_MEMORY_ALLOCATION_ERR:
            return "NPP_MEMORY_ALLOCATION_ERROR";

        case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR:
            return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";

        case NPP_INVALID_HOST_POINTER_ERROR:
            return "NPP_INVALID_HOST_POINTER_ERROR";

        case NPP_INVALID_DEVICE_POINTER_ERROR:
            return "NPP_INVALID_DEVICE_POINTER_ERROR";
#endif

        case NPP_LUT_NUMBER_OF_LEVELS_ERROR:
            return "NPP_LUT_NUMBER_OF_LEVELS_ERROR";

        case NPP_TEXTURE_BIND_ERROR:
            return "NPP_TEXTURE_BIND_ERROR";

        case NPP_WRONG_INTERSECTION_ROI_ERROR:
            return "NPP_WRONG_INTERSECTION_ROI_ERROR";

        case NPP_NOT_EVEN_STEP_ERROR:
            return "NPP_NOT_EVEN_STEP_ERROR";

        case NPP_INTERPOLATION_ERROR:
            return "NPP_INTERPOLATION_ERROR";

        case NPP_RESIZE_FACTOR_ERROR:
            return "NPP_RESIZE_FACTOR_ERROR";

        case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR:
            return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR";


#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000

        case NPP_MEMFREE_ERR:
            return "NPP_MEMFREE_ERR";

        case NPP_MEMSET_ERR:
            return "NPP_MEMSET_ERR";

        case NPP_MEMCPY_ERR:
            return "NPP_MEMCPY_ERROR";

        case NPP_MIRROR_FLIP_ERR:
            return "NPP_MIRROR_FLIP_ERR";
#else

        case NPP_MEMFREE_ERROR:
            return "NPP_MEMFREE_ERROR";

        case NPP_MEMSET_ERROR:
            return "NPP_MEMSET_ERROR";

        case NPP_MEMCPY_ERROR:
            return "NPP_MEMCPY_ERROR";

        case NPP_MIRROR_FLIP_ERROR:
            return "NPP_MIRROR_FLIP_ERROR";
#endif

        case NPP_ALIGNMENT_ERROR:
            return "NPP_ALIGNMENT_ERROR";

        case NPP_STEP_ERROR:
            return "NPP_STEP_ERROR";

        case NPP_SIZE_ERROR:
            return "NPP_SIZE_ERROR";

        case NPP_NULL_POINTER_ERROR:
            return "NPP_NULL_POINTER_ERROR";

        case NPP_CUDA_KERNEL_EXECUTION_ERROR:
            return "NPP_CUDA_KERNEL_EXECUTION_ERROR";

        case NPP_NOT_IMPLEMENTED_ERROR:
            return "NPP_NOT_IMPLEMENTED_ERROR";

        case NPP_ERROR:
            return "NPP_ERROR";

        case NPP_SUCCESS:
            return "NPP_SUCCESS";

        case NPP_WRONG_INTERSECTION_QUAD_WARNING:
            return "NPP_WRONG_INTERSECTION_QUAD_WARNING";

        case NPP_MISALIGNED_DST_ROI_WARNING:
            return "NPP_MISALIGNED_DST_ROI_WARNING";

        case NPP_AFFINE_QUAD_INCORRECT_WARNING:
            return "NPP_AFFINE_QUAD_INCORRECT_WARNING";

        case NPP_DOUBLE_SIZE_WARNING:
            return "NPP_DOUBLE_SIZE_WARNING";

        case NPP_WRONG_INTERSECTION_ROI_WARNING:
            return "NPP_WRONG_INTERSECTION_ROI_WARNING";

#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000
        /* These are 6.0 or higher */
        case NPP_LUT_PALETTE_BITSIZE_ERROR:
            return "NPP_LUT_PALETTE_BITSIZE_ERROR";

        case NPP_ZC_MODE_NOT_SUPPORTED_ERROR:
            return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR";

        case NPP_QUALITY_INDEX_ERROR:
            return "NPP_QUALITY_INDEX_ERROR";

        case NPP_CHANNEL_ORDER_ERROR:
            return "NPP_CHANNEL_ORDER_ERROR";

        case NPP_ZERO_MASK_VALUE_ERROR:
            return "NPP_ZERO_MASK_VALUE_ERROR";

        case NPP_NUMBER_OF_CHANNELS_ERROR:
            return "NPP_NUMBER_OF_CHANNELS_ERROR";

        case NPP_COI_ERROR:
            return "NPP_COI_ERROR";

        case NPP_DIVISOR_ERROR:
            return "NPP_DIVISOR_ERROR";

        case NPP_CHANNEL_ERROR:
            return "NPP_CHANNEL_ERROR";

        case NPP_STRIDE_ERROR:
            return "NPP_STRIDE_ERROR";

        case NPP_ANCHOR_ERROR:
            return "NPP_ANCHOR_ERROR";

        case NPP_MASK_SIZE_ERROR:
            return "NPP_MASK_SIZE_ERROR";

        case NPP_MOMENT_00_ZERO_ERROR:
            return "NPP_MOMENT_00_ZERO_ERROR";

        case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR:
            return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR";

        case NPP_THRESHOLD_ERROR:
            return "NPP_THRESHOLD_ERROR";

        case NPP_CONTEXT_MATCH_ERROR:
            return "NPP_CONTEXT_MATCH_ERROR";

        case NPP_FFT_FLAG_ERROR:
            return "NPP_FFT_FLAG_ERROR";

        case NPP_FFT_ORDER_ERROR:
            return "NPP_FFT_ORDER_ERROR";

        case NPP_SCALE_RANGE_ERROR:
            return "NPP_SCALE_RANGE_ERROR";

        case NPP_DATA_TYPE_ERROR:
            return "NPP_DATA_TYPE_ERROR";

        case NPP_OUT_OFF_RANGE_ERROR:
            return "NPP_OUT_OFF_RANGE_ERROR";

        case NPP_DIVIDE_BY_ZERO_ERROR:
            return "NPP_DIVIDE_BY_ZERO_ERROR";

        case NPP_RANGE_ERROR:
            return "NPP_RANGE_ERROR";

        case NPP_NO_MEMORY_ERROR:
            return "NPP_NO_MEMORY_ERROR";

        case NPP_ERROR_RESERVED:
            return "NPP_ERROR_RESERVED";

        case NPP_NO_OPERATION_WARNING:
            return "NPP_NO_OPERATION_WARNING";

        case NPP_DIVIDE_BY_ZERO_WARNING:
            return "NPP_DIVIDE_BY_ZERO_WARNING";
#endif

    }

    return "<unknown>";
}
#endif

#ifdef __DRIVER_TYPES_H__
#ifndef DEVICE_RESET
#define DEVICE_RESET cudaDeviceReset();
#endif
#else
#ifndef DEVICE_RESET
#define DEVICE_RESET
#endif
#endif

template< typename T >
void check(T result, char const *const func, const char *const file, int const line)
{
    if (result)
    {
        fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n",
                file, line, static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
        DEVICE_RESET
        // Make sure we call CUDA Device Reset before exiting
        exit(EXIT_FAILURE);
    }
}

#ifdef __DRIVER_TYPES_H__
// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
#define checkCudaErrors(val)           check ( (val), #val, __FILE__, __LINE__ )

// This will output the proper error string when calling cudaGetLastError
#define getLastCudaError(msg)      __getLastCudaError (msg, __FILE__, __LINE__)

inline void __getLastCudaError(const char *errorMessage, const char *file, const int line)
{
    cudaError_t err = cudaGetLastError();

    if (cudaSuccess != err)
    {
        fprintf(stderr, "%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n",
                file, line, errorMessage, (int)err, cudaGetErrorString(err));
        DEVICE_RESET
        exit(EXIT_FAILURE);
    }
}
#endif

#ifndef MAX
#define MAX(a,b) (a > b ? a : b)
#endif

// Float To Int conversion
inline int ftoi(float value)
{
    return (value >= 0 ? (int)(value + 0.5) : (int)(value - 0.5));
}

// Beginning of GPU Architecture definitions
inline int _ConvertSMVer2Cores(int major, int minor)
{
    // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
    typedef struct
    {
        int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
        int Cores;
    } sSMtoCores;

    sSMtoCores nGpuArchCoresPerSM[] =
    {
        { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
        { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
        { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
        { 0x32, 192}, // Kepler Generation (SM 3.2) GK10x class
        { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
        { 0x37, 192}, // Kepler Generation (SM 3.7) GK21x class
        { 0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class
        { 0x52, 128}, // Maxwell Generation (SM 5.2) GM20x class
        {   -1, -1 }
    };

    int index = 0;

    while (nGpuArchCoresPerSM[index].SM != -1)
    {
        if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor))
        {
            return nGpuArchCoresPerSM[index].Cores;
        }

        index++;
    }

    // If we don't find the values, we default use the previous one to run properly
    printf("MapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[index-1].Cores);
    return nGpuArchCoresPerSM[index-1].Cores;
}
// end of GPU Architecture definitions

#ifdef __CUDA_RUNTIME_H__
// General GPU Device CUDA Initialization
inline int gpuDeviceInit(int devID)
{
    int device_count;
    checkCudaErrors(cudaGetDeviceCount(&device_count));

    if (device_count == 0)
    {
        fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n");
        exit(EXIT_FAILURE);
    }

    if (devID < 0)
    {
        devID = 0;
    }

    if (devID > device_count-1)
    {
        fprintf(stderr, "\n");
        fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", device_count);
        fprintf(stderr, ">> gpuDeviceInit (-device=%d) is not a valid GPU device. <<\n", devID);
        fprintf(stderr, "\n");
        return -devID;
    }

    cudaDeviceProp deviceProp;
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));

    if (deviceProp.computeMode == cudaComputeModeProhibited)
    {
        fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n");
        return -1;
    }

    if (deviceProp.major < 1)
    {
        fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
        exit(EXIT_FAILURE);
    }

    checkCudaErrors(cudaSetDevice(devID));
    printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name);

    return devID;
}

// This function returns the best GPU (with maximum GFLOPS)
inline int gpuGetMaxGflopsDeviceId()
{
    int current_device     = 0, sm_per_multiproc  = 0;
    int max_perf_device    = 0;
    int device_count       = 0, best_SM_arch      = 0;
    int devices_prohibited = 0;
    
    unsigned long long max_compute_perf = 0;
    cudaDeviceProp deviceProp;
    cudaGetDeviceCount(&device_count);
    
    checkCudaErrors(cudaGetDeviceCount(&device_count));

    if (device_count == 0)
    {
        fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: no devices supporting CUDA.\n");
        exit(EXIT_FAILURE);
    }

    // Find the best major SM Architecture GPU device
    while (current_device < device_count)
    {
        cudaGetDeviceProperties(&deviceProp, current_device);

        // If this GPU is not running on Compute Mode prohibited, then we can add it to the list
        if (deviceProp.computeMode != cudaComputeModeProhibited)
        {
            if (deviceProp.major > 0 && deviceProp.major < 9999)
            {
                best_SM_arch = MAX(best_SM_arch, deviceProp.major);
            }
        }
        else
        {
            devices_prohibited++;
        }

        current_device++;
    }

    if (devices_prohibited == device_count)
    {
        fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: all devices have compute mode prohibited.\n");
        exit(EXIT_FAILURE);
    }

    // Find the best CUDA capable GPU device
    current_device = 0;

    while (current_device < device_count)
    {
        cudaGetDeviceProperties(&deviceProp, current_device);

        // If this GPU is not running on Compute Mode prohibited, then we can add it to the list
        if (deviceProp.computeMode != cudaComputeModeProhibited)
        {
            if (deviceProp.major == 9999 && deviceProp.minor == 9999)
            {
                sm_per_multiproc = 1;
            }
            else
            {
                sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
            }

            unsigned long long compute_perf  = (unsigned long long) deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;

            if (compute_perf  > max_compute_perf)
            {
                // If we find GPU with SM major > 2, search only these
                if (best_SM_arch > 2)
                {
                    // If our device==dest_SM_arch, choose this, or else pass
                    if (deviceProp.major == best_SM_arch)
                    {
                        max_compute_perf  = compute_perf;
                        max_perf_device   = current_device;
                    }
                }
                else
                {
                    max_compute_perf  = compute_perf;
                    max_perf_device   = current_device;
                }
            }
        }

        ++current_device;
    }

    return max_perf_device;
}


// Initialization code to find the best CUDA Device
inline int findCudaDevice(int argc, const char **argv)
{
    cudaDeviceProp deviceProp;
    int devID = 0;

    // If the command-line has a device number specified, use it
    if (checkCmdLineFlag(argc, argv, "device"))
    {
        devID = getCmdLineArgumentInt(argc, argv, "device=");

        if (devID < 0)
        {
            printf("Invalid command line parameter\n ");
            exit(EXIT_FAILURE);
        }
        else
        {
            devID = gpuDeviceInit(devID);

            if (devID < 0)
            {
                printf("exiting...\n");
                exit(EXIT_FAILURE);
            }
        }
    }
    else
    {
        // Otherwise pick the device with highest Gflops/s
        devID = gpuGetMaxGflopsDeviceId();
        checkCudaErrors(cudaSetDevice(devID));
        checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
        printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
    }

    return devID;
}

// General check for CUDA GPU SM Capabilities
inline bool checkCudaCapabilities(int major_version, int minor_version)
{
    cudaDeviceProp deviceProp;
    deviceProp.major = 0;
    deviceProp.minor = 0;
    int dev;

    checkCudaErrors(cudaGetDevice(&dev));
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));

    if ((deviceProp.major > major_version) ||
        (deviceProp.major == major_version && deviceProp.minor >= minor_version))
    {
        printf("  Device %d: <%16s >, Compute SM %d.%d detected\n", dev, deviceProp.name, deviceProp.major, deviceProp.minor);
        return true;
    }
    else
    {
        printf("  No GPU device was found that can support CUDA compute capability %d.%d.\n", major_version, minor_version);
        return false;
    }
}
#endif

// end of CUDA Helper Functions


#endif

 goconv/inc/helper_string.h

New file
@@ -0,0 +1,526 @@
/**
 * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

// These are helper functions for the SDK samples (string parsing, timers, etc)
#ifndef STRING_HELPER_H
#define STRING_HELPER_H

#include <stdio.h>
#include <stdlib.h>
#include <fstream>
#include <string>

#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
#ifndef _CRT_SECURE_NO_DEPRECATE
#define _CRT_SECURE_NO_DEPRECATE
#endif
#ifndef STRCASECMP
#define STRCASECMP  _stricmp
#endif
#ifndef STRNCASECMP
#define STRNCASECMP _strnicmp
#endif
#ifndef STRCPY
#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
#endif

#ifndef FOPEN
#define FOPEN(fHandle,filename,mode) fopen_s(&fHandle, filename, mode)
#endif
#ifndef FOPEN_FAIL
#define FOPEN_FAIL(result) (result != 0)
#endif
#ifndef SSCANF
#define SSCANF sscanf_s
#endif
#ifndef SPRINTF
#define SPRINTF sprintf_s
#endif
#else // Linux Includes
#include <string.h>
#include <strings.h>

#ifndef STRCASECMP
#define STRCASECMP  strcasecmp
#endif
#ifndef STRNCASECMP
#define STRNCASECMP strncasecmp
#endif
#ifndef STRCPY
#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
#endif

#ifndef FOPEN
#define FOPEN(fHandle,filename,mode) (fHandle = fopen(filename, mode))
#endif
#ifndef FOPEN_FAIL
#define FOPEN_FAIL(result) (result == NULL)
#endif
#ifndef SSCANF
#define SSCANF sscanf
#endif
#ifndef SPRINTF
#define SPRINTF sprintf
#endif
#endif

#ifndef EXIT_WAIVED
#define EXIT_WAIVED 2
#endif

// CUDA Utility Helper Functions
inline int stringRemoveDelimiter(char delimiter, const char *string)
{
    int string_start = 0;

    while (string[string_start] == delimiter)
    {
        string_start++;
    }

    if (string_start >= (int)strlen(string)-1)
    {
        return 0;
    }

    return string_start;
}

inline int getFileExtension(char *filename, char **extension)
{
    int string_length = (int)strlen(filename);

    while (filename[string_length--] != '.')
    {
        if (string_length == 0)
            break;
    }

    if (string_length > 0) string_length += 2;

    if (string_length == 0)
        *extension = NULL;
    else
        *extension = &filename[string_length];

    return string_length;
}


inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref)
{
    bool bFound = false;

    if (argc >= 1)
    {
        for (int i=1; i < argc; i++)
        {
            int string_start = stringRemoveDelimiter('-', argv[i]);
            const char *string_argv = &argv[i][string_start];

            const char *equal_pos = strchr(string_argv, '=');
            int argv_length = (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);

            int length = (int)strlen(string_ref);

            if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length))
            {
                bFound = true;
                continue;
            }
        }
    }

    return bFound;
}

// This function wraps the CUDA Driver API into a template function
template <class T>
inline bool getCmdLineArgumentValue(const int argc, const char **argv, const char *string_ref, T *value)
{
    bool bFound = false;

    if (argc >= 1)
    {
        for (int i=1; i < argc; i++)
        {
            int string_start = stringRemoveDelimiter('-', argv[i]);
            const char *string_argv = &argv[i][string_start];
            int length = (int)strlen(string_ref);

            if (!STRNCASECMP(string_argv, string_ref, length))
            {
                if (length+1 <= (int)strlen(string_argv))
                {
                    int auto_inc = (string_argv[length] == '=') ? 1 : 0;
                    *value = (T)atoi(&string_argv[length + auto_inc]);
                }

                bFound = true;
                i=argc;
            }
        }
    }

    return bFound;
}

inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref)
{
    bool bFound = false;
    int value = -1;

    if (argc >= 1)
    {
        for (int i=1; i < argc; i++)
        {
            int string_start = stringRemoveDelimiter('-', argv[i]);
            const char *string_argv = &argv[i][string_start];
            int length = (int)strlen(string_ref);

            if (!STRNCASECMP(string_argv, string_ref, length))
            {
                if (length+1 <= (int)strlen(string_argv))
                {
                    int auto_inc = (string_argv[length] == '=') ? 1 : 0;
                    value = atoi(&string_argv[length + auto_inc]);
                }
                else
                {
                    value = 0;
                }

                bFound = true;
                continue;
            }
        }
    }

    if (bFound)
    {
        return value;
    }
    else
    {
        return 0;
    }
}

inline float getCmdLineArgumentFloat(const int argc, const char **argv, const char *string_ref)
{
    bool bFound = false;
    float value = -1;

    if (argc >= 1)
    {
        for (int i=1; i < argc; i++)
        {
            int string_start = stringRemoveDelimiter('-', argv[i]);
            const char *string_argv = &argv[i][string_start];
            int length = (int)strlen(string_ref);

            if (!STRNCASECMP(string_argv, string_ref, length))
            {
                if (length+1 <= (int)strlen(string_argv))
                {
                    int auto_inc = (string_argv[length] == '=') ? 1 : 0;
                    value = (float)atof(&string_argv[length + auto_inc]);
                }
                else
                {
                    value = 0.f;
                }

                bFound = true;
                continue;
            }
        }
    }

    if (bFound)
    {
        return value;
    }
    else
    {
        return 0;
    }
}

inline bool getCmdLineArgumentString(const int argc, const char **argv,
                                     const char *string_ref, char **string_retval)
{
    bool bFound = false;

    if (argc >= 1)
    {
        for (int i=1; i < argc; i++)
        {
            int string_start = stringRemoveDelimiter('-', argv[i]);
            char *string_argv = (char *)&argv[i][string_start];
            int length = (int)strlen(string_ref);

            if (!STRNCASECMP(string_argv, string_ref, length))
            {
                *string_retval = &string_argv[length+1];
                bFound = true;
                continue;
            }
        }
    }

    if (!bFound)
    {
        *string_retval = NULL;
    }

    return bFound;
}

//////////////////////////////////////////////////////////////////////////////
//! Find the path for a file assuming that
//! files are found in the searchPath.
//!
//! @return the path if succeeded, otherwise 0
//! @param filename         name of the file
//! @param executable_path  optional absolute path of the executable
//////////////////////////////////////////////////////////////////////////////
inline char *sdkFindFilePath(const char *filename, const char *executable_path)
{
    // <executable_name> defines a variable that is replaced with the name of the executable

    // Typical relative search paths to locate needed companion files (e.g. sample input data, or JIT source files)
    // The origin for the relative search may be the .exe file, a .bat file launching an .exe, a browser .exe launching the .exe or .bat, etc
    const char *searchPath[] =
    {
        "./",                                       // same dir
        "./common/",                                // "/common/" subdir
        "./common/data/",                           // "/common/data/" subdir
        "./data/",                                  // "/data/" subdir
        "./src/",                                   // "/src/" subdir
        "./src/<executable_name>/data/",            // "/src/<executable_name>/data/" subdir
        "./inc/",                                   // "/inc/" subdir
        "./0_Simple/",                              // "/0_Simple/" subdir
        "./1_Utilities/",                           // "/1_Utilities/" subdir
        "./2_Graphics/",                            // "/2_Graphics/" subdir
        "./3_Imaging/",                             // "/3_Imaging/" subdir
        "./4_Finance/",                             // "/4_Finance/" subdir
        "./5_Simulations/",                         // "/5_Simulations/" subdir
        "./6_Advanced/",                            // "/6_Advanced/" subdir
        "./7_CUDALibraries/",                       // "/7_CUDALibraries/" subdir
        "./8_Android/",                             // "/8_Android/" subdir
        "./samples/",                               // "/samples/" subdir

        "./0_Simple/<executable_name>/data/",        // "/0_Simple/<executable_name>/data/" subdir
        "./1_Utilities/<executable_name>/data/",     // "/1_Utilities/<executable_name>/data/" subdir
        "./2_Graphics/<executable_name>/data/",      // "/2_Graphics/<executable_name>/data/" subdir
        "./3_Imaging/<executable_name>/data/",       // "/3_Imaging/<executable_name>/data/" subdir
        "./4_Finance/<executable_name>/data/",       // "/4_Finance/<executable_name>/data/" subdir
        "./5_Simulations/<executable_name>/data/",   // "/5_Simulations/<executable_name>/data/" subdir
        "./6_Advanced/<executable_name>/data/",      // "/6_Advanced/<executable_name>/data/" subdir
        "./7_CUDALibraries/<executable_name>/",      // "/7_CUDALibraries/<executable_name>/" subdir
        "./7_CUDALibraries/<executable_name>/data/", // "/7_CUDALibraries/<executable_name>/data/" subdir

        "../",                                      // up 1 in tree
        "../common/",                               // up 1 in tree, "/common/" subdir
        "../common/data/",                          // up 1 in tree, "/common/data/" subdir
        "../data/",                                 // up 1 in tree, "/data/" subdir
        "../src/",                                  // up 1 in tree, "/src/" subdir
        "../inc/",                                  // up 1 in tree, "/inc/" subdir

        "../0_Simple/<executable_name>/data/",       // up 1 in tree, "/0_Simple/<executable_name>/" subdir
        "../1_Utilities/<executable_name>/data/",    // up 1 in tree, "/1_Utilities/<executable_name>/" subdir
        "../2_Graphics/<executable_name>/data/",     // up 1 in tree, "/2_Graphics/<executable_name>/" subdir
        "../3_Imaging/<executable_name>/data/",      // up 1 in tree, "/3_Imaging/<executable_name>/" subdir
        "../4_Finance/<executable_name>/data/",      // up 1 in tree, "/4_Finance/<executable_name>/" subdir
        "../5_Simulations/<executable_name>/data/",  // up 1 in tree, "/5_Simulations/<executable_name>/" subdir
        "../6_Advanced/<executable_name>/data/",     // up 1 in tree, "/6_Advanced/<executable_name>/" subdir
        "../7_CUDALibraries/<executable_name>/data/",// up 1 in tree, "/7_CUDALibraries/<executable_name>/" subdir
        "../8_Android/<executable_name>/data/",      // up 1 in tree, "/8_Android/<executable_name>/" subdir
        "../samples/<executable_name>/data/",        // up 1 in tree, "/samples/<executable_name>/" subdir
        "../../",                                        // up 2 in tree
        "../../common/",                                 // up 2 in tree, "/common/" subdir
        "../../common/data/",                            // up 2 in tree, "/common/data/" subdir
        "../../data/",                                   // up 2 in tree, "/data/" subdir
        "../../src/",                                    // up 2 in tree, "/src/" subdir
        "../../inc/",                                    // up 2 in tree, "/inc/" subdir
        "../../sandbox/<executable_name>/data/",         // up 2 in tree, "/sandbox/<executable_name>/" subdir
        "../../0_Simple/<executable_name>/data/",        // up 2 in tree, "/0_Simple/<executable_name>/" subdir
        "../../1_Utilities/<executable_name>/data/",     // up 2 in tree, "/1_Utilities/<executable_name>/" subdir
        "../../2_Graphics/<executable_name>/data/",      // up 2 in tree, "/2_Graphics/<executable_name>/" subdir
        "../../3_Imaging/<executable_name>/data/",       // up 2 in tree, "/3_Imaging/<executable_name>/" subdir
        "../../4_Finance/<executable_name>/data/",       // up 2 in tree, "/4_Finance/<executable_name>/" subdir
        "../../5_Simulations/<executable_name>/data/",   // up 2 in tree, "/5_Simulations/<executable_name>/" subdir
        "../../6_Advanced/<executable_name>/data/",      // up 2 in tree, "/6_Advanced/<executable_name>/" subdir
        "../../7_CUDALibraries/<executable_name>/data/", // up 2 in tree, "/7_CUDALibraries/<executable_name>/" subdir
        "../../8_Android/<executable_name>/data/",       // up 2 in tree, "/8_Android/<executable_name>/" subdir
        "../../samples/<executable_name>/data/",         // up 2 in tree, "/samples/<executable_name>/" subdir
        "../../../",                                        // up 3 in tree
        "../../../src/<executable_name>/",                  // up 3 in tree, "/src/<executable_name>/" subdir
        "../../../src/<executable_name>/data/",             // up 3 in tree, "/src/<executable_name>/data/" subdir
        "../../../src/<executable_name>/src/",              // up 3 in tree, "/src/<executable_name>/src/" subdir
        "../../../src/<executable_name>/inc/",              // up 3 in tree, "/src/<executable_name>/inc/" subdir
        "../../../sandbox/<executable_name>/",              // up 3 in tree, "/sandbox/<executable_name>/" subdir
        "../../../sandbox/<executable_name>/data/",         // up 3 in tree, "/sandbox/<executable_name>/data/" subdir
        "../../../sandbox/<executable_name>/src/",          // up 3 in tree, "/sandbox/<executable_name>/src/" subdir
        "../../../sandbox/<executable_name>/inc/",          // up 3 in tree, "/sandbox/<executable_name>/inc/" subdir
        "../../../0_Simple/<executable_name>/data/",        // up 3 in tree, "/0_Simple/<executable_name>/" subdir
        "../../../1_Utilities/<executable_name>/data/",     // up 3 in tree, "/1_Utilities/<executable_name>/" subdir
        "../../../2_Graphics/<executable_name>/data/",      // up 3 in tree, "/2_Graphics/<executable_name>/" subdir
        "../../../3_Imaging/<executable_name>/data/",       // up 3 in tree, "/3_Imaging/<executable_name>/" subdir
        "../../../4_Finance/<executable_name>/data/",       // up 3 in tree, "/4_Finance/<executable_name>/" subdir
        "../../../5_Simulations/<executable_name>/data/",   // up 3 in tree, "/5_Simulations/<executable_name>/" subdir
        "../../../6_Advanced/<executable_name>/data/",      // up 3 in tree, "/6_Advanced/<executable_name>/" subdir
        "../../../7_CUDALibraries/<executable_name>/data/", // up 3 in tree, "/7_CUDALibraries/<executable_name>/" subdir
        "../../../8_Android/<executable_name>/data/",       // up 3 in tree, "/8_Android/<executable_name>/" subdir
        "../../../0_Simple/<executable_name>/",        // up 3 in tree, "/0_Simple/<executable_name>/" subdir
        "../../../1_Utilities/<executable_name>/",     // up 3 in tree, "/1_Utilities/<executable_name>/" subdir
        "../../../2_Graphics/<executable_name>/",      // up 3 in tree, "/2_Graphics/<executable_name>/" subdir
        "../../../3_Imaging/<executable_name>/",       // up 3 in tree, "/3_Imaging/<executable_name>/" subdir
        "../../../4_Finance/<executable_name>/",       // up 3 in tree, "/4_Finance/<executable_name>/" subdir
        "../../../5_Simulations/<executable_name>/",   // up 3 in tree, "/5_Simulations/<executable_name>/" subdir
        "../../../6_Advanced/<executable_name>/",      // up 3 in tree, "/6_Advanced/<executable_name>/" subdir
        "../../../7_CUDALibraries/<executable_name>/", // up 3 in tree, "/7_CUDALibraries/<executable_name>/" subdir
        "../../../8_Android/<executable_name>/",       // up 3 in tree, "/8_Android/<executable_name>/" subdir
        "../../../samples/<executable_name>/data/",         // up 3 in tree, "/samples/<executable_name>/" subdir
        "../../../common/",                                 // up 3 in tree, "../../../common/" subdir
        "../../../common/data/",                            // up 3 in tree, "../../../common/data/" subdir
        "../../../data/",                                   // up 3 in tree, "../../../data/" subdir
        "../../../../",                                // up 4 in tree
        "../../../../src/<executable_name>/",          // up 4 in tree, "/src/<executable_name>/" subdir
        "../../../../src/<executable_name>/data/",     // up 4 in tree, "/src/<executable_name>/data/" subdir
        "../../../../src/<executable_name>/src/",      // up 4 in tree, "/src/<executable_name>/src/" subdir
        "../../../../src/<executable_name>/inc/",      // up 4 in tree, "/src/<executable_name>/inc/" subdir
        "../../../../sandbox/<executable_name>/",      // up 4 in tree, "/sandbox/<executable_name>/" subdir
        "../../../../sandbox/<executable_name>/data/", // up 4 in tree, "/sandbox/<executable_name>/data/" subdir
        "../../../../sandbox/<executable_name>/src/",  // up 4 in tree, "/sandbox/<executable_name>/src/" subdir
        "../../../../sandbox/<executable_name>/inc/",   // up 4 in tree, "/sandbox/<executable_name>/inc/" subdir
        "../../../../0_Simple/<executable_name>/data/",     // up 4 in tree, "/0_Simple/<executable_name>/" subdir
        "../../../../1_Utilities/<executable_name>/data/",  // up 4 in tree, "/1_Utilities/<executable_name>/" subdir
        "../../../../2_Graphics/<executable_name>/data/",   // up 4 in tree, "/2_Graphics/<executable_name>/" subdir
        "../../../../3_Imaging/<executable_name>/data/",    // up 4 in tree, "/3_Imaging/<executable_name>/" subdir
        "../../../../4_Finance/<executable_name>/data/",    // up 4 in tree, "/4_Finance/<executable_name>/" subdir
        "../../../../5_Simulations/<executable_name>/data/",// up 4 in tree, "/5_Simulations/<executable_name>/" subdir
        "../../../../6_Advanced/<executable_name>/data/",   // up 4 in tree, "/6_Advanced/<executable_name>/" subdir
        "../../../../7_CUDALibraries/<executable_name>/data/", // up 4 in tree, "/7_CUDALibraries/<executable_name>/" subdir
        "../../../../8_Android/<executable_name>/data/",    // up 4 in tree, "/8_Android/<executable_name>/" subdir
        "../../../../0_Simple/<executable_name>/",     // up 4 in tree, "/0_Simple/<executable_name>/" subdir
        "../../../../1_Utilities/<executable_name>/",  // up 4 in tree, "/1_Utilities/<executable_name>/" subdir
        "../../../../2_Graphics/<executable_name>/",   // up 4 in tree, "/2_Graphics/<executable_name>/" subdir
        "../../../../3_Imaging/<executable_name>/",    // up 4 in tree, "/3_Imaging/<executable_name>/" subdir
        "../../../../4_Finance/<executable_name>/",    // up 4 in tree, "/4_Finance/<executable_name>/" subdir
        "../../../../5_Simulations/<executable_name>/",// up 4 in tree, "/5_Simulations/<executable_name>/" subdir
        "../../../../6_Advanced/<executable_name>/",   // up 4 in tree, "/6_Advanced/<executable_name>/" subdir
        "../../../../7_CUDALibraries/<executable_name>/", // up 4 in tree, "/7_CUDALibraries/<executable_name>/" subdir
        "../../../../8_Android/<executable_name>/",    // up 4 in tree, "/8_Android/<executable_name>/" subdir
        "../../../../samples/<executable_name>/data/",      // up 4 in tree, "/samples/<executable_name>/" subdir
        "../../../../common/",                              // up 4 in tree, "../../../common/" subdir
        "../../../../common/data/",                         // up 4 in tree, "../../../common/data/" subdir
        "../../../../data/",                                // up 4 in tree, "../../../data/" subdir
        "../../../../../",                                // up 5 in tree
        "../../../../../src/<executable_name>/",          // up 5 in tree, "/src/<executable_name>/" subdir
        "../../../../../src/<executable_name>/data/",     // up 5 in tree, "/src/<executable_name>/data/" subdir
        "../../../../../src/<executable_name>/src/",      // up 5 in tree, "/src/<executable_name>/src/" subdir
        "../../../../../src/<executable_name>/inc/",      // up 5 in tree, "/src/<executable_name>/inc/" subdir
        "../../../../../sandbox/<executable_name>/",      // up 5 in tree, "/sandbox/<executable_name>/" subdir
        "../../../../../sandbox/<executable_name>/data/", // up 5 in tree, "/sandbox/<executable_name>/data/" subdir
        "../../../../../sandbox/<executable_name>/src/",  // up 5 in tree, "/sandbox/<executable_name>/src/" subdir
        "../../../../../sandbox/<executable_name>/inc/",   // up 5 in tree, "/sandbox/<executable_name>/inc/" subdir
        "../../../../../0_Simple/<executable_name>/data/",     // up 5 in tree, "/0_Simple/<executable_name>/" subdir
        "../../../../../1_Utilities/<executable_name>/data/",  // up 5 in tree, "/1_Utilities/<executable_name>/" subdir
        "../../../../../2_Graphics/<executable_name>/data/",   // up 5 in tree, "/2_Graphics/<executable_name>/" subdir
        "../../../../../3_Imaging/<executable_name>/data/",    // up 5 in tree, "/3_Imaging/<executable_name>/" subdir
        "../../../../../4_Finance/<executable_name>/data/",    // up 5 in tree, "/4_Finance/<executable_name>/" subdir
        "../../../../../5_Simulations/<executable_name>/data/",// up 5 in tree, "/5_Simulations/<executable_name>/" subdir
        "../../../../../6_Advanced/<executable_name>/data/",   // up 5 in tree, "/6_Advanced/<executable_name>/" subdir
        "../../../../../7_CUDALibraries/<executable_name>/data/", // up 5 in tree, "/7_CUDALibraries/<executable_name>/" subdir
        "../../../../../8_Android/<executable_name>/data/",    // up 5 in tree, "/8_Android/<executable_name>/" subdir
        "../../../../../samples/<executable_name>/data/",      // up 5 in tree, "/samples/<executable_name>/" subdir
        "../../../../../common/",                         // up 5 in tree, "../../../common/" subdir
        "../../../../../common/data/",                    // up 5 in tree, "../../../common/data/" subdir
    };

    // Extract the executable name
    std::string executable_name;

    if (executable_path != 0)
    {
        executable_name = std::string(executable_path);

#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
        // Windows path delimiter
        size_t delimiter_pos = executable_name.find_last_of('\\');
        executable_name.erase(0, delimiter_pos + 1);

        if (executable_name.rfind(".exe") != std::string::npos)
        {
            // we strip .exe, only if the .exe is found
            executable_name.resize(executable_name.size() - 4);
        }

#else
        // Linux & OSX path delimiter
        size_t delimiter_pos = executable_name.find_last_of('/');
        executable_name.erase(0,delimiter_pos+1);
#endif
    }

    // Loop over all search paths and return the first hit
    for (unsigned int i = 0; i < sizeof(searchPath)/sizeof(char *); ++i)
    {
        std::string path(searchPath[i]);
        size_t executable_name_pos = path.find("<executable_name>");

        // If there is executable_name variable in the searchPath
        // replace it with the value
        if (executable_name_pos != std::string::npos)
        {
            if (executable_path != 0)
            {
                path.replace(executable_name_pos, strlen("<executable_name>"), executable_name);
            }
            else
            {
                // Skip this path entry if no executable argument is given
                continue;
            }
        }

#ifdef _DEBUG
        printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str());
#endif

        // Test if the file exists
        path.append(filename);
        FILE *fp;
        FOPEN(fp, path.c_str(), "rb");

        if (fp != NULL)
        {
            fclose(fp);
            // File found
            // returning an allocated array here for backwards compatibility reasons
            char *file_path = (char *) malloc(path.length() + 1);
            STRCPY(file_path, path.length() + 1, path.c_str());
            return file_path;
        }

        if (fp)
        {
            fclose(fp);
        }
    }

    // File not found
    return 0;
}

#endif

New file
			@@ -0,0 +1,592 @@
			#include "conv.h"

			#include <cmath>
			#include <sys/time.h>

			#include <npp.h>
			#include <helper_cuda.h>
			#include <helper_string.h>
			#include "Exceptions.h"


			static const int MEMORY_ALGN_DEVICE = 511;
			static const int HD_MEMORY_ALGN_DEVICE = 511;

			static inline int DivUp(int x, int d)
			{
			return (x + d - 1) / d;
			}

			static int set_data(uint8_t data, const int width, const int height, unsigned char mY, unsigned char mU, unsigned char mV)
			{
			uint8_t* yuv_data = (uint8_t*)data;
			if (!yuv_data)
			{
			return -1;
			}

			uint32_t i, j;
			uint32_t off;
			uint32_t off_yuv;
			uint32_t half_h;
			uint32_t half_w;
			uint32_t u_size;
			uint8_t* yuv_ptr;
			uint8_t* u_ptr;
			uint8_t* v_ptr;

			int w = width;
			int h = height;

			//从这一句来看，即使是同一种格式，进来也要处理一下。
			size_t nPitch = (w + HD_MEMORY_ALGN_DEVICE) & ~HD_MEMORY_ALGN_DEVICE;
			off = 0;
			off_yuv = 0;
			for (i = 0; i < (uint32_t)h; i++)
			{
			memcpy(mY + off, yuv_data + off_yuv, w);
			off += nPitch;
			off_yuv += w;
			}

			half_w = w >> 1;
			half_h = h >> 1;
			u_size = half_w * half_h;
			nPitch = (half_w + HD_MEMORY_ALGN_DEVICE) & ~HD_MEMORY_ALGN_DEVICE;

			off_yuv = w * h;
			off = 0;
			for (i = 0; i < half_h; i++)
			{
			yuv_ptr = yuv_data + off_yuv;
			u_ptr = mU + off;
			v_ptr = mV + off;
			for (j = 0; j < (uint32_t)w; j += 2)
			{
			u_ptr++ = yuv_ptr++;
			v_ptr++ = yuv_ptr++;
			}
			off_yuv += w;
			off += nPitch;
			}

			return 0;
			}

			/////////////handle
			class convertor{
			public:
			convertor(const int srcW, const int srcH, const int dstW, const int dstH, const int gpu);
			~convertor();
			int yuv2bgr(unsigned char *bgr, int bgrLen);
			int resize2bgr(unsigned char in, unsigned char data, int data_len);
			int resizeyuv(unsigned char in, unsigned char data, int data_len);
			int fill_yuv(const unsigned char *yuv);
			private:
			void init_yuv();
			void init_resize();
			void init_resize_bgr();
			void init_resize_yuv();
			private:
			int width;
			int height;

			unsigned char aSamplingFactors[3];
			int nMCUBlocksH;
			int nMCUBlocksV;

			Npp8u *apSrcImage[3];
			NppiSize aSrcSize[3];
			Npp32s aSrcImageStep[3];
			size_t aSrcPitch[3];

			uint8_t *mY;
			uint8_t *mU;
			uint8_t *mV;

			///////////////////////////
			int rWidth;
			int rHeight;
			float fx;
			float fy;

			Npp8u *apDstImage[3] = {0,0,0};
			Npp32s aDstImageStep[3];
			NppiSize aDstSize[3];

			/////////////////////////////
			Npp8u *imgOrigin;
			size_t pitchOrigin;
			NppiSize sizeOrigin;

			unsigned char *bgrOrigin;
			int bgrOriginLen;
			size_t bgrOriginPitch;

			////////////////////////////
			Npp8u *imgResize;
			size_t pitchResize;
			NppiSize sizeResize;

			unsigned char *bgrScale;
			int bgrScaleLen;
			size_t bgrScalePitch;

			// resize only
			////////////////////////////
			Npp8u *originBGR;
			int pitchOriginBGR;
			Npp8u *resizedBGR;
			int pitchResizedBGR;
			unsigned char *hostResizedBGR;

			///////////////////////////
			unsigned char *nv12;

			bool initialized_yuv, initialized_resize, initialized_resize_bgr, initialized_resize_yuv;
			int gpu_index;
			};


			convertor::convertor(const int srcW, const int srcH, const int dstW, const int dstH, const int gpu)
			:width(srcW)
			,height(srcH)
			,rWidth(dstW)
			,rHeight(dstH)
			,fx(-1)
			,fy(-1)
			,mY(NULL)
			,mU(NULL)
			,mV(NULL)
			,imgOrigin(0)
			,imgResize(0)
			,bgrOrigin(NULL)
			,bgrOriginLen(0)
			,bgrScale(NULL)
			,bgrScaleLen(0)
			,originBGR(0)
			,pitchOriginBGR(0)
			,resizedBGR(0)
			,pitchResizedBGR(0)
			,hostResizedBGR(NULL)
			,nv12(NULL)
			,initialized_yuv(false)
			,initialized_resize(false)
			,initialized_resize_bgr(false)
			,initialized_resize_yuv(false)
			,gpu_index(gpu)
			{}

			static void setGPUDevice(const int gpu){
			if (gpu >= 0){
			cudaSetDevice(gpu);
			}
			}

			void convertor::init_yuv(){
			if (initialized_yuv) return;
			initialized_yuv = true;

			setGPUDevice(gpu_index);

			for(int i = 0; i < 3; i++){
			apSrcImage[i] = 0;
			apDstImage[i] = 0;
			}

			aSamplingFactors[0] = 34;
			aSamplingFactors[1] = 17;
			aSamplingFactors[2] = 17;

			nMCUBlocksH = 0;
			nMCUBlocksV = 0;

			for (int i = 0; i < 3; ++i)
			{
			nMCUBlocksV = std::max(nMCUBlocksV, aSamplingFactors[i] & 0x0f);
			nMCUBlocksH = std::max(nMCUBlocksH, aSamplingFactors[i] >> 4);
			}

			for (int i = 0; i < 3; ++i)
			{
			NppiSize oBlocks;
			NppiSize oBlocksPerMCU = { aSamplingFactors[i] >> 4, aSamplingFactors[i] & 0x0f };

			oBlocks.width = (int)ceil((width + 7) / 8 *
			static_cast<float>(oBlocksPerMCU.width) / nMCUBlocksH);
			oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width;

			oBlocks.height = (int)ceil((height + 7) / 8 *
			static_cast<float>(oBlocksPerMCU.height) / nMCUBlocksV);
			oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height;

			aSrcSize[i].width = oBlocks.width * 8;
			aSrcSize[i].height = oBlocks.height * 8;

			// Allocate Memory
			size_t nPitch;
			NPP_CHECK_CUDA(cudaMallocPitch((void**)&(apSrcImage[i]), &nPitch, aSrcSize[i].width, aSrcSize[i].height));
			aSrcPitch[i] = nPitch;
			aSrcImageStep[i] = static_cast<Npp32s>(nPitch);
			}

			NPP_CHECK_CUDA(cudaMallocPitch((void*)&imgOrigin, &pitchOrigin, width 3, height));

			bgrOriginPitch = width * 3;
			bgrOriginLen = bgrOriginPitch * height;
			NPP_CHECK_CUDA(cudaHostAlloc((void**)&bgrOrigin, bgrOriginLen, cudaHostAllocDefault));

			sizeOrigin.width = width;
			sizeOrigin.height = height;

			uint32_t nPitch = (width + MEMORY_ALGN_DEVICE) & ~MEMORY_ALGN_DEVICE;
			NPP_CHECK_CUDA(cudaHostAlloc((void*)&mY, nPitch height, cudaHostAllocDefault));
			nPitch = (width/2 + MEMORY_ALGN_DEVICE) & ~MEMORY_ALGN_DEVICE;
			NPP_CHECK_CUDA(cudaHostAlloc((void*)&mU, nPitch height / 2, cudaHostAllocDefault));
			NPP_CHECK_CUDA(cudaHostAlloc((void*)&mV, nPitch height / 2, cudaHostAllocDefault));

			}

			void convertor::init_resize(){
			if (initialized_resize) return;
			initialized_resize = true;

			setGPUDevice(gpu_index);

			NppiSize oDstImageSize;
			oDstImageSize.width = std::max(1, rWidth);
			oDstImageSize.height = std::max(1, rHeight);

			sizeResize.width = oDstImageSize.width;
			sizeResize.height = oDstImageSize.height;

			for (int i=0; i < 3; ++i)
			{
			NppiSize oBlocks;
			NppiSize oBlocksPerMCU = { aSamplingFactors[i] & 0x0f, aSamplingFactors[i] >> 4};

			oBlocks.width = (int)ceil((oDstImageSize.width + 7)/8 *
			static_cast<float>(oBlocksPerMCU.width)/nMCUBlocksH);
			oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width;

			oBlocks.height = (int)ceil((oDstImageSize.height+7)/8 *
			static_cast<float>(oBlocksPerMCU.height)/nMCUBlocksV);
			oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height;

			aDstSize[i].width = oBlocks.width * 8;
			aDstSize[i].height = oBlocks.height * 8;

			// Allocate Memory
			size_t nPitch;
			NPP_CHECK_CUDA(cudaMallocPitch((void**)&apDstImage[i], &nPitch, aDstSize[i].width, aDstSize[i].height));
			aDstImageStep[i] = static_cast<Npp32s>(nPitch);
			}

			if (rWidth > 0 && rHeight > 0 && rWidth < width && rHeight < height){
			fx = (float)(rWidth) / (float)(width);
			fy = (float)(rHeight) / (float)(height);
			}

			if (imgResize == 0){
			if (rWidth > 0 && rHeight > 0 && rWidth < width && rHeight < height){
			NPP_CHECK_CUDA(cudaMallocPitch((void*)&imgResize, &pitchResize, rWidth 3, rHeight));
			}
			}
			if (!bgrScale){
			if (rWidth > 0 && rHeight > 0 && rWidth < width && rHeight < height){
			bgrScalePitch = rWidth * 3;
			bgrScaleLen = bgrScalePitch * rHeight;
			NPP_CHECK_CUDA(cudaHostAlloc((void**)&bgrScale, bgrScaleLen, cudaHostAllocDefault));
			}
			}
			}

			void convertor::init_resize_bgr(){
			if (initialized_resize_bgr) return;
			initialized_resize_bgr = true;

			setGPUDevice(gpu_index);
			if (originBGR == 0){
			originBGR = nppiMalloc_8u_C3(width, height, &pitchOriginBGR);
			}
			if (resizedBGR == 0){
			resizedBGR = nppiMalloc_8u_C3(rWidth, rHeight, &pitchResizedBGR);
			}
			if (hostResizedBGR == NULL){
			NPP_CHECK_CUDA(cudaHostAlloc((void*)&hostResizedBGR, rWidth 3 * rHeight, cudaHostAllocDefault));
			}
			}

			void convertor::init_resize_yuv(){
			if (initialized_resize_yuv) return;
			initialized_resize_yuv = true;

			if (rWidth > 0 && rHeight > 0){
			fx = (float)(width) / (float)(rWidth);
			fy = (float)(height) / (float)(rHeight);
			}

			nv12 = (unsigned char)malloc(rWidthrHeight*3/2);
			}

			convertor::~convertor(){
			setGPUDevice(gpu_index);

			if(mY) cudaFreeHost(mY);
			if(mU) cudaFreeHost(mU);
			if(mV) cudaFreeHost(mV);

			for (int i = 0; i < 3; ++i)//内存释放
			{
			cudaFree(apSrcImage[i]);
			cudaFree(apDstImage[i]);
			}

			if (imgOrigin) cudaFree(imgOrigin);
			if (imgResize) cudaFree(imgResize);

			if (bgrOrigin) cudaFreeHost(bgrOrigin);
			if (bgrScale) cudaFreeHost(bgrScale);

			if (originBGR) nppiFree(originBGR);
			if (resizedBGR) nppiFree(resizedBGR);
			if (hostResizedBGR) cudaFreeHost(hostResizedBGR);

			if (nv12) free(nv12);
			}

			int convertor::fill_yuv(const unsigned char *yuv){
			init_yuv();
			int ret = set_data((uint8_t*)yuv, width, height, mY, mU, mV);
			if (ret < 0) return ret;

			setGPUDevice(gpu_index);

			NPP_CHECK_CUDA(cudaMemcpy(apSrcImage[0], mY, aSrcPitch[0] * height, cudaMemcpyHostToDevice));
			NPP_CHECK_CUDA(cudaMemcpy(apSrcImage[1], mU, aSrcPitch[1] * height / 2, cudaMemcpyHostToDevice));
			NPP_CHECK_CUDA(cudaMemcpy(apSrcImage[2], mV, aSrcPitch[2] * height / 2, cudaMemcpyHostToDevice));
			return 0;
			}

			int convertor::yuv2bgr(unsigned char *bgr, int bgrLen){

			*bgr = NULL;
			*bgrLen = 0;

			setGPUDevice(gpu_index);

			NPP_CHECK_NPP(nppiYUV420ToBGR_8u_P3C3R(apSrcImage, aSrcImageStep, imgOrigin, pitchOrigin, sizeOrigin));

			NPP_CHECK_CUDA(cudaMemcpy2D(bgrOrigin, bgrOriginPitch, imgOrigin, pitchOrigin, bgrOriginPitch, height, cudaMemcpyDeviceToHost));
			*bgr = bgrOrigin;
			*bgrLen = bgrOriginLen;

			return 0;
			}

			int convertor::resize2bgr(unsigned char in, unsigned char data, int data_len){
			*data = NULL;
			*data_len = 0;

			if ((rWidth < 0 && rHeight < 0) \|\| (rWidth > width && rHeight > height)){
			return -1;
			}

			setGPUDevice(gpu_index);

			if (!in){

			init_resize();

			NppiSize oDstImageSize;
			oDstImageSize.width = std::max(1, rWidth);
			oDstImageSize.height = std::max(1, rHeight);
			for (int i = 0; i < 3; ++i)
			{
			NppiSize oBlocksPerMCU = { aSamplingFactors[i] & 0x0f, aSamplingFactors[i] >> 4};
			NppiSize oSrcImageSize = {(width * oBlocksPerMCU.width) / nMCUBlocksH, (height * oBlocksPerMCU.height)/nMCUBlocksV};
			NppiRect oSrcImageROI = {0,0,oSrcImageSize.width, oSrcImageSize.height};
			NppiRect oDstImageROI;
			NppiInterpolationMode eInterploationMode = NPPI_INTER_SUPER;
			NPP_CHECK_NPP(nppiGetResizeRect(oSrcImageROI, &oDstImageROI,
			fx,
			fy,
			0.0, 0.0, eInterploationMode));
			NPP_CHECK_NPP(nppiResizeSqrPixel_8u_C1R(apSrcImage[i], oSrcImageSize, aSrcImageStep[i], oSrcImageROI,
			apDstImage[i], aDstImageStep[i], oDstImageROI ,
			fx,
			fy,
			0.0, 0.0, eInterploationMode));
			}
			NPP_CHECK_NPP(nppiYUV420ToBGR_8u_P3C3R(apDstImage, aDstImageStep, imgResize, pitchResize, sizeResize));
			NPP_CHECK_CUDA(cudaMemcpy2D(bgrScale, bgrScalePitch, imgResize, pitchResize, bgrScalePitch, rHeight, cudaMemcpyDeviceToHost));
			*data = bgrScale;
			*data_len = bgrScaleLen;
			}else{

			init_resize_bgr();

			NppiSize oSrcSize;
			oSrcSize.width = width;
			oSrcSize.height = height;

			NPP_CHECK_CUDA(cudaMemcpy2D(originBGR, pitchOriginBGR, in, width3, width3, height, cudaMemcpyHostToDevice));

			NppiRect oSrcROI;
			oSrcROI.x = 0;
			oSrcROI.y = 0;
			oSrcROI.width = width;
			oSrcROI.height = height;


			NppiRect oDstROI;
			oDstROI.x = 0;
			oDstROI.y = 0;
			oDstROI.width = rWidth;
			oDstROI.height = rHeight;

			// Scale Factor
			double nXFactor = double(oDstROI.width) / double(oSrcROI.width);
			double nYFactor = double(oDstROI.height) / double(oSrcROI.height);

			// Scaled X/Y Shift
			double nXShift = - oSrcROI.x * nXFactor ;
			double nYShift = - oSrcROI.y * nYFactor;
			int eInterpolation = NPPI_INTER_SUPER;
			if (nXFactor >= 1.f \|\| nYFactor >= 1.f)
			eInterpolation = NPPI_INTER_LANCZOS;

			NppStatus ret = nppiResizeSqrPixel_8u_C3R(originBGR, oSrcSize, pitchOriginBGR, oSrcROI,
			resizedBGR, pitchResizedBGR, oDstROI, nXFactor, nYFactor, nXShift, nYShift, eInterpolation );

			if(ret != NPP_SUCCESS) {
			printf("imageResize_8u_C3R failed %d.\n", ret);
			return -2;
			}
			size_t pitch = rWidth * 3;
			data_len = pitch rHeight;
			NPP_CHECK_CUDA(cudaMemcpy2D(hostResizedBGR, pitch, resizedBGR, pitchResizedBGR, pitch, rHeight, cudaMemcpyDeviceToHost));
			*data = hostResizedBGR;
			}
			return 0;
			}

			static int nv12_nearest_scale(uint8_t* __restrict src, uint8_t* __restrict dst,
			int srcWidth, int srcHeight, int dstWidth, int dstHeight)
			{
			register int sw = srcWidth; //register keyword is for local var to accelorate
			register int sh = srcHeight;
			register int dw = dstWidth;
			register int dh = dstHeight;
			register int y, x;
			unsigned long int srcy, srcx, src_index, dst_index;
			unsigned long int xrIntFloat_16 = (sw << 16) / dw + 1; //better than float division
			unsigned long int yrIntFloat_16 = (sh << 16) / dh + 1;

			uint8_t* dst_uv = dst + dh * dw; //memory start pointer of dest uv
			uint8_t* src_uv = src + sh * sw; //memory start pointer of source uv
			uint8_t* dst_uv_yScanline;
			uint8_t* src_uv_yScanline;
			uint8_t* dst_y_slice = dst; //memory start pointer of dest y
			uint8_t* src_y_slice;
			uint8_t* sp;
			uint8_t* dp;

			for (y = 0; y < (dh & ~7); ++y) //'dh & ~7' is to generate faster assembly code
			{
			srcy = (y * yrIntFloat_16) >> 16;
			src_y_slice = src + srcy * sw;

			if((y & 1) == 0)
			{
			dst_uv_yScanline = dst_uv + (y / 2) * dw;
			src_uv_yScanline = src_uv + (srcy / 2) * sw;
			}

			for(x = 0; x < (dw & ~7); ++x)
			{
			srcx = (x * xrIntFloat_16) >> 16;
			dst_y_slice[x] = src_y_slice[srcx];

			if((y & 1) == 0) //y is even
			{
			if((x & 1) == 0) //x is even
			{
			src_index = (srcx / 2) * 2;

			sp = dst_uv_yScanline + x;
			dp = src_uv_yScanline + src_index;
			sp = dp;
			++sp;
			++dp;
			sp = dp;
			}
			}
			}
			dst_y_slice += dw;
			}
			return 0;
			}

			int convertor::resizeyuv(unsigned char in, unsigned char data, int data_len){

			init_resize_yuv();

			data_len = rWidthrHeight*3/2;
			*data = nv12;

			return nv12_nearest_scale(in, nv12, width, height, rWidth, rHeight);
			}

			convHandle conv_create(const int srcW, const int srcH, const int dstW, const int dstH, const int gpu){
			if (gpu < 0) return NULL;

			convertor *conv = new convertor(srcW, srcH, dstW, dstH, gpu);
			return conv;
			}

			void conv_destroy(convHandle h){
			if (!h) return;
			convertor conv = (convertor)h;
			delete conv;
			}

			int yuv2bgrandresize(convHandle h, void yuv, unsigned char bgr, int bgrLen, unsigned char *scaleBGR, int scaleBGRLen){
			if (!h) return -2;
			convertor conv = (convertor)h;
			int ret = conv->fill_yuv((unsigned char*)yuv);
			if (ret != 0) return ret;
			ret = conv->yuv2bgr(bgr, bgrLen);
			if (ret != 0) return ret;
			ret = conv->resize2bgr(NULL, scaleBGR, scaleBGRLen);
			return ret;
			}

			int yuv2bgr(convHandle h, void yuv, unsigned char bgr, int bgrLen){
			if (!h) return -2;
			convertor conv = (convertor)h;
			int ret = conv->fill_yuv((unsigned char*)yuv);
			if (ret != 0) return ret;
			return conv->yuv2bgr(bgr, bgrLen);
			}

			int yuv2resizedbgr(convHandle h, void yuv, unsigned char bgr, int bgrLen){
			if (!h) return -2;
			convertor conv = (convertor)h;
			int ret = conv->fill_yuv((unsigned char*)yuv);
			if (ret != 0) return ret;
			ret = conv->resize2bgr(NULL, bgr, bgrLen);
			return ret;
			}

			int resizebgr(convHandle h, void data, unsigned char resized, int len){
			if (!h) return -2;
			convertor conv = (convertor)h;
			return conv->resize2bgr((unsigned char*)data, resized, len);
			}

			int resizeyuv(convHandle h, void data, unsigned char resized, int len){
			if (!h) return -2;
			convertor conv = (convertor)h;
			return conv->resizeyuv((unsigned char*)data, resized, len);
			}

New file
			@@ -0,0 +1,23 @@
			#ifndef __RESIZE_NPP_H__
			#define __RESIZE_NPP_H__

			#ifdef __cplusplus
			extern "C"{
			#endif

			typedef void* convHandle;
			convHandle conv_create(const int srcW, const int srcH, const int dstW, const int dstH, const int gpu);

			void conv_destroy(convHandle h);
			int yuv2bgrandresize(convHandle h, void yuv, unsigned char bgr, int bgrLen, unsigned char *scaleBGR, int scaleBGRLen);
			int yuv2bgr(convHandle h, void yuv, unsigned char bgr, int bgrLen);
			int yuv2resizedbgr(convHandle h, void yuv, unsigned char bgr, int bgrLen);

			int resizebgr(convHandle h, void data, unsigned char resized, int len);
			int resizeyuv(convHandle h, void data, unsigned char resized, int len);

			#ifdef __cplusplus
			}
			#endif

			#endif //__RESIZE_NPP_H__

New file
			@@ -0,0 +1,258 @@
			package goconv

			/*
			#cgo CFLAGS: -I./ -I./inc -I/usr/local/cuda/include
			#cgo CXXFLAGS: -I./ -I./inc -I/usr/local/cuda/include -std=c++11
			#cgo LDFLAGS: -L/usr/local/cuda/lib64 -lnppig -lnppicc -lnppial -lnppisu -lcudart -ldl
			#include <stdlib.h>
			#include "conv.h"
			*/
			import "C"
			import (
			"unsafe"

			"basic.com/valib/godraw.git"
			"basic.com/valib/gogpu.git"
			"github.com/disintegration/imaging"
			)

			const (
			need = 200
			reserved = 512
			)

			func gpuIndex(lastIndex int) int {
			indices := gogpu.RankGPU()
			if len(indices) == 0 {
			return -1
			}

			for _, v := range indices {
			if v != lastIndex {
			if gogpu.SatisfyGPU(v, need, need/2) {
			return v
			}
			}
			}

			if gogpu.SatisfyGPU(lastIndex, need, reserved) {
			return lastIndex
			}
			return -1
			}

			type convertor struct {
			width int
			height int
			rWidth int
			rHeight int
			conv C.convHandle
			}

			var convts []*convertor

			func find(w, h, rw, rh int) *convertor {
			for _, v := range convts {
			if v.width == w && v.height == h && v.rWidth == rw && v.rHeight == rh {
			return v
			}
			}
			gpu := gpuIndex(0)
			if gpu < 0 {
			return nil
			}
			cw := C.conv_create(C.int(w), C.int(h), C.int(rw), C.int(rh), C.int(gpu))
			if cw == nil {
			return nil
			}
			c := &convertor{w, h, rw, rh, cw}
			convts = append(convts, c)
			return c
			}

			// YUV2BGR yuv->bgr
			func YUV2BGR(yuv []byte, w, h int) []byte {

			cw := find(w, h, -1, -1)
			if cw == nil {
			return yuv2bgr(yuv, w, h)
			}
			var bgr *C.uchar
			var bgrLen C.int
			ret := C.yuv2bgr(cw.conv, unsafe.Pointer(&yuv[0]), &bgr, &bgrLen)
			if ret != 0 {
			return nil
			}
			const maxLen = 0x7fffffff
			goBGRLen := int(bgrLen)
			if goBGRLen > 0 {
			return (*[maxLen]byte)(unsafe.Pointer(bgr))[:goBGRLen:goBGRLen]
			}
			return nil
			}

			// YUV2ResizedBGR yuv -> resized bgr
			func YUV2ResizedBGR(yuv []byte, w, h, rw, rh int) []byte {

			cw := find(w, h, rw, rh)
			if cw == nil {
			bgr := yuv2bgr(yuv, w, h)
			return bgresize(bgr, w, h, rw, rh)
			}
			var bgr *C.uchar
			var bgrLen C.int
			ret := C.yuv2resizedbgr(cw.conv, unsafe.Pointer(&yuv[0]), &bgr, &bgrLen)
			if ret != 0 {
			return nil
			}
			const maxLen = 0x7fffffff
			goBGRLen := int(bgrLen)
			if goBGRLen > 0 {
			return (*[maxLen]byte)(unsafe.Pointer(bgr))[:goBGRLen:goBGRLen]
			}
			return nil

			}

			// ResizeBGR resize
			func ResizeBGR(bgrO []byte, w, h, rw, rh int) []byte {
			if (rw < 0 && rh < 0) \|\| (rw > w && rh > h) {
			return bgrO
			}

			cw := find(w, h, rw, rh)
			if cw == nil {
			return bgresize(bgrO, w, h, rw, rh)
			}

			var bgr *C.uchar
			var bgrLen C.int
			ret := C.resizebgr(cw.conv, unsafe.Pointer(&bgrO[0]), &bgr, &bgrLen)
			if ret != 0 {
			return nil
			}
			const maxLen = 0x7fffffff
			goBGRLen := int(bgrLen)
			if goBGRLen > 0 {
			return (*[maxLen]byte)(unsafe.Pointer(bgr))[:goBGRLen:goBGRLen]
			}
			return nil
			}

			// ResizeYUV yuv
			func ResizeYUV(yuv []byte, w, h, rw, rh int) []byte {
			if (rw < 0 && rh < 0) \|\| (rw > w && rh > h) {
			return yuv
			}

			cw := find(w, h, rw, rh)
			if cw == nil {
			return yuv
			}

			var resized *C.uchar
			var resizedLen C.int
			ret := C.resizeyuv(cw.conv, unsafe.Pointer(&yuv[0]), &resized, &resizedLen)
			if ret != 0 {
			return nil
			}

			const maxLen = 0x7fffffff
			goResizedLen := int(resizedLen)
			if goResizedLen > 0 {
			return (*[maxLen]byte)(unsafe.Pointer(resized))[:goResizedLen:goResizedLen]
			}
			return nil
			}

			// YUV2BGRandResize conv and resize
			func YUV2BGRandResize(yuv []byte, w, h, rw, rh int) ([]byte, []byte) {
			cw := find(w, h, rw, rh)
			if cw == nil {
			origin := yuv2bgr(yuv, w, h)
			resized := bgresize(origin, w, h, rw, rh)
			return origin, resized
			}

			var bgr *C.uchar
			var bgrLen C.int
			var scale *C.uchar
			var scaleLen C.int

			ret := C.yuv2bgrandresize(cw.conv, unsafe.Pointer(&yuv[0]), &bgr, &bgrLen, &scale, &scaleLen)

			if ret != 0 {
			return nil, nil
			}
			var out, resized []byte

			const maxLen = 0x7fffffff
			goBGRLen, goScaleLen := int(bgrLen), int(scaleLen)
			if goBGRLen > 0 {
			out = (*[maxLen]byte)(unsafe.Pointer(bgr))[:goBGRLen:goBGRLen]
			}
			if goScaleLen > 0 {
			resized = (*[maxLen]byte)(unsafe.Pointer(scale))[:goScaleLen:goScaleLen]
			}
			return out, resized

			}

			// Free free
			func Free() {
			for _, v := range convts {
			if v.conv != nil {
			C.conv_destroy(v.conv)
			}
			}
			}

			func yuv2bgr(yuv []byte, w, h int) []byte {

			data := make([]byte, 0, wh3)
			start := w * h
			for i := 0; i < h; i++ {
			for j := 0; j < w; j++ {

			index := i/2*w + j - (j & 0x01)

			y := int32(yuv[j+i*w])
			v := int32(yuv[start+index])
			u := int32(yuv[start+index+1])

			r := y + (140*(v-128))/100
			g := y - (34(u-128)+71(v-128))/100
			b := y + (177*(u-128))/100

			if r > 255 {
			r = 255
			}
			if r < 0 {
			r = 0
			}
			if g > 255 {
			g = 255
			}
			if g < 0 {
			g = 0
			}
			if b > 255 {
			b = 255
			}
			if b < 0 {
			b = 0
			}
			data = append(data, byte(r), byte(g), byte(b))
			}
			}
			return data
			}

			func bgresize(bgr []byte, w, h, rw, rh int) []byte {
			img, err := godraw.ToImage(bgr, w, h)
			if err != nil {
			return nil
			}
			dstImg := imaging.Resize(img, rw, rh, imaging.NearestNeighbor)
			return godraw.Image2BGR(dstImg)
			}

New file
			@@ -0,0 +1,181 @@
			/**
			* Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
			*
			* Please refer to the NVIDIA end user license agreement (EULA) associated
			* with this source code for terms and conditions that govern your use of
			* this software. Any use, reproduction, disclosure, or distribution of
			* this software and related documentation outside the terms of the EULA
			* is strictly prohibited.
			*
			*/

			#ifndef NV_UTIL_NPP_EXCEPTIONS_H
			#define NV_UTIL_NPP_EXCEPTIONS_H


			#include <string>
			#include <sstream>
			#include <iostream>

			/// All npp related C++ classes are put into the npp namespace.
			namespace npp
			{

			/// Exception base class.
			/// This exception base class will be used for everything C++ throught
			/// the NPP project.
			/// The exception contains a string message, as well as data fields for a string
			/// containing the name of the file as well as the line number where the exception was thrown.
			/// The easiest way of throwing exceptions and providing filename and line number is
			/// to use one of the ASSERT macros defined for that purpose.
			class Exception
			{
			public:
			/// Constructor.
			/// \param rMessage A message with information as to why the exception was thrown.
			/// \param rFileName The name of the file where the exception was thrown.
			/// \param nLineNumber Line number in the file where the exception was thrown.
			explicit
			Exception(const std::string &rMessage = "", const std::string &rFileName = "", unsigned int nLineNumber = 0)
			: sMessage_(rMessage), sFileName_(rFileName), nLineNumber_(nLineNumber)
			{ };

			Exception(const Exception &rException)
			: sMessage_(rException.sMessage_), sFileName_(rException.sFileName_), nLineNumber_(rException.nLineNumber_)
			{ };

			virtual
			~Exception()
			{ };

			/// Get the exception's message.
			const
			std::string &
			message()
			const
			{
			return sMessage_;
			}

			/// Get the exception's file info.
			const
			std::string &
			fileName()
			const
			{
			return sFileName_;
			}

			/// Get the exceptions's line info.
			unsigned int
			lineNumber()
			const
			{
			return nLineNumber_;
			}


			/// Create a clone of this exception.
			/// This creates a new Exception object on the heap. It is
			/// the responsibility of the user of this function to free this memory
			/// (delete x).
			virtual
			Exception *
			clone()
			const
			{
			return new Exception(*this);
			}

			/// Create a single string with all the exceptions information.
			/// The virtual toString() method is used by the operator<<()
			/// so that all exceptions derived from this base-class can print
			/// their full information correctly even if a reference to their
			/// exact type is not had at the time of printing (i.e. the basic
			/// operator<<() is used).
			virtual
			std::string
			toString()
			const
			{
			std::ostringstream oOutputString;
			oOutputString << fileName() << ":" << lineNumber() << ": " << message();
			return oOutputString.str();
			}

			private:
			std::string sMessage_; ///< Message regarding the cause of the exception.
			std::string sFileName_; ///< Name of the file where the exception was thrown.
			unsigned int nLineNumber_; ///< Line number in the file where the exception was thrown
			};

			/// Output stream inserter for Exception.
			/// \param rOutputStream The stream the exception information is written to.
			/// \param rException The exception that's being written.
			/// \return Reference to the output stream being used.
			static std::ostream &
			operator << (std::ostream &rOutputStream, const Exception &rException)
			{
			rOutputStream << rException.toString();
			return rOutputStream;
			}

			/// Basic assert macro.
			/// This macro should be used to enforce any kind of pre or post conditions.
			/// Unlike the C-runtime assert macro, this macro does not abort execution, but throws
			/// a C++ exception. The exception is automatically filled with information about the failing
			/// condition, the filename and line number where the exception was thrown.
			/// \note The macro is written in such a way that omitting a semicolon after its usage
			/// causes a compiler error. The correct way to invoke this macro is:
			/// NPP_ASSERT(n < MAX);
			#define NPP_ASSERT(C) do {if (!(C)) throw npp::Exception(#C " assertion faild!", __FILE__, __LINE__);} while(false)

			// ASSERT macro.
			// Same functionality as the basic assert macro with the added ability to pass
			// a message M. M should be a string literal.
			// Note: Never use code inside ASSERT() that causes a side-effect ASSERT macros may get compiled
			// out in release mode.
			#define NPP_ASSERT_MSG(C, M) do {if (!(C)) throw npp::Exception(#C " assertion faild! Message: " M, __FILE__, __LINE__);} while(false)

			#ifdef _DEBUG
			/// Basic debug assert macro.
			/// This macro is identical in every respect to NPP_ASSERT(C) but it does get compiled to a
			/// no-op in release builds. It is therefor of utmost importance to not put statements into
			/// this macro that cause side effects required for correct program execution.
			#define NPP_DEBUG_ASSERT(C) do {if (!(C)) throw npp::Exception(#C " debug assertion faild!", __FILE__, __LINE__);} while(false)
			#else
			#define NPP_DEBUG_ASSERT(C)
			#endif

			/// ASSERT for null-pointer test.
			/// It is safe to put code with side effects into this macro. Also: This macro never
			/// gets compiled to a no-op because resource allocation may fail based on external causes not under
			/// control of a software developer.
			#define NPP_ASSERT_NOT_NULL(P) do {if ((P) == 0) throw npp::Exception(#P " not null assertion faild!", __FILE__, __LINE__);} while(false)

			/// Macro for flagging methods as not implemented.
			/// The macro throws an exception with a message that an implementation was missing
			#define NPP_NOT_IMPLEMENTED() do {throw npp::Exception("Implementation missing!", __FILE__, __LINE__);} while(false)

			/// Macro for checking error return code of CUDA (runtime) calls.
			/// This macro never gets disabled.
			#define NPP_CHECK_CUDA(S) do {cudaError_t eCUDAResult; \
			eCUDAResult = S; \
			if (eCUDAResult != cudaSuccess) std::cout << "NPP_CHECK_CUDA - eCUDAResult = " << eCUDAResult << std::endl; \
			NPP_ASSERT(eCUDAResult == cudaSuccess);} while (false)

			/// Macro for checking error return code for NPP calls.
			#define NPP_CHECK_NPP(S) do {NppStatus eStatusNPP; \
			eStatusNPP = S; \
			if (eStatusNPP != NPP_SUCCESS) std::cout << "NPP_CHECK_NPP - eStatusNPP = " << _cudaGetErrorEnum(eStatusNPP) << "("<< eStatusNPP << ")" << std::endl; \
			NPP_ASSERT(eStatusNPP == NPP_SUCCESS);} while (false)

			/// Macro for checking error return codes from cuFFT calls.
			#define NPP_CHECK_CUFFT(S) do {cufftResult eCUFFTResult; \
			eCUFFTResult = S; \
			if (eCUFFTResult != NPP_SUCCESS) std::cout << "NPP_CHECK_CUFFT - eCUFFTResult = " << eCUFFTResult << std::endl; \
			NPP_ASSERT(eCUFFTResult == CUFFT_SUCCESS);} while (false)

			} // npp namespace

			#endif // NV_UTIL_NPP_EXCEPTIONS_H

New file
			@@ -0,0 +1,526 @@
			/**
			* Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
			*
			* Please refer to the NVIDIA end user license agreement (EULA) associated
			* with this source code for terms and conditions that govern your use of
			* this software. Any use, reproduction, disclosure, or distribution of
			* this software and related documentation outside the terms of the EULA
			* is strictly prohibited.
			*
			*/

			// These are helper functions for the SDK samples (string parsing, timers, etc)
			#ifndef STRING_HELPER_H
			#define STRING_HELPER_H

			#include <stdio.h>
			#include <stdlib.h>
			#include <fstream>
			#include <string>

			#if defined(WIN32) \|\| defined(_WIN32) \|\| defined(WIN64) \|\| defined(_WIN64)
			#ifndef _CRT_SECURE_NO_DEPRECATE
			#define _CRT_SECURE_NO_DEPRECATE
			#endif
			#ifndef STRCASECMP
			#define STRCASECMP _stricmp
			#endif
			#ifndef STRNCASECMP
			#define STRNCASECMP _strnicmp
			#endif
			#ifndef STRCPY
			#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
			#endif

			#ifndef FOPEN
			#define FOPEN(fHandle,filename,mode) fopen_s(&fHandle, filename, mode)
			#endif
			#ifndef FOPEN_FAIL
			#define FOPEN_FAIL(result) (result != 0)
			#endif
			#ifndef SSCANF
			#define SSCANF sscanf_s
			#endif
			#ifndef SPRINTF
			#define SPRINTF sprintf_s
			#endif
			#else // Linux Includes
			#include <string.h>
			#include <strings.h>

			#ifndef STRCASECMP
			#define STRCASECMP strcasecmp
			#endif
			#ifndef STRNCASECMP
			#define STRNCASECMP strncasecmp
			#endif
			#ifndef STRCPY
			#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
			#endif

			#ifndef FOPEN
			#define FOPEN(fHandle,filename,mode) (fHandle = fopen(filename, mode))
			#endif
			#ifndef FOPEN_FAIL
			#define FOPEN_FAIL(result) (result == NULL)
			#endif
			#ifndef SSCANF
			#define SSCANF sscanf
			#endif
			#ifndef SPRINTF
			#define SPRINTF sprintf
			#endif
			#endif

			#ifndef EXIT_WAIVED
			#define EXIT_WAIVED 2
			#endif

			// CUDA Utility Helper Functions
			inline int stringRemoveDelimiter(char delimiter, const char *string)
			{
			int string_start = 0;

			while (string[string_start] == delimiter)
			{
			string_start++;
			}

			if (string_start >= (int)strlen(string)-1)
			{
			return 0;
			}

			return string_start;
			}

			inline int getFileExtension(char filename, char *extension)
			{
			int string_length = (int)strlen(filename);

			while (filename[string_length--] != '.')
			{
			if (string_length == 0)
			break;
			}

			if (string_length > 0) string_length += 2;

			if (string_length == 0)
			*extension = NULL;
			else
			*extension = &filename[string_length];

			return string_length;
			}


			inline bool checkCmdLineFlag(const int argc, const char *argv, const char string_ref)
			{
			bool bFound = false;

			if (argc >= 1)
			{
			for (int i=1; i < argc; i++)
			{
			int string_start = stringRemoveDelimiter('-', argv[i]);
			const char *string_argv = &argv[i][string_start];

			const char *equal_pos = strchr(string_argv, '=');
			int argv_length = (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);

			int length = (int)strlen(string_ref);

			if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length))
			{
			bFound = true;
			continue;
			}
			}
			}

			return bFound;
			}

			// This function wraps the CUDA Driver API into a template function
			template <class T>
			inline bool getCmdLineArgumentValue(const int argc, const char *argv, const char string_ref, T *value)
			{
			bool bFound = false;

			if (argc >= 1)
			{
			for (int i=1; i < argc; i++)
			{
			int string_start = stringRemoveDelimiter('-', argv[i]);
			const char *string_argv = &argv[i][string_start];
			int length = (int)strlen(string_ref);

			if (!STRNCASECMP(string_argv, string_ref, length))
			{
			if (length+1 <= (int)strlen(string_argv))
			{
			int auto_inc = (string_argv[length] == '=') ? 1 : 0;
			*value = (T)atoi(&string_argv[length + auto_inc]);
			}

			bFound = true;
			i=argc;
			}
			}
			}

			return bFound;
			}

			inline int getCmdLineArgumentInt(const int argc, const char *argv, const char string_ref)
			{
			bool bFound = false;
			int value = -1;

			if (argc >= 1)
			{
			for (int i=1; i < argc; i++)
			{
			int string_start = stringRemoveDelimiter('-', argv[i]);
			const char *string_argv = &argv[i][string_start];
			int length = (int)strlen(string_ref);

			if (!STRNCASECMP(string_argv, string_ref, length))
			{
			if (length+1 <= (int)strlen(string_argv))
			{
			int auto_inc = (string_argv[length] == '=') ? 1 : 0;
			value = atoi(&string_argv[length + auto_inc]);
			}
			else
			{
			value = 0;
			}

			bFound = true;
			continue;
			}
			}
			}

			if (bFound)
			{
			return value;
			}
			else
			{
			return 0;
			}
			}

			inline float getCmdLineArgumentFloat(const int argc, const char *argv, const char string_ref)
			{
			bool bFound = false;
			float value = -1;

			if (argc >= 1)
			{
			for (int i=1; i < argc; i++)
			{
			int string_start = stringRemoveDelimiter('-', argv[i]);
			const char *string_argv = &argv[i][string_start];
			int length = (int)strlen(string_ref);

			if (!STRNCASECMP(string_argv, string_ref, length))
			{
			if (length+1 <= (int)strlen(string_argv))
			{
			int auto_inc = (string_argv[length] == '=') ? 1 : 0;
			value = (float)atof(&string_argv[length + auto_inc]);
			}
			else
			{
			value = 0.f;
			}

			bFound = true;
			continue;
			}
			}
			}

			if (bFound)
			{
			return value;
			}
			else
			{
			return 0;
			}
			}

			inline bool getCmdLineArgumentString(const int argc, const char **argv,
			const char string_ref, char *string_retval)
			{
			bool bFound = false;

			if (argc >= 1)
			{
			for (int i=1; i < argc; i++)
			{
			int string_start = stringRemoveDelimiter('-', argv[i]);
			char string_argv = (char )&argv[i][string_start];
			int length = (int)strlen(string_ref);

			if (!STRNCASECMP(string_argv, string_ref, length))
			{
			*string_retval = &string_argv[length+1];
			bFound = true;
			continue;
			}
			}
			}

			if (!bFound)
			{
			*string_retval = NULL;
			}

			return bFound;
			}

			//////////////////////////////////////////////////////////////////////////////
			//! Find the path for a file assuming that
			//! files are found in the searchPath.
			//!
			//! @return the path if succeeded, otherwise 0
			//! @param filename name of the file
			//! @param executable_path optional absolute path of the executable
			//////////////////////////////////////////////////////////////////////////////
			inline char sdkFindFilePath(const char filename, const char *executable_path)
			{
			// <executable_name> defines a variable that is replaced with the name of the executable

			// Typical relative search paths to locate needed companion files (e.g. sample input data, or JIT source files)
			// The origin for the relative search may be the .exe file, a .bat file launching an .exe, a browser .exe launching the .exe or .bat, etc
			const char *searchPath[] =
			{
			"./", // same dir
			"./common/", // "/common/" subdir
			"./common/data/", // "/common/data/" subdir
			"./data/", // "/data/" subdir
			"./src/", // "/src/" subdir
			"./src/<executable_name>/data/", // "/src/<executable_name>/data/" subdir
			"./inc/", // "/inc/" subdir
			"./0_Simple/", // "/0_Simple/" subdir
			"./1_Utilities/", // "/1_Utilities/" subdir
			"./2_Graphics/", // "/2_Graphics/" subdir
			"./3_Imaging/", // "/3_Imaging/" subdir
			"./4_Finance/", // "/4_Finance/" subdir
			"./5_Simulations/", // "/5_Simulations/" subdir
			"./6_Advanced/", // "/6_Advanced/" subdir
			"./7_CUDALibraries/", // "/7_CUDALibraries/" subdir
			"./8_Android/", // "/8_Android/" subdir
			"./samples/", // "/samples/" subdir

			"./0_Simple/<executable_name>/data/", // "/0_Simple/<executable_name>/data/" subdir
			"./1_Utilities/<executable_name>/data/", // "/1_Utilities/<executable_name>/data/" subdir
			"./2_Graphics/<executable_name>/data/", // "/2_Graphics/<executable_name>/data/" subdir
			"./3_Imaging/<executable_name>/data/", // "/3_Imaging/<executable_name>/data/" subdir
			"./4_Finance/<executable_name>/data/", // "/4_Finance/<executable_name>/data/" subdir
			"./5_Simulations/<executable_name>/data/", // "/5_Simulations/<executable_name>/data/" subdir
			"./6_Advanced/<executable_name>/data/", // "/6_Advanced/<executable_name>/data/" subdir
			"./7_CUDALibraries/<executable_name>/", // "/7_CUDALibraries/<executable_name>/" subdir
			"./7_CUDALibraries/<executable_name>/data/", // "/7_CUDALibraries/<executable_name>/data/" subdir

			"../", // up 1 in tree
			"../common/", // up 1 in tree, "/common/" subdir
			"../common/data/", // up 1 in tree, "/common/data/" subdir
			"../data/", // up 1 in tree, "/data/" subdir
			"../src/", // up 1 in tree, "/src/" subdir
			"../inc/", // up 1 in tree, "/inc/" subdir

			"../0_Simple/<executable_name>/data/", // up 1 in tree, "/0_Simple/<executable_name>/" subdir
			"../1_Utilities/<executable_name>/data/", // up 1 in tree, "/1_Utilities/<executable_name>/" subdir
			"../2_Graphics/<executable_name>/data/", // up 1 in tree, "/2_Graphics/<executable_name>/" subdir
			"../3_Imaging/<executable_name>/data/", // up 1 in tree, "/3_Imaging/<executable_name>/" subdir
			"../4_Finance/<executable_name>/data/", // up 1 in tree, "/4_Finance/<executable_name>/" subdir
			"../5_Simulations/<executable_name>/data/", // up 1 in tree, "/5_Simulations/<executable_name>/" subdir
			"../6_Advanced/<executable_name>/data/", // up 1 in tree, "/6_Advanced/<executable_name>/" subdir
			"../7_CUDALibraries/<executable_name>/data/",// up 1 in tree, "/7_CUDALibraries/<executable_name>/" subdir
			"../8_Android/<executable_name>/data/", // up 1 in tree, "/8_Android/<executable_name>/" subdir
			"../samples/<executable_name>/data/", // up 1 in tree, "/samples/<executable_name>/" subdir
			"../../", // up 2 in tree
			"../../common/", // up 2 in tree, "/common/" subdir
			"../../common/data/", // up 2 in tree, "/common/data/" subdir
			"../../data/", // up 2 in tree, "/data/" subdir
			"../../src/", // up 2 in tree, "/src/" subdir
			"../../inc/", // up 2 in tree, "/inc/" subdir
			"../../sandbox/<executable_name>/data/", // up 2 in tree, "/sandbox/<executable_name>/" subdir
			"../../0_Simple/<executable_name>/data/", // up 2 in tree, "/0_Simple/<executable_name>/" subdir
			"../../1_Utilities/<executable_name>/data/", // up 2 in tree, "/1_Utilities/<executable_name>/" subdir
			"../../2_Graphics/<executable_name>/data/", // up 2 in tree, "/2_Graphics/<executable_name>/" subdir
			"../../3_Imaging/<executable_name>/data/", // up 2 in tree, "/3_Imaging/<executable_name>/" subdir
			"../../4_Finance/<executable_name>/data/", // up 2 in tree, "/4_Finance/<executable_name>/" subdir
			"../../5_Simulations/<executable_name>/data/", // up 2 in tree, "/5_Simulations/<executable_name>/" subdir
			"../../6_Advanced/<executable_name>/data/", // up 2 in tree, "/6_Advanced/<executable_name>/" subdir
			"../../7_CUDALibraries/<executable_name>/data/", // up 2 in tree, "/7_CUDALibraries/<executable_name>/" subdir
			"../../8_Android/<executable_name>/data/", // up 2 in tree, "/8_Android/<executable_name>/" subdir
			"../../samples/<executable_name>/data/", // up 2 in tree, "/samples/<executable_name>/" subdir
			"../../../", // up 3 in tree
			"../../../src/<executable_name>/", // up 3 in tree, "/src/<executable_name>/" subdir
			"../../../src/<executable_name>/data/", // up 3 in tree, "/src/<executable_name>/data/" subdir
			"../../../src/<executable_name>/src/", // up 3 in tree, "/src/<executable_name>/src/" subdir
			"../../../src/<executable_name>/inc/", // up 3 in tree, "/src/<executable_name>/inc/" subdir
			"../../../sandbox/<executable_name>/", // up 3 in tree, "/sandbox/<executable_name>/" subdir
			"../../../sandbox/<executable_name>/data/", // up 3 in tree, "/sandbox/<executable_name>/data/" subdir
			"../../../sandbox/<executable_name>/src/", // up 3 in tree, "/sandbox/<executable_name>/src/" subdir
			"../../../sandbox/<executable_name>/inc/", // up 3 in tree, "/sandbox/<executable_name>/inc/" subdir
			"../../../0_Simple/<executable_name>/data/", // up 3 in tree, "/0_Simple/<executable_name>/" subdir
			"../../../1_Utilities/<executable_name>/data/", // up 3 in tree, "/1_Utilities/<executable_name>/" subdir
			"../../../2_Graphics/<executable_name>/data/", // up 3 in tree, "/2_Graphics/<executable_name>/" subdir
			"../../../3_Imaging/<executable_name>/data/", // up 3 in tree, "/3_Imaging/<executable_name>/" subdir
			"../../../4_Finance/<executable_name>/data/", // up 3 in tree, "/4_Finance/<executable_name>/" subdir
			"../../../5_Simulations/<executable_name>/data/", // up 3 in tree, "/5_Simulations/<executable_name>/" subdir
			"../../../6_Advanced/<executable_name>/data/", // up 3 in tree, "/6_Advanced/<executable_name>/" subdir
			"../../../7_CUDALibraries/<executable_name>/data/", // up 3 in tree, "/7_CUDALibraries/<executable_name>/" subdir
			"../../../8_Android/<executable_name>/data/", // up 3 in tree, "/8_Android/<executable_name>/" subdir
			"../../../0_Simple/<executable_name>/", // up 3 in tree, "/0_Simple/<executable_name>/" subdir
			"../../../1_Utilities/<executable_name>/", // up 3 in tree, "/1_Utilities/<executable_name>/" subdir
			"../../../2_Graphics/<executable_name>/", // up 3 in tree, "/2_Graphics/<executable_name>/" subdir
			"../../../3_Imaging/<executable_name>/", // up 3 in tree, "/3_Imaging/<executable_name>/" subdir
			"../../../4_Finance/<executable_name>/", // up 3 in tree, "/4_Finance/<executable_name>/" subdir
			"../../../5_Simulations/<executable_name>/", // up 3 in tree, "/5_Simulations/<executable_name>/" subdir
			"../../../6_Advanced/<executable_name>/", // up 3 in tree, "/6_Advanced/<executable_name>/" subdir
			"../../../7_CUDALibraries/<executable_name>/", // up 3 in tree, "/7_CUDALibraries/<executable_name>/" subdir
			"../../../8_Android/<executable_name>/", // up 3 in tree, "/8_Android/<executable_name>/" subdir
			"../../../samples/<executable_name>/data/", // up 3 in tree, "/samples/<executable_name>/" subdir
			"../../../common/", // up 3 in tree, "../../../common/" subdir
			"../../../common/data/", // up 3 in tree, "../../../common/data/" subdir
			"../../../data/", // up 3 in tree, "../../../data/" subdir
			"../../../../", // up 4 in tree
			"../../../../src/<executable_name>/", // up 4 in tree, "/src/<executable_name>/" subdir
			"../../../../src/<executable_name>/data/", // up 4 in tree, "/src/<executable_name>/data/" subdir
			"../../../../src/<executable_name>/src/", // up 4 in tree, "/src/<executable_name>/src/" subdir
			"../../../../src/<executable_name>/inc/", // up 4 in tree, "/src/<executable_name>/inc/" subdir
			"../../../../sandbox/<executable_name>/", // up 4 in tree, "/sandbox/<executable_name>/" subdir
			"../../../../sandbox/<executable_name>/data/", // up 4 in tree, "/sandbox/<executable_name>/data/" subdir
			"../../../../sandbox/<executable_name>/src/", // up 4 in tree, "/sandbox/<executable_name>/src/" subdir
			"../../../../sandbox/<executable_name>/inc/", // up 4 in tree, "/sandbox/<executable_name>/inc/" subdir
			"../../../../0_Simple/<executable_name>/data/", // up 4 in tree, "/0_Simple/<executable_name>/" subdir
			"../../../../1_Utilities/<executable_name>/data/", // up 4 in tree, "/1_Utilities/<executable_name>/" subdir
			"../../../../2_Graphics/<executable_name>/data/", // up 4 in tree, "/2_Graphics/<executable_name>/" subdir
			"../../../../3_Imaging/<executable_name>/data/", // up 4 in tree, "/3_Imaging/<executable_name>/" subdir
			"../../../../4_Finance/<executable_name>/data/", // up 4 in tree, "/4_Finance/<executable_name>/" subdir
			"../../../../5_Simulations/<executable_name>/data/",// up 4 in tree, "/5_Simulations/<executable_name>/" subdir
			"../../../../6_Advanced/<executable_name>/data/", // up 4 in tree, "/6_Advanced/<executable_name>/" subdir
			"../../../../7_CUDALibraries/<executable_name>/data/", // up 4 in tree, "/7_CUDALibraries/<executable_name>/" subdir
			"../../../../8_Android/<executable_name>/data/", // up 4 in tree, "/8_Android/<executable_name>/" subdir
			"../../../../0_Simple/<executable_name>/", // up 4 in tree, "/0_Simple/<executable_name>/" subdir
			"../../../../1_Utilities/<executable_name>/", // up 4 in tree, "/1_Utilities/<executable_name>/" subdir
			"../../../../2_Graphics/<executable_name>/", // up 4 in tree, "/2_Graphics/<executable_name>/" subdir
			"../../../../3_Imaging/<executable_name>/", // up 4 in tree, "/3_Imaging/<executable_name>/" subdir
			"../../../../4_Finance/<executable_name>/", // up 4 in tree, "/4_Finance/<executable_name>/" subdir
			"../../../../5_Simulations/<executable_name>/",// up 4 in tree, "/5_Simulations/<executable_name>/" subdir
			"../../../../6_Advanced/<executable_name>/", // up 4 in tree, "/6_Advanced/<executable_name>/" subdir
			"../../../../7_CUDALibraries/<executable_name>/", // up 4 in tree, "/7_CUDALibraries/<executable_name>/" subdir
			"../../../../8_Android/<executable_name>/", // up 4 in tree, "/8_Android/<executable_name>/" subdir
			"../../../../samples/<executable_name>/data/", // up 4 in tree, "/samples/<executable_name>/" subdir
			"../../../../common/", // up 4 in tree, "../../../common/" subdir
			"../../../../common/data/", // up 4 in tree, "../../../common/data/" subdir
			"../../../../data/", // up 4 in tree, "../../../data/" subdir
			"../../../../../", // up 5 in tree
			"../../../../../src/<executable_name>/", // up 5 in tree, "/src/<executable_name>/" subdir
			"../../../../../src/<executable_name>/data/", // up 5 in tree, "/src/<executable_name>/data/" subdir
			"../../../../../src/<executable_name>/src/", // up 5 in tree, "/src/<executable_name>/src/" subdir
			"../../../../../src/<executable_name>/inc/", // up 5 in tree, "/src/<executable_name>/inc/" subdir
			"../../../../../sandbox/<executable_name>/", // up 5 in tree, "/sandbox/<executable_name>/" subdir
			"../../../../../sandbox/<executable_name>/data/", // up 5 in tree, "/sandbox/<executable_name>/data/" subdir
			"../../../../../sandbox/<executable_name>/src/", // up 5 in tree, "/sandbox/<executable_name>/src/" subdir
			"../../../../../sandbox/<executable_name>/inc/", // up 5 in tree, "/sandbox/<executable_name>/inc/" subdir
			"../../../../../0_Simple/<executable_name>/data/", // up 5 in tree, "/0_Simple/<executable_name>/" subdir
			"../../../../../1_Utilities/<executable_name>/data/", // up 5 in tree, "/1_Utilities/<executable_name>/" subdir
			"../../../../../2_Graphics/<executable_name>/data/", // up 5 in tree, "/2_Graphics/<executable_name>/" subdir
			"../../../../../3_Imaging/<executable_name>/data/", // up 5 in tree, "/3_Imaging/<executable_name>/" subdir
			"../../../../../4_Finance/<executable_name>/data/", // up 5 in tree, "/4_Finance/<executable_name>/" subdir
			"../../../../../5_Simulations/<executable_name>/data/",// up 5 in tree, "/5_Simulations/<executable_name>/" subdir
			"../../../../../6_Advanced/<executable_name>/data/", // up 5 in tree, "/6_Advanced/<executable_name>/" subdir
			"../../../../../7_CUDALibraries/<executable_name>/data/", // up 5 in tree, "/7_CUDALibraries/<executable_name>/" subdir
			"../../../../../8_Android/<executable_name>/data/", // up 5 in tree, "/8_Android/<executable_name>/" subdir
			"../../../../../samples/<executable_name>/data/", // up 5 in tree, "/samples/<executable_name>/" subdir
			"../../../../../common/", // up 5 in tree, "../../../common/" subdir
			"../../../../../common/data/", // up 5 in tree, "../../../common/data/" subdir
			};

			// Extract the executable name
			std::string executable_name;

			if (executable_path != 0)
			{
			executable_name = std::string(executable_path);

			#if defined(WIN32) \|\| defined(_WIN32) \|\| defined(WIN64) \|\| defined(_WIN64)
			// Windows path delimiter
			size_t delimiter_pos = executable_name.find_last_of('\\');
			executable_name.erase(0, delimiter_pos + 1);

			if (executable_name.rfind(".exe") != std::string::npos)
			{
			// we strip .exe, only if the .exe is found
			executable_name.resize(executable_name.size() - 4);
			}

			#else
			// Linux & OSX path delimiter
			size_t delimiter_pos = executable_name.find_last_of('/');
			executable_name.erase(0,delimiter_pos+1);
			#endif
			}

			// Loop over all search paths and return the first hit
			for (unsigned int i = 0; i < sizeof(searchPath)/sizeof(char *); ++i)
			{
			std::string path(searchPath[i]);
			size_t executable_name_pos = path.find("<executable_name>");

			// If there is executable_name variable in the searchPath
			// replace it with the value
			if (executable_name_pos != std::string::npos)
			{
			if (executable_path != 0)
			{
			path.replace(executable_name_pos, strlen("<executable_name>"), executable_name);
			}
			else
			{
			// Skip this path entry if no executable argument is given
			continue;
			}
			}

			#ifdef _DEBUG
			printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str());
			#endif

			// Test if the file exists
			path.append(filename);
			FILE *fp;
			FOPEN(fp, path.c_str(), "rb");

			if (fp != NULL)
			{
			fclose(fp);
			// File found
			// returning an allocated array here for backwards compatibility reasons
			char file_path = (char ) malloc(path.length() + 1);
			STRCPY(file_path, path.length() + 1, path.c_str());
			return file_path;
			}

			if (fp)
			{
			fclose(fp);
			}
			}

			// File not found
			return 0;
			}

			#endif