valib/goffmpeg.git

/*******************************************************************
*   CUDALERP.cu
*   CUDALERP
*
*    Author: Kareem Omar
*    kareem.omar@uah.edu
*    https://github.com/komrad36
*
*    Last updated Jan 7, 2016
*******************************************************************/
//
// The file CUDALERP.h exposes two extremely high performance GPU
// resize operations,
// CUDALERP (bilinear interpolation), and 
// CUDANERP (nearest neighbor interpolation), for 8-bit unsigned
// integer (i.e. grayscale) data.
//
// For 32-bit float data, see the CUDAFLERP project instead.
//
// CUDALERP offers superior accuracy to CUDA's built-in texture
// interpolator at comparable performance. The accuracy if compiled
// with -use-fast-math off is nearly equivalent to my CPU interpolator,
// KLERP, while still being as fast as the built-in interpolation.
// 
// Particularly for large images, CUDALERP dramatically outperforms
// even the highly tuned CPU AVX2 versions.
// 
// All functionality is contained in the header 'CUDALERP.h' and
// the source file 'CUDALERP.cu' and has no external dependencies at all.
// 
// Note that these are intended for computer vision use(hence the speed)
// and are designed for grayscale images.
// 
// The file 'main.cpp' is an example and speed test driver.
//
 
#include "CUDALERP.h"
 
__global__ void
#ifndef __INTELLISENSE__
__launch_bounds__(256, 0)
#endif
CUDANERP_kernel(const cudaTextureObject_t d_img_tex, const float gxs, const float gys, uint8_t* __restrict const d_out, const int neww) {
    uint32_t x = (blockIdx.x << 9) + (threadIdx.x << 1);
    const uint32_t y = blockIdx.y;
    const float fy = y*gys;
#pragma unroll
    for (int i = 0; i < 2; ++i, ++x) {
        const float fx = x*gxs;
        float res = 255.0f*tex2D<float>(d_img_tex, fx, fy);
        if (x < neww) d_out[y*neww + x] = res;
    }
}
 
__global__ void
#ifndef __INTELLISENSE__
__launch_bounds__(256, 0)
#endif
CUDALERP_kernel(const cudaTextureObject_t d_img_tex, const float gxs, const float gys, uint8_t* __restrict const d_out, const int neww) {
    uint32_t x = (blockIdx.x << 9) + (threadIdx.x << 1);
    const uint32_t y = blockIdx.y;
    const float fy = (y + 0.5f)*gys - 0.5f;
    const float wt_y = fy - floor(fy);
    const float invwt_y = 1.0f - wt_y;
#pragma unroll
    for (int i = 0; i < 2; ++i, ++x) {
        const float fx = (x + 0.5f)*gxs - 0.5f;
        // less accurate and not really much (or any) faster
        // -----------------
        // const float res = tex2D<float>(d_img_tex, fx, fy);
        // -----------------
        const float4 f = tex2Dgather<float4>(d_img_tex, fx + 0.5f, fy + 0.5f);
        const float wt_x = fx - floor(fx);
        const float invwt_x = 1.0f - wt_x;
        const float xa = invwt_x*f.w + wt_x*f.z;
        const float xb = invwt_x*f.x + wt_x*f.y;
        const float res = 255.0f*(invwt_y*xa + wt_y*xb) + 0.5f;
        // -----------------
        if (x < neww) d_out[y*neww + x] = res;
    }
}
 
void CUDANERP(const cudaTextureObject_t d_img_tex, const int oldw, const int oldh, uint8_t* __restrict const d_out, const uint32_t neww, const uint32_t newh) {
    const float gxs = static_cast<float>(oldw) / static_cast<float>(neww);
    const float gys = static_cast<float>(oldh) / static_cast<float>(newh);
    CUDANERP_kernel<<<{((neww - 1) >> 9) + 1, newh}, 256>>>(d_img_tex, gxs, gys, d_out, neww);
    cudaDeviceSynchronize();
}
 
void CUDALERP(const cudaTextureObject_t d_img_tex, const int oldw, const int oldh, uint8_t* __restrict const d_out, const uint32_t neww, const uint32_t newh) {
    const float gxs = static_cast<float>(oldw) / static_cast<float>(neww);
    const float gys = static_cast<float>(oldh) / static_cast<float>(newh);
    CUDALERP_kernel<<<{((neww - 1) >> 9) + 1, newh}, 256>>>(d_img_tex, gxs, gys, d_out, neww);
    cudaDeviceSynchronize();
}