~chenshijun/analysis.git

New file
			@@ -0,0 +1,1261 @@
			/**
			* Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
			*
			* Please refer to the NVIDIA end user license agreement (EULA) associated
			* with this source code for terms and conditions that govern your use of
			* this software. Any use, reproduction, disclosure, or distribution of
			* this software and related documentation outside the terms of the EULA
			* is strictly prohibited.
			*
			*/

			////////////////////////////////////////////////////////////////////////////////
			// These are CUDA Helper functions for initialization and error checking

			#ifndef HELPER_CUDA_H
			#define HELPER_CUDA_H

			#pragma once

			#include <stdlib.h>
			#include <stdio.h>
			#include <string.h>

			#include <helper_string.h>

			#ifndef EXIT_WAIVED
			#define EXIT_WAIVED 2
			#endif

			// Note, it is required that your SDK sample to include the proper header files, please
			// refer the CUDA examples for examples of the needed CUDA headers, which may change depending
			// on which CUDA functions are used.

			// CUDA Runtime error messages
			#ifdef __DRIVER_TYPES_H__
			static const char *_cudaGetErrorEnum(cudaError_t error)
			{
			switch (error)
			{
			case cudaSuccess:
			return "cudaSuccess";

			case cudaErrorMissingConfiguration:
			return "cudaErrorMissingConfiguration";

			case cudaErrorMemoryAllocation:
			return "cudaErrorMemoryAllocation";

			case cudaErrorInitializationError:
			return "cudaErrorInitializationError";

			case cudaErrorLaunchFailure:
			return "cudaErrorLaunchFailure";

			case cudaErrorPriorLaunchFailure:
			return "cudaErrorPriorLaunchFailure";

			case cudaErrorLaunchTimeout:
			return "cudaErrorLaunchTimeout";

			case cudaErrorLaunchOutOfResources:
			return "cudaErrorLaunchOutOfResources";

			case cudaErrorInvalidDeviceFunction:
			return "cudaErrorInvalidDeviceFunction";

			case cudaErrorInvalidConfiguration:
			return "cudaErrorInvalidConfiguration";

			case cudaErrorInvalidDevice:
			return "cudaErrorInvalidDevice";

			case cudaErrorInvalidValue:
			return "cudaErrorInvalidValue";

			case cudaErrorInvalidPitchValue:
			return "cudaErrorInvalidPitchValue";

			case cudaErrorInvalidSymbol:
			return "cudaErrorInvalidSymbol";

			case cudaErrorMapBufferObjectFailed:
			return "cudaErrorMapBufferObjectFailed";

			case cudaErrorUnmapBufferObjectFailed:
			return "cudaErrorUnmapBufferObjectFailed";

			case cudaErrorInvalidHostPointer:
			return "cudaErrorInvalidHostPointer";

			case cudaErrorInvalidDevicePointer:
			return "cudaErrorInvalidDevicePointer";

			case cudaErrorInvalidTexture:
			return "cudaErrorInvalidTexture";

			case cudaErrorInvalidTextureBinding:
			return "cudaErrorInvalidTextureBinding";

			case cudaErrorInvalidChannelDescriptor:
			return "cudaErrorInvalidChannelDescriptor";

			case cudaErrorInvalidMemcpyDirection:
			return "cudaErrorInvalidMemcpyDirection";

			case cudaErrorAddressOfConstant:
			return "cudaErrorAddressOfConstant";

			case cudaErrorTextureFetchFailed:
			return "cudaErrorTextureFetchFailed";

			case cudaErrorTextureNotBound:
			return "cudaErrorTextureNotBound";

			case cudaErrorSynchronizationError:
			return "cudaErrorSynchronizationError";

			case cudaErrorInvalidFilterSetting:
			return "cudaErrorInvalidFilterSetting";

			case cudaErrorInvalidNormSetting:
			return "cudaErrorInvalidNormSetting";

			case cudaErrorMixedDeviceExecution:
			return "cudaErrorMixedDeviceExecution";

			case cudaErrorCudartUnloading:
			return "cudaErrorCudartUnloading";

			case cudaErrorUnknown:
			return "cudaErrorUnknown";

			case cudaErrorNotYetImplemented:
			return "cudaErrorNotYetImplemented";

			case cudaErrorMemoryValueTooLarge:
			return "cudaErrorMemoryValueTooLarge";

			case cudaErrorInvalidResourceHandle:
			return "cudaErrorInvalidResourceHandle";

			case cudaErrorNotReady:
			return "cudaErrorNotReady";

			case cudaErrorInsufficientDriver:
			return "cudaErrorInsufficientDriver";

			case cudaErrorSetOnActiveProcess:
			return "cudaErrorSetOnActiveProcess";

			case cudaErrorInvalidSurface:
			return "cudaErrorInvalidSurface";

			case cudaErrorNoDevice:
			return "cudaErrorNoDevice";

			case cudaErrorECCUncorrectable:
			return "cudaErrorECCUncorrectable";

			case cudaErrorSharedObjectSymbolNotFound:
			return "cudaErrorSharedObjectSymbolNotFound";

			case cudaErrorSharedObjectInitFailed:
			return "cudaErrorSharedObjectInitFailed";

			case cudaErrorUnsupportedLimit:
			return "cudaErrorUnsupportedLimit";

			case cudaErrorDuplicateVariableName:
			return "cudaErrorDuplicateVariableName";

			case cudaErrorDuplicateTextureName:
			return "cudaErrorDuplicateTextureName";

			case cudaErrorDuplicateSurfaceName:
			return "cudaErrorDuplicateSurfaceName";

			case cudaErrorDevicesUnavailable:
			return "cudaErrorDevicesUnavailable";

			case cudaErrorInvalidKernelImage:
			return "cudaErrorInvalidKernelImage";

			case cudaErrorNoKernelImageForDevice:
			return "cudaErrorNoKernelImageForDevice";

			case cudaErrorIncompatibleDriverContext:
			return "cudaErrorIncompatibleDriverContext";

			case cudaErrorPeerAccessAlreadyEnabled:
			return "cudaErrorPeerAccessAlreadyEnabled";

			case cudaErrorPeerAccessNotEnabled:
			return "cudaErrorPeerAccessNotEnabled";

			case cudaErrorDeviceAlreadyInUse:
			return "cudaErrorDeviceAlreadyInUse";

			case cudaErrorProfilerDisabled:
			return "cudaErrorProfilerDisabled";

			case cudaErrorProfilerNotInitialized:
			return "cudaErrorProfilerNotInitialized";

			case cudaErrorProfilerAlreadyStarted:
			return "cudaErrorProfilerAlreadyStarted";

			case cudaErrorProfilerAlreadyStopped:
			return "cudaErrorProfilerAlreadyStopped";

			/* Since CUDA 4.0*/
			case cudaErrorAssert:
			return "cudaErrorAssert";

			case cudaErrorTooManyPeers:
			return "cudaErrorTooManyPeers";

			case cudaErrorHostMemoryAlreadyRegistered:
			return "cudaErrorHostMemoryAlreadyRegistered";

			case cudaErrorHostMemoryNotRegistered:
			return "cudaErrorHostMemoryNotRegistered";

			/* Since CUDA 5.0 */
			case cudaErrorOperatingSystem:
			return "cudaErrorOperatingSystem";

			case cudaErrorPeerAccessUnsupported:
			return "cudaErrorPeerAccessUnsupported";

			case cudaErrorLaunchMaxDepthExceeded:
			return "cudaErrorLaunchMaxDepthExceeded";

			case cudaErrorLaunchFileScopedTex:
			return "cudaErrorLaunchFileScopedTex";

			case cudaErrorLaunchFileScopedSurf:
			return "cudaErrorLaunchFileScopedSurf";

			case cudaErrorSyncDepthExceeded:
			return "cudaErrorSyncDepthExceeded";

			case cudaErrorLaunchPendingCountExceeded:
			return "cudaErrorLaunchPendingCountExceeded";

			case cudaErrorNotPermitted:
			return "cudaErrorNotPermitted";

			case cudaErrorNotSupported:
			return "cudaErrorNotSupported";

			/* Since CUDA 6.0 */
			case cudaErrorHardwareStackError:
			return "cudaErrorHardwareStackError";

			case cudaErrorIllegalInstruction:
			return "cudaErrorIllegalInstruction";

			case cudaErrorMisalignedAddress:
			return "cudaErrorMisalignedAddress";

			case cudaErrorInvalidAddressSpace:
			return "cudaErrorInvalidAddressSpace";

			case cudaErrorInvalidPc:
			return "cudaErrorInvalidPc";

			case cudaErrorIllegalAddress:
			return "cudaErrorIllegalAddress";

			/* Since CUDA 6.5*/
			case cudaErrorInvalidPtx:
			return "cudaErrorInvalidPtx";

			case cudaErrorInvalidGraphicsContext:
			return "cudaErrorInvalidGraphicsContext";

			case cudaErrorStartupFailure:
			return "cudaErrorStartupFailure";

			case cudaErrorApiFailureBase:
			return "cudaErrorApiFailureBase";
			}

			return "<unknown>";
			}
			#endif

			#ifdef __cuda_cuda_h__
			// CUDA Driver API errors
			static const char *_cudaGetErrorEnum(CUresult error)
			{
			switch (error)
			{
			case CUDA_SUCCESS:
			return "CUDA_SUCCESS";

			case CUDA_ERROR_INVALID_VALUE:
			return "CUDA_ERROR_INVALID_VALUE";

			case CUDA_ERROR_OUT_OF_MEMORY:
			return "CUDA_ERROR_OUT_OF_MEMORY";

			case CUDA_ERROR_NOT_INITIALIZED:
			return "CUDA_ERROR_NOT_INITIALIZED";

			case CUDA_ERROR_DEINITIALIZED:
			return "CUDA_ERROR_DEINITIALIZED";

			case CUDA_ERROR_PROFILER_DISABLED:
			return "CUDA_ERROR_PROFILER_DISABLED";

			case CUDA_ERROR_PROFILER_NOT_INITIALIZED:
			return "CUDA_ERROR_PROFILER_NOT_INITIALIZED";

			case CUDA_ERROR_PROFILER_ALREADY_STARTED:
			return "CUDA_ERROR_PROFILER_ALREADY_STARTED";

			case CUDA_ERROR_PROFILER_ALREADY_STOPPED:
			return "CUDA_ERROR_PROFILER_ALREADY_STOPPED";

			case CUDA_ERROR_NO_DEVICE:
			return "CUDA_ERROR_NO_DEVICE";

			case CUDA_ERROR_INVALID_DEVICE:
			return "CUDA_ERROR_INVALID_DEVICE";

			case CUDA_ERROR_INVALID_IMAGE:
			return "CUDA_ERROR_INVALID_IMAGE";

			case CUDA_ERROR_INVALID_CONTEXT:
			return "CUDA_ERROR_INVALID_CONTEXT";

			case CUDA_ERROR_CONTEXT_ALREADY_CURRENT:
			return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT";

			case CUDA_ERROR_MAP_FAILED:
			return "CUDA_ERROR_MAP_FAILED";

			case CUDA_ERROR_UNMAP_FAILED:
			return "CUDA_ERROR_UNMAP_FAILED";

			case CUDA_ERROR_ARRAY_IS_MAPPED:
			return "CUDA_ERROR_ARRAY_IS_MAPPED";

			case CUDA_ERROR_ALREADY_MAPPED:
			return "CUDA_ERROR_ALREADY_MAPPED";

			case CUDA_ERROR_NO_BINARY_FOR_GPU:
			return "CUDA_ERROR_NO_BINARY_FOR_GPU";

			case CUDA_ERROR_ALREADY_ACQUIRED:
			return "CUDA_ERROR_ALREADY_ACQUIRED";

			case CUDA_ERROR_NOT_MAPPED:
			return "CUDA_ERROR_NOT_MAPPED";

			case CUDA_ERROR_NOT_MAPPED_AS_ARRAY:
			return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY";

			case CUDA_ERROR_NOT_MAPPED_AS_POINTER:
			return "CUDA_ERROR_NOT_MAPPED_AS_POINTER";

			case CUDA_ERROR_ECC_UNCORRECTABLE:
			return "CUDA_ERROR_ECC_UNCORRECTABLE";

			case CUDA_ERROR_UNSUPPORTED_LIMIT:
			return "CUDA_ERROR_UNSUPPORTED_LIMIT";

			case CUDA_ERROR_CONTEXT_ALREADY_IN_USE:
			return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE";

			case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED:
			return "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED";

			case CUDA_ERROR_INVALID_PTX:
			return "CUDA_ERROR_INVALID_PTX";

			case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT:
			return "CUDA_ERROR_INVALID_GRAPHICS_CONTEXT";

			case CUDA_ERROR_INVALID_SOURCE:
			return "CUDA_ERROR_INVALID_SOURCE";

			case CUDA_ERROR_FILE_NOT_FOUND:
			return "CUDA_ERROR_FILE_NOT_FOUND";

			case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND:
			return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND";

			case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED:
			return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED";

			case CUDA_ERROR_OPERATING_SYSTEM:
			return "CUDA_ERROR_OPERATING_SYSTEM";

			case CUDA_ERROR_INVALID_HANDLE:
			return "CUDA_ERROR_INVALID_HANDLE";

			case CUDA_ERROR_NOT_FOUND:
			return "CUDA_ERROR_NOT_FOUND";

			case CUDA_ERROR_NOT_READY:
			return "CUDA_ERROR_NOT_READY";

			case CUDA_ERROR_ILLEGAL_ADDRESS:
			return "CUDA_ERROR_ILLEGAL_ADDRESS";

			case CUDA_ERROR_LAUNCH_FAILED:
			return "CUDA_ERROR_LAUNCH_FAILED";

			case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
			return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";

			case CUDA_ERROR_LAUNCH_TIMEOUT:
			return "CUDA_ERROR_LAUNCH_TIMEOUT";

			case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING:
			return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";

			case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED:
			return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED";

			case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED:
			return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED";

			case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE:
			return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE";

			case CUDA_ERROR_CONTEXT_IS_DESTROYED:
			return "CUDA_ERROR_CONTEXT_IS_DESTROYED";

			case CUDA_ERROR_ASSERT:
			return "CUDA_ERROR_ASSERT";

			case CUDA_ERROR_TOO_MANY_PEERS:
			return "CUDA_ERROR_TOO_MANY_PEERS";

			case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED:
			return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED";

			case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED:
			return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED";

			case CUDA_ERROR_HARDWARE_STACK_ERROR:
			return "CUDA_ERROR_HARDWARE_STACK_ERROR";

			case CUDA_ERROR_ILLEGAL_INSTRUCTION:
			return "CUDA_ERROR_ILLEGAL_INSTRUCTION";

			case CUDA_ERROR_MISALIGNED_ADDRESS:
			return "CUDA_ERROR_MISALIGNED_ADDRESS";

			case CUDA_ERROR_INVALID_ADDRESS_SPACE:
			return "CUDA_ERROR_INVALID_ADDRESS_SPACE";

			case CUDA_ERROR_INVALID_PC:
			return "CUDA_ERROR_INVALID_PC";

			case CUDA_ERROR_NOT_PERMITTED:
			return "CUDA_ERROR_NOT_PERMITTED";

			case CUDA_ERROR_NOT_SUPPORTED:
			return "CUDA_ERROR_NOT_SUPPORTED";

			case CUDA_ERROR_UNKNOWN:
			return "CUDA_ERROR_UNKNOWN";
			}

			return "<unknown>";
			}
			#endif

			#ifdef CUBLAS_API_H_
			// cuBLAS API errors
			static const char *_cudaGetErrorEnum(cublasStatus_t error)
			{
			switch (error)
			{
			case CUBLAS_STATUS_SUCCESS:
			return "CUBLAS_STATUS_SUCCESS";

			case CUBLAS_STATUS_NOT_INITIALIZED:
			return "CUBLAS_STATUS_NOT_INITIALIZED";

			case CUBLAS_STATUS_ALLOC_FAILED:
			return "CUBLAS_STATUS_ALLOC_FAILED";

			case CUBLAS_STATUS_INVALID_VALUE:
			return "CUBLAS_STATUS_INVALID_VALUE";

			case CUBLAS_STATUS_ARCH_MISMATCH:
			return "CUBLAS_STATUS_ARCH_MISMATCH";

			case CUBLAS_STATUS_MAPPING_ERROR:
			return "CUBLAS_STATUS_MAPPING_ERROR";

			case CUBLAS_STATUS_EXECUTION_FAILED:
			return "CUBLAS_STATUS_EXECUTION_FAILED";

			case CUBLAS_STATUS_INTERNAL_ERROR:
			return "CUBLAS_STATUS_INTERNAL_ERROR";

			case CUBLAS_STATUS_NOT_SUPPORTED:
			return "CUBLAS_STATUS_NOT_SUPPORTED";

			case CUBLAS_STATUS_LICENSE_ERROR:
			return "CUBLAS_STATUS_LICENSE_ERROR";
			}

			return "<unknown>";
			}
			#endif

			#ifdef _CUFFT_H_
			// cuFFT API errors
			static const char *_cudaGetErrorEnum(cufftResult error)
			{
			switch (error)
			{
			case CUFFT_SUCCESS:
			return "CUFFT_SUCCESS";

			case CUFFT_INVALID_PLAN:
			return "CUFFT_INVALID_PLAN";

			case CUFFT_ALLOC_FAILED:
			return "CUFFT_ALLOC_FAILED";

			case CUFFT_INVALID_TYPE:
			return "CUFFT_INVALID_TYPE";

			case CUFFT_INVALID_VALUE:
			return "CUFFT_INVALID_VALUE";

			case CUFFT_INTERNAL_ERROR:
			return "CUFFT_INTERNAL_ERROR";

			case CUFFT_EXEC_FAILED:
			return "CUFFT_EXEC_FAILED";

			case CUFFT_SETUP_FAILED:
			return "CUFFT_SETUP_FAILED";

			case CUFFT_INVALID_SIZE:
			return "CUFFT_INVALID_SIZE";

			case CUFFT_UNALIGNED_DATA:
			return "CUFFT_UNALIGNED_DATA";

			case CUFFT_INCOMPLETE_PARAMETER_LIST:
			return "CUFFT_INCOMPLETE_PARAMETER_LIST";

			case CUFFT_INVALID_DEVICE:
			return "CUFFT_INVALID_DEVICE";

			case CUFFT_PARSE_ERROR:
			return "CUFFT_PARSE_ERROR";

			case CUFFT_NO_WORKSPACE:
			return "CUFFT_NO_WORKSPACE";

			case CUFFT_NOT_IMPLEMENTED:
			return "CUFFT_NOT_IMPLEMENTED";

			case CUFFT_LICENSE_ERROR:
			return "CUFFT_LICENSE_ERROR";
			}

			return "<unknown>";
			}
			#endif


			#ifdef CUSPARSEAPI
			// cuSPARSE API errors
			static const char *_cudaGetErrorEnum(cusparseStatus_t error)
			{
			switch (error)
			{
			case CUSPARSE_STATUS_SUCCESS:
			return "CUSPARSE_STATUS_SUCCESS";

			case CUSPARSE_STATUS_NOT_INITIALIZED:
			return "CUSPARSE_STATUS_NOT_INITIALIZED";

			case CUSPARSE_STATUS_ALLOC_FAILED:
			return "CUSPARSE_STATUS_ALLOC_FAILED";

			case CUSPARSE_STATUS_INVALID_VALUE:
			return "CUSPARSE_STATUS_INVALID_VALUE";

			case CUSPARSE_STATUS_ARCH_MISMATCH:
			return "CUSPARSE_STATUS_ARCH_MISMATCH";

			case CUSPARSE_STATUS_MAPPING_ERROR:
			return "CUSPARSE_STATUS_MAPPING_ERROR";

			case CUSPARSE_STATUS_EXECUTION_FAILED:
			return "CUSPARSE_STATUS_EXECUTION_FAILED";

			case CUSPARSE_STATUS_INTERNAL_ERROR:
			return "CUSPARSE_STATUS_INTERNAL_ERROR";

			case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
			return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
			}

			return "<unknown>";
			}
			#endif

			#ifdef CUSOLVER_COMMON_H_
			//cuSOLVER API errors
			static const char *_cudaGetErrorEnum(cusolverStatus_t error)
			{
			switch(error)
			{
			case CUSOLVER_STATUS_SUCCESS:
			return "CUSOLVER_STATUS_SUCCESS";
			case CUSOLVER_STATUS_NOT_INITIALIZED:
			return "CUSOLVER_STATUS_NOT_INITIALIZED";
			case CUSOLVER_STATUS_ALLOC_FAILED:
			return "CUSOLVER_STATUS_ALLOC_FAILED";
			case CUSOLVER_STATUS_INVALID_VALUE:
			return "CUSOLVER_STATUS_INVALID_VALUE";
			case CUSOLVER_STATUS_ARCH_MISMATCH:
			return "CUSOLVER_STATUS_ARCH_MISMATCH";
			case CUSOLVER_STATUS_MAPPING_ERROR:
			return "CUSOLVER_STATUS_MAPPING_ERROR";
			case CUSOLVER_STATUS_EXECUTION_FAILED:
			return "CUSOLVER_STATUS_EXECUTION_FAILED";
			case CUSOLVER_STATUS_INTERNAL_ERROR:
			return "CUSOLVER_STATUS_INTERNAL_ERROR";
			case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
			return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
			case CUSOLVER_STATUS_NOT_SUPPORTED :
			return "CUSOLVER_STATUS_NOT_SUPPORTED ";
			case CUSOLVER_STATUS_ZERO_PIVOT:
			return "CUSOLVER_STATUS_ZERO_PIVOT";
			case CUSOLVER_STATUS_INVALID_LICENSE:
			return "CUSOLVER_STATUS_INVALID_LICENSE";
			}

			return "<unknown>";

			}
			#endif

			#ifdef CURAND_H_
			// cuRAND API errors
			static const char *_cudaGetErrorEnum(curandStatus_t error)
			{
			switch (error)
			{
			case CURAND_STATUS_SUCCESS:
			return "CURAND_STATUS_SUCCESS";

			case CURAND_STATUS_VERSION_MISMATCH:
			return "CURAND_STATUS_VERSION_MISMATCH";

			case CURAND_STATUS_NOT_INITIALIZED:
			return "CURAND_STATUS_NOT_INITIALIZED";

			case CURAND_STATUS_ALLOCATION_FAILED:
			return "CURAND_STATUS_ALLOCATION_FAILED";

			case CURAND_STATUS_TYPE_ERROR:
			return "CURAND_STATUS_TYPE_ERROR";

			case CURAND_STATUS_OUT_OF_RANGE:
			return "CURAND_STATUS_OUT_OF_RANGE";

			case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
			return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";

			case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
			return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";

			case CURAND_STATUS_LAUNCH_FAILURE:
			return "CURAND_STATUS_LAUNCH_FAILURE";

			case CURAND_STATUS_PREEXISTING_FAILURE:
			return "CURAND_STATUS_PREEXISTING_FAILURE";

			case CURAND_STATUS_INITIALIZATION_FAILED:
			return "CURAND_STATUS_INITIALIZATION_FAILED";

			case CURAND_STATUS_ARCH_MISMATCH:
			return "CURAND_STATUS_ARCH_MISMATCH";

			case CURAND_STATUS_INTERNAL_ERROR:
			return "CURAND_STATUS_INTERNAL_ERROR";
			}

			return "<unknown>";
			}
			#endif

			#ifdef NV_NPPIDEFS_H
			// NPP API errors
			static const char *_cudaGetErrorEnum(NppStatus error)
			{
			switch (error)
			{
			case NPP_NOT_SUPPORTED_MODE_ERROR:
			return "NPP_NOT_SUPPORTED_MODE_ERROR";

			case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR:
			return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR";

			case NPP_RESIZE_NO_OPERATION_ERROR:
			return "NPP_RESIZE_NO_OPERATION_ERROR";

			case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY:
			return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY";

			#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000

			case NPP_BAD_ARG_ERROR:
			return "NPP_BAD_ARGUMENT_ERROR";

			case NPP_COEFF_ERROR:
			return "NPP_COEFFICIENT_ERROR";

			case NPP_RECT_ERROR:
			return "NPP_RECTANGLE_ERROR";

			case NPP_QUAD_ERROR:
			return "NPP_QUADRANGLE_ERROR";

			case NPP_MEM_ALLOC_ERR:
			return "NPP_MEMORY_ALLOCATION_ERROR";

			case NPP_HISTO_NUMBER_OF_LEVELS_ERROR:
			return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";

			case NPP_INVALID_INPUT:
			return "NPP_INVALID_INPUT";

			case NPP_POINTER_ERROR:
			return "NPP_POINTER_ERROR";

			case NPP_WARNING:
			return "NPP_WARNING";

			case NPP_ODD_ROI_WARNING:
			return "NPP_ODD_ROI_WARNING";
			#else

			// These are for CUDA 5.5 or higher
			case NPP_BAD_ARGUMENT_ERROR:
			return "NPP_BAD_ARGUMENT_ERROR";

			case NPP_COEFFICIENT_ERROR:
			return "NPP_COEFFICIENT_ERROR";

			case NPP_RECTANGLE_ERROR:
			return "NPP_RECTANGLE_ERROR";

			case NPP_QUADRANGLE_ERROR:
			return "NPP_QUADRANGLE_ERROR";

			case NPP_MEMORY_ALLOCATION_ERR:
			return "NPP_MEMORY_ALLOCATION_ERROR";

			case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR:
			return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";

			case NPP_INVALID_HOST_POINTER_ERROR:
			return "NPP_INVALID_HOST_POINTER_ERROR";

			case NPP_INVALID_DEVICE_POINTER_ERROR:
			return "NPP_INVALID_DEVICE_POINTER_ERROR";
			#endif

			case NPP_LUT_NUMBER_OF_LEVELS_ERROR:
			return "NPP_LUT_NUMBER_OF_LEVELS_ERROR";

			case NPP_TEXTURE_BIND_ERROR:
			return "NPP_TEXTURE_BIND_ERROR";

			case NPP_WRONG_INTERSECTION_ROI_ERROR:
			return "NPP_WRONG_INTERSECTION_ROI_ERROR";

			case NPP_NOT_EVEN_STEP_ERROR:
			return "NPP_NOT_EVEN_STEP_ERROR";

			case NPP_INTERPOLATION_ERROR:
			return "NPP_INTERPOLATION_ERROR";

			case NPP_RESIZE_FACTOR_ERROR:
			return "NPP_RESIZE_FACTOR_ERROR";

			case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR:
			return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR";


			#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000

			case NPP_MEMFREE_ERR:
			return "NPP_MEMFREE_ERR";

			case NPP_MEMSET_ERR:
			return "NPP_MEMSET_ERR";

			case NPP_MEMCPY_ERR:
			return "NPP_MEMCPY_ERROR";

			case NPP_MIRROR_FLIP_ERR:
			return "NPP_MIRROR_FLIP_ERR";
			#else

			case NPP_MEMFREE_ERROR:
			return "NPP_MEMFREE_ERROR";

			case NPP_MEMSET_ERROR:
			return "NPP_MEMSET_ERROR";

			case NPP_MEMCPY_ERROR:
			return "NPP_MEMCPY_ERROR";

			case NPP_MIRROR_FLIP_ERROR:
			return "NPP_MIRROR_FLIP_ERROR";
			#endif

			case NPP_ALIGNMENT_ERROR:
			return "NPP_ALIGNMENT_ERROR";

			case NPP_STEP_ERROR:
			return "NPP_STEP_ERROR";

			case NPP_SIZE_ERROR:
			return "NPP_SIZE_ERROR";

			case NPP_NULL_POINTER_ERROR:
			return "NPP_NULL_POINTER_ERROR";

			case NPP_CUDA_KERNEL_EXECUTION_ERROR:
			return "NPP_CUDA_KERNEL_EXECUTION_ERROR";

			case NPP_NOT_IMPLEMENTED_ERROR:
			return "NPP_NOT_IMPLEMENTED_ERROR";

			case NPP_ERROR:
			return "NPP_ERROR";

			case NPP_SUCCESS:
			return "NPP_SUCCESS";

			case NPP_WRONG_INTERSECTION_QUAD_WARNING:
			return "NPP_WRONG_INTERSECTION_QUAD_WARNING";

			case NPP_MISALIGNED_DST_ROI_WARNING:
			return "NPP_MISALIGNED_DST_ROI_WARNING";

			case NPP_AFFINE_QUAD_INCORRECT_WARNING:
			return "NPP_AFFINE_QUAD_INCORRECT_WARNING";

			case NPP_DOUBLE_SIZE_WARNING:
			return "NPP_DOUBLE_SIZE_WARNING";

			case NPP_WRONG_INTERSECTION_ROI_WARNING:
			return "NPP_WRONG_INTERSECTION_ROI_WARNING";

			#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000
			/* These are 6.0 or higher */
			case NPP_LUT_PALETTE_BITSIZE_ERROR:
			return "NPP_LUT_PALETTE_BITSIZE_ERROR";

			case NPP_ZC_MODE_NOT_SUPPORTED_ERROR:
			return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR";

			case NPP_QUALITY_INDEX_ERROR:
			return "NPP_QUALITY_INDEX_ERROR";

			case NPP_CHANNEL_ORDER_ERROR:
			return "NPP_CHANNEL_ORDER_ERROR";

			case NPP_ZERO_MASK_VALUE_ERROR:
			return "NPP_ZERO_MASK_VALUE_ERROR";

			case NPP_NUMBER_OF_CHANNELS_ERROR:
			return "NPP_NUMBER_OF_CHANNELS_ERROR";

			case NPP_COI_ERROR:
			return "NPP_COI_ERROR";

			case NPP_DIVISOR_ERROR:
			return "NPP_DIVISOR_ERROR";

			case NPP_CHANNEL_ERROR:
			return "NPP_CHANNEL_ERROR";

			case NPP_STRIDE_ERROR:
			return "NPP_STRIDE_ERROR";

			case NPP_ANCHOR_ERROR:
			return "NPP_ANCHOR_ERROR";

			case NPP_MASK_SIZE_ERROR:
			return "NPP_MASK_SIZE_ERROR";

			case NPP_MOMENT_00_ZERO_ERROR:
			return "NPP_MOMENT_00_ZERO_ERROR";

			case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR:
			return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR";

			case NPP_THRESHOLD_ERROR:
			return "NPP_THRESHOLD_ERROR";

			case NPP_CONTEXT_MATCH_ERROR:
			return "NPP_CONTEXT_MATCH_ERROR";

			case NPP_FFT_FLAG_ERROR:
			return "NPP_FFT_FLAG_ERROR";

			case NPP_FFT_ORDER_ERROR:
			return "NPP_FFT_ORDER_ERROR";

			case NPP_SCALE_RANGE_ERROR:
			return "NPP_SCALE_RANGE_ERROR";

			case NPP_DATA_TYPE_ERROR:
			return "NPP_DATA_TYPE_ERROR";

			case NPP_OUT_OFF_RANGE_ERROR:
			return "NPP_OUT_OFF_RANGE_ERROR";

			case NPP_DIVIDE_BY_ZERO_ERROR:
			return "NPP_DIVIDE_BY_ZERO_ERROR";

			case NPP_RANGE_ERROR:
			return "NPP_RANGE_ERROR";

			case NPP_NO_MEMORY_ERROR:
			return "NPP_NO_MEMORY_ERROR";

			case NPP_ERROR_RESERVED:
			return "NPP_ERROR_RESERVED";

			case NPP_NO_OPERATION_WARNING:
			return "NPP_NO_OPERATION_WARNING";

			case NPP_DIVIDE_BY_ZERO_WARNING:
			return "NPP_DIVIDE_BY_ZERO_WARNING";
			#endif

			}

			return "<unknown>";
			}
			#endif

			#ifdef __DRIVER_TYPES_H__
			#ifndef DEVICE_RESET
			#define DEVICE_RESET cudaDeviceReset();
			#endif
			#else
			#ifndef DEVICE_RESET
			#define DEVICE_RESET
			#endif
			#endif

			template< typename T >
			void check(T result, char const const func, const char const file, int const line)
			{
			if (result)
			{
			fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n",
			file, line, static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
			DEVICE_RESET
			// Make sure we call CUDA Device Reset before exiting
			exit(EXIT_FAILURE);
			}
			}

			#ifdef __DRIVER_TYPES_H__
			// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
			#define checkCudaErrors(val) check ( (val), #val, __FILE__, __LINE__ )

			// This will output the proper error string when calling cudaGetLastError
			#define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__)

			inline void __getLastCudaError(const char errorMessage, const char file, const int line)
			{
			cudaError_t err = cudaGetLastError();

			if (cudaSuccess != err)
			{
			fprintf(stderr, "%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n",
			file, line, errorMessage, (int)err, cudaGetErrorString(err));
			DEVICE_RESET
			exit(EXIT_FAILURE);
			}
			}
			#endif

			#ifndef MAX
			#define MAX(a,b) (a > b ? a : b)
			#endif

			// Float To Int conversion
			inline int ftoi(float value)
			{
			return (value >= 0 ? (int)(value + 0.5) : (int)(value - 0.5));
			}

			// Beginning of GPU Architecture definitions
			inline int _ConvertSMVer2Cores(int major, int minor)
			{
			// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
			typedef struct
			{
			int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
			int Cores;
			} sSMtoCores;

			sSMtoCores nGpuArchCoresPerSM[] =
			{
			{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
			{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
			{ 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
			{ 0x32, 192}, // Kepler Generation (SM 3.2) GK10x class
			{ 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
			{ 0x37, 192}, // Kepler Generation (SM 3.7) GK21x class
			{ 0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class
			{ 0x52, 128}, // Maxwell Generation (SM 5.2) GM20x class
			{ -1, -1 }
			};

			int index = 0;

			while (nGpuArchCoresPerSM[index].SM != -1)
			{
			if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor))
			{
			return nGpuArchCoresPerSM[index].Cores;
			}

			index++;
			}

			// If we don't find the values, we default use the previous one to run properly
			printf("MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[index-1].Cores);
			return nGpuArchCoresPerSM[index-1].Cores;
			}
			// end of GPU Architecture definitions

			#ifdef __CUDA_RUNTIME_H__
			// General GPU Device CUDA Initialization
			inline int gpuDeviceInit(int devID)
			{
			int device_count;
			checkCudaErrors(cudaGetDeviceCount(&device_count));

			if (device_count == 0)
			{
			fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n");
			exit(EXIT_FAILURE);
			}

			if (devID < 0)
			{
			devID = 0;
			}

			if (devID > device_count-1)
			{
			fprintf(stderr, "\n");
			fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", device_count);
			fprintf(stderr, ">> gpuDeviceInit (-device=%d) is not a valid GPU device. <<\n", devID);
			fprintf(stderr, "\n");
			return -devID;
			}

			cudaDeviceProp deviceProp;
			checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));

			if (deviceProp.computeMode == cudaComputeModeProhibited)
			{
			fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n");
			return -1;
			}

			if (deviceProp.major < 1)
			{
			fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
			exit(EXIT_FAILURE);
			}

			checkCudaErrors(cudaSetDevice(devID));
			printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name);

			return devID;
			}

			// This function returns the best GPU (with maximum GFLOPS)
			inline int gpuGetMaxGflopsDeviceId()
			{
			int current_device = 0, sm_per_multiproc = 0;
			int max_perf_device = 0;
			int device_count = 0, best_SM_arch = 0;
			int devices_prohibited = 0;

			unsigned long long max_compute_perf = 0;
			cudaDeviceProp deviceProp;
			cudaGetDeviceCount(&device_count);

			checkCudaErrors(cudaGetDeviceCount(&device_count));

			if (device_count == 0)
			{
			fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: no devices supporting CUDA.\n");
			exit(EXIT_FAILURE);
			}

			// Find the best major SM Architecture GPU device
			while (current_device < device_count)
			{
			cudaGetDeviceProperties(&deviceProp, current_device);

			// If this GPU is not running on Compute Mode prohibited, then we can add it to the list
			if (deviceProp.computeMode != cudaComputeModeProhibited)
			{
			if (deviceProp.major > 0 && deviceProp.major < 9999)
			{
			best_SM_arch = MAX(best_SM_arch, deviceProp.major);
			}
			}
			else
			{
			devices_prohibited++;
			}

			current_device++;
			}

			if (devices_prohibited == device_count)
			{
			fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: all devices have compute mode prohibited.\n");
			exit(EXIT_FAILURE);
			}

			// Find the best CUDA capable GPU device
			current_device = 0;

			while (current_device < device_count)
			{
			cudaGetDeviceProperties(&deviceProp, current_device);

			// If this GPU is not running on Compute Mode prohibited, then we can add it to the list
			if (deviceProp.computeMode != cudaComputeModeProhibited)
			{
			if (deviceProp.major == 9999 && deviceProp.minor == 9999)
			{
			sm_per_multiproc = 1;
			}
			else
			{
			sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
			}

			unsigned long long compute_perf = (unsigned long long) deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;

			if (compute_perf > max_compute_perf)
			{
			// If we find GPU with SM major > 2, search only these
			if (best_SM_arch > 2)
			{
			// If our device==dest_SM_arch, choose this, or else pass
			if (deviceProp.major == best_SM_arch)
			{
			max_compute_perf = compute_perf;
			max_perf_device = current_device;
			}
			}
			else
			{
			max_compute_perf = compute_perf;
			max_perf_device = current_device;
			}
			}
			}

			++current_device;
			}

			return max_perf_device;
			}


			// Initialization code to find the best CUDA Device
			inline int findCudaDevice(int argc, const char **argv)
			{
			cudaDeviceProp deviceProp;
			int devID = 0;

			// If the command-line has a device number specified, use it
			if (checkCmdLineFlag(argc, argv, "device"))
			{
			devID = getCmdLineArgumentInt(argc, argv, "device=");

			if (devID < 0)
			{
			printf("Invalid command line parameter\n ");
			exit(EXIT_FAILURE);
			}
			else
			{
			devID = gpuDeviceInit(devID);

			if (devID < 0)
			{
			printf("exiting...\n");
			exit(EXIT_FAILURE);
			}
			}
			}
			else
			{
			// Otherwise pick the device with highest Gflops/s
			devID = gpuGetMaxGflopsDeviceId();
			checkCudaErrors(cudaSetDevice(devID));
			checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
			printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
			}

			return devID;
			}

			// General check for CUDA GPU SM Capabilities
			inline bool checkCudaCapabilities(int major_version, int minor_version)
			{
			cudaDeviceProp deviceProp;
			deviceProp.major = 0;
			deviceProp.minor = 0;
			int dev;

			checkCudaErrors(cudaGetDevice(&dev));
			checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));

			if ((deviceProp.major > major_version) \|\|
			(deviceProp.major == major_version && deviceProp.minor >= minor_version))
			{
			printf(" Device %d: <%16s >, Compute SM %d.%d detected\n", dev, deviceProp.name, deviceProp.major, deviceProp.minor);
			return true;
			}
			else
			{
			printf(" No GPU device was found that can support CUDA compute capability %d.%d.\n", major_version, minor_version);
			return false;
			}
			}
			#endif

			// end of CUDA Helper Functions


			#endif