/tomo/pyhst : contents of hst_cuda/hst_cuda

: (revision 276)

To get this branch, use:

bzr branch
http://darksoft.org/webbzr/tomo/pyhst

//#define HST_AUTOGEN
#define HST_HEATUP
//#define HST_MEASURE_GPUBOOST

#ifdef HST_AUTOGEN
# include "hst_cuda_defines_gen.h"
#else
//#define HST_TEX_KERNEL			//!< Enforce ppt1 tex-based kernel
//#define HST_BASE_KERNEL			//!< Enforce simplest kernel
//#define HST_HYBRID				//!< Use Tex & Linear kernels in parallel. Doesn't help on Fermi
//#define HST_HALF_MODE				//!< Careful, only NN-interpolation using tex-engine (i.e. no oversampling and NN in tex mode)
#endif


#define HST_CUDA_ARRAY				//!< Use cudaArray instead of binding textures

#define HST_BASE_REMAP
#define HST_BASE_PPT 2
#define HST_LINEAR_KERNEL			//!< Enforece linear kernel
//#define HST_LINEAR_BASE
#define HST_LINEAR_MPLINEAR
#define HST_HALF_CACHE


/*
#if (__CUDA_ARCH__ >= 500)
# undef HST_OPTIMIZE_KEPLER
# define HST_OPTIMIZE_KEPLER 5
#elif (__CUDA_ARCH__ >= 300)
# undef HST_OPTIMIZE_KEPLER
# define HST_OPTIMIZE_KEPLER 3
#elif (__CUDA_ARCH__ >= 100)
# undef HST_OPTIMIZE_KEPLER
#else
#endif
*/

#if (SLICE_BLOCK < 4)||!defined(HST_LINEAR_MPLINEAR)||defined(HST_HYBRID)
# undef HST_HALF_MODE
#endif

#ifndef HST_HALF_MODE
# undef HST_HALF_CACHE
#endif

#ifdef HST_HALF_CACHE
# define HST_FANCY_ROUND			//!< Good on slice1, but bad on slice4 (without somehow works very badly if 4+ unrolling of p-loop is requested)
#endif

#if defined(PYHST_RECON_BENCHMARK)||!defined(PYHST_ASTRA_SCALING)
# define HST_FILTER2				//!< YES: Filter 2 projections at once (but insignificantly affects the scaling. so we keep it off for quality benchmarking)
#endif


#define HST_SET_BOUNDS
#define HST_LINEAR_BLOCK 16			//! 16! 8 is slow (other numbers are not supported, enforece SQUARE_PPT)

#if defined(HST_LINEAR_BASE)
# define HST_LINEAR_PPT 2			//! 2 or 4! The default PPT for Linear kernels (possible: 1, but would be slower for sure
# define HST_OVERS_PPT 2			//! 2 or 4! The default PPT for Linear kernels (possible: 1, but would be slower for sure
# define HST_NN_PPT 2
#elif defined(HST_LINEAR_MPLINEAR)&&(HST_OPTIMIZE_KEPLER > 4)
# if ((SLICE_BLOCK > 1)&&(defined(HST_HYBRID)))||(SLICE_BLOCK > 2)
#  define HST_LINEAR_PPT 2			//! 2 or 4! The default PPT for Linear kernels (possible: 1, but would be slower for sure
# else
#  define HST_LINEAR_PPT 4			//! 2 or 4! The default PPT for Linear kernels (possible: 1, but would be slower for sure
# endif
# if SLICE_BLOCK < 4
#  define HST_OVERS_PPT 4			//! 2 or 4! The default PPT for Linear kernels (possible: 1, but would be slower for sure
#  define HST_NN_PPT 4
# else
#  define HST_OVERS_PPT 2			//! 2 or 4! The default PPT for Linear kernels (possible: 1, but would be slower for sure
#  define HST_NN_PPT 2
# endif
#elif defined(HST_LINEAR_MPLINEAR)&&defined(HST_OPTIMIZE_KEPLER)
# define HST_LINEAR_PPT 2			//! 2 or 4! The default PPT for Linear kernels (possible: 1, but would be slower for sure
# if SLICE_BLOCK == 1
#  define HST_OVERS_PPT 4			//! 2 or 4! The default PPT for Linear kernels (possible: 1, but would be slower for sure
#  define HST_NN_PPT 4
# else
#  define HST_OVERS_PPT 2			//! 2 or 4! The default PPT for Linear kernels (possible: 1, but would be slower for sure
#  define HST_NN_PPT 2
# endif
#else
# define HST_LINEAR_PPT 2			//! 2! The default PPT for Linear kernels (possible: 1, but would be slower for sure). 4 is not possible in SIMD mode
# define HST_NN_PPT 2
#endif

#if defined(HST_LINEAR_BASE)
# define HST_LINEAR_ASSYMETRY 1			//! 1!
#elif defined(HST_HYBRID)
# define HST_LINEAR_ASSYMETRY 1			//! 1!
#elif defined(HST_OPTIMIZE_KEPLER)&&defined(HST_LINEAR_MPLINEAR)
# if ((SLICE_BLOCK <= 2)&&(HST_LINEAR_PPT < 4))
#  define HST_LINEAR_ASSYMETRY 2			//! 1!
# else
#  define HST_LINEAR_ASSYMETRY 1			//! 1!
# endif
#else
# define HST_LINEAR_ASSYMETRY 1			//! 1!
#endif

#if defined(HST_LINEAR_BASE)
# define HST_OVERS_ASSYMETRY 1			//! 1!
# define HST_NN_ASSYMETRY 1		        //! 1!
#elif (HST_OPTIMIZE_KEPLER)&&(SLICE_BLOCK == 2)
# if HST_OVERS_PPT < 4
#  define HST_OVERS_ASSYMETRY 2			//! 1!
# else
#  define HST_OVERS_ASSYMETRY 1			//! 1!
# endif
# if HST_NN_PPT < 4
#  define HST_NN_ASSYMETRY 2			//! 1!
# else
#  define HST_NN_ASSYMETRY 1		        //! 1!
# endif
#else 
# define HST_OVERS_ASSYMETRY 1			//! 1!
# define HST_NN_ASSYMETRY 1			//! 1!
#endif



#define HST_NEWTEX4                          	//!< Assymetric: process 4 projections and 64 bins during each step
#define HST_NEWTEX_PROJ_MAJOR			//!< YES Seems a good idea for better locality and non-conflicting const memory loads
//#define HST_NEWTEX_REUSE_BUFS			//!< YES Re-use the coalescing shared memory buffer for CACHE_CONST
//#define HST_NEWTEX_DIRECT_WRITE		//!< YES if we short on shmem. Generally, only useful for slice4 mode on Kepler and later...
//#define HST_FLOAT_LOOPS			//!< On sm_35 only. sm_30 - bad. Use float loops to optimize register usage (mostly bad)
//#define HST_SQUARE_PPT			//!< NO. Little effect on performance (slighly reduces register usage, but doesn't affect computations as the loops are unrolled anyway)

#ifdef HST_HYBRID
//# define HST_HYBRID_BALANCE_ALU 4
//# define HST_HYBRID_NEWTEX
# define HST_CACHE_SIN			        //!< YES. It is good idea on all NVIDIA as shmem faster than cmem. Even on shmem-bound kernels
//# define HST_C_TRIG				//!< Slight speedup due to less fragmented cmem in main part (but mostly useless)
# ifdef HST_HYBRID_NEWTEX
#  define HST_NEWTEX4                             //!< Assymetric: process 4 projections and 64 bins during each step.
#  define HST_CACHE_CONST			//!< YES! In NewTex cache constants in the shared memory
#  if SLICE_BLOCK == 1
#   define HST_NEWTEX4_PPT HST_LINEAR_PPT	//!< NO. Do partial PPT directly in the Hybrid kernel.
#  endif
//#  define HST_NO_OFFSETS				//!< IGNORE THIRD c_all parameter (assume 0, no offsets)
//#  define HST_L1_PROJ_BLOCK 4096		//!< UNUSED
# endif
#endif

#if (SLICE_BLOCK > 1)||((HST_OPTIMIZE_KEPLER > 4)&&(SLICE_BLOCK > 1))
# define HST_NEWTEX4                           //!< Assymetric: process 4 projections and 64 bins during each step.
#endif
#if defined(HST_NEWTEX4)&&((!defined(HST_OPTIMIZE_KEPLER))||(HST_OPTIMIZE_KEPLER < 5))
# define HST_CACHE_CONST			//!< YES! At least for Hybrid mode (but not on Pascal)
#endif


    // Linear Kernel overrides
//#define HST_LMAP                                //!< Just for testing, linear mapping.
#define HST_ZMAP                                //!< On Pascal/float2 (and other platforms benefiting from 4-thread shmem locality)
#define HST_NEWCACHE				//!< YES! New way of caching reducing number of shmem reads/writes
//#define HST_NEWCACHE_UNPAD			//!< Add if and remove padding in the shmem cache (low efficiency)
#define HST_GMEM_CONST				//!< YES! Store constants in global memory (used only in mplinear with MINH caching) 
//define HST_INNER_GMEM				//!< NO! Also access GMem from inner loop (Slow)
//# define HST_PRECOMPUTE_OFFSETS		//!< IRRELEVANT! Just move some index computations out of inner loop (used only with MINH mode on)
//# define HST_MEM_FETCHES			//!< NO! Use sinogram array instead of textures in Linear kernels. Doesn't help in practice even if shmem bound kernels
//# define HST_TUNE_SHMEM			//!< NO! Buggy! Try to optimize shmem in cost of extra computations (we are comp. bounded for float1 and it will be enforced for float2-4 anyway, not needed)
#define HST_XCACHE_EARLY_Y                      //!< YES! Must. Other mode is not unrolled properly and causes large register overhead
#define HST_XCACHE_LD128 2                      //!< Store SUBH_X cache as groups of float2/float4 numbers to reduce number of memory instructions


#if defined(HST_LINEAR_BASE)
# if defined(HST_OPTIMIZE_KEPLER)
#  define HST_CREATE_TEXTURE			//!< Create texture instead of binding. Seems faster, but only supported since Kepler
#  define HST_SET_KEPLER_BOUNDS
#  if HST_OPTIMIZE_KEPLER > 4
#   define HST_WARPLINE				//!< Seems better without on Fermi
#  else
#   define HST_SHMEM64				//!< Optimize caching in shared memory (avoid bank conflicts) in case of oversampling
#  endif
# else
#  undef HST_SHMEM64				//!< Optimize caching in shared memory (avoid bank conflicts) in case of oversampling
# endif
# define HST_CACHE_SUBH				//!< YES! Must on NVIDIA (this that makes it different from AMD)
# define HST_CACHE_Y				//!< YES! Must. If disabled extra computations, but also shmem bank conflicts
#elif defined(HST_OPTIMIZE_KEPLER)
# define HST_CREATE_TEXTURE			//!< Create texture instead of binding. Seems faster, but only supported since Kepler
# define HST_SHFL_SUM				//!< Important as ShMem is shared with L1 (in case of spilage)
# define HST_SET_KEPLER_BOUNDS

# if defined(HST_CACHE_CONST)&&defined(HST_NEWTEX4)&&(SLICE_BLOCK < 4)
#  if  HST_OPTIMIZE_KEPLER < 4
#   define HST_CCACHE_KEPLER			//!< KEPLER ONLY (NOT NEWER)! Optimizes 64-bit shmem accesses without adding to much shared memory
//#   define HST_CCACHE_LD128			//!< KEPLER ONLY (NOT NEWER)! Optimizes 64-bit shmem accesses with HST_CACHE_CONST & HST_NEWTEX4
#  endif
# endif
# if (defined(HST_TEX_KERNEL)&&defined(HST_SET_BOUNDS)&&(SLICE_BLOCK > 1)&&!defined(HST_NEWTEX4))
#  define HST_KEPLER_PREFER_L1
//#  define HST_KEPLER_PREFER_L1_EQUAL
# endif

	// Linear Kernel overrides

# if HST_OPTIMIZE_KEPLER > 4
#  define HST_WARPLINE				//!< Seems better without on Fermi
# else
#  define HST_SHMEM64				//!< Optimize caching in shared memory (avoid bank conflicts) in case of oversampling
# endif

# if SLICE_BLOCK == 1
#  if HST_OPTIMIZE_KEPLER > 4
#  else
#   define HST_FANCY_ROUND			//!< Good on slice1, but bad on slice4 (without somehow works very badly if 4+ unrolling of p-loop is requested)
#   define HST_FANCY_INDEX			//!< Optimizes out integer multiplication required to convert array index to address (idx * sizeof(type))
#  endif
#  define HST_SHFL_CACHE			//!< YES! SIGNIFICANT! Prevents ShMem bank conflicts and also halfes caching accesses 
#  define HST_CACHE_Y				//!< YES! Must. If disabled extra computations, but also shmem bank conflicts
#  define HST_CACHE_SIN				//!< YES. It is good idea on all NVIDIA as shmem faster than cmem. Even on shmem-bound kernels
# elif SLICE_BLOCK == 2
#  if HST_OPTIMIZE_KEPLER > 4
#  else
#   define HST_FANCY_ROUND			//!< Good on slice1, but bad on slice4 (without somehow works very badly if 4+ unrolling of p-loop is requested)
#  endif
//#  define HST_SHFL_CACHE			//!< YES! SIGNIFICANT! Prevents ShMem bank conflicts and also halfes caching accesses 
//#  define HST_CACHE_Y				//!< YES! Must. If disabled extra computations, but also shmem bank conflicts
//#  define HST_CACHE_SIN				//!< YES. It is good idea on all NVIDIA as shmem faster than cmem. Even on shmem-bound kernels
# else
#  if HST_OPTIMIZE_KEPLER > 4
//#   define HST_SHMEM64
#  endif
# endif

# define HST_CACHE_SUBH				//!< YES! Must on NVIDIA (this that makes it different from AMD)
//#define HST_SUBH_DIRECT				//!< NO! AMDDirect mode with double-interpolation, i.e. reduced quality
//# define HST_C_TRIG				//!< Slight speedup due to less fragmented cmem in main part (but mostly useless)
//# define HST_CACHE_MINH				//!< NO! Minimal effect and actually slows slightly down
//# define HST_SHFL_MINH			//!< NO! Seems better without

//#define HST_CACHE_MINH
//# define HST_CACHE_SUBH_X			//!< Save 1 operation in main loop in price of extra shmem (but no additional writes)
//# define HST_C_SIN
#else /* ! HST_OPTIMIZE_KEPLER */
	// Linear Kernel overrides
# undef HST_SHMEM64				//!< Optimize caching in shared memory (avoid bank conflicts) in case of oversampling
//#define HST_WARPLINE				//!< Seems better without on Fermi
#if SLICE_BLOCK > 1
# define HST_WARPLINE				//!< Seems better without on Fermi
#endif
# undef HST_FANCY_ROUND

# define HST_CACHE_Y				//!< YES! Must. If disabled extra computations, but also shmem bank conflicts
# define HST_CACHE_SUBH				//!< YES! Must on NVIDIA (this that makes it different from AMD)
//# define HST_C_SIN
# define HST_CACHE_SIN				//!< YES. It is good idea on all NVIDIA as shmem faster than cmem. Even on shmem-bound kernels
//# define HST_C_TRIG				//!< Slight speedup due to less fragmented cmem in main part (but mostly useless)
# if SLICE_BLOCK == 4
//#  define HST_CACHE_MINH				//!< NO! Minimal effect and actually slows slightly down (register pressure)
# else
//#  define HST_CACHE_MINH				//!< NO! Minimal effect and actually slows slightly down (register pressure)
#  define HST_CACHE_SUBH_X			//!< Save 1 operation in main loop in price of extra shmem (but no additional writes)
# endif
#endif /* !  HST_OPTIMIZE_KEPLER */

#ifndef HST_OVERS_PPT 
# define HST_OVERS_PPT HST_LINEAR_PPT
#endif
#ifndef HST_NN_PPT 
# define HST_NN_PPT HST_OVERS_PPT
#endif

#define HST_NEWTEX_CACHE_BLOCK 128
#define HST_LINEAR_CACHE_BLOCK 32//256//32//64 // 64 for linear

#if HST_LINEAR_ASSYMETRY > 2
# define HST_LINEAR_PROJ_BLOCK 4
#elif SLICE_BLOCK > 1
#  define HST_LINEAR_PROJ_BLOCK 8
/*#elif SLICE_BLOCK == 2
# define HST_LINEAR_PROJ_BLOCK HST_LINEAR_BLOCK*/
#elif (HST_LINEAR_ASSYMETRY * HST_LINEAR_PPT * HST_LINEAR_PPT) < 8
# define HST_LINEAR_PROJ_BLOCK HST_LINEAR_BLOCK
#else
# define HST_LINEAR_PROJ_BLOCK 8
#endif

#if SLICE_BLOCK > 1
# define HST_NN_PROJ_BLOCK 8
#else
# define HST_NN_PROJ_BLOCK 16
#endif

#if HST_OVERS_ASSYMETRY > 2
# define HST_OVERS_PROJ_BLOCK 4
#elif SLICE_BLOCK > 2
# define HST_OVERS_PROJ_BLOCK 4
#elif SLICE_BLOCK > 1
# if defined(HST_OPTIMIZE_KEPLER)
#  define HST_OVERS_PROJ_BLOCK 4
# else
#  define HST_OVERS_PROJ_BLOCK 8
# endif
/*#elif SLICE_BLOCK == 2
# define HST_OVERS_PROJ_BLOCK 8*/
#else  // SLICE_BLOCK == 1 
# if (HST_OVERS_ASSYMETRY * HST_OVERS_PPT * HST_OVERS_PPT) < 8
#  ifdef HST_OPTIMIZE_KEPLER
#   define HST_OVERS_PROJ_BLOCK 8			//! Set bellow 8 to enable shmem optimizations on Kepler and later
#  else // Fermi
#   ifdef HST_WARPLINE
#    define HST_OVERS_PROJ_BLOCK 8
#   else
#    define HST_OVERS_PROJ_BLOCK 16 			// SUBHX caching!
#   endif
#  endif
# else // PPT == 4
#  if HST_OPTIMIZE_KEPLER > 4
#   define HST_OVERS_PROJ_BLOCK 8 //4			//! Set bellow 8 to enable shmem optimizations on Kepler and later
#  else
#   define HST_OVERS_PROJ_BLOCK 4			//! Set bellow 8 to enable shmem optimizations on Kepler and later
#  endif
# endif
#endif

#if HST_LINEAR_BLOCK < 16
# undef HST_SHMEM64
#endif

#ifdef HST_SHMEM64
# define HST_WARPLINE
#endif

#ifdef HST_CACHE_MINH
# define HST_CACHE_SIN
# define HST_CACHE_SUBH
#endif

#ifdef HST_SET_BOUNDS
# ifdef HST_OPTIMIZE_KEPLER
#  if SLICE_BLOCK > 2
#   ifdef HST_NEWTEX4
#    define HST_NEWTEX_MIN_BLOCKS 6
#   else
#    define HST_NEWTEX_MIN_BLOCKS 4
#   endif
#   define HST_OVERS_MIN_BLOCKS 4 //3
#   define HST_LINEAR_MIN_BLOCKS 4 //4
#   define HST_NN_MIN_BLOCKS 4
#  elif SLICE_BLOCK > 1
#   ifdef HST_NEWTEX4
#    ifdef HST_NEWTEX4_PPT
#     define HST_NEWTEX_MIN_BLOCKS 6
#    else
#     define HST_NEWTEX_MIN_BLOCKS 8 // 6 - 8
#    endif
#   else
#    define HST_NEWTEX_MIN_BLOCKS 4
#   endif
#   if HST_OVERS_PPT > 2
#    if HST_OVERS_ASSYMETRY > 1
#     define HST_OVERS_MIN_BLOCKS 4//6
#    else
#     define HST_OVERS_MIN_BLOCKS 4//6
#   endif
#   else 
#    if HST_OVERS_ASSYMETRY > 1
#     define HST_OVERS_MIN_BLOCKS 8//8//6
#    else
#     define HST_OVERS_MIN_BLOCKS 6//6
#    endif
#   endif
#   if HST_LINEAR_ASSYMETRY > 1
#    define HST_LINEAR_MIN_BLOCKS 8 // 6 - 10
#   elif HST_LINEAR_PPT > 2
#    define HST_LINEAR_MIN_BLOCKS 4 // 6
#   else
#    define HST_LINEAR_MIN_BLOCKS 6 // 6
#   endif
#   if HST_NN_PPT > 2
#    define HST_NN_MIN_BLOCKS 4
#   else 
#    if HST_NN_ASSYMETRY > 1
#     define HST_NN_MIN_BLOCKS 8
#    else
#     define HST_NN_MIN_BLOCKS 6
#    endif
#   endif
#  else /* SLICE_BLOCK == 1 */
#   ifdef HST_NEWTEX4
#    define HST_NEWTEX_MIN_BLOCKS 8
#   else
#    define HST_NEWTEX_MIN_BLOCKS 6//8
#   endif
#   if HST_LINEAR_ASSYMETRY > 2
#    define HST_LINEAR_MIN_BLOCKS 14 // or 12-16?
#   elif HST_LINEAR_ASSYMETRY > 1
#    define HST_LINEAR_MIN_BLOCKS 8 // or 6-10?
#   elif HST_LINEAR_PPT > 2
#    if HST_OPTIMIZE_KEPLER > 4
#     define HST_LINEAR_MIN_BLOCKS 8 //4 // 6
#    else
#     define HST_LINEAR_MIN_BLOCKS 4 // 6
#    endif
#   else
#    define HST_LINEAR_MIN_BLOCKS 8 //6
#   endif

#   if HST_NN_PPT < 4
#    define HST_NN_MIN_BLOCKS 6 // 6
#   else
#    define HST_NN_MIN_BLOCKS 4 // 6
#   endif
#   if HST_OVERS_PPT < 4
#    define HST_OVERS_MIN_BLOCKS 6
#   else /* PPT == 4 */
#    define HST_OVERS_MIN_BLOCKS 4 // with PPT(Y) mode, we can increase it to 6, but no speed-up
#   endif
#  endif
# else /* ! HST_OPTIMIZE_KEPLER */
#  define HST_NEWTEX_MIN_BLOCKS 3
#  if (SLICE_BLOCK > 2)||defined(HST_CACHE_MINH)
#   define HST_OVERS_MIN_BLOCKS 3
#   define HST_LINEAR_MIN_BLOCKS 3
#  elif SLICE_BLOCK > 1
#   define HST_LINEAR_MIN_BLOCKS 3
#   define HST_OVERS_MIN_BLOCKS 3 //4
#  else
#   define HST_LINEAR_MIN_BLOCKS 4
#   ifdef HST_WARPLINE
#    define HST_OVERS_MIN_BLOCKS 4
#   else
#    define HST_OVERS_MIN_BLOCKS 3//4
#   endif
#  endif
#  define HST_NN_MIN_BLOCKS 0//0
# endif
#endif

#ifdef HST_HYBRID
# ifdef HST_HYBRID_NEWTEX
#  define HST_HYBRID_MIN_BLOCKS HST_NEWTEX_MIN_BLOCKS
# else
#  if SLICE_BLOCK > 1
#   if HST_LINEAR_PPT > 2
#    warning "Not efficient hybrid configuration"
#    define HST_HYBRID_MIN_BLOCKS 4
#   else
#    define HST_HYBRID_MIN_BLOCKS 8
#   endif
#  else
#   if HST_LINEAR_PPT > 2
#    define HST_HYBRID_MIN_BLOCKS 8 // 6
#   else
#    define HST_HYBRID_MIN_BLOCKS 8
#   endif
#  endif
# endif

# ifndef HST_HYBRID_BALANCE_ALU
#  if SLICE_BLOCK == 1
#   if HST_HYBRID_MIN_BLOCKS == 8
#    define HST_HYBRID_BALANCE_ALU 5
#   endif
#  endif
# endif
#endif



#ifdef HST_FANCY_ROUND
# define HST_LINEAR_FANCY_FLOOR
# define HST_OVERS_FANCY_ROUND
# define HST_MP_FANCY_ROUND        // Enable fancy rounding mode (faster)
#endif

#ifdef HST_FANCY_INDEX
//# define HST_LINEAR_FANCY_INDEX
# define HST_OVERS_FANCY_INDEX
#endif

#ifndef HST_LINEAR_FANCY_FLOOR
# undef  HST_LINEAR_FANCY_INDEX
#endif

#ifndef HST_OVERS_FANCY_ROUND
# undef  HST_OVERS_FANCY_INDEX
#endif


#if SLICE_BLOCK > 1
# define HST_CUDA_ARRAY
#endif /* SLICE_BLOCK */

#ifdef HST_BASE_KERNEL
# define HST_TEX_KERNEL
# undef HST_LINEAR_KERNEL
# undef HST_MPLINEAR_KERNEL
#endif

#ifdef HST_CACHE_CONST
# undef HST_FLOAT_LOOPS
#endif

/*
#ifdef HST_MPLINEAR_KERNEL
# if SLICE_BLOCK > 1
#  define HST_LINEAR_KERNEL
#  undef HST_MPLINEAR_KERNEL
# endif
#endif
*/

#if /*defined(HST_CACHE_MINH) ||*/ !defined(HST_CACHE_SUBH)
# undef HST_CACHE_SUBH_X
#endif

#ifdef HST_CACHE_SUBH_X
# ifdef HST_C_TRIG
#  undef HST_C_TRIG
#  define HST_C_SIN
# endif
#endif


    // Non assymetric mode is barelly tested and slower on measured configurations
#define HST_MP_ASYMMETRIC         // Run mplinear with 8x16 thread block (and 8x4 actual PPT)
#ifndef HST_MP_ASYMMETRIC 
# define HST_MP_MINIMAL          // Minimize number of registers in price of more computations
#endif /* ! HST_MP_ASYMMETRIC */



#if defined(HST_C_TRIG) || defined(HST_C_SIN)
# define MAXNPROJECTIONS 2048			//!< Maximum number of supported projections per slices
#else
# define MAXNPROJECTIONS 4096			//!< Maximum number of supported projections per slices
#endif
#define HST_CUDA_MAX_DEVICES 16			//!< Maximum CUDA-devices supported by the module (check tex_projes)

#define BLOCK_SIZE 16				//!< CUDA block size (for filtering)
#define BLOCK_SIZE_X BLOCK_SIZE			//!< Check kernels, there are some expectations
#define BLOCK_SIZE_Y BLOCK_SIZE			//!< Check kernels, there are some expectations

#define CUDA_HEATUP_MAX_CHANGE 0.05
#define CUDA_HEATUP_MIN_ITERATIONS 200
#define CUDA_HEATUP_MAX_ITERATIONS 250
#define CUDA_HEATUP_KERNEL_ITERATIONS 4096
#define CUDA_HEATUP_ELEMENTS 4
#define CUDA_HEATUP_GRID 4480 // 16 * 5 * 7 (covers SM numbers) * 8 (covers warps per SM)
#define CUDA_MAX_STATS 16384

    /* We should be carefoul here, in many cases it faster to preform everything at once
    probably due to the texture caching */
//#ifdef HST_OPTIMIZE_KEPLER
//# define BP_BATCH_SIZE 128			//!< Batch size (in blocks) for Back Projection  (should be even)
//#else /* HW_OPTIMIZE_KEPLER */
//# ifdef  HW_IGNORE_OLD_HARDWARE
//#  define BP_BATCH_SIZE 128			//!< Batch size (in blocks) for Back Projection  (should be even)
//# else /*  HW_IGNORE_OLD_HARDWARE */
//#  define BP_BATCH_SIZE 8			//!< Batch size (in blocks) for Back Projection  (should be even)
//# endif /*  HW_IGNORE_OLD_HARDWARE */
//#endif /*  HW_OPTIMIZE_KEPLER */
//#define FFT_BATCH_SIZE 8			//!< Batch size (in blocks) for FFT transformations (should be even)


#define BP_BATCH_SIZE 512
#define FFT_BATCH_SIZE 256                      //!< Anyway limited by size of constant memory