/tomo/pyhst : revision 228

To get this branch, use:

bzr branch
http://darksoft.org/webbzr/tomo/pyhst

« back to all changes in this revision

Viewing changes to hst_cuda/hst_cuda_defines.h

Committer: Suren A. Chilingaryan
Date: 2017-09-28 10:34:47 UTC
Revision ID: csa@suren.me-20170928103447-dggjgnuxmymgew2a

Quality fixes (tex-based)

files modified:
CMakeLists.txt

PyHST.py

PyHST_c.c

hst.c

hst_cuda/CMakeLists.txt

hst_cuda/hst_cuda.cu

hst_cuda/hst_cuda_bp_kepler.h

hst_cuda/hst_cuda_bp_kernels.h

hst_cuda/hst_cuda_bp_mplinear.h

hst_cuda/hst_cuda_defines.h

hst_cuda/hst_cuda_kernels.h

setup.py

Show diffs side-by-side

added added

removed removed

hst_cuda/hst_cuda_defines.h

//#define HST_BASE_KERNEL //!< Enforce simplest kernel

#define HST_LINEAR_KERNEL //!< Enforece linear kernel

#define HST_HYBRID //!< Use Tex & Linear kernels in parallel. Doesn't help on Fermi

#if defined(PYHST_RECON_BENCHMARK)||!defined(PYHST_ASTRA_SCALING)

# define HST_FILTER2 //!< YES: Filter 2 projections at once (but insignificantly affects the scaling. so we keep it off for quality benchmarking)

#endif

//#define HST_HYBRID //!< Use Tex & Linear kernels in parallel. Doesn't help on Fermi

#define HST_LINEAR_MPLINEAR

#define HST_SET_BOUNDS

#define HST_LINEAR_BLOCK 16 //! 16! 8 is slow (other numbers are not supported, enforece SQUARE_PPT)

//#define HST_SQUARE_PPT //! NO. Little effect on performance (slighly reduces register usage, but doesn't affect computations as the loops are unrolled anyway)

#if defined(HST_LINEAR_MPLINEAR)&&(HST_OPTIMIZE_KEPLER > 4)

# if defined(HST_HYBRID)

#endif

#ifdef HST_HYBRID

// Const memory problem!

# define HST_HYBRID_NEWTEX

# define HST_NEWTEX4 //!< Assymetric: process 4 projections and 64 bins during each step.

# define HST_FLOAT_LOOPS //! Use float loops to optimize register usage (mostly bad)

//# define HST_HYBRID_NEWTEX

//# define HST_NEWTEX4 //!< Assymetric: process 4 projections and 64 bins during each step.

# define HST_CACHE_SIN //!< YES. It is good idea on all NVIDIA as shmem faster than cmem. Even on shmem-bound kernels

#endif

//#define HST_SQUARE_PPT //! NO. Little effect on performance (slighly reduces register usage, but doesn't affect computations as the loops are unrolled anyway)

//#define HST_FLOAT_LOOPS //! Use float loops to optimize register usage (mostly bad)

//#define HST_NEWTEX4 //!< Assymetric: process 4 projections and 64 bins during each step.

#define HST_L1_PROJ_BLOCK 128 //!< UNUSED now

// Tex Kernel overrides

//#define HST_FLOAT_LOOPS //! Use float loops to optimize register usage (mostly bad)

#define HST_NEWCACHE //!< New way of caching reducing number of shmem reads/writes

Older »