/tomo/pyhst : contents of hst_cuda/hst_cuda_bp

: (revision 276)

To get this branch, use:

bzr branch
http://darksoft.org/webbzr/tomo/pyhst

//#define DEBUG_OFFSETS 1	// set to 0 to allow intially interpolations and 1 to disable


#undef KERNEL_NAME
#undef SHMEM_PADDING
#undef SHMEM_SHIFTED_PADDING
#undef SHMEM_ARRAYS
#undef SHMEM_SAMPLING
#undef SIMPLE_SHMEM
#undef SPLIT_SHMEM
#undef REMAP_THREADS
#undef SHMEM_LOAD_ITERATIONS
#undef OVERSAMPLE_MULTIPLIER
#undef GET_CACHE
#undef INTERPOLATE_FROM_CACHE
#undef MIN_BLOCKS
#undef CACHE
#undef FAST_CACHE
#undef FAST_CACHE2
#undef FANCY_ROUND
#undef FANCY_INDEX
#undef FANCY_MINH_CORRECTION
#undef FANCY_CACHE
#undef FANCY_CACHE_MULTIPLIER
#undef SHFL_CACHE
#undef SHMEM_LOAD_FULLWARP
#undef SHMEM_LOAD2
#undef CACHE_POS
#undef CACHE_BLOCK
#undef GET_SINO
#undef BSX
#undef BSY
#undef ASSYMETRY
#undef PSTEP
#undef PTYPE
#undef sincos
#undef P
#undef NEWCACHE

#define BSX BS
#define BSY BS

#if defined(HST_CACHE_Y)&&!defined(HST_HALF_MODE)
# define CACHE_Y
#endif

#define MK_KERNEL_NAME(name, version) HST_MACRO_JOIN(name, version)

#ifdef OVERSAMPLE
# if OVERSAMPLE == 1
#  define KERNEL_NAME hst_cuda_alu_nn
#  define MIN_BLOCKS HST_NN_MIN_BLOCKS
#  define ASSYMETRY HST_NN_ASSYMETRY
# else
#  define KERNEL_NAME MK_KERNEL_NAME(hst_cuda_alu_oversample, OVERSAMPLE)
#  define MIN_BLOCKS HST_OVERS_MIN_BLOCKS
#  define ASSYMETRY HST_OVERS_ASSYMETRY
# endif
# define SHMEM_SAMPLING OVERSAMPLE			/**< Oversmapling coefficients, how many points to sample per cell */
# undef CACHE_Y
#else
# define KERNEL_NAME hst_cuda_alu_linear
# define SHMEM_SAMPLING 1
# define MIN_BLOCKS HST_LINEAR_MIN_BLOCKS
# define ASSYMETRY HST_LINEAR_ASSYMETRY
#endif
#define SHMEM_SHIFTED_PADDING 0

#ifdef CACHE_Y
# define SHMEM_ARRAYS 2
# define SHMEM_PADDING 1
#else
# define SIMPLE_SHMEM					/**< Use simple ShMem array instead of float2 with 'x' and 'y-x' */
# define SHMEM_ARRAYS 1					/**< Number of shmem arrays (in non SIMPLE mode). For interpolation we may need 2 */
# define SHMEM_PADDING 0				/**< We need extra byte in the end of shmem array if we compute (y-x) */
#endif

#if SHMEM_SAMPLING > 1
# define REMAP_THREADS					/**< Localise accesses (needed for wide Oversampling kernels to avoid cache misses */
#endif

#if SHMEM_SAMPLING == 4 
# define OVERSAMPLE_MULTIPLIER 0.25f
#elif SHMEM_SAMPLING > 1
# define OVERSAMPLE_MULTIPLIER (1.f / OVERSAMPLE)
#else
# define OVERSAMPLE_MULTIPLIER 1
#endif


#ifdef HST_TUNE_SHMEM
	/* This is pretty much useless. We optimize access to ShMem cache during the writting phase, but
	this costs us extra instructions (need to compute 2 addresses instead of 1). 
	- On Fermi (and likely all other architectures float1 is compute bound. For float* we enforce it
	anyway.
	- Besides, using 3D arrays is significantly more optimal than addressing manually in linear array.
	So, the clever tricks will also not work.
	- Shifted padding is to extend array at different (non-last) dimmension to ensure no bank conflicts
	in 32-bit mode. */
# ifndef SIMPLE_SHMEM
#  if SLICE_BLOCK == 1
#    undef SHMEM_SHIFTED_PADDING
#    define SHMEM_SHIFTED_PADDING SHMEM_PADDING
#    undef SHMEM_PADDING
#    define SHMEM_PADDING 0
#  endif
#  define SIMPLE_SHMEM
# endif /* ! SIMPLE_SHMEM */
#endif

#if SLICE_BLOCK > 1
# define SIMPLE_SHMEM
# if SLICE_BLOCK == 4
#  if  ((OVERSAMPLE > 2)||!defined(HST_OPTIMIZE_KEPLER))&&!defined(HST_HALF_MODE)
#   define SPLIT_SHMEM
#  endif
# endif
#endif

#ifndef HST_SHMEM64
# if (OVERSAMPLE > 2)&&(SLICE_BLOCK > 1)&&!defined(HST_HALF_MODE)
#  define SPLIT_SHMEM32
# endif
#endif


    // We don't need padding if shfl used to compute (x+1)-x
#if defined(CACHE_Y)&&!defined(SIMPLE_SHMEM)
# if defined(HST_NEWCACHE)
#  define NEWCACHE
#  if defined(HST_SHFL_CACHE)||defined(HST_NEWCACHE_UNPAD)
#   undef SHMEM_PADDING
#   define SHMEM_PADDING 0
#   define REMAP_THREADS
#  endif
#  if defined(HST_SHFL_CACHE)
#   define SHFL_CACHE
#  endif
# endif
#endif




#if PPT == 1
# if SHMEM_SAMPLING == 4
#  define SHMEM_LOADS (6)
# else
#  define SHMEM_LOADS (2 * SHMEM_SAMPLING)
# endif
# define SHMEM_LOADS_ITERATIONS SHMEM_LOADS
#else
# define SHMEM_LOADS (3 * SHMEM_SAMPLING * (PPT / 2))

# if defined(HST_WARPLINE)&&!defined(HST_HALF_MODE)&&((SHMEM_SAMPLING * PPT) >= 4)/*&&(PROJ_BLOCK <= 8)*/
#  if defined(HST_SHMEM64)&&((SHMEM_SAMPLING * PPT) >= 8)&&(SLICE_BLOCK == 1)&&defined(SIMPLE_SHMEM)
#    define SHMEM_LOAD2						/* Combine two texture fetches to 1 64-bit shmem store */
#    define SHMEM_LOAD_FULLWARP ((SHMEM_SAMPLING * PPT) / 4)	/* Counts number of rows we can load simultaneously */
#    define SHMEM_LOAD_ITERATIONS (SHMEM_LOADS  / SHMEM_LOAD_FULLWARP / 2)  /* Use 32-threads (or more) for loading data, this limits projections to 8 */
#  else
#    if defined(SHFL_CACHE)||defined(HST_NEWCACHE_UNPAD)
#     define SHMEM_LOAD_FULLWARP 2				/* Counts number of rows we can load simultaneously */
#    elif defined(HST_SHMEM64)
#     define SHMEM_LOAD_FULLWARP ((SHMEM_SAMPLING * PPT) / 2)	/* Counts number of rows we can load simultaneously */
#    else
#     define SHMEM_LOAD_FULLWARP 2//2 // Shoudl be adaptive, set to 4 if only 4 proj (prevents shfl!)
#    endif
#    define SHMEM_LOAD_ITERATIONS (SHMEM_LOADS / SHMEM_LOAD_FULLWARP)
#  endif
# else
#  define SHMEM_LOAD_ITERATIONS (SHMEM_LOADS)
# endif
#endif




#ifdef SIMPLE_SHMEM
# define CACHE(proj, bin, c) cache[c][proj][bin]
# define FANCY_CACHE(proj, bin, c) (*(vfloat*)(((char*)(&(cache[c][proj][0]))) + bin))
# define FANCY_CACHE_MULTIPLIER (sizeof(vfloat))
#else 
# define CACHE(proj, bin, c) cache[proj][bin].comp(c)
# define CACHE2(proj, bin) cache[proj][bin]
# define FANCY_CACHE(proj, bin, c) (*(vfloat*)(((char*)(&(cache[proj]))) + bin + c))
# define FANCY_CACHE2(proj, bin) (*(float2*)(((char*)(&(cache[proj]))) + bin))
# define FANCY_CACHE_MULTIPLIER (2 * sizeof(float))
#endif

#define FANCY_MINH_CORRECTION(x) (x)
#ifdef OVERSAMPLE
# ifdef HST_OVERS_FANCY_ROUND
#  ifdef HST_OVERS_FANCY_INDEX
#   define FANCY_INDEX
//#  define FANCY_ROUND(idx, h) ({idx = FANCY_CACHE_MULTIPLIER * (int)(h); })
#   define FANCY_ROUND(idx, h) ({float fh = FANCY_CACHE_MULTIPLIER * ((h) + exp23) - (FANCY_CACHE_MULTIPLIER - 1) * exp23; idx = __float_as_int(fh) - 0x4B000000; })
#  else /* NO FANCY_INDEX */
#   define FANCY_ROUND(idx, h) ({float fh = (h) + exp23; idx = __float_as_int(fh) - 0x4B000000; })
#  endif /* NO FANCY_INDEX */
# endif /* FANCY_ROUND */
#else /* NOT OVERSAMPLE */
# ifdef HST_LINEAR_FANCY_FLOOR
#  ifdef HST_LINEAR_FANCY_INDEX
#   define FANCY_INDEX
#   define FANCY_ROUND(idx, diff, h) ({ float newh = h - 0.499999f; float fh = newh + exp23; diff = (fh - exp23); fh = FANCY_CACHE_MULTIPLIER * fh - (FANCY_CACHE_MULTIPLIER - 1) * exp23; idx = (*(int*)(&fh)) - (0x4B000000 + FANCY_CACHE_MULTIPLIER); })
#  else /* NO FANCY_INDEX */
#   define FANCY_ROUND(idx, diff, h) ({ float newh = h - 0.499999f; float fh = newh + exp23; diff = (fh - exp23); idx = (*(int*)(&fh)) - 0x4B000001; })
#  endif
	// For proper rounding we need values above 1. So, we artificaly add 1 to array offset. We will remove it during the rounding.
#  undef FANCY_MINH_CORRECTION
#  define FANCY_MINH_CORRECTION(x) ((x) - 1.f)
# endif
#endif


#ifdef FANCY_INDEX
# define FAST_CACHE FANCY_CACHE
# define FAST_CACHE2 FANCY_CACHE2
#else
# define FAST_CACHE CACHE
# define FAST_CACHE2 CACHE2
#endif





#ifdef SIMPLE_SHMEM
# ifdef CACHE_Y
#  ifdef SPLIT_SHMEM
#   define GET_CACHE(v1, v2, minh, i) ({ int bin = (minh); float2 vxy, vzw; vxy = FAST_CACHE(i, bin, 0); vzw = FAST_CACHE(i, bin, 2); v1 = mkf4(vxy, vzw);  vxy = FAST_CACHE(i, bin, 1); vzw = FAST_CACHE(i, bin, 3); v2 = mkf4(vxy, vzw); })
#  else /* SPLIT_SHMEM */
#   define GET_CACHE(v1, v2, minh, i) ({ int bin = (minh); v1 = FAST_CACHE(i, bin, 0); v2 = FAST_CACHE(i, bin, 1); })
#  endif /* SPLIT_SHMEM */
# else
#  if defined(SPLIT_SHMEM32)
#   if SLICE_BLOCK == 4
#    define GET_CACHE(v, minh, i) ({ int bin = (minh); float vx, vy, vz, vw; vx = FAST_CACHE(i, bin, 0); vy = FAST_CACHE(i, bin, 1); vz = FAST_CACHE(i, bin, 2); vw = FAST_CACHE(i, bin, 3); v = make_float4(vx, vy, vz, vw); })
#   elif SLICE_BLOCK == 2
#    define GET_CACHE(v, minh, i) ({ int bin = (minh); float vx, vy; vx = FAST_CACHE(i, bin, 0); vy = FAST_CACHE(i, bin, 1); v = make_float2(vx, vy); })
#   else
#    error "SHMEM32 should only be enabled in multi-slice modes"
#   endif
#  elif defined(SPLIT_SHMEM)
#   define GET_CACHE(v, minh, i) ({ int bin = (minh); float2 vxy, vzw; vxy = FAST_CACHE(i, bin, 0); vzw = FAST_CACHE(i, bin, 1); v = mkf4(vxy, vzw); })
#  else /* SPLIT_SHMEM */
#   ifdef HST_HALF_CACHE
#    define GET_CACHE(v, minh, i) ({ int bin = (minh); union { sfloat f; half2 h[2]; } vdata; vdata.f = FAST_CACHE(i, bin, 0); float4 data = mkf4(__half22float2(vdata.h[0]), __half22float2(vdata.h[1])); v = data; })
#   else /* ! HALF_CACHE */
#    define GET_CACHE(v, minh, i) ({ int bin = (minh); v = FAST_CACHE(i, bin, 0); })
#   endif /* ! HALF_CACHE */
#  endif /* SPLIT_SHMEM */
# endif
#else /* SIMPLE_SHMEM */
# define GET_CACHE(v1, v2, minh, i) ({ int bin = (minh); float2 c = FAST_CACHE2(i, bin); v1 = c.x; v2 = c.y; })
#endif /* SIMPLE_SHMEM */

#if defined(HST_MEM_FETCHES)&&(SHMEM_SAMPLING == 1)&&!defined(HST_HALF_MODE)
# define HST_FETCH_VAR(var, val, tex05) const int var = (val); 	// floor?
# define GET_SINO(bin, proj) sino[(proj)*bin_pitch + (((bin)<num_bins)?(bin):0)]
#else  /* HST_MEM_FETCHES */
# define HST_FETCH_VAR(var, val, tex05) const float var = (val) + tex05;
# ifdef HST_HALF_CACHE
#  define GET_SINO(bin, proj) hst_real_tex(texptr, bin, proj);
# else
#  define GET_SINO(bin, proj) hst_tex(texptr, bin, proj);
# endif /* HST_HALF_CACHE */
#endif /* HST_MEM_FETCHES */

#ifdef DEBUG_OFFSETS
# undef GET_SINO
# define GET_SINO(bin, proj)  {0.f}
#endif

#if defined(CACHE_Y)
# define INTERPOLATE_FROM_CACHE(i, idx) ({ vfloat v1, v2; GET_CACHE(v1, v2, idx, i); (v1 + diff * v2); })
#elif defined(OVERSAMPLE)
# define INTERPOLATE_FROM_CACHE(i, idx) ({ vfloat v; GET_CACHE(v, idx, i); v; })
#else  /* UNCACHED_Y */
# define INTERPOLATE_FROM_CACHE(i, idx) ({ vfloat v1, v2; int _idx = idx; GET_CACHE(v1, _idx, i); GET_CACHE(v2, _idx + 1, i); (v1 + diff * (v2 - v1)); })
#endif /* OVERSAMPLE */


#ifdef SHMEM_LOAD_FULLWARP
# ifdef SHMEM_LOAD2
#  define CACHE_POS(i) (2 * SHMEM_LOAD_FULLWARP * i * BSX + 2 * cache_tidx)
# else
#  define CACHE_POS(i) (SHMEM_LOAD_FULLWARP * i * BSX + cache_tidx)
# endif
#else
# define CACHE_POS(i) (i * BSX + cache_tidx)
#endif

#if HST_LINEAR_CACHE_BLOCK > (BSX * BSY / ASSYMETRY)
# define CACHE_BLOCK (BSX * BSY / ASSYMETRY)
#else 
# define CACHE_BLOCK HST_LINEAR_CACHE_BLOCK
#endif

#if defined(HST_SUBH_DIRECT)&&!defined(HST_CACHE_SUBH)
# define FLOOR(x) (x)
#else
//# define FLOOR(x) (floor(x))
# define FLOOR(x) (floor(x - 0.5f) + 0.5f)
#endif

#if (ASSYMETRY > 1)&&((BSX != 16)||(BSY != 16))
# error "Can't use non standard block size together with assymetry"
#elif (ASSYMETRY > 1)
# undef HST_SQUARE_PPT
#elif (BSX != 16)||(BSY != 16)
# define HST_SQUARE_PPT
#endif

#if defined(HST_CACHE_SUBH_X)//&&!defined(OVERSAMPLE)
# define CACHE_SUBH_X
#else
# undef CACHE_SUBH_X
#endif


#ifdef CACHE_SUBH_X
# ifdef HST_CACHE_MINH
#  ifdef HST_XCACHE_LD128
#   define P_BLOCK HST_XCACHE_LD128
#  else
        // 2 seems optimal, 4 on Kepler?
#   define P_BLOCK 1
#  endif
#  undef HST_SQUARE_PPT
#  if ASSYMETRY > 1
#   error "ASSYMETRY has to be implemented for SQUARE_PPT"
#  endif
# else 
#  define P_BLOCK PPT
#  define HST_SQUARE_PPT
# endif
#endif

#if P_BLOCK == 4
# define PTYPE float4
#elif P_BLOCK == 2
# define PTYPE float2
#else
# define PTYPE float
#endif

#if defined(HST_CACHE_MINH)
# ifdef HST_PRECOMPUTE_OFFSETS
#  define CACHE_IDX(p) (proj_offset + (p))
# else
#  define CACHE_IDX(p) (subproj * PROJ_BLOCK + (p))
# endif
#else
# define CACHE_IDX(p) (p)
#endif




#ifdef SHMEM_LOAD_FULLWARP
# define PSTEP (BSY/ASSYMETRY/SHMEM_LOAD_FULLWARP)
#else
# define PSTEP (BSY/ASSYMETRY)
#endif



#if defined(HST_HYBRID) && (!defined(OVERSAMPLE) || (OVERSAMPLE == 1))
 __device__
 static 
#else
__global__
static 
# if MIN_BLOCKS > 0
__launch_bounds__(BSX * BSY / ASSYMETRY, MIN_BLOCKS)
# endif
#endif
void KERNEL_NAME(cudaTextureObject_t texptr, const float * __restrict__ g_all, const tfloat* __restrict__ sino, int num_proj, int num_bins, int bin_pitch, vfloat *d_SLICE, float apos_off_x, float apos_off_y, int batch) {
#ifdef HST_SQUARE_PPT
    vfloat res[PPT][PPT] = {0.f};
#else
# define YSIZE (ASSYMETRY * PPT * PPT)
# define YSTEP (BSY / (ASSYMETRY * PPT))
    vfloat res[YSIZE] = {0.f};
#endif

    const float exp23 = exp2(23.f);

    const int tidx = threadIdx.x;
    const int tidy = threadIdx.y;

#ifdef SHMEM_LOAD_FULLWARP
	// Should not happen for 8x8 blocks (should be no conflicts with ppt=2)
    const int cache_tidx = (tidy%SHMEM_LOAD_FULLWARP)*16 + tidx;
    const int cache_tidy = tidy/SHMEM_LOAD_FULLWARP;
#else
    const int cache_tidx = tidx;
    const int cache_tidy = tidy;
#endif

    const int bidx = PPT * blockIdx.x * BSX;
    const int bidy = batch + PPT * blockIdx.y * BSY;

	// We get centers offsetted by 0.5f to get in the pixel centers as expected by texture engine
    const float off_x = apos_off_x;
    const float off_y = apos_off_y;

    const float bx = bidx + off_x;
    const float by = bidy + off_y;

#if defined(HST_SQUARE_PPT)
# ifdef REMAP_THREADS
	// Also should not happen for 8x8 blocks (but we can reconsider)
#  if (BSX == 16)&&(BSY == 16)
    const int sidx = 4 * (threadIdx.y % 4) + (threadIdx.x % 4);
    const int sidy = 4 * (threadIdx.y / 4) + (threadIdx.x / 4);
#  elif (BSX == 8)&&(BSY == 8)
    const int sidx16 = 4 * (threadIdx.y % 4) + (threadIdx.x % 4);
    const int sidx = sidx16 % 8;
    const int sidy = 4 * (sidx16 / 8) + 2 * (threadIdx.y / 4) + (threadIdx.x / 4);
#  else
#   error "Specified block size is not supported"
#  endif
# else
    const int sidx = tidx;
    const int sidy = tidy;
# endif
#else // standard mode
# if PPT == 4
#  if defined(HST_ZMAP)&&(ASSYMETRY < 2)
    const int xy_minor = threadIdx.x % 4;
    const int y_minor = xy_minor / 2;
    const int x_minor = xy_minor % 2;

    const int xy_major = threadIdx.x / 4;
    const int y_major = xy_major / 2;
    const int x_major = xy_major % 2;

    const int sidx = 4 * ASSYMETRY * threadIdx.y + x_major * 2 + x_minor;
    const int sidy = y_major * 2 + y_minor;
#  elif defined(HST_LMAP)&&(ASSYMETRY < 2)
    const int sidx = 16 * (threadIdx.y % 4) + threadIdx.x;
    const int sidy = threadIdx.y / 4;
#  else
    const int sidx = 4 * ASSYMETRY * threadIdx.y + (threadIdx.x % (4 * ASSYMETRY));
    const int sidy = (threadIdx.x / (4 * ASSYMETRY));
#  endif
# elif PPT == 2
#  if ASSYMETRY == 4
    const int sidx = 8 * threadIdx.y  + (threadIdx.x % 8);
    const int sidy = (threadIdx.x / 8);
#  elif defined(HST_ZMAP)&&(ASSYMETRY < 2)
    const int xy_minor = threadIdx.x % 4;
    const int y_minor = xy_minor / 2;
    const int x_minor = xy_minor % 2;

    const int xy_major = threadIdx.x / 4;
    const int y_major = xy_major / 2;
    const int x_major = xy_major % 2;

    const int sidx = 4 * (threadIdx.y % 8) + x_major * 2 + x_minor;
    const int sidy = 4 * (threadIdx.y / 8) + y_major * 2 + y_minor;
#  elif defined(HST_LMAP)&&(ASSYMETRY < 2)
    const int sidx = 16 * (threadIdx.y % 2) + threadIdx.x;
    const int sidy = threadIdx.y / 8;
#  else
    const int sidx = 4 * (threadIdx.y % 8) + (threadIdx.x % 4);
    const int sidy = 4 * (threadIdx.y / 8) + (threadIdx.x / 4);
#  endif
# else
#   error "Specified PPT is not supported"
# endif
#endif

    const int idx = bidx + sidx; 
    const int idy = bidy + sidy; 

#if defined(CACHE_SUBH_X)&&!defined(HST_CACHE_MINH)
    const float _x = SHMEM_SAMPLING * (bx + cache_tidx);
#else
    const float x = SHMEM_SAMPLING * (idx + off_x);
#endif
    const float y = SHMEM_SAMPLING * (idy + off_y);


#ifdef DEBUG_OFFSETS
    int debug_run = !DEBUG_OFFSETS;
#endif 

#ifdef HST_CACHE_MINH
# ifdef HST_SHFL_MINH 
     const int wrapid = (tidy&1) * 16 + tidx;
# endif
    __shared__ float  cache_minh[CACHE_BLOCK];		// For mem-fetches we actually can store in unsigned char (but that would involve back-conversion for texture fetches)
    __shared__ float  cache_subh[CACHE_BLOCK];
# ifdef CACHE_SUBH_X
#  if P_BLOCK == 2
	// may be different on SHMEM64
    __shared__ float  cache_subhx[PROJ_BLOCK/P_BLOCK][PPT * BSX + 8][P_BLOCK];
#  elif P_BLOCK > 1
    __shared__ float  cache_subhx[PROJ_BLOCK/P_BLOCK][PPT * BSX][P_BLOCK];
#  else 
    __shared__ float  cache_subhx[PROJ_BLOCK][PPT * BSX + 8];
#  endif
    __shared__ float  cache_cos[CACHE_BLOCK];
    __shared__ float  cache_sin[CACHE_BLOCK];
# else
    __shared__ float2 cache_sin[CACHE_BLOCK];
# endif

    int tminhx = tidy * BSX + tidx;
#else
# ifdef HST_CACHE_SUBH
#  ifdef CACHE_SUBH_X
//    __shared__ float  cache_subh[PROJ_BLOCK/P_BLOCK][PPT * BSX][P_BLOCK];
    __shared__ PTYPE  cache_subhx[PROJ_BLOCK][BSX];
#  else
    __shared__  float   cache_subh[PROJ_BLOCK];
#  endif
# endif

# ifdef HST_CACHE_SIN
#  ifdef CACHE_SUBH_X
    __shared__  float  cache_sin[PROJ_BLOCK];
#  else
    __shared__  float2  cache_sin[PROJ_BLOCK];
#  endif
# endif
#endif

#ifdef SIMPLE_SHMEM
# if defined(SPLIT_SHMEM32)
    __shared__  float   cache[SLICE_BLOCK * SHMEM_ARRAYS][PROJ_BLOCK + SHMEM_SHIFTED_PADDING][SHMEM_LOADS * BSX + SHMEM_PADDING];
# elif defined(SPLIT_SHMEM)
    __shared__  float2   cache[2 * SHMEM_ARRAYS][PROJ_BLOCK + SHMEM_SHIFTED_PADDING][SHMEM_LOADS * BSX + SHMEM_PADDING];
# else
    __shared__  sfloat   cache[SHMEM_ARRAYS][PROJ_BLOCK + SHMEM_SHIFTED_PADDING][SHMEM_LOADS * BSX + SHMEM_PADDING];
# endif
#else 
    __shared__  float2   cache[PROJ_BLOCK][SHMEM_LOADS * BSX + SHMEM_PADDING];
#endif


    HST_FETCH_VAR(projf, cache_tidy, 0.5f);

#ifdef HST_CACHE_MINH
    for (int proje=0; proje<num_proj; /*proje += PROJ_BLOCK*/) {

	    // Load2 on SHMEM64
      if (tminhx < CACHE_BLOCK) {
        int g_proj = proje + tminhx;
# if defined(HST_GMEM_CONST)
	const float4 all = { g_all[g_proj], g_all[MAXNPROJECTIONS + g_proj], g_all[2 * MAXNPROJECTIONS + g_proj], g_all[3 * MAXNPROJECTIONS + g_proj] };
# elif defined(HST_C_TRIG)
	const float4 all = (float4){c_trig[g_proj].x, c_trig[g_proj].y, c_ofst[g_proj].x, c_ofst[g_proj].y };
# else
	const float4 all = c_all[g_proj];
# endif

# if !defined(HST_GMEM_CONST)&&defined(HST_C_TRIG)
//	const float g_minh = floor(__fmaf_rz(-by, all.y, __fmaf_rz(bx, all.x, all.w)));
	const float g_minh = floor(bx * all.x - by * all.y + all.w);
	float g_subh = all.z - SHMEM_SAMPLING * g_minh;
# else
	const float g_minh = floor(all.z + bx * all.x - by * all.y + all.w);
	float g_subh = SHMEM_SAMPLING * (all.z - FANCY_MINH_CORRECTION(g_minh));
# endif

# if defined(OVERSAMPLE)&&!defined(FANCY_ROUND)
	g_subh += 0.5f;
# endif
	cache_subh[tminhx] = g_subh;
	cache_minh[tminhx] = g_minh;
# ifdef CACHE_SUBH_X
	cache_cos[tminhx] = all.x;
	cache_sin[tminhx] = all.y;
# else
	cache_sin[tminhx] = (float2){ all.x, all.y};
# endif
      }

      __syncthreads();

# ifdef HST_SHFL_MINH 
      float regcache_subh[CACHE_BLOCK / 32];
#  pragma unroll
      for (int i = 0; i < (CACHE_BLOCK / 32); i ++) {
        regcache_subh[i] = cache_subh[32 * i + wrapid];
      }
# endif
# if CACHE_BLOCK > 32
      const int last_proj = min(num_proj, proje + CACHE_BLOCK);
      for (int subproj = 0; proje < last_proj; subproj++, proje += PROJ_BLOCK) {
# else
#pragma unroll
      for (int subproj = 0; subproj < (32 / PROJ_BLOCK); subproj++, proje += PROJ_BLOCK) {
#endif
# ifdef HST_PRECOMPUTE_OFFSETS
	 const int proj_offset = subproj * PROJ_BLOCK;
	 const int proj_var = (proj_offset) >> 5;
	 const int proj_idx = (proj_offset)&0x1F;
# endif

#else
    for (int proje=0; proje < num_proj; proje += PROJ_BLOCK) {
#endif

	if (cache_tidy < PROJ_BLOCK) {
#if PSTEP < PROJ_BLOCK
#warning "multi-step caching..."
# define P(proje) (proje + p)
#pragma unroll
	  for (int p = 0; p < PROJ_BLOCK; p += PSTEP) {
#else
# define P(proje) (proje)
	  {
#endif
#ifdef HST_CACHE_MINH
	    const float minh = cache_minh[subproj * PROJ_BLOCK + P(cache_tidy)];
#else // ! HST_CACHE_MINH


# ifdef HST_C_TRIG
	    const float4 all = (float4){c_trig[P(proje + cache_tidy)].x, c_trig[P(proje + cache_tidy)].y, c_ofst[P(proje + cache_tidy)].x, c_ofst[P(proje + cache_tidy)].y };
//	    const float minh = FLOOR(__fmaf_rz(-by, all.y, __fmaf_rz(bx, all.x, all.w)));
	    const float minh = FLOOR(bx * all.x - by * all.y + all.w);
# else
		// cache_tidy is either equal to tidy or to tidy/2
	    float4 all = c_all[P(proje + cache_tidy)];
	    float minh = FLOOR(all.z + bx * all.x - by * all.y + all.w);
# endif

# ifdef HST_CACHE_SUBH
#  ifdef HST_C_TRIG
	    float subh = all.z - SHMEM_SAMPLING * minh;
#  else
	    float subh = SHMEM_SAMPLING * (all.z - FANCY_MINH_CORRECTION(minh));
#  endif
#  if defined(OVERSAMPLE)&&!defined(FANCY_ROUND)
	    subh += 0.5f;
#  endif

#  ifdef CACHE_SUBH_X
#   if (defined(SHMEM_LOAD_FULLWARP))
/*	if (cache_tidx < BSX) 
	    cache_subhx[P(cache_tidy)][cache_tidx].x = subh + _x * all.x;
	else
	    cache_subhx[P(cache_tidy)][cache_tidx - BSX].y = subh + _x * all.x;
*/
#if PPT == 2
	const int hxpos = (cache_tidx < BSX)?(2*cache_tidx):(2*(cache_tidx - BSX) + 1);
	*(((float*)(&cache_subhx[P(cache_tidy)][0])) + hxpos) = subh + _x * all.x;
#else
	const int hxhalf = (cache_tidx < BSX)?0:1;
	const int hxpos = (cache_tidx < BSX)?(2*cache_tidx):(2*(cache_tidx - BSX) + 1);
	*(((float2*)(&cache_subhx[P(cache_tidy)][0])) + hxpos) = (float2){
		subh + (_x + (2 * hxhalf    ) * BSX * SHMEM_SAMPLING) * all.x,
		subh + (_x + (2 * hxhalf + 1) * BSX * SHMEM_SAMPLING) * all.x
	};
#endif

#    ifdef HST_CACHE_SIN
	if (cache_tidx == 0)
	    cache_sin[P(cache_tidy)] = all.y;
#    endif

	{
#   else
#    ifdef SHMEM_LOAD_FULLWARP
	    if (cache_tidx < BSX) 
#    endif
	    {


	    cache_subhx[P(cache_tidy)][cache_tidx] = (PTYPE){
		subh + _x * all.x
		, subh + (_x + BSX * SHMEM_SAMPLING) * all.x
#    if PPT > 2
		, subh + (_x + 2 * BSX * SHMEM_SAMPLING) * all.x
		, subh + (_x + 3 * BSX * SHMEM_SAMPLING) * all.x
#    endif
	    };


#    ifdef HST_CACHE_SIN
	    if (cache_tidx == 0)
	        cache_sin[P(cache_tidy)] = all.y;
#    endif
#   endif //FAST_TRACK
#  else
	    if (cache_tidx == 0) {
		cache_subh[P(cache_tidy)] = subh;
#   ifdef HST_CACHE_SIN
		cache_sin[P(cache_tidy)] = (float2){all.x, all.y};
#   endif
#  endif
	    }
# endif // HST_CACHE_SUBH
#endif // ! HST_CACHE_MINH


	    HST_FETCH_VAR(binf, minh, 0.0f);
#if defined(NEWCACHE)
	    const int bin_start = SHMEM_LOAD_ITERATIONS * cache_tidx;
	    float v1 = GET_SINO(binf + bin_start, P(proje) + projf);
	    float v = v1;
#pragma unroll
	    for (int i = 0; i < (SHMEM_LOAD_ITERATIONS - 1); i++) {
		const int pos = bin_start + i;
		const float vnext = GET_SINO(binf + pos + 1, P(proje) + projf);
		CACHE2(P(cache_tidy), pos) = (float2){ v, vnext - v };
		v = vnext;
	    }
//	    const int last_bin = bin_start + SHMEM_LOAD_ITERATIONS;
# if defined(SHFL_CACHE)
#  ifdef SHMEM_LOAD_FULLWARP
	    v1 = __shfl_down(v1, 1, 32);
#  else
	    v1 = __shfl_down(v1, 1, BSX);
#  endif
# else
	    __threadfence_block();
#  ifdef HST_NEWCACHE_UNPAD
#   ifdef SHMEM_LOAD_FULLWARP
	    v1 = (cache_tidx<31)?CACHE(P(cache_tidy),  bin_start + SHMEM_LOAD_ITERATIONS, 0):v;
#   else
	    v1 = (cache_tidx<(BSX-1))?CACHE(P(cache_tidy),  bin_start + SHMEM_LOAD_ITERATIONS, 0):v;
#   endif
#  else
	    v1 = CACHE(P(cache_tidy),  bin_start + SHMEM_LOAD_ITERATIONS, 0);
#  endif
# endif
	    CACHE2(P(cache_tidy), bin_start + SHMEM_LOAD_ITERATIONS - 1) = (float2){ v, (v1 - v) };
#else
# pragma unroll
	    for (int i = 0; i < SHMEM_LOAD_ITERATIONS; i++) {
		int pos = CACHE_POS(i);

		sfloat v = GET_SINO(binf + OVERSAMPLE_MULTIPLIER * pos, P(proje) + projf);
# if defined(SPLIT_SHMEM32)
		CACHE(P(cache_tidy), pos, 0) = v.x;
		CACHE(P(cache_tidy), pos, SHMEM_ARRAYS) = v.y;
#  if SLICE_BLOCK > 2
		CACHE(P(cache_tidy), pos, 2 * SHMEM_ARRAYS) = v.z;
		CACHE(P(cache_tidy), pos, 3 * SHMEM_ARRAYS) = v.w;
#  endif
# elif defined(SPLIT_SHMEM)
		CACHE(P(cache_tidy), pos, 0) = (float2){v.x, v.y};
		CACHE(P(cache_tidy), pos, SHMEM_ARRAYS) = (float2){v.z, v.w};
# elif defined(SHMEM_LOAD2)
		float v2 = GET_SINO(binf + OVERSAMPLE_MULTIPLIER * (pos + 1), P(proje) + projf);
		*(float2*)&CACHE(P(cache_tidy), pos, 0) = (float2){v, v2};
# else
		CACHE(P(cache_tidy), pos, 0) = v;
# endif
	    }

# ifdef CACHE_Y
	    // We don't need to sync all groups, but prevent optimizer from reshuffling the instructions,
	    // __syncthreads() will require to break the (to be executed by all threads) and slightly slower
	__threadfence_block();

# pragma unroll
	    for (int i = 0; i < SHMEM_LOAD_ITERATIONS; i++) {
		int pos = CACHE_POS(i);
		CACHE(P(cache_tidy), pos, 1) = CACHE(P(cache_tidy), pos + 1, 0) - CACHE(P(cache_tidy), pos, 0);
#  ifdef SPLIT_SHMEM
		CACHE(P(cache_tidy), pos, 3) = CACHE(P(cache_tidy), pos + 1, 2) - CACHE(P(cache_tidy), pos, 2);
#  endif
	    }
# endif // CACHE_Y
#endif // SHFL_CACHE

	  } // p
	} //cache_tidy in range

	__syncthreads();
	

#if defined(HST_CACHE_MINH)&&defined(CACHE_SUBH_X)
	const int pid = CACHE_IDX(sidy);

# if PROJ_BLOCK < YSTEP
#  if P_BLOCK > 1
	if (sidy < PROJ_BLOCK) {
	    cache_subhx[sidy/P_BLOCK][sidx][sidy%P_BLOCK] = cache_subh[pid] + x * cache_cos[pid];
	}
#  else
	if (sidy < PROJ_BLOCK) {
	    cache_subhx[sidy][sidx] = cache_subh[pid] + x * cache_cos[pid];
	}
#  endif
# else
#  if P_BLOCK > 1
#   pragma unroll
	for (int i = 0; i < (PROJ_BLOCK / YSTEP); i++) {
	    cache_subhx[(i * YSTEP)/P_BLOCK + sidy/P_BLOCK][sidx][sidy%P_BLOCK] = cache_subh[pid + i * YSTEP] + x * cache_cos[pid + i * YSTEP];
	}
#  else /* P_BLOCK == 1 */
#   pragma unroll
	for (int i = 0; i < (PROJ_BLOCK / YSTEP); i++) {
	    cache_subhx[(i * YSTEP) + sidy][sidx] = cache_subh[pid + i * YSTEP] + x * cache_cos[pid + i * YSTEP];
	}
#  endif /* P_BLOCK == 1 */
# endif
	__syncthreads(); //?
#endif



#if defined(HST_CACHE_MINH)&&defined(CACHE_SUBH_X)
# if P_BLOCK > 1
//#  pragma unroll
	for (int pblock = 0; pblock < (PROJ_BLOCK/P_BLOCK); pblock ++) {
/*	    PTYPE _subh = *(PTYPE*)(&cache_subhx[pblock][sidx][0]);
	    PTYPE _sincos = *(PTYPE*)(&cache_sin[CACHE_IDX(P_BLOCK * pblock)]);
	    float *pb_subh = (float*)&_subh;
	    float *pb_sincos = (float*)&_sincos;*/


	    float pb_subh[P_BLOCK];
	    float pb_sincos[P_BLOCK];

#  pragma unroll
	    for (int i = 0; i < P_BLOCK; i++) {
		pb_sincos[i] = cache_sin[CACHE_IDX(P_BLOCK * pblock + i)];
#  ifdef HST_XCACHE_EARLY_Y
		pb_subh[i] = cache_subhx[pblock][sidx][i] - y * pb_sincos[i];
#  else
		pb_subh[i] = cache_subhx[pblock][sidx][i];
#  endif
	    }

#  pragma unroll
	    for (int pstep = 0; pstep < P_BLOCK; pstep++) {
		const int p = pblock * P_BLOCK + pstep;
		const float subh = pb_subh[pstep];
		const float sincos = pb_sincos[pstep];

# else

            {
# pragma unroll
	      for (int p = 0; p < PROJ_BLOCK; p++) {
		float sincos = cache_sin[CACHE_IDX(p)];
#  ifdef HST_XCACHE_EARLY_Y
		float subh = cache_subhx[p][sidx] - y * sincos;
#  else
		float subh = cache_subhx[p][sidx];
#  endif
# endif

#else /* ! CACHE_MINH && CAHE SUBH_X */
	    {
# if !defined(HST_SHMEM64)
#  if SLICE_BLOCK > 2
#   pragma unroll 2
#  else
#   pragma unroll// over-unrolling hurts performance (but 4 is need to allow joining accesses to subh_cache)
#  endif
# elif (ASSYMETRY > 2)||(SLICE_BLOCK > 1)||(!defined(HST_FANCY_ROUND))
#  pragma unroll 2	// over-unrolling hurts performance (but 4 is need to allow joining accesses to subh_cache)
# else
#  pragma unroll 4
	// over-unrolling hurts performance (but 4 is need to allow joining accesses to subh_cache)
# endif
	    for (int p = 0; p < PROJ_BLOCK; p++) {

# if !defined(HST_CACHE_SUBH)
#  ifdef HST_C_TRIG
		const float4 all = (float4){c_trig[proje + p].x, c_trig[proje + p].y, c_ofst[proje + p].x, c_ofst[proje + p].y };
#  else
		const float4 all = c_all[proje + p];
#  endif
# elif !defined(HST_CACHE_SIN)
#  if defined(CACHE_SUBH_X)&&defined(HST_C_SIN)
		const float all = c_sin[proje + p];
#  elif defined(HST_C_TRIG)
		const float2 all = c_trig[proje + p];
#  else
		const float4 all = c_all[proje + p];
#  endif
# endif

# if defined(HST_CACHE_SUBH)&&defined(HST_CACHE_SIN)
#  ifdef CACHE_SUBH_X
 		const float sincos = cache_sin[CACHE_IDX(p)];
#  else
		const float2 sincos = cache_sin[CACHE_IDX(p)];
#  endif
# elif defined(HST_C_SIN)
#  define sincos all
# elif defined(CACHE_SUBH_X)
#  define sincos all.y
# else
#  define sincos all
# endif

# if defined(HST_CACHE_MINH)
#  if defined(HST_SHFL_MINH)
#   ifdef HST_PRECOMPUTE_OFFSETS
		float subh = __shfl(regcache_subh[proj_var], proj_idx + p, 32);
#   else
		float subh = __shfl(regcache_subh[subproj * PROJ_BLOCK >> 5], (subproj * PROJ_BLOCK + p)&0x1F, 32);
#   endif // HST_PRECOMPUTE_OFFSETS
#  else
		float subh = cache_subh[CACHE_IDX(p)];
#  endif // CACHE_SUBH_X
# elif defined(HST_CACHE_SUBH)
#  ifdef CACHE_SUBH_X
//	    	const int pidx = CACHE_IDX(p);
//	    	const float subh = cache_subh[pidx/4][sidx][pidx%4];
//	    	float subh = cache_subh[CACHE_IDX(pblock * P_BLOCK + p)];
		const float *subh = (float*)&cache_subhx[CACHE_IDX(p)][sidx];
#  else
		float subh = cache_subh[CACHE_IDX(p)];
#  endif // ! CACHE_SUBH_X
# else // DIRECT MODE
#  ifdef HST_C_TRIG
		const float minh = FLOOR(__fmaf_rz(-by, all.y, __fmaf_rz(bx, all.x, all.w)));
//	    	float minh = FLOOR(bx * all.x - by * all.y + all.w);
		const float subh = all.z - SHMEM_SAMPLING * minh;
#  else
		float minh = FLOOR(all.z + bx * all.x - by * all.y + all.w);
		float subh = SHMEM_SAMPLING * (all.z - FANCY_MINH_CORRECTION (minh));
#  endif
#  if defined(OVERSAMPLE)&&!defined(FANCY_ROUND)
		subh += 0.5f; // bringing to center of the pixel to avoid adding 0.5f before interpolation
#  endif
# endif // ! HST_CACHE_MINH

# ifndef CACHE_SUBH_X
//	    	subh += SHMEM_SAMPLING_MULTIPLIER * (x * sincos.x - y * sincos.y);
//	    	subh = (subh + x * sincos.x) - y * sincos.y;
		subh = __fmaf_rz(-y, sincos.y, __fmaf_rz(x, sincos.x, subh));
# endif
#endif /* No p-blocks */

#ifdef HST_SQUARE_PPT
# pragma unroll
		for (int i = 0; i < PPT; i++) {
# ifdef CACHE_SUBH_X
	    	  float hsquare[PPT];
#  pragma unroll
	    	  for (int j = 0; j < PPT; j++) {
	    	    hsquare[j] = subh[j] - (y + i * BSY * SHMEM_SAMPLING) * sincos;
	    	  }
# endif
# pragma unroll
	    	  for (int j = 0; j < PPT; j++) {
# ifdef CACHE_SUBH_X
		    // This is faster for some reason on GTX580
		    const float h = hsquare[j];//subh[j] - (y + i * BSY * SHMEM_SAMPLING) * sincos;

# else
		    const float h = subh + (j * BSX * SHMEM_SAMPLING) * sincos.x - (i * BSY * SHMEM_SAMPLING) * sincos.y;
# endif
#else // ! HST_SQUARE_PPT
# pragma unroll
        	  for (int i = 0; i < YSIZE; i++) {
# ifdef CACHE_SUBH_X
#  ifdef HST_XCACHE_EARLY_Y
            	    const float h = subh - (i * YSTEP * SHMEM_SAMPLING) * sincos;
#  else
            	    const float h = subh - (y + i * YSTEP * SHMEM_SAMPLING) * sincos;
#  endif
# else
            	    const float h = subh  -    (i * YSTEP * SHMEM_SAMPLING) * sincos.y;
# endif
#endif // ! HST_SQUARE_PPT

#ifdef OVERSAMPLE
# ifdef FANCY_ROUND
		    int idx;
		    FANCY_ROUND(idx, h);
# else
		    int idx = (int)h;
		    //int idx = (int)(h + 0.5f);	// DS!!! 0.5f can go to subh [OVERSAMPLE NON FANCY]
		    //int idx = (int)rint(h);	// DS!!! We now add 0.5f, so this should not be used
# endif
#else
# ifdef FANCY_ROUND
		    int idx; float diff;
		    FANCY_ROUND(idx, diff, h);
# else
		    float diff = floorf(h);
		    int idx = (int)diff;
# endif
		    diff = h - diff;
#endif


#ifdef DEBUG_OFFSETS
		    if (debug_run > 0) {
#endif

#ifdef HST_SQUARE_PPT
			res[i][j] += INTERPOLATE_FROM_CACHE(p, idx); 
#else
			res[i] += INTERPOLATE_FROM_CACHE(p, idx); 
#endif

#ifdef DEBUG_OFFSETS
		    }

		    if ((debug_run >=0 )&&((proje + p)==0)) { 
# ifdef HST_SQUARE_PPT
			*(float*)(&res[i][j]) = idx;
# else
			*(float*)(&res[i]) = idx;
# endif
			debug_run = -1;
		    }
#endif
#ifdef HST_SQUARE_PPT
	        } // j
#endif
    	      } // i
    	    } // p
    	} // pblock

	__syncthreads();
#ifdef HST_CACHE_MINH
      }
#endif
    }


#ifdef HST_SQUARE_PPT
# pragma unroll
    for (int i = 0; i < PPT; i++) {
# pragma unroll
	for (int j = 0; j < PPT; j++) {
	    d_SLICE[BSX * PPT * gridDim.x * (idy + i * BSY) + idx + j * BSX] = res[i][j];
	}
    }
#else
# pragma unroll
    for (int i = 0; i < YSIZE; i++) {
	d_SLICE[BSX * PPT * gridDim.x * (idy + i * YSTEP) + idx] = res[i];
    }
#endif

}