/tomo/pyhst : contents of hst_cuda/hst_cuda_bp

: (revision 276)

To get this branch, use:

bzr branch
http://darksoft.org/webbzr/tomo/pyhst

//#define DEBUG_OFFSETS 1	// set to 0 to allow intially interpolations and 1 to disable

#undef KERNEL_NAME
#undef SHMEM_PADDING
#undef SHMEM_SHIFTED_PADDING
#undef SHMEM_ARRAYS
#undef SHMEM_SAMPLING
#undef SIMPLE_SHMEM
#undef SPLIT_SHMEM
#undef REMAP_THREADS
#undef SHMEM_LOAD_ITERATIONS
#undef OVERSAMPLE_MULTIPLIER
#undef GET_CACHE
#undef INTERPOLATE_FROM_CACHE
#undef MIN_BLOCKS
#undef CACHE
#undef FANCY_ROUND
#undef FANCY_MINH_CORRECTION
#undef SHMEM_LOAD_FULLWARP
#undef SHMEM_LOAD2
#undef CACHE_POS
#undef GET_SINO
#undef BSX
#undef BSY

#define BSX BS
#define BSY BS

#ifdef HST_CACHE_Y
# define CACHE_Y
#endif

#define HST_MACRO_JOIN(a, b) a ## b
#define MK_KERNEL_NAME(name, version) HST_MACRO_JOIN(name, version)

#ifdef OVERSAMPLE
# if OVERSAMPLE == 1
#  define KERNEL_NAME hst_cuda_alu_nn
# else
#  define KERNEL_NAME MK_KERNEL_NAME(hst_cuda_alu_oversample, OVERSAMPLE)
# endif
# define SHMEM_SAMPLING OVERSAMPLE			/**< Oversmapling coefficients, how many points to sample per cell */
# undef CACHE_Y
# define MIN_BLOCKS HST_OVERS_MIN_BLOCKS
#else
# define KERNEL_NAME hst_cuda_alu_linear
# define SHMEM_SAMPLING 1
# define MIN_BLOCKS HST_LINEAR_MIN_BLOCKS
#endif
#define SHMEM_SHIFTED_PADDING 0

#ifdef CACHE_Y
# define SHMEM_ARRAYS 2
# define SHMEM_PADDING 1
#else
# define SIMPLE_SHMEM					/**< Use simple ShMem array instead of float2 with 'x' and 'y-x' */
# define SHMEM_ARRAYS 1					/**< Number of shmem arrays (in non SIMPLE mode). For interpolation we may need 2 */
# define SHMEM_PADDING 0				/**< We need extra byte in the end of shmem array if we compute (y-x) */
#endif

#if SHMEM_SAMPLING > 1
# define REMAP_THREADS					/**< Localise accesses (needed for wide Oversampling kernels to avoid cache misses */
#endif

#if SHMEM_SAMPLING == 4 
# define OVERSAMPLE_MULTIPLIER 0.25f
#elif SHMEM_SAMPLING > 1
# define OVERSAMPLE_MULTIPLIER (1.f / OVERSAMPLE)
#else
# define OVERSAMPLE_MULTIPLIER 1
#endif


#if PPT == 1
# if SHMEM_SAMPLING == 4
#  define SHMEM_LOADS (6)
# else
#  define SHMEM_LOADS (2 * SHMEM_SAMPLING)
# endif
# define SHMEM_LOADS_ITERATIONS SHMEM_LOADS
#else /* PPT = 4 */
# define SHMEM_LOADS (3 * SHMEM_SAMPLING)
# if defined(HST_SHMEM64)&&(SHMEM_SAMPLING >= 2)&&(PROJ_BLOCK <= 8)
#  if (SHMEM_SAMPLING >= 4)&&(SLICE_BLOCK == 1)
#    define SHMEM_LOAD_ITERATIONS (SHMEM_LOADS / 4)
#    define SHMEM_LOAD_FULLWARP					/* Use 32-threads for loading data, this limits projections to 8 */
#    define SHMEM_LOAD2						/* Combine two texture fetches to 1 64-bit shmem store */
#  else
#    define SHMEM_LOAD_ITERATIONS (SHMEM_LOADS / 2)
#    define SHMEM_LOAD_FULLWARP
#  endif
# else
#  define SHMEM_LOAD_ITERATIONS (SHMEM_LOADS)
# endif
#endif


#ifdef HST_TUNE_SHMEM
	/* This is pretty much useless. We optimize access to ShMem cache during the writting phase, but
	this costs us extra instructions (need to compute 2 addresses instead of 1). 
	- On Fermi (and likely all other architectures float1 is compute bound. For float* we enforce it
	anyway.
	- Besides, using 3D arrays is significantly more optimal than addressing manually in linear array.
	So, the clever tricks will also not work.
	- Shifted padding is to extend array at different (non-last) dimmension to ensure no bank conflicts
	in 32-bit mode. */
# ifndef SIMPLE_SHMEM
#  if SLICE_BLOCK == 1
#    undef SHMEM_SHIFTED_PADDING
#    define SHMEM_SHIFTED_PADDING SHMEM_PADDING
#    undef SHMEM_PADDING
#    define SHMEM_PADDING 0
#  endif
#  define SIMPLE_SHMEM
# endif /* ! SIMPLE_SHMEM */
#endif

#if SLICE_BLOCK > 1
# define SIMPLE_SHMEM
# if SLICE_BLOCK == 4
#  ifndef  HST_OPTIMIZE_KEPLER
#   define SPLIT_SHMEM
#  endif
# endif
#endif


#ifdef SIMPLE_SHMEM
# define CACHE(proj, bin, c) cache[c][proj][bin]
#else 
# define CACHE(proj, bin, c) cache[proj][bin].comp(c)
#endif

#ifdef SIMPLE_SHMEM
# ifdef CACHE_Y
#  ifdef SPLIT_SHMEM
#   define GET_CACHE(v1, v2, minh, i) ({ int bin = (minh); float2 vxy, vzw; vxy = CACHE(i, bin, 0); vzw = CACHE(i, bin, 2); v1 = mkf4(vxy, vzw);  vxy = CACHE(i, bin, 1); vzw = CACHE(i, bin, 3); v2 = mkf4(vxy, vzw); })
#  else /* SPLIT_SHMEM */
#   define GET_CACHE(v1, v2, minh, i) ({ int bin = (minh); v1 = CACHE(i, bin, 0); v2 = CACHE(i, bin, 1); })
#  endif /* SPLIT_SHMEM */
# else
#  ifdef SPLIT_SHMEM
#   define GET_CACHE(v, minh, i) ({ int bin = (minh); float2 vxy, vzw; vxy = CACHE(i, bin, 0); vzw = CACHE(i, bin, 1); v = mkf4(vxy, vzw); })
#  else /* SPLIT_SHMEM */
#   define GET_CACHE(v, minh, i) ({ int bin = (minh); v = CACHE(i, bin, 0); })
#  endif /* SPLIT_SHMEM */
# endif
#else /* SIMPLE_SHMEM */
# define GET_CACHE(v1, v2, minh, i) ({ float2 c = cache[i][minh]; v1 = c.x; v2 = c.y; })
#endif /* SIMPLE_SHMEM */

#if defined(HST_MEM_FETCHES)&&(SHMEM_SAMPLING == 1)
# define HST_FETCH_VAR(var, val) const int var = (val); 	// floor?
# define GET_SINO(bin, proj) sino[(proj)*bin_pitch + (((bin)<num_bins)?(bin):0)]
#else  /* HST_MEM_FETCHES */
# define HST_FETCH_VAR(var, val) const float var = (val) + 0.5f;
# define GET_SINO(bin, proj) hst_tex(texptr, bin, proj);
#endif /* HST_MEM_FETCHES */

#ifdef DEBUG_OFFSETS
# undef GET_SINO
# define GET_SINO(bin, proj)  {0.f}
#endif

#if defined(CACHE_Y)
# define INTERPOLATE_FROM_CACHE(res, idx, vc, i) ({ vfloat v1, v2; GET_CACHE(v1, v2, idx.comp(vc), i); res[vc] += v1 + diff.comp(vc) * v2; })
#elif defined(OVERSAMPLE)
# define INTERPOLATE_FROM_CACHE(res, idx, vc, i) ({ vfloat v; GET_CACHE(v, idx.comp(vc), i); res[vc] += v; })
#else  /* UNCACHED_Y */
# define INTERPOLATE_FROM_CACHE(res, idx, vc, i) ({ vfloat v1, v2; int _idx = idx.comp(vc); GET_CACHE(v1, _idx, i); GET_CACHE(v2, _idx + 1, i); res[vc] += v1 + diff.comp(vc) * (v2 - v1); })
#endif /* OVERSAMPLE */

#define FANCY_MINH_CORRECTION(x) (x)
#ifdef OVERSAMPLE
# ifdef HST_OVERS_FANCY_ROUND
#  define FANCY_ROUND(idx, h) ({float4 fh = h + (float4){exp23, exp23, exp23, exp23}; idx = (*(int4*)(&fh)) - (int4){0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000}; })
# endif
#else
# ifdef HST_LINEAR_FANCY_FLOOR
//#  define FANCY_ROUND(idx, diff, h) ({ float4 fh = h + (float4){exp23, exp23, exp23, exp23}; idx = (*(int4*)(&fh)) - (int4){0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000}; diff = h - (fh - (float4){exp23, exp23, exp23, exp23});})
//#  define FANCY_ROUND(idx, diff, h) ({ float4 newh = h + (float4){0.500001f, 0.500001f, 0.500001f, 0.500001f} ; float4 fh = newh + (float4){exp23, exp23, exp23, exp23}; idx = (*(int4*)(&fh)) - (int4){0x4B000001, 0x4B000001, 0x4B000001, 0x4B000001}; diff = (float4){1.f, 1.f, 1.f, 1.f} + (h - (fh - (float4){exp23, exp23, exp23, exp23}));})
#  define FANCY_ROUND(idx, diff, h) ({ float4 newh = h -(float4){0.499999f, 0.499999f, 0.499999f, 0.499999f} ; float4 fh = newh + (float4){exp23, exp23, exp23, exp23}; idx = (*(int4*)(&fh)) - (int4){0x4B000001, 0x4B000001, 0x4B000001, 0x4B000001}; diff = (h - (fh - (float4){exp23, exp23, exp23, exp23}));})
	// For proper rounding we need values above 1. So, we artificaly add 1 to array offset. We will remove it during the rounding.
#  undef FANCY_MINH_CORRECTION
#  define FANCY_MINH_CORRECTION(x) ((x) - 1.f)
# endif
#endif


#ifdef SHMEM_LOAD_FULLWARP
# ifdef SHMEM_LOAD2
#  define CACHE_POS(i) (4 * i * BSX + 2 * cache_tidx)
# else
#  define CACHE_POS(i) (2 * i * BSX + cache_tidx)
# endif
#else
# define CACHE_POS(i) (i * BSX + cache_tidx)
#endif


#if defined(HST_HYBRID) && !defined(OVERSAMPLE)
 __device__
#else
__global__
#endif
static 
#if MIN_BLOCKS > 0
__launch_bounds__(256, MIN_BLOCKS)
#endif
void KERNEL_NAME(cudaTextureObject_t texptr, const float * __restrict__ g_all, const tfloat * __restrict__ sino, int num_proj, int num_bins, int bin_pitch, vfloat *d_SLICE, float apos_off_x, float apos_off_y, int batch) {
    vfloat res[PPT * PPT] = {0.f};

    const float exp23 = exp2(23.f);

    const int tidx = threadIdx.x;
    const int tidy = threadIdx.y;

#ifdef SHMEM_LOAD_FULLWARP
	// Should not happen for 8x8 blocks (should be no conflicts with ppt=2)
    const int cache_tidx = (tidy%2)*16 + tidx;
    const int cache_tidy = tidy/2;
#else
    const int cache_tidx = tidx;
    const int cache_tidy = tidy;
#endif

    const int bidx = PPT * blockIdx.x * BSX;
    const int bidy = batch + PPT * blockIdx.y * BSY;

	// We get centers offsetted by 0.5f to get in the pixel centers as expected by texture engine
    const float off_x = apos_off_x - 0.5f;
    const float off_y = apos_off_y - 0.5f;

    const float bx = bidx + off_x;
    const float by = bidy + off_y;

#ifdef HST_CACHE_SUBH
    const float _x = bx + tidx;
//    const float _y = by + tidy;
#endif

#ifdef REMAP_THREADS
	// Also should not happen for 8x8 blocks (but we can reconsider)
# if (BSX == 16)&&(BSY == 16)
    const int sidx = 4 * (threadIdx.y % 4) + (threadIdx.x % 4);
    const int sidy = 4 * (threadIdx.y / 4) + (threadIdx.x / 4);
# elif (BSX == 8)&&(BSY == 8)
    const int sidx16 = 4 * (threadIdx.y % 4) + (threadIdx.x % 4);
    const int sidx = sidx16 % 8;
    const int sidy = 4 * (sidx16 / 8) + 2 * (threadIdx.y / 4) + (threadIdx.x / 4);
# else
#  error "Specified block size is not supported"
# endif
#else
    const int sidx = tidx;
    const int sidy = tidy;
#endif

    const int idx = bidx + sidx; 
    const int idy = bidy + sidy; 

#ifndef HST_CACHE_SUBH
    const float x = idx + off_x;
#endif
    const float y = idy + off_y;

#ifdef HST_CACHE_SUBH
    __shared__  float2   cache_subh[PROJ_BLOCK][BSX];
# ifdef HST_CACHE_SIN
    __shared__  float   cache_sin[PROJ_BLOCK];
# endif 
#endif

#ifdef DEBUG_OFFSETS
    int debug_run = !DEBUG_OFFSETS;
#endif 

#ifdef SIMPLE_SHMEM
# ifdef SPLIT_SHMEM
    __shared__  float2   cache[2 * SHMEM_ARRAYS][PROJ_BLOCK + SHMEM_SHIFTED_PADDING][SHMEM_LOADS * BSX + SHMEM_PADDING];
# else
    __shared__  vfloat   cache[SHMEM_ARRAYS][PROJ_BLOCK + SHMEM_SHIFTED_PADDING][SHMEM_LOADS * BSX + SHMEM_PADDING];
# endif
#else 
    __shared__  float2   cache[PROJ_BLOCK][SHMEM_LOADS * BSX + SHMEM_PADDING];
#endif

//#pragma unroll 8

    HST_FETCH_VAR(projf, cache_tidy);
    for (int proje=0; proje<num_proj; proje += PROJ_BLOCK) {

	if (cache_tidy < PROJ_BLOCK) {
		// cache_tidy is either equal to tidy or to tidy/2
	    float4 all = c_all[proje + cache_tidy];
	    float minh = FANCY_MINH_CORRECTION(floor(all.z + bx * all.x - by * all.y + all.w));

#ifdef SHMEM_LOAD_FULLWARP
	    if ((tidy&1) == 0)
#endif
	    {
#ifdef HST_CACHE_SUBH
		float subh = SHMEM_SAMPLING * (all.z + _x * all.x - minh);
		cache_subh[cache_tidy][tidx].x = subh;
		cache_subh[cache_tidy][tidx].y = subh + SHMEM_SAMPLING * BSX * all.x;
# ifdef HST_CACHE_SIN
		if (tidx == 0) cache_sin[cache_tidy] = SHMEM_SAMPLING * all.y;
# endif
#endif
	    }
	
	    HST_FETCH_VAR(binf, minh);
#pragma unroll
	    for (int i = 0; i < SHMEM_LOAD_ITERATIONS; i++) {
		int pos = CACHE_POS(i);

		vfloat v = GET_SINO(binf + OVERSAMPLE_MULTIPLIER * pos, proje + projf);
#if defined(SPLIT_SHMEM)
		CACHE(cache_tidy, pos, 0) = (float2){v.x, v.y};
		CACHE(cache_tidy, pos, SHMEM_ARRAYS) = (float2){v.z, v.w};
#elif defined(SHMEM_LOAD2)
		float v2 = GET_SINO(binf + OVERSAMPLE_MULTIPLIER * (pos + 1), proje + projf);
		*(float2*)&CACHE(cache_tidy, pos, 0) = (float2){v, v2};
#else
		CACHE(cache_tidy, pos, 0) = v;
#endif
	    }

#ifdef CACHE_Y
	    // We don't need to sync all groups, but prevent optimizer from reshuffling the instructions,
	    // __syncthreads() will require to break the (to be executed by all threads) and slightly slower
	__threadfence_block();

#pragma unroll
	    for (int i = 0; i < SHMEM_LOAD_ITERATIONS; i++) {
		int pos = CACHE_POS(i);
		CACHE(cache_tidy, pos, 1) = CACHE(cache_tidy, pos + 1, 0) - CACHE(cache_tidy, pos, 0);
# ifdef SPLIT_SHMEM
		CACHE(cache_tidy, pos, 3) = CACHE(cache_tidy, pos + 1, 2) - CACHE(cache_tidy, pos, 2);
# endif
	    }
#endif // CACHE_Y

	}


	__syncthreads();

	    // Automatic is good, otherwise 16
#pragma unroll
	for (int i = 0; i < PROJ_BLOCK; i++) {
#ifdef HST_CACHE_SUBH
	    float2 subh = cache_subh[i][sidx];
# ifdef HST_CACHE_SIN
	    float csin = cache_sin[i];
# else
	    float csin = SHMEM_SAMPLING * c_all[proje + i].y;
# endif
#else
	    float4 all = c_all[proje + i];
	    const float _minh = FANCY_MINH_CORRECTION(floor(all.z + bx * all.x - by * all.y + all.w));

	    float2 subh;
	    subh.x = SHMEM_SAMPLING * (all.z + x * all.x - _minh);
	    subh.y = subh.x + SHMEM_SAMPLING * BSX * all.x;
	    float csin = SHMEM_SAMPLING * all.y;
#endif

	    float4 minh = {
		subh.x - y * csin,
		subh.y - y * csin,
		subh.x - (y + BSY) * csin,
		subh.y - (y + BSY) * csin
	    };

#ifdef FANCY_ROUND
	    int4 idx;
# ifdef OVERSAMPLE
	    FANCY_ROUND(idx, minh);
# else
	    float4 diff;
	    FANCY_ROUND(idx, diff, minh);
# endif
#else

# ifdef OVERSAMPLE
		// Rounding
//	    int4 idx = (int4){(int)rint(minh.x), (int)rint(minh.y), (int)rint(minh.z), (int)rint(minh.w)};
	    minh += (float4){0.5f, 0.5f, 0.5f, 0.5f};
	    int4 idx = (int4){(int)minh.x, (int)minh.y, (int)minh.z, (int)minh.w};
# else
		// Interpolating between floor and floor + 1
	    float4 diff = floorf(minh);
	    int4 idx = (int4){(int)diff.x, (int)diff.y, (int)diff.z, (int)diff.w};
	    diff = minh - diff;
# endif
#endif


#ifdef DEBUG_OFFSETS
	    if (debug_run > 0) {
#endif
		INTERPOLATE_FROM_CACHE(res, idx, 0, i);
#if PPT > 1
		INTERPOLATE_FROM_CACHE(res, idx, 1, i);
		INTERPOLATE_FROM_CACHE(res, idx, 2, i);
		INTERPOLATE_FROM_CACHE(res, idx, 3, i);
#endif

		// Read floats: od -tf4 reco.vol | head
#ifdef DEBUG_OFFSETS
	    }

	    if ((debug_run >=0 )&&((proje + i)==0)) { //(res[0].x != 0) {
		*(float*)(&res[0]) = idx.x;
		*(float*)(&res[1]) = idx.y;
		*(float*)(&res[2]) = idx.z;
		*(float*)(&res[3]) = idx.w;
		debug_run = -1;
	    }
#endif


	}

	__syncthreads();
    }


    d_SLICE[BSX * PPT * gridDim.x * (idy      ) + idx      ] = res[0];
#if PPT > 1
    d_SLICE[BSX * PPT * gridDim.x * (idy      ) + idx + BSX] = res[1];
    d_SLICE[BSX * PPT * gridDim.x * (idy + BSY) + idx      ] = res[2];
    d_SLICE[BSX * PPT * gridDim.x * (idy + BSY) + idx + BSX] = res[3];
#endif
}