/tomo/pyhst : contents of hst_cuda/hst_cuda_bp

: (revision 276)

To get this branch, use:

bzr branch
http://darksoft.org/webbzr/tomo/pyhst

#if defined(HST_NEWTEX4)
# define BIN_DIM 64
# define BIN_STEPS 2
# define BIN_STEP 8
# define PROJ_DIM 4
# if defined(HST_FLOAT_LOOPS)
#  define PROJ_STEP 4.f
# else
#  define PROJ_STEP 4
# endif
# define SHFL_STEP 4
# define PAD_BUF 0
#else /* 16 projections */
# define BIN_DIM 16
# define BIN_STEPS 4
# define BIN_STEP 4
# define PROJ_DIM 16
# if defined(HST_FLOAT_LOOPS)
#  define PROJ_STEP 16.f
# else
#  define PROJ_STEP 16
# endif
# define SHFL_STEP 16
# if  (defined(HST_SHFL_SUM) && (SLICE_BLOCK == 1))
#  define PAD_BUF 4
# else
#  define PAD_BUF 0
# endif
#endif


#if SLICE_BLOCK == 4
# define shfl_sum(val, k) ({ \
    val.x += __shfl_xor(val.x, k, SHFL_STEP); \
    val.y += __shfl_xor(val.y, k, SHFL_STEP); \
    val.z += __shfl_xor(val.z, k, SHFL_STEP); \
    val.w += __shfl_xor(val.w, k, SHFL_STEP); \
})
#elif SLICE_BLOCK == 2
# define shfl_sum(val, k) ({ \
    val.x += __shfl_xor(val.x, k, SHFL_STEP); \
    val.y += __shfl_xor(val.y, k, SHFL_STEP); \
})
#else
# define shfl_sum(val, k) ({ \
    val += __shfl_xor(val, k, SHFL_STEP); \
})
#endif


#ifdef HST_INNER_GMEM
# define L1_MODE __restrict__
#else
# define L1_MODE
#endif

#undef CACHE_BLOCK
#define CACHE_BLOCK HST_NEWTEX_CACHE_BLOCK

#if defined(HST_CCACHE_KEPLER)
# define CCACHE_LINE (CACHE_BLOCK/PROJ_STEP + PROJ_STEP)
# define CCACHE_SIZE (CACHE_BLOCK * 8 + 4 * CCACHE_LINE * 4)
# define RCACHE_SIZE (BIN_DIM * (PROJ_DIM + PAD_BUF) * vsize)
# if CCACHE_SIZE > RCACHE_SIZE
#  define RCACHE_BIN_DIM (1 + CCACHE_SIZE / ((PROJ_DIM + PAD_BUF) * vsize))
//#  define KEPLER_CACHE_SIZE CCACHE_SIZE
# else
#  define RCACHE_BIN_DIM BIN_DIM
//#  define KEPLER_CACHE_SIZE RCACHE_SIZE
# endif
#else
# define RCACHE_BIN_DIM BIN_DIM
#endif


#ifdef HYBRID_KEPLER
__device__
#else
__global__ 
#endif
static 
#if defined(HST_SET_BOUNDS) && !defined(HYBRID_KEPLER)
__launch_bounds__(256, HST_NEWTEX_MIN_BLOCKS)
#endif /* HST_SET_BOUNDS */
void 
#ifdef HYBRID_KEPLER
hst_cuda_linear_companion
#else 
hst_kepler_kernel
#endif
#if defined(HST_FLOAT_LOOPS)
(cudaTextureObject_t texptr, const float * L1_MODE g_all, float num_proj, int num_bins, vfloat *d_SLICE, float apos_off_x, float apos_off_y,
#else
(cudaTextureObject_t texptr, const float * L1_MODE g_all, int num_proj, int num_bins, vfloat *d_SLICE, float apos_off_x, float apos_off_y,
#endif
#ifdef HYBRID_KEPLER
const int bidx, const int bidy) {
#else
int batch) {
#endif
    float h;

#if defined(HST_SQUARE_PPT)||!defined(HST_NEWTEX4)
# define SQUARE_PPT
    vfloat res[BIN_STEPS][BIN_STEPS] = {0};
#else
# ifdef HST_NEWTEX4_PPT
#  define YNTEXSIZE (BIN_STEPS * BIN_STEPS * HST_NEWTEX4_PPT)
# else 
#  define YNTEXSIZE (BIN_STEPS * BIN_STEPS)
# endif
# define YNTEXSTEP (BIN_STEP / BIN_STEPS)
    vfloat res[YNTEXSIZE] = {0};
#endif

    __shared__ vfloat buf[RCACHE_BIN_DIM][PROJ_DIM + PAD_BUF]; 
#ifndef HST_NEWTEX_DIRECT_WRITE
# ifdef HST_NEWTEX4_PPT
    __shared__ vfloat fin[HST_NEWTEX4_PPT * 16][16];
# else
    __shared__ vfloat fin[16][16];
# endif 
#endif
    const int tidx = threadIdx.x;
    const int tidy = threadIdx.y;

#ifndef HYBRID_KEPLER
    const int bidx = blockIdx.x * BLOCK_SIZE_X;
    const int bidy = batch + blockIdx.y * BLOCK_SIZE_Y;
#endif


#ifdef HST_NEWTEX_PROJ_MAJOR
# ifdef HST_NEWTEX4
    const int proj = tidy / 4;
    const int block = tidy % 4;
# else
    const int proj = tidy;
# endif
    const int xy_minor = tidx % 4;
    const int y_minor = xy_minor / 2;
    const int x_minor = xy_minor % 2;

    const int xy_major = tidx / 4;
    const int y_major = xy_major / 2;
    const int x_major = xy_major % 2;
#else
# ifdef HST_NEWTEX4
    const int block = tidy / 4;
    const int proj = tidx / 4;
# else
    const int proj = 4 * (tidy / 4) + (tidx / 4);
# endif 
    const int xy_minor = tidx % 4;
    const int y_minor = xy_minor / 2;
    const int x_minor = xy_minor % 2;

    const int xy_major = tidy % 4;
    const int y_major = xy_major / 2;
    const int x_major = xy_major % 2;
#endif


#if defined(HST_NEWTEX4)
# ifdef SQUARE_PPT
    const int blocky = block/2;
    const int blockx = block%2;
# else
    const int blocky = 0;
    const int blockx = block;
# endif

    const int ridy = 4 * tidy + tidx / 4;
    const int ridx = tidx % 4;

    const int sidy = blocky * 4 + y_major * 2 + y_minor;
    const int sidx = blockx * 4 + x_major * 2 + x_minor;
#else
    const int ridy = tidy;
    const int ridx = tidx;

    const int sidy = y_major * 2 + y_minor;
    const int sidx = x_major * 2 + x_minor;
#endif

    const float x = bidx + sidx + apos_off_x;
    const float y = bidy + sidy + apos_off_y;

#ifdef HST_NO_OFFSETS
# if defined(HST_C_TRIG)
    const float axis = c_ofst[0].x;
# else
    const float axis = c_all[0].z;
# endif
#endif



#ifdef HST_CACHE_CONST
    const int tid = 16 * tidy + tidx;
# if defined(HST_CCACHE_KEPLER)
#  define CCACHE_LINE (CACHE_BLOCK/PROJ_STEP + PROJ_STEP)
    float2 *w_trig; w_trig = (float2*)(&buf);
    float *w_ofst; w_ofst = (float*)(&buf) + 2 * CACHE_BLOCK;
# elif defined(HST_CCACHE_LD128)
#  if defined(HST_C_TRIG)
    __shared__ float2 w_trig[PROJ_STEP][CACHE_BLOCK/PROJ_STEP + PROJ_STEP];
#   ifndef HST_NO_OFFSETS
    __shared__ float  w_ofst[PROJ_STEP][CACHE_BLOCK/PROJ_STEP + PROJ_STEP];
#   endif
#  else
#   ifdef HST_NO_OFFSETS
    __shared__ float  w_all[2 * PROJ_STEP][CACHE_BLOCK/PROJ_STEP + PROJ_STEP];
#   else
    __shared__ float  w_all[3 * PROJ_STEP][CACHE_BLOCK/PROJ_STEP + PROJ_STEP];
#   endif
#  endif
# else
#  if defined(HST_NEWTEX_REUSE_BUFS)&&((SLICE_BLOCK > 1)||(CACHE_BLOCK <= 64))
    float2 *w_trig; w_trig = (float2*)(&buf);
#   ifndef HST_NO_OFFSETS
#    if ((SLICE_BLOCK > 2)||(CACHE_BLOCK <= 128))
	float *w_ofst; w_ofst = (float*)(&buf) + 2 * CACHE_BLOCK;
#    else
	__shared__ float  w_ofst[CACHE_BLOCK];
#    endif
#   endif
#  else
    __shared__ float2 w_trig[CACHE_BLOCK];
#   ifndef HST_NO_OFFSETS
    __shared__ float  w_ofst[CACHE_BLOCK];
#   endif
#  endif
# endif

    for (int pblock = 0; pblock < num_proj; pblock += CACHE_BLOCK) {
	const int pblock_end = min(CACHE_BLOCK, num_proj - pblock);

	if (tid < CACHE_BLOCK) {
	    const int g_proj = pblock + tid;
# if defined(HST_CCACHE_KEPLER)
	    w_trig[tid] = (float2){ g_all[g_proj], g_all[MAXNPROJECTIONS + g_proj] };
	    w_ofst[ridx * CCACHE_LINE + ridy] = g_all[2 * MAXNPROJECTIONS + g_proj];
# elif defined(HST_CCACHE_LD128)
#  if defined(HST_C_TRIG)
	    w_trig[ridx][ridy] = (float2){ g_all[g_proj], g_all[MAXNPROJECTIONS + g_proj] };
#   ifndef HST_NO_OFFSETS
    	    w_ofst[ridx][ridy] = g_all[2 * MAXNPROJECTIONS + g_proj];
#   endif
#  else
	    w_all[ridx][ridy] = g_all[g_proj];
	    w_all[PROJ_STEP + ridx][ridy] = g_all[MAXNPROJECTIONS + g_proj];
#   ifndef HST_NO_OFFSETS
	    w_all[2 * PROJ_STEP + ridx][ridy] = g_all[2 * MAXNPROJECTIONS + g_proj];
#   endif
#  endif
# else /* HST_CCACHE_LD128 */
	    w_trig[tid] = (float2){ g_all[g_proj], g_all[MAXNPROJECTIONS + g_proj] };
#  ifndef HST_NO_OFFSETS
    	    w_ofst[tid] = g_all[2 * MAXNPROJECTIONS + g_proj];
#  endif
# endif /* ! HST_CCACHE_LD128 */
	}
	
	
	__syncthreads();

#pragma unroll
# if defined(HST_CCACHE_KEPLER)
	for (int p = 0, proje = proj; proje < pblock_end; p++, proje += PROJ_STEP) {
	    const float4 all = (float4){w_trig[proje].x, w_trig[proje].y, w_ofst[proj * CCACHE_LINE + p], 0};
# elif defined(HST_CCACHE_LD128)
	for (int p = 0, proje = proj; proje < pblock_end; p++, proje += PROJ_STEP) {
#  if defined(HST_C_TRIG)
#   ifdef HST_NO_OFFSETS
	    const float4 all = (float4){w_trig[proj][p].x, w_trig[proj][p].y, axis, 0};
#   else
	    const float4 all = (float4){w_trig[proj][p].x, w_trig[proj][p].y, w_ofst[proj][p], 0};
#   endif
#  else
#   ifdef HST_NO_OFFSETS
	    const float4 all = (float4){w_all[proj][p], w_all[PROJ_STEP + proj][p], axis, 0};
#   else
	    const float4 all = (float4){w_all[proj][p], w_all[PROJ_STEP + proj][p], w_all[2 * PROJ_STEP + proj][p], 0};
#   endif
#  endif 
# else /* HST_CCACHE_LD128 */
	for (int proje = proj; proje < pblock_end; proje += PROJ_STEP) {
#  ifdef HST_NO_OFFSETS
	    const float4 all = (float4){w_trig[proje].x, w_trig[proje].y, axis, 0};
#  else
//	    const float4 all = { g_all[pblock + proje], g_all[MAXNPROJECTIONS + pblock + proje], g_all[2 * MAXNPROJECTIONS + pblock + proje], 0 } ;
	    const float4 all = (float4){w_trig[proje].x, w_trig[proje].y, w_ofst[proje], 0};
/*	    if (w_trig[proje].x !=  g_all[pblock + proje]) {
		if (tid < 4)
		    printf("Problems: %i %i (%f != %f)\n", pblock, proje, w_trig[proje].x, g_all[pblock + proje]);
	    }
*/
#  endif
# endif /* ! HST_CCACHE_LD128 */

	    const float projf = pblock + proje + 0.5f;
#else // ! HST_CACHE_CONST 
# if defined(HST_FLOAT_LOOPS)
//#pragma unroll 2
	for (float projf =  proj + 0.5f; projf < num_proj; projf += PROJ_STEP) {
//	    float fidx = projf + exp2(23.f); const int idx = (*(int*)(&fidx)) - 0x4B000000;
    	    const int proje = (int)projf;
# else
#if SLICE_BLOCK < 4
# pragma unroll 16
#endif
	for (int proje = proj; proje < num_proj; proje += PROJ_STEP) {
	    const float projf = proje + 0.5f;
# endif
# if defined(HST_GMEM_CONST)&&defined(HST_INNER_GMEM)
#  ifdef HST_NO_OFFSETS
	    const float4 all = { g_all[proje], g_all[MAXNPROJECTIONS + proje], axis, 0 };
#  else
	    const float4 all = { g_all[proje], g_all[MAXNPROJECTIONS + proje], g_all[2 * MAXNPROJECTIONS + proje], 0 };
#  endif

# elif defined(HST_C_TRIG)
#  ifdef HST_NO_OFFSETS
	    const float4 all = (float4){c_trig[proje].x, c_trig[proje].y, axis, 0 };
#  else
	    const float4 all = (float4){c_trig[proje].x, c_trig[proje].y, c_ofst[proje].x, 0 };
#  endif
# else
	    const float4 all = c_all[proje];		
# endif
#endif
	    h = all.z + x * all.x - y * all.y;

#ifdef SQUARE_PPT
# pragma unroll
    	    for (int i = 0; i < BIN_STEPS; i++) {
# pragma unroll
		for (int j = 0; j < BIN_STEPS; j++) {
		    float subh = h + BIN_STEP * j * all.x - BIN_STEP * i * all.y;
		    res[i][j] += hst_tex(texptr, subh, projf);
		}
	    }
#else
# pragma unroll
	    for (int i = 0; i < YNTEXSIZE; i++) {
		float subh = h - YNTEXSTEP * i * all.y;
		res[i] += hst_tex(texptr, subh, projf);
	    }
#endif
    }

#ifdef HST_CACHE_CONST
	__syncthreads();
    }
#endif

#ifdef SQUARE_PPT
# pragma unroll
    for (int i = 0; i < BIN_STEPS; i++) {
# pragma unroll
	for (int j = 0; j < BIN_STEPS; j++) {
            buf[BIN_STEP * sidy + sidx][proj] = res[i][j];
#else
# pragma unroll
    for (int i = 0; i < YNTEXSIZE; i++) {
            buf[16 * sidy + sidx][proj] = res[i]; //?
#endif

            __syncthreads();

#ifdef HST_SHFL_SUM
            vfloat val = buf[ridy][ridx];
# pragma unroll
            for (int k=(PROJ_DIM/2); k>=1; k/=2) 
        	shfl_sum(val, k);
#else // HST_SHFL_SUM
# pragma unroll
            for (int k=(PROJ_DIM/2); k>=1; k/=2) {
        	if (ridx < k) // No data corruption, but arbitrary crashes...
		    buf[ridy][ridx] += buf[ridy][ridx+k];

            	    // We need it instead of volatile. Could be faster if buf declared volatile
//                __syncthreads();
                __threadfence_block();
            }
            vfloat val = buf[ridy][0];
#endif //  HST_SHFL_SUM

#ifdef SQUARE_PPT
            const int rx = BIN_STEP * j + ridy%BIN_STEP;
            const int ry = BIN_STEP * i + ridy/BIN_STEP;
#else
            const int rx = ridy % 16;
            const int ry = YNTEXSTEP * i + ridy / 16;
#endif

            if (!ridx) {
#ifdef HST_NEWTEX_DIRECT_WRITE
		d_SLICE[BLOCK_SIZE_X * gridDim.x * (bidy + ry) + (bidx + rx)] = val;
#else
                fin[ry][rx] = val;
#endif
            }

	    __syncthreads();
#ifdef SQUARE_PPT
        }
#endif
    }

#ifndef HST_NEWTEX_DIRECT_WRITE
    const int idx = bidx + tidx;
    const int idy = bidy + tidy;

# ifdef HYBRID_KEPLER
#  ifdef HST_NEWTEX4_PPT
    for (int i = 0; i < HST_NEWTEX4_PPT; i++) {
	d_SLICE[PPT * BLOCK_SIZE_X * gridDim.x * (idy + i * BLOCK_SIZE_Y) + idx] = fin[16 * i + tidy][tidx];
    }
#  else
    d_SLICE[PPT * BLOCK_SIZE_X * gridDim.x * idy + idx] = fin[tidy][tidx];
#  endif
# else
    d_SLICE[BLOCK_SIZE_X * gridDim.x * idy + idx] = fin[tidy][tidx];
# endif
#endif
}