/tomo/pyhst : contents of hst_cuda/hst_cuda_bp

: (revision 276)

To get this branch, use:

bzr branch
http://darksoft.org/webbzr/tomo/pyhst

/*
 * The PyHST program is Copyright (C) 2002-2011 of the
 * European Synchrotron Radiation Facility (ESRF) and
 * Karlsruhe Institute of Technology (KIT).
 *
 * PyHST is free software: you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * hst is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#ifdef HST_C_SIN
__constant__ float  c_sin[ MAXNPROJECTIONS  ] ;	//!< (cosinuses, sinuses, axis position, min bin
#endif
#ifdef HST_C_TRIG
__constant__ float2  c_trig[ MAXNPROJECTIONS  ] ;	//!< (cosinuses, sinuses, axis position, min bin
__constant__ float2  c_ofst[ MAXNPROJECTIONS  ] ;	//!< (cosinuses, sinuses, axis position, min bin
#endif
__constant__ float4  c_all[ MAXNPROJECTIONS  ] ;	//!< (cosinuses, sinuses, axis position, min bin
texture<float, 2, cudaReadModeElementType> tex_projes;


#include "hst_cuda_fai.h"



#if SLICE_BLOCK == 4
# define vsize 16
# define vfloat float4
# define hfloat float2
#elif SLICE_BLOCK == 2
# define vsize 8
# define vfloat float2
# define hfloat float1
#else 
# define vsize 4
# define vfloat float
#endif

    // Texture data type
#ifdef HST_HALF_MODE
# define tfloat hfloat
#else
# define tfloat vfloat
#endif

    // Shared memory data type
#ifdef HST_HALF_CACHE
# define sfloat hfloat
#else
# define sfloat vfloat
#endif


#ifndef HST_CREATE_TEXTURE
# ifdef HST_HALF_MODE
texture<float2, 2, cudaReadModeElementType> array_projes;
# else
texture<vfloat, 2, cudaReadModeElementType> array_projes;
# endif
#endif /* !HST_CREATE_TEXTURE */

#if defined(HST_CUDA_ARRAY)
# ifdef HST_CREATE_TEXTURE
#  ifdef HST_HALF_MODE
#   define hst_real_tex(t, x, y) tex2D<tfloat>(t, x, y)
#  else
#   define hst_real_tex(t, x, y) tex2D<vfloat>(t, x, y)
#  endif
# else 
#  define hst_real_tex(t, x, y) tex2D(array_projes, x, y)
# endif
#else
# define hst_real_tex(t, x, y) tex2D(tex_projes, x, y)
#endif

#ifdef HST_HALF_MODE
# define hst_tex(t, x, y) ({ \
		union { \
		    tfloat f; \
		    half2 h[2]; \
		} tdata; \
		tdata.f = hst_real_tex(t, x, y); \
		float4 data = mkf4(__half22float2(tdata.h[0]), __half22float2(tdata.h[1])); \
		data; \
	})
#else 
# define hst_tex(t, x, y) hst_real_tex(t, x, y)
#endif



#define comp0 x
#define comp1 y
#define comp2 z
#define comp3 w
#define comp(i) comp##i

#define mkf4(xy, zw) make_float4(xy.x, xy.y, zw.x, zw.y)

#define HST_MACRO_JOIN(a, b) a ## b

#include <math_constants.h>
#define M_PI_F CUDART_PI_F



__global__ static void hst_cuda_kernel(cudaTextureObject_t texptr, int num_proj, int num_bins, vfloat *d_SLICE, float apos_off_x, float apos_off_y, int batch) {
#ifdef HST_BASE_REMAP
    const int q2 = threadIdx.x % 4;
    const int q2x = q2 % 2, q2y = q2 / 2;
    const int q4 = threadIdx.x / 4;
    const int q4x = q4 % 2, q4y = q4 / 2;
    const int q16 = threadIdx.y;
    const int q16x = q16 % 4, q16y = q16 / 4;

    const int tidx = 4 * q16x + 2 * q4x + q2x;
    const int tidy = 4 * q16y + 2 * q4y + q2y;

    const int idx = blockIdx.x * BLOCK_SIZE_X + tidx;
    const int idy = blockIdx.y * BLOCK_SIZE_Y + tidy + batch;
#else
    const int idx = blockIdx.x * BLOCK_SIZE_X + threadIdx.x;
    const int idy = blockIdx.y * BLOCK_SIZE_Y + threadIdx.y + batch;
#endif

    float h;
    vfloat res={0};

    const float bx = idx + apos_off_x;
    const float by = idy + apos_off_y;

    for (int proje=0; proje<num_proj; proje++) {
        const float4 all = c_all[proje];
        h = all.z + bx * all.x - by * all.y;

        res=res + hst_tex(texptr, h, proje + 0.5f) ;
    }

    d_SLICE[ BLOCK_SIZE_X*gridDim.x*idy + idx ] = res;
}


#include "hst_cuda_bp_kepler.h"


#define PROJ_BLOCK HST_LINEAR_PROJ_BLOCK
#define PPT HST_LINEAR_PPT
#define BS HST_LINEAR_BLOCK
#ifdef HST_LINEAR_MPLINEAR
# include "hst_cuda_bp_mplinear.h"
#else
# include "hst_cuda_bp_simdlinear.h"
#endif
#undef BS
#undef PPT
#undef PROJ_BLOCK


#define OVERSAMPLE 4
#define PROJ_BLOCK HST_OVERS_PROJ_BLOCK
#define PPT HST_OVERS_PPT
#define BS HST_LINEAR_BLOCK
#ifdef HST_LINEAR_MPLINEAR
# include "hst_cuda_bp_mplinear.h"
#else
# include "hst_cuda_bp_simdlinear.h"
#endif
#undef BS
#undef PPT
#undef PROJ_BLOCK
#undef OVERSAMPLE


#define OVERSAMPLE 1
#define PROJ_BLOCK HST_NN_PROJ_BLOCK
#define PPT HST_NN_PPT
#define BS HST_LINEAR_BLOCK
#ifdef HST_LINEAR_MPLINEAR
# include "hst_cuda_bp_mplinear.h"
#else
# include "hst_cuda_bp_simdlinear.h"
#endif
#undef BS
#undef PPT
#undef PROJ_BLOCK
#undef OVERSAMPLE



#ifdef HST_HYBRID
# define PPT HST_LINEAR_PPT
# ifdef HST_HYBRID_NEWTEX
#  define HYBRID_KEPLER
#  include "hst_cuda_bp_kepler.h"
#  undef HYBRID_KEPLER
# endif
#endif


#if defined(HST_HYBRID)
__device__ static
#elif defined(HST_BASE_REMAP)&&defined(HST_BASE_PPT)
# define PPT HST_BASE_PPT
__global__
#endif

#if defined(HST_HYBRID)||(defined(HST_BASE_REMAP)&&defined(HST_BASE_PPT))
    // We more-or-less bound here to PPT2 (PPT1 is not efficient for Linear and PPT4 uses to much registers causing latency problems)
# ifdef HST_FLOAT_LOOPS
void hst_cuda_linear_companion(cudaTextureObject_t texptr, const float * __restrict__ g_all, float num_proj, int num_bins, vfloat *d_SLICE, float apos_off_x, float apos_off_y, int batch) {
# else
void hst_cuda_linear_companion(cudaTextureObject_t texptr, const float * __restrict__ g_all, int num_proj, int num_bins, vfloat *d_SLICE, float apos_off_x, float apos_off_y, int batch) {
# endif
# ifdef HST_SQUARE_PPT
    vfloat res[PPT][PPT] = {0};

    const int sidx = 4 * (threadIdx.y % 4) + (threadIdx.x % 4);
    const int sidy = 4 * (threadIdx.y / 4) + (threadIdx.x / 4);
# else
#  define YTEXSIZE (PPT * PPT)
#  define YTEXSTEP (BLOCK_SIZE_Y / PPT)

    vfloat res[PPT * PPT] = {0};

#  ifdef HST_BASE_REMAP
    const int q2 = threadIdx.x % 4;
    const int q2x = q2 % 2, q2y = q2 / 2;
    const int q4 = threadIdx.x / 4;
    const int q4x = q4 % 2, q4y = q4 / 2;

    const int sidx = 4 * (threadIdx.y % 8) + 2*q4x + q2x;
    const int sidy = 4 * (threadIdx.y / 8) + 2*q4y + q2y;
#  else
	// PPT2 only
    const int sidx = 4 * (threadIdx.y % 8) + (threadIdx.x % 4);
    const int sidy = 4 * (threadIdx.y / 8) + (threadIdx.x / 4);
#  endif

# endif


//    const int sidx = threadIdx.x;
//    const int sidy = threadIdx.y;

    const int idx = PPT * blockIdx.x * BLOCK_SIZE_X + sidx;
    const int idy = PPT * blockIdx.y * BLOCK_SIZE_Y + sidy + batch;

    float h;
    const float bx = idx + apos_off_x;
    const float by = idy + apos_off_y;

# ifdef HST_FLOAT_LOOPS
    for (float projf = 0.5f; projf < num_proj; projf++) {
	const int proje = (int)projf;
# else
    for (int proje=0; proje<num_proj; proje++) {
	const float projf = proje + 0.5f;
# endif

# if defined(HST_C_TRIG)
#  ifdef HST_NO_OFFSETS
	const float4 all = (float4){c_trig[proje].x, c_trig[proje].y, 0, 0 };
#  else
	const float4 all = (float4){c_trig[proje].x, c_trig[proje].y, c_ofst[proje].x, 0 };
#  endif
# else
        const float4 all = c_all[proje];
# endif

        h = all.z + bx * all.x - by * all.y;

# ifdef HST_SQUARE_PPT
#  pragma unroll
        for (int i = 0; i < PPT; i++) {
#  pragma unroll
            for (int j = 0; j < PPT; j++) {
                res[i][j] += hst_tex(texptr, h + j * BLOCK_SIZE_X * all.x - i * BLOCK_SIZE_Y * all.y, projf) ;
            }
        }
    }


#  pragma unroll
    for (int i = 0; i < PPT; i++) {
#  pragma unroll
        for (int j = 0; j < PPT; j++) {
            d_SLICE[PPT * BLOCK_SIZE_X * gridDim.x * (idy + i * BLOCK_SIZE_Y) + idx + j * BLOCK_SIZE_X] = res[i][j];
        }
    }
# else
#  pragma unroll
	for (int i = 0; i < YTEXSIZE; i++) {
	    res[i] += hst_tex(texptr, h - i * YTEXSTEP * all.y, projf) ;
	}
    }	
#  pragma unroll
    for (int i = 0; i < YTEXSIZE; i++) {
        d_SLICE[PPT * BLOCK_SIZE_X * gridDim.x * (idy + i * YTEXSTEP) + idx] = res[i];
    }
# endif

}
#endif

#ifdef HST_HYBRID

__device__ uint get_smid(void) {
     uint ret;
     asm("mov.u32 %0, %smid;" : "=r"(ret) );
     return ret;

}

__device__ uint smblocks[128] = {0};


# if defined(HST_SET_BOUNDS)
__launch_bounds__(256, HST_HYBRID_MIN_BLOCKS)
# endif /* HST_SET_BOUNDS */
__global__ static void hst_cuda_linear_hybrid(cudaTextureObject_t texptr, const float * __restrict__ g_all, const tfloat * __restrict__ sino, int num_proj, int num_bins, int bin_pitch, vfloat *d_SLICE, float apos_off_x, float apos_off_y, int batch) {
//    int block = (blockIdx.y * gridDim.x +  blockIdx.x)/16;


    __shared__ uint block;
    if ((threadIdx.x == 0)&&(threadIdx.y == 0)) {
	uint smid = get_smid();
	block = atomicAdd(&smblocks[smid], 1);
    }
    __syncthreads();

#ifdef HST_HYBRID_BALANCE_ALU
    if ((block%HST_NEWTEX_MIN_BLOCKS) < HST_HYBRID_BALANCE_ALU) {
#else
    if (block&1) {
#endif
        hst_cuda_alu_linear(texptr, g_all, sino, num_proj, num_bins, bin_pitch, d_SLICE, apos_off_x, apos_off_y, batch);
    } else {
#ifdef HST_HYBRID_NEWTEX
# ifdef HST_NEWTEX4_PPT
	{
	    int i = 0;
# else
	for (int i = 0; i < PPT; i++) {
# endif
	    for (int j = 0; j < PPT; j++) {
		const int bidx = (PPT * blockIdx.x + j) * HST_LINEAR_BLOCK;
		const int bidy = batch + (PPT * blockIdx.y + i) * HST_LINEAR_BLOCK;

		hst_cuda_linear_companion(texptr, g_all, num_proj, num_bins, d_SLICE, apos_off_x, apos_off_y, bidx, bidy);
	    }
	}
#else
	hst_cuda_linear_companion(texptr, g_all, num_proj, num_bins, d_SLICE, apos_off_x, apos_off_y, batch);
#endif
    }

}

# if defined(HST_SET_BOUNDS)
__launch_bounds__(256, HST_HYBRID_MIN_BLOCKS)
# endif /* HST_SET_BOUNDS */
__global__ static void hst_cuda_nn_hybrid(cudaTextureObject_t texptr, const float * __restrict__ g_all, const tfloat * __restrict__ sino, int num_proj, int num_bins, int bin_pitch, vfloat *d_SLICE, float apos_off_x, float apos_off_y, int batch) {
//    int block = (blockIdx.y * gridDim.x +  blockIdx.x)/16;


    __shared__ uint block;
    if ((threadIdx.x == 0)&&(threadIdx.y == 0)) {
	uint smid = get_smid();
	block = atomicAdd(&smblocks[smid], 1);
    }
    __syncthreads();

#ifdef HST_HYBRID_BALANCE_ALU
    if ((block%HST_NEWTEX_MIN_BLOCKS) < HST_HYBRID_BALANCE_ALU) {
#else
    if (block&1) {
#endif
        hst_cuda_alu_nn(texptr, g_all, sino, num_proj, num_bins, bin_pitch, d_SLICE, apos_off_x, apos_off_y, batch);
    } else {
#ifdef HST_HYBRID_NEWTEX
# ifdef HST_NEWTEX4_PPT
	{
	    int i = 0;
# else
	for (int i = 0; i < PPT; i++) {
# endif
	    for (int j = 0; j < PPT; j++) {
		const int bidx = (PPT * blockIdx.x + j) * HST_LINEAR_BLOCK;
		const int bidy = batch + (PPT * blockIdx.y + i) * HST_LINEAR_BLOCK;

		hst_cuda_linear_companion(texptr, g_all, num_proj, num_bins, d_SLICE, apos_off_x, apos_off_y, bidx, bidy);
	    }
	}
#else
	hst_cuda_linear_companion(texptr, g_all, num_proj, num_bins, d_SLICE, apos_off_x, apos_off_y, batch);
#endif
    }

}
# undef PPT
#endif