/tomo/pyhst : revision 228

To get this branch, use:

bzr branch
http://darksoft.org/webbzr/tomo/pyhst

« back to all changes in this revision

Viewing changes to hst_cuda/hst_cuda_bp_kepler.h

Committer: Suren A. Chilingaryan
Date: 2017-09-28 10:34:47 UTC
Revision ID: csa@suren.me-20170928103447-dggjgnuxmymgew2a

Quality fixes (tex-based)

files modified:
CMakeLists.txt

PyHST.py

PyHST_c.c

hst.c

hst_cuda/CMakeLists.txt

hst_cuda/hst_cuda.cu

hst_cuda/hst_cuda_bp_kepler.h

hst_cuda/hst_cuda_bp_kernels.h

hst_cuda/hst_cuda_bp_mplinear.h

hst_cuda/hst_cuda_defines.h

hst_cuda/hst_cuda_kernels.h

setup.py

Show diffs side-by-side

added added

removed removed

hst_cuda/hst_cuda_bp_kepler.h

#if SLICE_BLOCK == 4

# define shfl_sum(val, k) ({ \

val.x += __shfl_xor(val.x, k, 16); \

val.y += __shfl_xor(val.y, k, 16); \

val.z += __shfl_xor(val.z, k, 16); \

val.w += __shfl_xor(val.w, k, 16); \

})

#elif SLICE_BLOCK == 2

# define shfl_sum(val, k) ({ \

val.x += __shfl_xor(val.x, k, 16); \

val.y += __shfl_xor(val.y, k, 16); \

})

#else

# define shfl_sum(val, k) ({ \

val += __shfl_xor(val, k, 16); \

})

#endif

#if defined(HST_NEWTEX4)

# define BIN_DIM 64

# define BIN_STEPS 2

# endif

#endif

#if SLICE_BLOCK == 4

# define shfl_sum(val, k) ({ \

val.x += __shfl_xor(val.x, k, PROJ_STEP); \

val.y += __shfl_xor(val.y, k, PROJ_STEP); \

val.z += __shfl_xor(val.z, k, PROJ_STEP); \

val.w += __shfl_xor(val.w, k, PROJ_STEP); \

})

#elif SLICE_BLOCK == 2

# define shfl_sum(val, k) ({ \

val.x += __shfl_xor(val.x, k, PROJ_STEP); \

val.y += __shfl_xor(val.y, k, PROJ_STEP); \

})

#else

# define shfl_sum(val, k) ({ \

val += __shfl_xor(val, k, PROJ_STEP); \

})

#endif

#ifdef HYBRID_KEPLER

__device__

#else

hst_kepler_kernel

#endif

#if defined(HST_FLOAT_LOOPS)

(cudaTextureObject_t texptr, float num_proj, int num_bins, vfloat *d_SLICE, float apos_off_x, float apos_off_y,

(cudaTextureObject_t texptr, const float * __restrict__ g_all, float num_proj, int num_bins, vfloat *d_SLICE, float apos_off_x, float apos_off_y,

#else

(cudaTextureObject_t texptr, int num_proj, int num_bins, vfloat *d_SLICE, float apos_off_x, float apos_off_y,

(cudaTextureObject_t texptr, const float * __restrict__ g_all, int num_proj, int num_bins, vfloat *d_SLICE, float apos_off_x, float apos_off_y,

#endif

#ifdef HYBRID_KEPLER

const int bidx, const int bidy) {

118

119

const float x = bidx + sidx + apos_off_x;

119

120

const float y = bidy + sidy + apos_off_y;

120

121

122

121

123

#if defined(HST_FLOAT_LOOPS)

122

124

for (float projf = proj + 0.5f; projf < num_proj; projf += PROJ_STEP) {

123

125

// float fidx = projf + exp2(23.f); const int idx = (*(int*)(&fidx)) - 0x4B000000;

150

152

#ifdef HST_SHFL_SUM

151

153

vfloat val = buf[ridy][ridx];

152

154

# pragma unroll

153

for (int k=PROJ_DIM; k>=1; k/=2)

155

for (int k=(PROJ_DIM/2); k>=1; k/=2)

154

156

shfl_sum(val, k);

155

157

#else // HST_SHFL_SUM

156

158

# pragma unroll

Older »