/tomo/pyhst : revision 267

To get this branch, use:

bzr branch
http://darksoft.org/webbzr/tomo/pyhst

« back to all changes in this revision

Viewing changes to hst_opencl/alu.cl

Committer: Suren A. Chilingaryan
Date: 2017-12-20 15:28:19 UTC
Revision ID: csa@suren.me-20171220152819-byiryz4mmovcg8u2

New remmaping, small adjustment of configuration

files modified:
PyHST.py

cmake/FindOpenCL.cmake

hst_opencl/alu.cl

hst_opencl/hst_opencl_kernel.h

hst_opencl/hst_opencl_kernels.cl

hst_opencl/newtex.cl

Show diffs side-by-side

added added

removed removed

hst_opencl/alu.cl

// For use with AMD CodeXL

#define SLICE_BLOCK 1

#define PPT 4

#define BP_BLOCK 16

#define GPU

#define GPU_AMD

#define GPU_VLIW5

#define REMAP

#define BLOCK_SIZE 16

#define BP_BLOCK 16

#define ASSYMETRY 1

//#define OVERSAMPLE 4

#define MAXNPROJECTIONS 4096

#define HST_LINEAR_PPT PPT

#define HST_OVERS_PPT PPT

#define HST_NN_PPT PPT

# define PROJ_BLOCK 16

#elif defined(GPU_VLIW5)

# if SLICE_BLOCK > 2

# define PROJ_BLOCK 8

# if OVERSAMPLE > 1

# define PROJ_BLOCK 4

# else

# define PROJ_BLOCK 8

# endif

# elif (SLICE_BLOCK > 1)&&(OVERSAMPLE > 1)

# define PROJ_BLOCK 4

# else

# define PROJ_BLOCK 16

# define PROJ_BLOCK 8//16

# endif

#else

# if SLICE_BLOCK > 2

# elif (OVERSAMPLE > 1)||((SLICE_BLOCK > 1)&&(OVERSAMPLE == 1))

# define PROJ_BLOCK 8

# else

# define PROJ_BLOCK 16

# define PROJ_BLOCK 8//16

# endif

#endif

//#undef PROJ_BLOCK

//#define PROJ_BLOCK 16

111

#define HST_XCACHE_LD128 2 //!< Store SUBH_X cache as groups of float2/float4 numbers to reduce number of memory instructions

112

//#define HST_SQUARE_PPT //!< NO. Little effect on performance (slighly reduces register usage, but doesn't affect computations as the loops are unrolled anyway)

113

114

//#undef PROJ_BLOCK

115

//#define PROJ_BLOCK 8

116

117

118

#define HST_NEWCACHE //!< YES! New way of caching reducing number of shmem reads/writes

//# define HST_WARPLINE //!< Seems better without on Fermi

#if defined(GPU_GCN)&&((OVERSAMPLE > 1)||(SLICE_BLOCK > 1))

# define HST_SHMEM64 //!< Optimize caching in shared memory (avoid bank conflicts) in case of oversampling

#endif

119

#define HST_WARPLINE //!< Seems better without on Fermi

120

#define HST_SHMEM64 //!< Optimize caching in shared memory (avoid bank conflicts) in case of oversampling

121

//#if defined(GPU_GCN)&&((OVERSAMPLE > 1)||(SLICE_BLOCK > 1))

122

//# define HST_SHMEM64 //!< Optimize caching in shared memory (avoid bank conflicts) in case of oversampling

123

//#endif

100

124

101

125

102

126

//#define HST_NEWCACHE_UNPAD //!< Add if and remove padding in the shmem cache (low efficiency)

338

362

# define SHMEM_LOADS (3 * SHMEM_SAMPLING * (PPT / 2))

339

363

340

364

# if defined(HST_WARPLINE)&&!defined(HST_HALF_MODE)&&((SHMEM_SAMPLING * PPT) >= 4)/*&&(PROJ_BLOCK <= 8)*/

341

# if defined(HST_SHMEM64)&&((SHMEM_SAMPLING * PPT) >= 8)&&(SLICE_BLOCK == 1)&&defined(SIMPLE_SHMEM)

365

# if defined(HST_SHMEM64)&&((SHMEM_SAMPLING * PPT) >= 4)&&(SLICE_BLOCK == 1)&&defined(SIMPLE_SHMEM)

342

366

# define SHMEM_LOAD2 /* Combine two texture fetches to 1 64-bit shmem store */

343

367

# define SHMEM_LOAD_FULLWARP ((SHMEM_SAMPLING * PPT) / 4) /* Counts number of rows we can load simultaneously */

344

368

# define SHMEM_LOAD_ITERATIONS (SHMEM_LOADS / SHMEM_LOAD_FULLWARP / 2) /* Use 32-threads (or more) for loading data, this limits projections to 8 */

1007

1031

// const int pidx = CACHE_IDX(p);

1008

1032

// const float subh = cache_subh[pidx/4][sidx][pidx%4];

1009

1033

// float subh = cache_subh[CACHE_IDX(pblock * P_BLOCK + p)];

1010

const float *subh = (float*)&cache_subhx[CACHE_IDX(p)][sidx];

1034

__local const float *subh = (__local float*)(&cache_subhx[CACHE_IDX(p)][sidx]);

1011

1035

# else

1012

1036

float subh = cache_subh[CACHE_IDX(p)];

1013

1037

# endif // ! CACHE_SUBH_X

Older »