/tomo/pyhst : revision 188

To get this branch, use:

bzr branch
http://darksoft.org/webbzr/tomo/pyhst

« back to all changes in this revision

Viewing changes to hst_cuda/hst_cuda_test_kernels.h

Committer: Suren A. Chilingaryan
Date: 2017-08-28 01:51:13 UTC
Revision ID: csa@suren.me-20170828015113-5doek365s2330y4r

NewTex kernel

files added:
hst_cuda/hst_cuda_test_kernels.h

files modified:
debug.c

debug.h

hst_cuda/CMakeLists.txt

hst_cuda/hst_cuda.cu

hst_cuda/hst_cuda_bp_kernels.h

hst_cuda/hst_cuda_defines.h

hst_cuda/hst_cuda_kernels.h

Show diffs side-by-side

added added

removed removed

hst_cuda/hst_cuda_test_kernels.h

#if SLICE_BLOCK == 1

__global__ static void hst_kepler_orig_kernel(cudaTextureObject_t texptr, int num_proj, int num_bins, float *d_SLICE, float apos_off_x, float apos_off_y, int batch) {

float h;

float res[4][4] = {0};

#ifdef HST_OPTIMIZE_KEPLER

__shared__ float buf[16][18]; // 64b for kepler

// __shared__ float fill[48];

__shared__ float fin[16][18];

#else // HST_OPTIMIZE_KEPLER

__shared__ float buf[16][17]; // 32b for Fermi & GT200

// __shared__ float fill[56];

__shared__ float fin[16][17];

#endif // HST_OPTIMIZE_KEPLER

const int tidx = threadIdx.x;

const int tidy = threadIdx.y;

const int bidx = blockIdx.x * BLOCK_SIZE_X;

const int bidy = batch + blockIdx.y * BLOCK_SIZE_Y;

const int sidx = tidx % 4;

const int sidy = tidx / 4;

const float x = bidx + sidx + apos_off_x;

const float y = bidy + sidy + apos_off_y;

const float projf = tidy + 0.5f;

for (int proje=0; proje<num_proj; proje+=16) {

const float4 all = c_all[proje+tidy];

h = all.z + x * all.x - y * all.y;

#pragma unroll 4

for (int i = 0; i < 4; i++) {

#pragma unroll 4

for (int j = 0; j < 4; j++) {

float subh = h + 4 * j * all.x - 4 * i * all.y;

res[i][j] += hst_tex(texptr, subh, proje + projf);

}

#pragma unroll 4

for (int i = 0; i < 4; i++) {

#pragma unroll 4

for (int j = 0; j < 4; j++) {

buf[tidx][tidy] = res[i][j];

__syncthreads();

#ifdef HST_OPTIMIZE_KEPLER

float val = buf[tidy][tidx];

for (int k=16; k>=1; k/=2)

val += __shfl_xor(val, k, 16);

#else // HST_OPTIMIZE_KEPLER

volatile float *ptr = &buf[tidy][0];

for (int k=8; k>=1; k/=2)

ptr[tidx] += ptr[tidx+k];

float val = ptr[0];

#endif // HST_OPTIMIZE_KEPLER

const int rx = 4 * j + tidy%4;

const int ry = 4 * i + tidy/4;

if (!tidx) {

fin[ry][rx] = val;

}

__syncthreads();

}

const int idx = bidx + tidx;

const int idy = bidy + tidy;

d_SLICE[BLOCK_SIZE_X * gridDim.x * idy + idx] = fin[tidy][tidx];

}

#endif

b'\\ No newline at end of file'

Older »