/tomo/pyhst : revision 148

To get this branch, use:

bzr branch
http://darksoft.org/webbzr/tomo/pyhst

« back to all changes in this revision

Viewing changes to docs/optimizations/kepler/hst_tex_uniform.h

Committer: Suren A. Chilingaryan
Date: 2012-05-10 15:06:33 UTC
Revision ID: csa@dside.dyndns.org-20120510150633-56gdy6t3tflz2gab

OpenCL clean-up

files added:
docs/optimizations/gcn/hst_opencl_dma_8x8_6ppt.cl

docs/optimizations/gcn/subblocks.txt

docs/optimizations/kepler

docs/optimizations/kepler/approximation.txt

docs/optimizations/kepler/hst_linear_and_tex.h

docs/optimizations/kepler/hst_linear_art.h

docs/optimizations/kepler/hst_linear_good.h

docs/optimizations/kepler/hst_linear_multivar.h

docs/optimizations/kepler/hst_tex_uniform.h

tools

tools/gen.sh

files renamed:
docs/optimizations/sources/ => docs/optimizations/fermi/

hst_opencl/docs/ => docs/optimizations/gcn/

files modified:
hst_opencl/hst_opencl.c

hst_opencl/hst_opencl_kernel.h

hst_opencl/hst_opencl_kernels.cl

Show diffs side-by-side

added added

removed removed

docs/optimizations/kepler/hst_tex_uniform.h

We are trying to fetch along the both bin & proj axes of the projection within each block (assuming

the GPU is optimized for this use case). Minor modification going by 8 projections only and using

32 threads for acquring spatial data 8x4. Not finished, some artifcats present.

__global__ static void hst_cuda_kernel(int num_proj, int num_bins, float *d_SLICE, float apos_off_x, float apos_off_y, int batch) {

float h;

float res[4][2] = {0};

#ifdef HST_OPTIMIZE_KEPLER

__shared__ float buf[8][34]; // 64b for Kepler

__shared__ float fill[48];

__shared__ float fin[16][18];

#else /* HST_OPTIMIZE_KEPLER */

__shared__ float buf[8][33]; // 32b for Fermi & GT200

__shared__ float fill[56];

__shared__ float fin[16][17];

#endif /* HST_OPTIMIZE_KEPLER */

const int tidx = threadIdx.x;

const int tidy = threadIdx.y;

const int bidx = PPT * blockIdx.x * BLOCK_SIZE_X;

const int bidy = batch + PPT * blockIdx.y * BLOCK_SIZE_Y;

const int sidx = tidx % 8;

const int sidy = 2 * (2 * (tidy%2) + tidx / 8);

const int idx = bidx + sidx;

const int idy = bidy + sidy;

const float x = idx + apos_off_x;

const float y = idy + apos_off_y;

const float projf = tidy + 0.5f;

// const int idx = blockIdx.x * BLOCK_SIZE_X + threadIdx.x;

// const int idy = blockIdx.y * BLOCK_SIZE_Y + threadIdx.y + batch;

for (int proje=0; proje<num_proj; proje+=8) {

const float4 all = c_all[proje+tidy];

h = all.z + x * all.x - y * all.y;

#pragma unroll 2

for (int i = 0; i < 2; i++) {

#pragma unroll 2

for (int j = 0; j < 2; j++) {

float subh = h + 8 * j * all.x - 8 * i * all.y;

res[2 * i][j] += tex2D(tex_projes, subh, proje + projf);

subh -= all.y;

res[2 * i + 1][j] += tex2D(tex_projes, subh, proje + projf);

// d_SLICE[BLOCK_SIZE_X * gridDim.x * (idy + i * 4) + idx + j * 4] = res[i][j];

}

const int inx = 16 * (tidy%2) + tidx;

const int iny = tidy / 2;

const int outx = iny + 8 * (inx%4);

const int outy = inx/4;

const int finx = (tidy%2);

const int finy = (2 * (tidx%2) + tidy/8);

#pragma unroll 4

for (int i = 0; i < 4; i++) {

#pragma unroll 4

for (int j = 0; j < 2; j++) {

buf[iny][inx] = res[i][j];

__syncthreads();

float val = buf[outy][outx];

for (int i=8; i>=1; i/=2)

val += __shfl_xor(val, i, 8);

const int rx = 8 * j + finx;

const int ry = 4 * i + finy;

if (!tidx) {

fin[ry][rx] = val;

}

__syncthreads();

}

#pragma unroll 4

for (int i = 0; i < 4; i++) {

#pragma unroll 4

for (int j = 0; j < 4; j++) {

d_SLICE[BLOCK_SIZE_X * gridDim.x * tidy + tidx] = fin[tidy][tidx];

}

100

}

101

102

// d_SLICE[ BLOCK_SIZE_X*gridDim.x*idy + idx ] = res;

103

}

Older »