2
We are trying to fetch along the both bin & proj axes of the projection within each block (assuming
3
the GPU is optimized for this use case). Minor modification going by 8 projections only and using
4
32 threads for acquring spatial data 8x4. Not finished, some artifcats present.
8
__global__ static void hst_cuda_kernel(int num_proj, int num_bins, float *d_SLICE, float apos_off_x, float apos_off_y, int batch) {
10
float res[4][2] = {0};
12
#ifdef HST_OPTIMIZE_KEPLER
13
__shared__ float buf[8][34]; // 64b for Kepler
14
__shared__ float fill[48];
15
__shared__ float fin[16][18];
16
#else /* HST_OPTIMIZE_KEPLER */
17
__shared__ float buf[8][33]; // 32b for Fermi & GT200
18
__shared__ float fill[56];
19
__shared__ float fin[16][17];
20
#endif /* HST_OPTIMIZE_KEPLER */
22
const int tidx = threadIdx.x;
23
const int tidy = threadIdx.y;
25
const int bidx = PPT * blockIdx.x * BLOCK_SIZE_X;
26
const int bidy = batch + PPT * blockIdx.y * BLOCK_SIZE_Y;
28
const int sidx = tidx % 8;
29
const int sidy = 2 * (2 * (tidy%2) + tidx / 8);
31
const int idx = bidx + sidx;
32
const int idy = bidy + sidy;
34
const float x = idx + apos_off_x;
35
const float y = idy + apos_off_y;
37
const float projf = tidy + 0.5f;
39
// const int idx = blockIdx.x * BLOCK_SIZE_X + threadIdx.x;
40
// const int idy = blockIdx.y * BLOCK_SIZE_Y + threadIdx.y + batch;
42
for (int proje=0; proje<num_proj; proje+=8) {
43
const float4 all = c_all[proje+tidy];
44
h = all.z + x * all.x - y * all.y;
47
for (int i = 0; i < 2; i++) {
49
for (int j = 0; j < 2; j++) {
50
float subh = h + 8 * j * all.x - 8 * i * all.y;
51
res[2 * i][j] += tex2D(tex_projes, subh, proje + projf);
53
res[2 * i + 1][j] += tex2D(tex_projes, subh, proje + projf);
54
// d_SLICE[BLOCK_SIZE_X * gridDim.x * (idy + i * 4) + idx + j * 4] = res[i][j];
60
const int inx = 16 * (tidy%2) + tidx;
61
const int iny = tidy / 2;
63
const int outx = iny + 8 * (inx%4);
64
const int outy = inx/4;
66
const int finx = (tidy%2);
67
const int finy = (2 * (tidx%2) + tidy/8);
70
for (int i = 0; i < 4; i++) {
72
for (int j = 0; j < 2; j++) {
73
buf[iny][inx] = res[i][j];
77
float val = buf[outy][outx];
79
for (int i=8; i>=1; i/=2)
80
val += __shfl_xor(val, i, 8);
82
const int rx = 8 * j + finx;
83
const int ry = 4 * i + finy;
95
for (int i = 0; i < 4; i++) {
97
for (int j = 0; j < 4; j++) {
98
d_SLICE[BLOCK_SIZE_X * gridDim.x * tidy + tidx] = fin[tidy][tidx];
102
// d_SLICE[ BLOCK_SIZE_X*gridDim.x*idy + idx ] = res;