4
//#define PROJ_LINE (2 * IPT * PPT * 3)
5
#define PROJ_LINE (PPT * BLOCK_SIZE_X * 3 / 2)
7
__global__ static void hst_cuda_mplinear_kernel(int num_proj, int num_bins, float *d_SLICE, float apos_off_x, float apos_off_y, int batch) {
8
float res1[PPT][PPT] = {0.f};
9
float res2[PPT][PPT] = {0.f};
11
const int tidx = threadIdx.x;
12
const int tidy = threadIdx.y;
14
const int bidx = PPT * blockIdx.x * BLOCK_SIZE_X;
15
const int bidy = batch + PPT * blockIdx.y * BLOCK_SIZE_Y;
18
const float bx = bidx + apos_off_x;
19
const float by = bidy + apos_off_y;
21
__shared__ float f_minh[128];
22
__shared__ float2 cache[IPT * PROJ_LINE + 1];
24
const int tbidy = tidy/(PPT/2);
25
const int tbidx = tidy%(PPT/2);
27
const int wrapid = (tidy&1) * 16 + tidx;
29
const int sbidx = 8 * ((tidy / 2) % 2);
30
const int sbidy = 8 * ((tidy / 2) / 2);
32
const int stidx = tidx % 8;
33
const int stidy = 2 * (2 * (tidy&1) + tidx / 8);
35
const int sidx = (sbidx + stidx);
36
const int sidy = (sbidy + stidy);
38
const int idx = bidx + sidx;
39
const int idy = bidy + sidy;
41
const float x = idx + apos_off_x;
42
const float y = idy + apos_off_y;
44
const float exp23 = exp2(23.f);
46
const int ttidx = 16 * tidy + tidx;
48
float projf = tbidy + 0.5f;
50
const int num_proj_blocks = num_proj / 32;
52
for (int proj_block=0; proj_block<num_proj_blocks; proj_block += 4) {
53
const int proj = proj_block * 32;
55
float4 all = c_all[proj + ttidx];
56
float minh = floor(all.z + bx * all.x - by * all.y + all.w);
64
for (int i = 0; i < 4; i++) {
65
fminh[i] = f_minh[32 * i + wrapid];
68
int max_proj = min(num_proj_blocks - proj_block, 4);
71
for (int subproj32 = 0; subproj32 < max_proj; subproj32++) {
73
for (int subproj = 0; subproj < (32/IPT); subproj++) {
74
const int loop_proj = 32 * subproj32 + IPT * subproj;
75
const int proje = proj + loop_proj;
77
float4 all = c_all[proje + tbidy];
79
#ifdef HST_OPTIMIZE_KEPLER
80
int minh = __shfl(fminh[subproj32], IPT * subproj + tbidy, 32);
81
#else // HST_OPTIMIZE_KEPLER
82
int minh = i_minh[loop_proj + tbidy];
83
#endif // HST_OPTIMIZE_KEPLER
86
for (int i = 0; i < 3; i++) {
87
int pos = (i * (PPT/2) + tbidx) * BLOCK_SIZE_X + tidx;
89
cache[PROJ_LINE * tbidy + pos].x = tex2D(tex_projes, minh + pos, proje + projf);
92
// we may use fence instead
96
for (int i = 0; i < 3; i++) {
97
int pos = (i * (PPT/2) + tbidx) * BLOCK_SIZE_X + tidx;
98
cache[tbidy * PROJ_LINE + pos].y = cache[tbidy * PROJ_LINE + pos + 1].x - cache[tbidy * PROJ_LINE + pos].x;
103
#pragma unroll 4 // IPT
104
for (int p = 0; p < IPT; p++) {
105
float4 all = c_all[proje + p];
107
#ifdef HST_OPTIMIZE_KEPLER
108
float minh = __shfl(fminh[subproj32], IPT * subproj + p, 32);
109
#else // HST_OPTIMIZE_KEPLER
110
float minh = f_minh[loop_proj + p];
111
#endif // HST_OPTIMIZE_KEPLER
113
float h = all.z + x * all.x - y * all.y - minh;
116
char *cache_row = ((char*)(&cache)) + p * PROJ_LINE * sizeof(float2);
119
#pragma unroll 4 // PPT
120
for (int i = 0; i < PPT; i++) {
123
#pragma unroll 4 // PPT
124
for (int j = 0; j < PPT; j++) {
126
float subh2 = subh - all.y;
129
float fsubh1 = subh1 + exp23;
130
int idx1 = (*(int*)(&fsubh1)) - 0x4B000000;
131
fsubh1 = subh1 - (fsubh1 - exp23);
133
float fsubh2 = subh2 + exp23;
134
int idx2 = (*(int*)(&fsubh2)) - 0x4B000000;
135
fsubh2 = subh2 - (fsubh2 - exp23);
138
float2 c1 = cache[p * PROJ_LINE + idx1];
139
res1[i][j] += c1.x + fsubh1 * c1.y;
141
float2 c2 = cache[p * PROJ_LINE + idx2];
142
res2[i][j] += c2.x + fsubh2 * c2.y;
144
subh += 16.f * all.x;
155
for (int i = 0; i < PPT; i++) {
157
for (int j = 0; j < PPT; j++) {
158
d_SLICE[BLOCK_SIZE_X * PPT * gridDim.x * (idy + i * BLOCK_SIZE_Y) + idx + j * BLOCK_SIZE_X] = res1[i][j];
159
d_SLICE[BLOCK_SIZE_X * PPT * gridDim.x * (idy + i * BLOCK_SIZE_Y + 1) + idx + j * BLOCK_SIZE_X] = res2[i][j];