2
#define subh_cache(subh) cache[(int)(subh)]
6
__kernel void hst_backproject_test_oversample4_1(const int num_proj,
10
__read_only image2d_t sinogram,
11
__global float *slice,
12
__constant float4 *c_all,
16
const int tidx = get_local_id(0);
17
const int tidy = get_local_id(1);
19
const int bidx = PPT * get_group_id(0) * BLOCK_SIZE;
20
const int bidy = PPT * (get_global_id(1) - get_local_id(1) - get_group_id(1) * BLOCK_SIZE);
22
const float bx = bidx + off_x;
23
const float by = bidy + off_y;
26
const int sbidx = tidy % 4;
27
const int sbidy = tidy / 4;
29
const int stidx = tidx % 4;
30
const int stidy = tidx / 4;
32
const int sidx = (sbidx * 4 + stidx);
33
const int sidy = (sbidy * 4 + stidy);
36
const int sidx = tidx;
37
const int sidy = tidy - BLOCK_SIZE;
40
const float sx = sidx;
41
const float sy = sidy;
43
const int idx = bidx + sidx;
44
const int idy = bidy + sidy;
46
const int slice_width = PPT * get_global_size(0);
49
int offset = 5 * BLOCK_SIZE * BLOCK_SIZE;
50
float4 all = c_all[0];
52
if (tidy < BLOCK_SIZE) {
53
float minh = floor(all.z + bx * all.x - by * all.y + all.w);
56
for (int i = 0; i < 5; i++) {
57
int pos = i * BLOCK_SIZE * BLOCK_SIZE + tidy * BLOCK_SIZE + tidx;
58
cache[pos] = read_imagef(sinogram, volumeSampler, (float2)(minh + 0.25f*pos, .5f)).x;
62
float res[PPT][PPT] = {0};
63
for (int proj = 0; proj < num_proj; proj++) {
64
if (get_local_id(1) >= BLOCK_SIZE) {
68
float subh = mad(sx, all.x, mad(sy, all.y, all.w));
72
for (int i = 0; i < PPT; i++) {
74
for (int j = 0; j < PPT; j++) {
75
res[i][j] += subh_cache(buf * offset + subh + (i * BLOCK_SIZE) * all.y + (j * BLOCK_SIZE) * all.x);
81
if (proj == num_proj) break;
83
all = c_all[proj + 1];
84
if (tidy < BLOCK_SIZE) {
85
float minh = floor(all.z + bx * all.x - by * all.y + all.w);
88
for (int i = 0; i < 5; i++) {
89
int pos = i * BLOCK_SIZE * BLOCK_SIZE + tidy * BLOCK_SIZE + tidx;
90
cache[buf * offset + pos] = read_imagef(sinogram, volumeSampler, (float2)(minh + 0.25f*pos, proj + 1.5f)).x;
94
barrier(CLK_LOCAL_MEM_FENCE);
97
if (get_local_id(1) >= BLOCK_SIZE) {
99
for (int i = 0; i < PPT; i++) {
101
for (int j = 0; j < PPT; j++) {
102
slice[(idy + BLOCK_SIZE * i) * slice_width + idx + BLOCK_SIZE * j] = res[i][j];
109
__kernel void hst_backproject_test_oversample4_2(const int num_proj,
113
__read_only image2d_t sinogram,
114
__global float *slice,
115
__constant float4 *c_all,
116
__local float *shared
119
const int tidx = get_local_id(0);
120
const int tidy = get_local_id(1);
122
const int sidx = tidx;
123
const int sidy = tidy;
125
const int idx = PPT * get_group_id(0) * BLOCK_SIZE + sidx;
126
const int idy = PPT * (get_global_id(1) - get_local_id(1) - get_group_id(1) * BLOCK_SIZE) + sidy;
128
const float x = idx + off_x;
129
const float y = idy + off_y;
131
const int slice_width = PPT * get_global_size(0);
133
float res[PPT][PPT] = {0};
134
for (int proj = 0; proj < num_proj; proj++) {
135
float4 all = c_all[proj];
137
float subh = mad(x, all.x, mad(y, all.y, all.z));
140
for (int i = 0; i < PPT; i++) {
142
for (int j = 0; j < PPT; j++) {
143
res[i][j] += read_imagef(sinogram, volumeSampler, (float2)(subh + i * BLOCK_SIZE * all.y + j * BLOCK_SIZE * all.x, proj + .5f)).x;
149
for (int i = 0; i < PPT; i++) {
151
for (int j = 0; j < PPT; j++) {
152
slice[(idy + BLOCK_SIZE * i) * slice_width + idx + BLOCK_SIZE * j] = res[i][j];
159
__kernel void hst_backproject_test_oversample4(const int num_proj,
163
__read_only image2d_t sinogram,
164
__global float *slice,
165
__constant float4 *c_all,
166
__local float *cache) {
168
const int mode = 4;//(get_group_id(1) * get_num_groups(0) + get_group_id(0)) % 8;
170
if (get_local_id(1) < BLOCK_SIZE) {
171
hst_backproject_test_oversample4_2(num_proj, num_bins, off_x, off_y, sinogram, slice, c_all, cache);
174
hst_backproject_test_oversample4_1(num_proj, num_bins, off_x, off_y, sinogram, slice, c_all, cache);