2
This is attempt to use __shfl to cache parts of the shared memory cache in the registers.
3
We are going by half-wraps. Each half-wrap is responsible for dual-area of (4x4) * (4x4)
4
pixels (vertically stacked). I.e. we have 32x16 block of pixels per half-wrap. The full
5
wrap encloses 32x32 square. No we are using a full wrap to cache the required part of
6
cache for each of the half-wraps. A bit of doubling, but may be expected to pay off.
7
There is no way round: 32x32 will require 48 positions and we can't store it. The same
8
way processing only half-wraps will require 24 positions, but we would have only 16
11
Possible benefit of the approach is reduction of LD units usage and possible synergy
12
with texture fetch algorithm.
14
This code works somehow, but slow and probably has some artifacts. May be it is possible
15
to optimize somehow access to shared memory...
19
__global__ static void hst_cuda_linear_kernel(int num_proj, int num_bins, float *d_SLICE, float apos_off_x, float apos_off_y, int batch) {
20
float4 res = {0.f, 0.f, 0.f, 0.f};
22
const int tidx = threadIdx.x;
23
const int tidy = threadIdx.y;
25
const int bidx = 2 * blockIdx.x * BLOCK_SIZE_X;
26
const int bidy = batch + 2 * blockIdx.y * BLOCK_SIZE_Y;
28
const int idx = bidx + 2 * tidx;
29
const int idy = bidy + 2 * tidy;
31
const float bx = bidx + apos_off_x;
32
const float by = bidy + apos_off_y;
34
const float x = idx + apos_off_x;
35
const float y = idy + apos_off_y;
38
__shared__ float cache_subh[2*BLOCK_SIZE_X][BLOCK_SIZE_Y + 1];
39
__shared__ float cache_sin[BLOCK_SIZE_Y];
40
__shared__ float2 cache[BLOCK_SIZE_Y][3 * BLOCK_SIZE_X + 1];
43
float projf = tidy + 0.5;
44
for (int proje=0; proje<num_proj; proje += BLOCK_SIZE_Y) {
45
float4 all = c_all[proje + tidy];
46
int minh = (int)floor(all.z + bx * all.x - by * all.y + all.w);
48
float subh = (all.z + x * all.x - minh);
49
cache_subh[2*tidx][tidy] = subh;
50
cache_subh[2*tidx+1][tidy] = subh + all.x;
51
if (tidx == 0) cache_sin[tidy] = all.y;
55
for (int i = 0; i < 3; i++) {
56
int pos = i * BLOCK_SIZE_X + tidx;
57
cache[tidy][pos].x = tex2D(tex_projes, minh + pos, proje + projf);
63
for (int i = 0; i < 3; i++) {
64
int pos = i * BLOCK_SIZE_X + tidx;
65
cache[tidy][pos].y = cache[tidy][pos + 1].x - cache[tidy][pos].x;
71
for (int i = 0; i < BLOCK_SIZE_Y; i++) {
75
cache_subh[2 * tidx][i] - (y) * cache_sin[i],
76
cache_subh[2 * tidx + 1][i] - (y) * cache_sin[i],
77
cache_subh[2 * tidx][i] - (y + 1) * cache_sin[i],
78
cache_subh[2 * tidx + 1][i] - (y + 1) * cache_sin[i],
81
c = cache[i][(int)minh.x];
82
v1.x = c.x; v2.x = c.y;
83
c = cache[i][(int)minh.y];
84
v1.y = c.x; v2.y = c.y;
85
c = cache[i][(int)minh.z];
86
v1.z = c.x; v2.z = c.y;
87
c = cache[i][(int)minh.w];
88
v1.w = c.x; v2.w = c.y;
90
res += v1 + (minh - floorf(minh)) * v2;
97
*(float2*)(d_SLICE + BLOCK_SIZE_X * 2 * gridDim.x * (idy ) + idx) = make_float2(res.x, res.y);
98
*(float2*)(d_SLICE + BLOCK_SIZE_X * 2 * gridDim.x * (idy + 1) + idx) = make_float2(res.z, res.w);
102
__device__ static void hst_cuda_mplinear_kernel_2(int num_proj, int num_bins, float *d_SLICE, float apos_off_x, float apos_off_y, int batch) {
103
__shared__ float f_minh[128];
104
__shared__ float2 cache_offset[2][2][2][32]; //wrapy * wrapx * halfwrap * proj
105
__shared__ uchar2 cache_ioffset[2][2][2][32]; //wrapy * wrapx * halfwrap * proj
106
__shared__ float2 cache[IPT * PROJ_LINE];
108
float res1[2 * PPT][PPT] = {0.f};
109
// float res2[PPT][PPT] = {0.f};
111
const int bidx = PPT * blockIdx.x * BLOCK_SIZE_X;
112
const int bidy = batch + PPT * blockIdx.y * BLOCK_SIZE_Y;
114
const float bx = bidx + apos_off_x;
115
const float by = bidy + apos_off_y;
117
const int tidx = threadIdx.x;
118
const int tidy = threadIdx.y;
120
const int halfwrap = tidy&1;
121
const int wrapid = halfwrap * 16 + tidx;
122
const int wrapx = (tidy/2)%2;
123
const int wrapy = (tidy/2)/2;
125
// block-wide minh computation
126
const int ttidx = 16 * tidy + tidx;
129
const int tbidx = tidy%(PPT/2);
130
const int tbidy = tidy/(PPT/2);
132
// sidx computation and offset caching
133
const int sbidx = 4 * (tidy % 4);
134
const int sbidy = 4 * (tidy / 4);
136
// sidx computation only
137
const int stidx = tidx % 4;
138
const int stidy = tidx / 4;
140
// idx computation only
141
const int sidx = (sbidx + stidx);
142
const int sidy = 2 * sbidy + stidy;
144
// x computation and write-out
145
const int idx = bidx + PPT * sidx;
146
const int idy = bidy + PPT * sidy;
148
const float x = idx + apos_off_x;
149
const float y = idy + apos_off_y;
151
// const int sbidy = 4 * ((tidy / 2) / 2);
153
// const float sbx = bx + BLOCK_SIZE_X * sbidx;
154
// const float sby = by + BLOCK_SIZE_Y * sbidy;
156
// const int stidy = (2 * (tidy&1) + tidx / 8);
159
// const int idx = bidx + PPT * sidx;
160
// const int idy = bidy + PPT * sidy;
163
const float exp23 = exp2(23.f);
165
float projf = tbidy + 0.5f;
166
const int num_proj_blocks = num_proj / 32;
168
for (int proj_block=0; proj_block<num_proj_blocks; proj_block += 4) {
169
const int proj = proj_block * 32;
171
float4 all = c_all[proj + ttidx];
172
float minh = floor(all.z + bx * all.x - by * all.y + all.w);
173
f_minh[ttidx] = minh;
180
for (int i = 0; i < 4; i++) {
181
fminh[i] = f_minh[32 * i + wrapid];
184
int max_proj = min(num_proj_blocks - proj_block, 4);
187
for (int subproj32 = 0; subproj32 < max_proj; subproj32++) {
189
// all.w/4 to compensate smaller size
190
float4 all = c_all[proj + 32 * subproj32 + wrapid];
192
float lminh = __shfl(fminh[subproj32], wrapid, 32);
194
float2 offsets1 = make_float2(
195
floorf(all.z + (bx + 4 * PPT * (2 * wrapx + 0)) * all.x - (by + 4 * PPT * (2 * wrapy + 0)) * all.y + (0.25f * all.w) - lminh),
196
floorf(all.z + (bx + 4 * PPT * (2 * wrapx + 0)) * all.x - (by + 4 * PPT * (2 * wrapy + 1)) * all.y + (0.25f * all.w) - lminh)
198
float2 offsets2 = make_float2(
199
floorf(all.z + (bx + 4 * PPT * (2 * wrapx + 1)) * all.x - (by + 4 * PPT * (2 * wrapy + 0)) * all.y + (0.25f * all.w) - lminh),
200
floorf(all.z + (bx + 4 * PPT * (2 * wrapx + 1)) * all.x - (by + 4 * PPT * (2 * wrapy + 1)) * all.y + (0.25f * all.w) - lminh)
203
cache_offset[wrapy][wrapx][0][wrapid] = offsets1;
204
cache_offset[wrapy][wrapx][1][wrapid] = offsets2;
207
cache_ioffset[wrapy][wrapx][0][wrapid] = make_uchar2(
208
(unsigned char)floor(offsets1.x),
209
(unsigned char)floor(offsets1.y)
212
cache_ioffset[wrapy][wrapx][1][wrapid] = make_uchar2(
213
(unsigned char)floor(offsets2.x),
214
(unsigned char)floor(offsets2.y)
220
for (int subproj = 0; subproj < (32/IPT); subproj++) {
221
const int loop_proj = 32 * subproj32 + IPT * subproj;
222
const int proje = proj + loop_proj;
224
float4 all = c_all[proje + tbidy];
226
int minh = __shfl(fminh[subproj32], IPT * subproj + tbidy, 32);
230
for (int i = 0; i < 3; i++) {
231
int pos = (i * (PPT/2) + tbidx) * BLOCK_SIZE_X + tidx;
233
cache[PROJ_LINE * tbidy + pos].x = tex2D(tex_projes, minh + pos, proje + projf);
237
// we may use fence instead
241
for (int i = 0; i < 3; i++) {
242
int pos = (i * (PPT/2) + tbidx) * BLOCK_SIZE_X + tidx;
243
cache[tbidy * PROJ_LINE + pos].y = cache[tbidy * PROJ_LINE + pos + 1].x - cache[tbidy * PROJ_LINE + pos].x;
248
#pragma unroll 4 // IPT
249
for (int p = 0; p < IPT; p++) {
250
float4 all = c_all[proje + p];
252
float minh = __shfl(fminh[subproj32], IPT * subproj + p, 32);
256
// try to make intermediate step caching in vars
257
float2 offsets_my = cache_offset[wrapy][wrapx][halfwrap][IPT * subproj];
258
float2 offsets_foreign = cache_offset[wrapy][wrapx][!halfwrap][IPT * subproj];
259
uchar2 ioffsets1 = cache_ioffset[wrapy][wrapx][0][IPT * subproj];
260
uchar2 ioffsets2 = cache_ioffset[wrapy][wrapx][1][IPT * subproj];
262
float2 lcache1 = make_float2(
263
cache[p * PROJ_LINE + ioffsets1.x + wrapid].x,
264
cache[p * PROJ_LINE + ioffsets1.y + wrapid].x
266
float2 lcache2 = make_float2(
267
cache[p * PROJ_LINE + ioffsets2.x + wrapid].x,
268
cache[p * PROJ_LINE + ioffsets2.y + wrapid].x
273
float h = all.z + x * all.x - y * all.y - minh;
275
#pragma unroll 4 // PPT
276
for (int i = 0; i < PPT; i++) {
277
#pragma unroll 4 // PPT
278
for (int j = 0; j < PPT; j++) {
279
float subh1 = h - (i ) * all.y + j * all.x - offsets_my.x;
280
float subh2 = h - (i + 4 * PPT) * all.y + j * all.x - offsets_my.y;
282
float fsubh1 = subh1 + exp23;
283
int idx1 = (*(int*)(&fsubh1)) - 0x4B000000;
284
fsubh1 = subh1 - (fsubh1 - exp23);
286
float fsubh2 = subh2 + exp23;
287
int idx2 = (*(int*)(&fsubh2)) - 0x4B000000;
288
fsubh2 = subh2 - (fsubh2 - exp23);
290
// int foreign1 = __shfl_xor(idx1, 0x10, 32);
291
// int foreign2 = __shfl_xor(idx2, 0x10, 32);
295
c1.x = __shfl(lcache1.x, idx1, 32);
296
c2.x = __shfl(lcache2.x, idx1, 32);
298
c1.y = __shfl(lcache1.y, idx2, 32);
299
c2.y = __shfl(lcache2.y, idx2, 32);
301
//c = halfwrap?c2:c1;
303
res1[i][j] += (halfwrap)?c2.x:c1.x;
304
res1[4 + i][j] += (halfwrap)?c2.y:c1.y;
308
float2 c1 = cache[p * PROJ_LINE + idx1];
309
res1[i][j] += c1.x + fsubh1 * c1.y;
311
float2 c2 = cache[p * PROJ_LINE + idx2];
312
res1[4+i][j] += c2.x + fsubh2 * c2.y;
325
float4 *result = (float4*)(d_SLICE + PPT * BLOCK_SIZE_X * gridDim.x * idy + idx);
327
for (int i = 0; i < PPT; i++) {
328
result[i * gridDim.x * BLOCK_SIZE_X] = *(float4*)&res1[i][0];
329
result[(i + 16) * gridDim.x * BLOCK_SIZE_X] = *(float4*)&res1[4 + i][0];
334
__global__ static void hst_cuda_mplinear_kernel(int num_proj, int num_bins, float *d_SLICE, float apos_off_x, float apos_off_y, int batch) {
335
const int mode = 0;//((gridDim.x * blockIdx.y + blockIdx.x)/8) % 2;
338
hst_cuda_mplinear_kernel_1(num_proj, num_bins, d_SLICE, apos_off_x, apos_off_y, batch);
340
hst_cuda_mplinear_kernel_2(num_proj, num_bins, d_SLICE, apos_off_x, apos_off_y, batch);