/tomo/pyhst

To get this branch, use:
bzr branch http://darksoft.org/webbzr/tomo/pyhst
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
//#define DEBUG_OFFSETS 1	// set to 0 to allow intially interpolations and 1 to disable

#undef KERNEL_NAME
#undef SHMEM_PADDING
#undef SHMEM_SHIFTED_PADDING
#undef SHMEM_ARRAYS
#undef SHMEM_SAMPLING
#undef SIMPLE_SHMEM
#undef SPLIT_SHMEM
#undef REMAP_THREADS
#undef SHMEM_LOAD_ITERATIONS
#undef OVERSAMPLE_MULTIPLIER
#undef GET_CACHE
#undef INTERPOLATE_FROM_CACHE
#undef MIN_BLOCKS
#undef CACHE
#undef FANCY_ROUND
#undef FANCY_MINH_CORRECTION
#undef SHMEM_LOAD_FULLWARP
#undef SHMEM_LOAD2
#undef CACHE_POS
#undef GET_SINO
#undef BSX
#undef BSY

#define BSX BS
#define BSY BS

#ifdef HST_CACHE_Y
# define CACHE_Y
#endif

#define HST_MACRO_JOIN(a, b) a ## b
#define MK_KERNEL_NAME(name, version) HST_MACRO_JOIN(name, version)

#ifdef OVERSAMPLE
# if OVERSAMPLE == 1
#  define KERNEL_NAME hst_cuda_alu_nn
# else
#  define KERNEL_NAME MK_KERNEL_NAME(hst_cuda_alu_oversample, OVERSAMPLE)
# endif
# define SHMEM_SAMPLING OVERSAMPLE			/**< Oversmapling coefficients, how many points to sample per cell */
# undef CACHE_Y
# define MIN_BLOCKS HST_OVERS_MIN_BLOCKS
#else
# define KERNEL_NAME hst_cuda_alu_linear
# define SHMEM_SAMPLING 1
# define MIN_BLOCKS HST_LINEAR_MIN_BLOCKS
#endif
#define SHMEM_SHIFTED_PADDING 0

#ifdef CACHE_Y
# define SHMEM_ARRAYS 2
# define SHMEM_PADDING 1
#else
# define SIMPLE_SHMEM					/**< Use simple ShMem array instead of float2 with 'x' and 'y-x' */
# define SHMEM_ARRAYS 1					/**< Number of shmem arrays (in non SIMPLE mode). For interpolation we may need 2 */
# define SHMEM_PADDING 0				/**< We need extra byte in the end of shmem array if we compute (y-x) */
#endif

#if SHMEM_SAMPLING > 1
# define REMAP_THREADS					/**< Localise accesses (needed for wide Oversampling kernels to avoid cache misses */
#endif

#if SHMEM_SAMPLING == 4 
# define OVERSAMPLE_MULTIPLIER 0.25f
#elif SHMEM_SAMPLING > 1
# define OVERSAMPLE_MULTIPLIER (1.f / OVERSAMPLE)
#else
# define OVERSAMPLE_MULTIPLIER 1
#endif


#if PPT == 1
# if SHMEM_SAMPLING == 4
#  define SHMEM_LOADS (6)
# else
#  define SHMEM_LOADS (2 * SHMEM_SAMPLING)
# endif
# define SHMEM_LOADS_ITERATIONS SHMEM_LOADS
#else /* PPT = 4 */
# define SHMEM_LOADS (3 * SHMEM_SAMPLING)
# if defined(HST_SHMEM64)&&(SHMEM_SAMPLING >= 2)&&(PROJ_BLOCK <= 8)
#  if (SHMEM_SAMPLING >= 4)&&(SLICE_BLOCK == 1)
#    define SHMEM_LOAD_ITERATIONS (SHMEM_LOADS / 4)
#    define SHMEM_LOAD_FULLWARP					/* Use 32-threads for loading data, this limits projections to 8 */
#    define SHMEM_LOAD2						/* Combine two texture fetches to 1 64-bit shmem store */
#  else
#    define SHMEM_LOAD_ITERATIONS (SHMEM_LOADS / 2)
#    define SHMEM_LOAD_FULLWARP
#  endif
# else
#  define SHMEM_LOAD_ITERATIONS (SHMEM_LOADS)
# endif
#endif


#ifdef HST_TUNE_SHMEM
	/* This is pretty much useless. We optimize access to ShMem cache during the writting phase, but
	this costs us extra instructions (need to compute 2 addresses instead of 1). 
	- On Fermi (and likely all other architectures float1 is compute bound. For float* we enforce it
	anyway.
	- Besides, using 3D arrays is significantly more optimal than addressing manually in linear array.
	So, the clever tricks will also not work.
	- Shifted padding is to extend array at different (non-last) dimmension to ensure no bank conflicts
	in 32-bit mode. */
# ifndef SIMPLE_SHMEM
#  if SLICE_BLOCK == 1
#    undef SHMEM_SHIFTED_PADDING
#    define SHMEM_SHIFTED_PADDING SHMEM_PADDING
#    undef SHMEM_PADDING
#    define SHMEM_PADDING 0
#  endif
#  define SIMPLE_SHMEM
# endif /* ! SIMPLE_SHMEM */
#endif

#if SLICE_BLOCK > 1
# define SIMPLE_SHMEM
# if SLICE_BLOCK == 4
#  ifndef  HST_OPTIMIZE_KEPLER
#   define SPLIT_SHMEM
#  endif
# endif
#endif


#ifdef SIMPLE_SHMEM
# define CACHE(proj, bin, c) cache[c][proj][bin]
#else 
# define CACHE(proj, bin, c) cache[proj][bin].comp(c)
#endif

#ifdef SIMPLE_SHMEM
# ifdef CACHE_Y
#  ifdef SPLIT_SHMEM
#   define GET_CACHE(v1, v2, minh, i) ({ int bin = (minh); float2 vxy, vzw; vxy = CACHE(i, bin, 0); vzw = CACHE(i, bin, 2); v1 = mkf4(vxy, vzw);  vxy = CACHE(i, bin, 1); vzw = CACHE(i, bin, 3); v2 = mkf4(vxy, vzw); })
#  else /* SPLIT_SHMEM */
#   define GET_CACHE(v1, v2, minh, i) ({ int bin = (minh); v1 = CACHE(i, bin, 0); v2 = CACHE(i, bin, 1); })
#  endif /* SPLIT_SHMEM */
# else
#  ifdef SPLIT_SHMEM
#   define GET_CACHE(v, minh, i) ({ int bin = (minh); float2 vxy, vzw; vxy = CACHE(i, bin, 0); vzw = CACHE(i, bin, 1); v = mkf4(vxy, vzw); })
#  else /* SPLIT_SHMEM */
#   define GET_CACHE(v, minh, i) ({ int bin = (minh); v = CACHE(i, bin, 0); })
#  endif /* SPLIT_SHMEM */
# endif
#else /* SIMPLE_SHMEM */
# define GET_CACHE(v1, v2, minh, i) ({ float2 c = cache[i][minh]; v1 = c.x; v2 = c.y; })
#endif /* SIMPLE_SHMEM */

#if defined(HST_MEM_FETCHES)&&(SHMEM_SAMPLING == 1)
# define HST_FETCH_VAR(var, val) const int var = (val); 	// floor?
# define GET_SINO(bin, proj) sino[(proj)*bin_pitch + (((bin)<num_bins)?(bin):0)]
#else  /* HST_MEM_FETCHES */
# define HST_FETCH_VAR(var, val) const float var = (val) + 0.5f;
# define GET_SINO(bin, proj) hst_tex(texptr, bin, proj);
#endif /* HST_MEM_FETCHES */

#ifdef DEBUG_OFFSETS
# undef GET_SINO
# define GET_SINO(bin, proj)  {0.f}
#endif

#if defined(CACHE_Y)
# define INTERPOLATE_FROM_CACHE(res, idx, vc, i) ({ vfloat v1, v2; GET_CACHE(v1, v2, idx.comp(vc), i); res[vc] += v1 + diff.comp(vc) * v2; })
#elif defined(OVERSAMPLE)
# define INTERPOLATE_FROM_CACHE(res, idx, vc, i) ({ vfloat v; GET_CACHE(v, idx.comp(vc), i); res[vc] += v; })
#else  /* UNCACHED_Y */
# define INTERPOLATE_FROM_CACHE(res, idx, vc, i) ({ vfloat v1, v2; int _idx = idx.comp(vc); GET_CACHE(v1, _idx, i); GET_CACHE(v2, _idx + 1, i); res[vc] += v1 + diff.comp(vc) * (v2 - v1); })
#endif /* OVERSAMPLE */

#define FANCY_MINH_CORRECTION(x) (x)
#ifdef OVERSAMPLE
# ifdef HST_OVERS_FANCY_ROUND
#  define FANCY_ROUND(idx, h) ({float4 fh = h + (float4){exp23, exp23, exp23, exp23}; idx = (*(int4*)(&fh)) - (int4){0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000}; })
# endif
#else
# ifdef HST_LINEAR_FANCY_FLOOR
//#  define FANCY_ROUND(idx, diff, h) ({ float4 fh = h + (float4){exp23, exp23, exp23, exp23}; idx = (*(int4*)(&fh)) - (int4){0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000}; diff = h - (fh - (float4){exp23, exp23, exp23, exp23});})
//#  define FANCY_ROUND(idx, diff, h) ({ float4 newh = h + (float4){0.500001f, 0.500001f, 0.500001f, 0.500001f} ; float4 fh = newh + (float4){exp23, exp23, exp23, exp23}; idx = (*(int4*)(&fh)) - (int4){0x4B000001, 0x4B000001, 0x4B000001, 0x4B000001}; diff = (float4){1.f, 1.f, 1.f, 1.f} + (h - (fh - (float4){exp23, exp23, exp23, exp23}));})
#  define FANCY_ROUND(idx, diff, h) ({ float4 newh = h -(float4){0.499999f, 0.499999f, 0.499999f, 0.499999f} ; float4 fh = newh + (float4){exp23, exp23, exp23, exp23}; idx = (*(int4*)(&fh)) - (int4){0x4B000001, 0x4B000001, 0x4B000001, 0x4B000001}; diff = (h - (fh - (float4){exp23, exp23, exp23, exp23}));})
	// For proper rounding we need values above 1. So, we artificaly add 1 to array offset. We will remove it during the rounding.
#  undef FANCY_MINH_CORRECTION
#  define FANCY_MINH_CORRECTION(x) ((x) - 1.f)
# endif
#endif


#ifdef SHMEM_LOAD_FULLWARP
# ifdef SHMEM_LOAD2
#  define CACHE_POS(i) (4 * i * BSX + 2 * cache_tidx)
# else
#  define CACHE_POS(i) (2 * i * BSX + cache_tidx)
# endif
#else
# define CACHE_POS(i) (i * BSX + cache_tidx)
#endif


#if defined(HST_HYBRID) && !defined(OVERSAMPLE)
 __device__
#else
__global__
#endif
static 
#if MIN_BLOCKS > 0
__launch_bounds__(256, MIN_BLOCKS)
#endif
void KERNEL_NAME(cudaTextureObject_t texptr, const float * __restrict__ g_all, const tfloat * __restrict__ sino, int num_proj, int num_bins, int bin_pitch, vfloat *d_SLICE, float apos_off_x, float apos_off_y, int batch) {
    vfloat res[PPT * PPT] = {0.f};

    const float exp23 = exp2(23.f);

    const int tidx = threadIdx.x;
    const int tidy = threadIdx.y;

#ifdef SHMEM_LOAD_FULLWARP
	// Should not happen for 8x8 blocks (should be no conflicts with ppt=2)
    const int cache_tidx = (tidy%2)*16 + tidx;
    const int cache_tidy = tidy/2;
#else
    const int cache_tidx = tidx;
    const int cache_tidy = tidy;
#endif

    const int bidx = PPT * blockIdx.x * BSX;
    const int bidy = batch + PPT * blockIdx.y * BSY;

	// We get centers offsetted by 0.5f to get in the pixel centers as expected by texture engine
    const float off_x = apos_off_x - 0.5f;
    const float off_y = apos_off_y - 0.5f;

    const float bx = bidx + off_x;
    const float by = bidy + off_y;

#ifdef HST_CACHE_SUBH
    const float _x = bx + tidx;
//    const float _y = by + tidy;
#endif

#ifdef REMAP_THREADS
	// Also should not happen for 8x8 blocks (but we can reconsider)
# if (BSX == 16)&&(BSY == 16)
    const int sidx = 4 * (threadIdx.y % 4) + (threadIdx.x % 4);
    const int sidy = 4 * (threadIdx.y / 4) + (threadIdx.x / 4);
# elif (BSX == 8)&&(BSY == 8)
    const int sidx16 = 4 * (threadIdx.y % 4) + (threadIdx.x % 4);
    const int sidx = sidx16 % 8;
    const int sidy = 4 * (sidx16 / 8) + 2 * (threadIdx.y / 4) + (threadIdx.x / 4);
# else
#  error "Specified block size is not supported"
# endif
#else
    const int sidx = tidx;
    const int sidy = tidy;
#endif

    const int idx = bidx + sidx; 
    const int idy = bidy + sidy; 

#ifndef HST_CACHE_SUBH
    const float x = idx + off_x;
#endif
    const float y = idy + off_y;

#ifdef HST_CACHE_SUBH
    __shared__  float2   cache_subh[PROJ_BLOCK][BSX];
# ifdef HST_CACHE_SIN
    __shared__  float   cache_sin[PROJ_BLOCK];
# endif 
#endif

#ifdef DEBUG_OFFSETS
    int debug_run = !DEBUG_OFFSETS;
#endif 

#ifdef SIMPLE_SHMEM
# ifdef SPLIT_SHMEM
    __shared__  float2   cache[2 * SHMEM_ARRAYS][PROJ_BLOCK + SHMEM_SHIFTED_PADDING][SHMEM_LOADS * BSX + SHMEM_PADDING];
# else
    __shared__  vfloat   cache[SHMEM_ARRAYS][PROJ_BLOCK + SHMEM_SHIFTED_PADDING][SHMEM_LOADS * BSX + SHMEM_PADDING];
# endif
#else 
    __shared__  float2   cache[PROJ_BLOCK][SHMEM_LOADS * BSX + SHMEM_PADDING];
#endif

//#pragma unroll 8

    HST_FETCH_VAR(projf, cache_tidy);
    for (int proje=0; proje<num_proj; proje += PROJ_BLOCK) {

	if (cache_tidy < PROJ_BLOCK) {
		// cache_tidy is either equal to tidy or to tidy/2
	    float4 all = c_all[proje + cache_tidy];
	    float minh = FANCY_MINH_CORRECTION(floor(all.z + bx * all.x - by * all.y + all.w));

#ifdef SHMEM_LOAD_FULLWARP
	    if ((tidy&1) == 0)
#endif
	    {
#ifdef HST_CACHE_SUBH
		float subh = SHMEM_SAMPLING * (all.z + _x * all.x - minh);
		cache_subh[cache_tidy][tidx].x = subh;
		cache_subh[cache_tidy][tidx].y = subh + SHMEM_SAMPLING * BSX * all.x;
# ifdef HST_CACHE_SIN
		if (tidx == 0) cache_sin[cache_tidy] = SHMEM_SAMPLING * all.y;
# endif
#endif
	    }
	
	    HST_FETCH_VAR(binf, minh);
#pragma unroll
	    for (int i = 0; i < SHMEM_LOAD_ITERATIONS; i++) {
		int pos = CACHE_POS(i);

		vfloat v = GET_SINO(binf + OVERSAMPLE_MULTIPLIER * pos, proje + projf);
#if defined(SPLIT_SHMEM)
		CACHE(cache_tidy, pos, 0) = (float2){v.x, v.y};
		CACHE(cache_tidy, pos, SHMEM_ARRAYS) = (float2){v.z, v.w};
#elif defined(SHMEM_LOAD2)
		float v2 = GET_SINO(binf + OVERSAMPLE_MULTIPLIER * (pos + 1), proje + projf);
		*(float2*)&CACHE(cache_tidy, pos, 0) = (float2){v, v2};
#else
		CACHE(cache_tidy, pos, 0) = v;
#endif
	    }

#ifdef CACHE_Y
	    // We don't need to sync all groups, but prevent optimizer from reshuffling the instructions,
	    // __syncthreads() will require to break the (to be executed by all threads) and slightly slower
	__threadfence_block();

#pragma unroll
	    for (int i = 0; i < SHMEM_LOAD_ITERATIONS; i++) {
		int pos = CACHE_POS(i);
		CACHE(cache_tidy, pos, 1) = CACHE(cache_tidy, pos + 1, 0) - CACHE(cache_tidy, pos, 0);
# ifdef SPLIT_SHMEM
		CACHE(cache_tidy, pos, 3) = CACHE(cache_tidy, pos + 1, 2) - CACHE(cache_tidy, pos, 2);
# endif
	    }
#endif // CACHE_Y

	}


	__syncthreads();

	    // Automatic is good, otherwise 16
#pragma unroll
	for (int i = 0; i < PROJ_BLOCK; i++) {
#ifdef HST_CACHE_SUBH
	    float2 subh = cache_subh[i][sidx];
# ifdef HST_CACHE_SIN
	    float csin = cache_sin[i];
# else
	    float csin = SHMEM_SAMPLING * c_all[proje + i].y;
# endif
#else
	    float4 all = c_all[proje + i];
	    const float _minh = FANCY_MINH_CORRECTION(floor(all.z + bx * all.x - by * all.y + all.w));

	    float2 subh;
	    subh.x = SHMEM_SAMPLING * (all.z + x * all.x - _minh);
	    subh.y = subh.x + SHMEM_SAMPLING * BSX * all.x;
	    float csin = SHMEM_SAMPLING * all.y;
#endif

	    float4 minh = {
		subh.x - y * csin,
		subh.y - y * csin,
		subh.x - (y + BSY) * csin,
		subh.y - (y + BSY) * csin
	    };

#ifdef FANCY_ROUND
	    int4 idx;
# ifdef OVERSAMPLE
	    FANCY_ROUND(idx, minh);
# else
	    float4 diff;
	    FANCY_ROUND(idx, diff, minh);
# endif
#else

# ifdef OVERSAMPLE
		// Rounding
//	    int4 idx = (int4){(int)rint(minh.x), (int)rint(minh.y), (int)rint(minh.z), (int)rint(minh.w)};
	    minh += (float4){0.5f, 0.5f, 0.5f, 0.5f};
	    int4 idx = (int4){(int)minh.x, (int)minh.y, (int)minh.z, (int)minh.w};
# else
		// Interpolating between floor and floor + 1
	    float4 diff = floorf(minh);
	    int4 idx = (int4){(int)diff.x, (int)diff.y, (int)diff.z, (int)diff.w};
	    diff = minh - diff;
# endif
#endif


#ifdef DEBUG_OFFSETS
	    if (debug_run > 0) {
#endif
		INTERPOLATE_FROM_CACHE(res, idx, 0, i);
#if PPT > 1
		INTERPOLATE_FROM_CACHE(res, idx, 1, i);
		INTERPOLATE_FROM_CACHE(res, idx, 2, i);
		INTERPOLATE_FROM_CACHE(res, idx, 3, i);
#endif

		// Read floats: od -tf4 reco.vol | head
#ifdef DEBUG_OFFSETS
	    }

	    if ((debug_run >=0 )&&((proje + i)==0)) { //(res[0].x != 0) {
		*(float*)(&res[0]) = idx.x;
		*(float*)(&res[1]) = idx.y;
		*(float*)(&res[2]) = idx.z;
		*(float*)(&res[3]) = idx.w;
		debug_run = -1;
	    }
#endif


	}

	__syncthreads();
    }


    d_SLICE[BSX * PPT * gridDim.x * (idy      ) + idx      ] = res[0];
#if PPT > 1
    d_SLICE[BSX * PPT * gridDim.x * (idy      ) + idx + BSX] = res[1];
    d_SLICE[BSX * PPT * gridDim.x * (idy + BSY) + idx      ] = res[2];
    d_SLICE[BSX * PPT * gridDim.x * (idy + BSY) + idx + BSX] = res[3];
#endif
}