/tomo/pyhst

To get this branch, use:
bzr branch http://darksoft.org/webbzr/tomo/pyhst
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
//#define HST_AUTOGEN
#define HST_HEATUP
//#define HST_MEASURE_GPUBOOST

#ifdef HST_AUTOGEN
# include "hst_cuda_defines_gen.h"
#else
//#define HST_TEX_KERNEL			//!< Enforce ppt1 tex-based kernel
//#define HST_BASE_KERNEL			//!< Enforce simplest kernel
//#define HST_HYBRID				//!< Use Tex & Linear kernels in parallel. Doesn't help on Fermi
//#define HST_HALF_MODE				//!< Careful, only NN-interpolation using tex-engine (i.e. no oversampling and NN in tex mode)
#endif


#define HST_CUDA_ARRAY				//!< Use cudaArray instead of binding textures

#define HST_BASE_REMAP
#define HST_BASE_PPT 2
#define HST_LINEAR_KERNEL			//!< Enforece linear kernel
//#define HST_LINEAR_BASE
#define HST_LINEAR_MPLINEAR
#define HST_HALF_CACHE


/*
#if (__CUDA_ARCH__ >= 500)
# undef HST_OPTIMIZE_KEPLER
# define HST_OPTIMIZE_KEPLER 5
#elif (__CUDA_ARCH__ >= 300)
# undef HST_OPTIMIZE_KEPLER
# define HST_OPTIMIZE_KEPLER 3
#elif (__CUDA_ARCH__ >= 100)
# undef HST_OPTIMIZE_KEPLER
#else
#endif
*/

#if (SLICE_BLOCK < 4)||!defined(HST_LINEAR_MPLINEAR)||defined(HST_HYBRID)
# undef HST_HALF_MODE
#endif

#ifndef HST_HALF_MODE
# undef HST_HALF_CACHE
#endif

#ifdef HST_HALF_CACHE
# define HST_FANCY_ROUND			//!< Good on slice1, but bad on slice4 (without somehow works very badly if 4+ unrolling of p-loop is requested)
#endif

#if defined(PYHST_RECON_BENCHMARK)||!defined(PYHST_ASTRA_SCALING)
# define HST_FILTER2				//!< YES: Filter 2 projections at once (but insignificantly affects the scaling. so we keep it off for quality benchmarking)
#endif


#define HST_SET_BOUNDS
#define HST_LINEAR_BLOCK 16			//! 16! 8 is slow (other numbers are not supported, enforece SQUARE_PPT)

#if defined(HST_LINEAR_BASE)
# define HST_LINEAR_PPT 2			//! 2 or 4! The default PPT for Linear kernels (possible: 1, but would be slower for sure
# define HST_OVERS_PPT 2			//! 2 or 4! The default PPT for Linear kernels (possible: 1, but would be slower for sure
# define HST_NN_PPT 2
#elif defined(HST_LINEAR_MPLINEAR)&&(HST_OPTIMIZE_KEPLER > 4)
# if ((SLICE_BLOCK > 1)&&(defined(HST_HYBRID)))||(SLICE_BLOCK > 2)
#  define HST_LINEAR_PPT 2			//! 2 or 4! The default PPT for Linear kernels (possible: 1, but would be slower for sure
# else
#  define HST_LINEAR_PPT 4			//! 2 or 4! The default PPT for Linear kernels (possible: 1, but would be slower for sure
# endif
# if SLICE_BLOCK < 4
#  define HST_OVERS_PPT 4			//! 2 or 4! The default PPT for Linear kernels (possible: 1, but would be slower for sure
#  define HST_NN_PPT 4
# else
#  define HST_OVERS_PPT 2			//! 2 or 4! The default PPT for Linear kernels (possible: 1, but would be slower for sure
#  define HST_NN_PPT 2
# endif
#elif defined(HST_LINEAR_MPLINEAR)&&defined(HST_OPTIMIZE_KEPLER)
# define HST_LINEAR_PPT 2			//! 2 or 4! The default PPT for Linear kernels (possible: 1, but would be slower for sure
# if SLICE_BLOCK == 1
#  define HST_OVERS_PPT 4			//! 2 or 4! The default PPT for Linear kernels (possible: 1, but would be slower for sure
#  define HST_NN_PPT 4
# else
#  define HST_OVERS_PPT 2			//! 2 or 4! The default PPT for Linear kernels (possible: 1, but would be slower for sure
#  define HST_NN_PPT 2
# endif
#else
# define HST_LINEAR_PPT 2			//! 2! The default PPT for Linear kernels (possible: 1, but would be slower for sure). 4 is not possible in SIMD mode
# define HST_NN_PPT 2
#endif

#if defined(HST_LINEAR_BASE)
# define HST_LINEAR_ASSYMETRY 1			//! 1!
#elif defined(HST_HYBRID)
# define HST_LINEAR_ASSYMETRY 1			//! 1!
#elif defined(HST_OPTIMIZE_KEPLER)&&defined(HST_LINEAR_MPLINEAR)
# if ((SLICE_BLOCK <= 2)&&(HST_LINEAR_PPT < 4))
#  define HST_LINEAR_ASSYMETRY 2			//! 1!
# else
#  define HST_LINEAR_ASSYMETRY 1			//! 1!
# endif
#else
# define HST_LINEAR_ASSYMETRY 1			//! 1!
#endif

#if defined(HST_LINEAR_BASE)
# define HST_OVERS_ASSYMETRY 1			//! 1!
# define HST_NN_ASSYMETRY 1		        //! 1!
#elif (HST_OPTIMIZE_KEPLER)&&(SLICE_BLOCK == 2)
# if HST_OVERS_PPT < 4
#  define HST_OVERS_ASSYMETRY 2			//! 1!
# else
#  define HST_OVERS_ASSYMETRY 1			//! 1!
# endif
# if HST_NN_PPT < 4
#  define HST_NN_ASSYMETRY 2			//! 1!
# else
#  define HST_NN_ASSYMETRY 1		        //! 1!
# endif
#else 
# define HST_OVERS_ASSYMETRY 1			//! 1!
# define HST_NN_ASSYMETRY 1			//! 1!
#endif



#define HST_NEWTEX4                          	//!< Assymetric: process 4 projections and 64 bins during each step
#define HST_NEWTEX_PROJ_MAJOR			//!< YES Seems a good idea for better locality and non-conflicting const memory loads
//#define HST_NEWTEX_REUSE_BUFS			//!< YES Re-use the coalescing shared memory buffer for CACHE_CONST
//#define HST_NEWTEX_DIRECT_WRITE		//!< YES if we short on shmem. Generally, only useful for slice4 mode on Kepler and later...
//#define HST_FLOAT_LOOPS			//!< On sm_35 only. sm_30 - bad. Use float loops to optimize register usage (mostly bad)
//#define HST_SQUARE_PPT			//!< NO. Little effect on performance (slighly reduces register usage, but doesn't affect computations as the loops are unrolled anyway)

#ifdef HST_HYBRID
//# define HST_HYBRID_BALANCE_ALU 4
//# define HST_HYBRID_NEWTEX
# define HST_CACHE_SIN			        //!< YES. It is good idea on all NVIDIA as shmem faster than cmem. Even on shmem-bound kernels
//# define HST_C_TRIG				//!< Slight speedup due to less fragmented cmem in main part (but mostly useless)
# ifdef HST_HYBRID_NEWTEX
#  define HST_NEWTEX4                             //!< Assymetric: process 4 projections and 64 bins during each step.
#  define HST_CACHE_CONST			//!< YES! In NewTex cache constants in the shared memory
#  if SLICE_BLOCK == 1
#   define HST_NEWTEX4_PPT HST_LINEAR_PPT	//!< NO. Do partial PPT directly in the Hybrid kernel.
#  endif
//#  define HST_NO_OFFSETS				//!< IGNORE THIRD c_all parameter (assume 0, no offsets)
//#  define HST_L1_PROJ_BLOCK 4096		//!< UNUSED
# endif
#endif

#if (SLICE_BLOCK > 1)||((HST_OPTIMIZE_KEPLER > 4)&&(SLICE_BLOCK > 1))
# define HST_NEWTEX4                           //!< Assymetric: process 4 projections and 64 bins during each step.
#endif
#if defined(HST_NEWTEX4)&&((!defined(HST_OPTIMIZE_KEPLER))||(HST_OPTIMIZE_KEPLER < 5))
# define HST_CACHE_CONST			//!< YES! At least for Hybrid mode (but not on Pascal)
#endif


    // Linear Kernel overrides
//#define HST_LMAP                                //!< Just for testing, linear mapping.
#define HST_ZMAP                                //!< On Pascal/float2 (and other platforms benefiting from 4-thread shmem locality)
#define HST_NEWCACHE				//!< YES! New way of caching reducing number of shmem reads/writes
//#define HST_NEWCACHE_UNPAD			//!< Add if and remove padding in the shmem cache (low efficiency)
#define HST_GMEM_CONST				//!< YES! Store constants in global memory (used only in mplinear with MINH caching) 
//define HST_INNER_GMEM				//!< NO! Also access GMem from inner loop (Slow)
//# define HST_PRECOMPUTE_OFFSETS		//!< IRRELEVANT! Just move some index computations out of inner loop (used only with MINH mode on)
//# define HST_MEM_FETCHES			//!< NO! Use sinogram array instead of textures in Linear kernels. Doesn't help in practice even if shmem bound kernels
//# define HST_TUNE_SHMEM			//!< NO! Buggy! Try to optimize shmem in cost of extra computations (we are comp. bounded for float1 and it will be enforced for float2-4 anyway, not needed)
#define HST_XCACHE_EARLY_Y                      //!< YES! Must. Other mode is not unrolled properly and causes large register overhead
#define HST_XCACHE_LD128 2                      //!< Store SUBH_X cache as groups of float2/float4 numbers to reduce number of memory instructions


#if defined(HST_LINEAR_BASE)
# if defined(HST_OPTIMIZE_KEPLER)
#  define HST_CREATE_TEXTURE			//!< Create texture instead of binding. Seems faster, but only supported since Kepler
#  define HST_SET_KEPLER_BOUNDS
#  if HST_OPTIMIZE_KEPLER > 4
#   define HST_WARPLINE				//!< Seems better without on Fermi
#  else
#   define HST_SHMEM64				//!< Optimize caching in shared memory (avoid bank conflicts) in case of oversampling
#  endif
# else
#  undef HST_SHMEM64				//!< Optimize caching in shared memory (avoid bank conflicts) in case of oversampling
# endif
# define HST_CACHE_SUBH				//!< YES! Must on NVIDIA (this that makes it different from AMD)
# define HST_CACHE_Y				//!< YES! Must. If disabled extra computations, but also shmem bank conflicts
#elif defined(HST_OPTIMIZE_KEPLER)
# define HST_CREATE_TEXTURE			//!< Create texture instead of binding. Seems faster, but only supported since Kepler
# define HST_SHFL_SUM				//!< Important as ShMem is shared with L1 (in case of spilage)
# define HST_SET_KEPLER_BOUNDS

# if defined(HST_CACHE_CONST)&&defined(HST_NEWTEX4)&&(SLICE_BLOCK < 4)
#  if  HST_OPTIMIZE_KEPLER < 4
#   define HST_CCACHE_KEPLER			//!< KEPLER ONLY (NOT NEWER)! Optimizes 64-bit shmem accesses without adding to much shared memory
//#   define HST_CCACHE_LD128			//!< KEPLER ONLY (NOT NEWER)! Optimizes 64-bit shmem accesses with HST_CACHE_CONST & HST_NEWTEX4
#  endif
# endif
# if (defined(HST_TEX_KERNEL)&&defined(HST_SET_BOUNDS)&&(SLICE_BLOCK > 1)&&!defined(HST_NEWTEX4))
#  define HST_KEPLER_PREFER_L1
//#  define HST_KEPLER_PREFER_L1_EQUAL
# endif

	// Linear Kernel overrides

# if HST_OPTIMIZE_KEPLER > 4
#  define HST_WARPLINE				//!< Seems better without on Fermi
# else
#  define HST_SHMEM64				//!< Optimize caching in shared memory (avoid bank conflicts) in case of oversampling
# endif

# if SLICE_BLOCK == 1
#  if HST_OPTIMIZE_KEPLER > 4
#  else
#   define HST_FANCY_ROUND			//!< Good on slice1, but bad on slice4 (without somehow works very badly if 4+ unrolling of p-loop is requested)
#   define HST_FANCY_INDEX			//!< Optimizes out integer multiplication required to convert array index to address (idx * sizeof(type))
#  endif
#  define HST_SHFL_CACHE			//!< YES! SIGNIFICANT! Prevents ShMem bank conflicts and also halfes caching accesses 
#  define HST_CACHE_Y				//!< YES! Must. If disabled extra computations, but also shmem bank conflicts
#  define HST_CACHE_SIN				//!< YES. It is good idea on all NVIDIA as shmem faster than cmem. Even on shmem-bound kernels
# elif SLICE_BLOCK == 2
#  if HST_OPTIMIZE_KEPLER > 4
#  else
#   define HST_FANCY_ROUND			//!< Good on slice1, but bad on slice4 (without somehow works very badly if 4+ unrolling of p-loop is requested)
#  endif
//#  define HST_SHFL_CACHE			//!< YES! SIGNIFICANT! Prevents ShMem bank conflicts and also halfes caching accesses 
//#  define HST_CACHE_Y				//!< YES! Must. If disabled extra computations, but also shmem bank conflicts
//#  define HST_CACHE_SIN				//!< YES. It is good idea on all NVIDIA as shmem faster than cmem. Even on shmem-bound kernels
# else
#  if HST_OPTIMIZE_KEPLER > 4
//#   define HST_SHMEM64
#  endif
# endif

# define HST_CACHE_SUBH				//!< YES! Must on NVIDIA (this that makes it different from AMD)
//#define HST_SUBH_DIRECT				//!< NO! AMDDirect mode with double-interpolation, i.e. reduced quality
//# define HST_C_TRIG				//!< Slight speedup due to less fragmented cmem in main part (but mostly useless)
//# define HST_CACHE_MINH				//!< NO! Minimal effect and actually slows slightly down
//# define HST_SHFL_MINH			//!< NO! Seems better without

//#define HST_CACHE_MINH
//# define HST_CACHE_SUBH_X			//!< Save 1 operation in main loop in price of extra shmem (but no additional writes)
//# define HST_C_SIN
#else /* ! HST_OPTIMIZE_KEPLER */
	// Linear Kernel overrides
# undef HST_SHMEM64				//!< Optimize caching in shared memory (avoid bank conflicts) in case of oversampling
//#define HST_WARPLINE				//!< Seems better without on Fermi
#if SLICE_BLOCK > 1
# define HST_WARPLINE				//!< Seems better without on Fermi
#endif
# undef HST_FANCY_ROUND

# define HST_CACHE_Y				//!< YES! Must. If disabled extra computations, but also shmem bank conflicts
# define HST_CACHE_SUBH				//!< YES! Must on NVIDIA (this that makes it different from AMD)
//# define HST_C_SIN
# define HST_CACHE_SIN				//!< YES. It is good idea on all NVIDIA as shmem faster than cmem. Even on shmem-bound kernels
//# define HST_C_TRIG				//!< Slight speedup due to less fragmented cmem in main part (but mostly useless)
# if SLICE_BLOCK == 4
//#  define HST_CACHE_MINH				//!< NO! Minimal effect and actually slows slightly down (register pressure)
# else
//#  define HST_CACHE_MINH				//!< NO! Minimal effect and actually slows slightly down (register pressure)
#  define HST_CACHE_SUBH_X			//!< Save 1 operation in main loop in price of extra shmem (but no additional writes)
# endif
#endif /* !  HST_OPTIMIZE_KEPLER */

#ifndef HST_OVERS_PPT 
# define HST_OVERS_PPT HST_LINEAR_PPT
#endif
#ifndef HST_NN_PPT 
# define HST_NN_PPT HST_OVERS_PPT
#endif

#define HST_NEWTEX_CACHE_BLOCK 128
#define HST_LINEAR_CACHE_BLOCK 32//256//32//64 // 64 for linear

#if HST_LINEAR_ASSYMETRY > 2
# define HST_LINEAR_PROJ_BLOCK 4
#elif SLICE_BLOCK > 1
#  define HST_LINEAR_PROJ_BLOCK 8
/*#elif SLICE_BLOCK == 2
# define HST_LINEAR_PROJ_BLOCK HST_LINEAR_BLOCK*/
#elif (HST_LINEAR_ASSYMETRY * HST_LINEAR_PPT * HST_LINEAR_PPT) < 8
# define HST_LINEAR_PROJ_BLOCK HST_LINEAR_BLOCK
#else
# define HST_LINEAR_PROJ_BLOCK 8
#endif

#if SLICE_BLOCK > 1
# define HST_NN_PROJ_BLOCK 8
#else
# define HST_NN_PROJ_BLOCK 16
#endif

#if HST_OVERS_ASSYMETRY > 2
# define HST_OVERS_PROJ_BLOCK 4
#elif SLICE_BLOCK > 2
# define HST_OVERS_PROJ_BLOCK 4
#elif SLICE_BLOCK > 1
# if defined(HST_OPTIMIZE_KEPLER)
#  define HST_OVERS_PROJ_BLOCK 4
# else
#  define HST_OVERS_PROJ_BLOCK 8
# endif
/*#elif SLICE_BLOCK == 2
# define HST_OVERS_PROJ_BLOCK 8*/
#else  // SLICE_BLOCK == 1 
# if (HST_OVERS_ASSYMETRY * HST_OVERS_PPT * HST_OVERS_PPT) < 8
#  ifdef HST_OPTIMIZE_KEPLER
#   define HST_OVERS_PROJ_BLOCK 8			//! Set bellow 8 to enable shmem optimizations on Kepler and later
#  else // Fermi
#   ifdef HST_WARPLINE
#    define HST_OVERS_PROJ_BLOCK 8
#   else
#    define HST_OVERS_PROJ_BLOCK 16 			// SUBHX caching!
#   endif
#  endif
# else // PPT == 4
#  if HST_OPTIMIZE_KEPLER > 4
#   define HST_OVERS_PROJ_BLOCK 8 //4			//! Set bellow 8 to enable shmem optimizations on Kepler and later
#  else
#   define HST_OVERS_PROJ_BLOCK 4			//! Set bellow 8 to enable shmem optimizations on Kepler and later
#  endif
# endif
#endif

#if HST_LINEAR_BLOCK < 16
# undef HST_SHMEM64
#endif

#ifdef HST_SHMEM64
# define HST_WARPLINE
#endif

#ifdef HST_CACHE_MINH
# define HST_CACHE_SIN
# define HST_CACHE_SUBH
#endif

#ifdef HST_SET_BOUNDS
# ifdef HST_OPTIMIZE_KEPLER
#  if SLICE_BLOCK > 2
#   ifdef HST_NEWTEX4
#    define HST_NEWTEX_MIN_BLOCKS 6
#   else
#    define HST_NEWTEX_MIN_BLOCKS 4
#   endif
#   define HST_OVERS_MIN_BLOCKS 4 //3
#   define HST_LINEAR_MIN_BLOCKS 4 //4
#   define HST_NN_MIN_BLOCKS 4
#  elif SLICE_BLOCK > 1
#   ifdef HST_NEWTEX4
#    ifdef HST_NEWTEX4_PPT
#     define HST_NEWTEX_MIN_BLOCKS 6
#    else
#     define HST_NEWTEX_MIN_BLOCKS 8 // 6 - 8
#    endif
#   else
#    define HST_NEWTEX_MIN_BLOCKS 4
#   endif
#   if HST_OVERS_PPT > 2
#    if HST_OVERS_ASSYMETRY > 1
#     define HST_OVERS_MIN_BLOCKS 4//6
#    else
#     define HST_OVERS_MIN_BLOCKS 4//6
#   endif
#   else 
#    if HST_OVERS_ASSYMETRY > 1
#     define HST_OVERS_MIN_BLOCKS 8//8//6
#    else
#     define HST_OVERS_MIN_BLOCKS 6//6
#    endif
#   endif
#   if HST_LINEAR_ASSYMETRY > 1
#    define HST_LINEAR_MIN_BLOCKS 8 // 6 - 10
#   elif HST_LINEAR_PPT > 2
#    define HST_LINEAR_MIN_BLOCKS 4 // 6
#   else
#    define HST_LINEAR_MIN_BLOCKS 6 // 6
#   endif
#   if HST_NN_PPT > 2
#    define HST_NN_MIN_BLOCKS 4
#   else 
#    if HST_NN_ASSYMETRY > 1
#     define HST_NN_MIN_BLOCKS 8
#    else
#     define HST_NN_MIN_BLOCKS 6
#    endif
#   endif
#  else /* SLICE_BLOCK == 1 */
#   ifdef HST_NEWTEX4
#    define HST_NEWTEX_MIN_BLOCKS 8
#   else
#    define HST_NEWTEX_MIN_BLOCKS 6//8
#   endif
#   if HST_LINEAR_ASSYMETRY > 2
#    define HST_LINEAR_MIN_BLOCKS 14 // or 12-16?
#   elif HST_LINEAR_ASSYMETRY > 1
#    define HST_LINEAR_MIN_BLOCKS 8 // or 6-10?
#   elif HST_LINEAR_PPT > 2
#    if HST_OPTIMIZE_KEPLER > 4
#     define HST_LINEAR_MIN_BLOCKS 8 //4 // 6
#    else
#     define HST_LINEAR_MIN_BLOCKS 4 // 6
#    endif
#   else
#    define HST_LINEAR_MIN_BLOCKS 8 //6
#   endif

#   if HST_NN_PPT < 4
#    define HST_NN_MIN_BLOCKS 6 // 6
#   else
#    define HST_NN_MIN_BLOCKS 4 // 6
#   endif
#   if HST_OVERS_PPT < 4
#    define HST_OVERS_MIN_BLOCKS 6
#   else /* PPT == 4 */
#    define HST_OVERS_MIN_BLOCKS 4 // with PPT(Y) mode, we can increase it to 6, but no speed-up
#   endif
#  endif
# else /* ! HST_OPTIMIZE_KEPLER */
#  define HST_NEWTEX_MIN_BLOCKS 3
#  if (SLICE_BLOCK > 2)||defined(HST_CACHE_MINH)
#   define HST_OVERS_MIN_BLOCKS 3
#   define HST_LINEAR_MIN_BLOCKS 3
#  elif SLICE_BLOCK > 1
#   define HST_LINEAR_MIN_BLOCKS 3
#   define HST_OVERS_MIN_BLOCKS 3 //4
#  else
#   define HST_LINEAR_MIN_BLOCKS 4
#   ifdef HST_WARPLINE
#    define HST_OVERS_MIN_BLOCKS 4
#   else
#    define HST_OVERS_MIN_BLOCKS 3//4
#   endif
#  endif
#  define HST_NN_MIN_BLOCKS 0//0
# endif
#endif

#ifdef HST_HYBRID
# ifdef HST_HYBRID_NEWTEX
#  define HST_HYBRID_MIN_BLOCKS HST_NEWTEX_MIN_BLOCKS
# else
#  if SLICE_BLOCK > 1
#   if HST_LINEAR_PPT > 2
#    warning "Not efficient hybrid configuration"
#    define HST_HYBRID_MIN_BLOCKS 4
#   else
#    define HST_HYBRID_MIN_BLOCKS 8
#   endif
#  else
#   if HST_LINEAR_PPT > 2
#    define HST_HYBRID_MIN_BLOCKS 8 // 6
#   else
#    define HST_HYBRID_MIN_BLOCKS 8
#   endif
#  endif
# endif

# ifndef HST_HYBRID_BALANCE_ALU
#  if SLICE_BLOCK == 1
#   if HST_HYBRID_MIN_BLOCKS == 8
#    define HST_HYBRID_BALANCE_ALU 5
#   endif
#  endif
# endif
#endif



#ifdef HST_FANCY_ROUND
# define HST_LINEAR_FANCY_FLOOR
# define HST_OVERS_FANCY_ROUND
# define HST_MP_FANCY_ROUND        // Enable fancy rounding mode (faster)
#endif

#ifdef HST_FANCY_INDEX
//# define HST_LINEAR_FANCY_INDEX
# define HST_OVERS_FANCY_INDEX
#endif

#ifndef HST_LINEAR_FANCY_FLOOR
# undef  HST_LINEAR_FANCY_INDEX
#endif

#ifndef HST_OVERS_FANCY_ROUND
# undef  HST_OVERS_FANCY_INDEX
#endif


#if SLICE_BLOCK > 1
# define HST_CUDA_ARRAY
#endif /* SLICE_BLOCK */

#ifdef HST_BASE_KERNEL
# define HST_TEX_KERNEL
# undef HST_LINEAR_KERNEL
# undef HST_MPLINEAR_KERNEL
#endif

#ifdef HST_CACHE_CONST
# undef HST_FLOAT_LOOPS
#endif

/*
#ifdef HST_MPLINEAR_KERNEL
# if SLICE_BLOCK > 1
#  define HST_LINEAR_KERNEL
#  undef HST_MPLINEAR_KERNEL
# endif
#endif
*/

#if /*defined(HST_CACHE_MINH) ||*/ !defined(HST_CACHE_SUBH)
# undef HST_CACHE_SUBH_X
#endif

#ifdef HST_CACHE_SUBH_X
# ifdef HST_C_TRIG
#  undef HST_C_TRIG
#  define HST_C_SIN
# endif
#endif


    // Non assymetric mode is barelly tested and slower on measured configurations
#define HST_MP_ASYMMETRIC         // Run mplinear with 8x16 thread block (and 8x4 actual PPT)
#ifndef HST_MP_ASYMMETRIC 
# define HST_MP_MINIMAL          // Minimize number of registers in price of more computations
#endif /* ! HST_MP_ASYMMETRIC */



#if defined(HST_C_TRIG) || defined(HST_C_SIN)
# define MAXNPROJECTIONS 2048			//!< Maximum number of supported projections per slices
#else
# define MAXNPROJECTIONS 4096			//!< Maximum number of supported projections per slices
#endif
#define HST_CUDA_MAX_DEVICES 16			//!< Maximum CUDA-devices supported by the module (check tex_projes)

#define BLOCK_SIZE 16				//!< CUDA block size (for filtering)
#define BLOCK_SIZE_X BLOCK_SIZE			//!< Check kernels, there are some expectations
#define BLOCK_SIZE_Y BLOCK_SIZE			//!< Check kernels, there are some expectations

#define CUDA_HEATUP_MAX_CHANGE 0.05
#define CUDA_HEATUP_MIN_ITERATIONS 200
#define CUDA_HEATUP_MAX_ITERATIONS 250
#define CUDA_HEATUP_KERNEL_ITERATIONS 4096
#define CUDA_HEATUP_ELEMENTS 4
#define CUDA_HEATUP_GRID 4480 // 16 * 5 * 7 (covers SM numbers) * 8 (covers warps per SM)
#define CUDA_MAX_STATS 16384

    /* We should be carefoul here, in many cases it faster to preform everything at once
    probably due to the texture caching */
//#ifdef HST_OPTIMIZE_KEPLER
//# define BP_BATCH_SIZE 128			//!< Batch size (in blocks) for Back Projection  (should be even)
//#else /* HW_OPTIMIZE_KEPLER */
//# ifdef  HW_IGNORE_OLD_HARDWARE
//#  define BP_BATCH_SIZE 128			//!< Batch size (in blocks) for Back Projection  (should be even)
//# else /*  HW_IGNORE_OLD_HARDWARE */
//#  define BP_BATCH_SIZE 8			//!< Batch size (in blocks) for Back Projection  (should be even)
//# endif /*  HW_IGNORE_OLD_HARDWARE */
//#endif /*  HW_OPTIMIZE_KEPLER */
//#define FFT_BATCH_SIZE 8			//!< Batch size (in blocks) for FFT transformations (should be even)


#define BP_BATCH_SIZE 512
#define FFT_BATCH_SIZE 256                      //!< Anyway limited by size of constant memory