/tomo/pyhst

To get this branch, use:
bzr branch http://darksoft.org/webbzr/tomo/pyhst
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
//#define DEBUG_OFFSETS 1	// set to 0 to allow intially interpolations and 1 to disable


#undef KERNEL_NAME
#undef SHMEM_PADDING
#undef SHMEM_SHIFTED_PADDING
#undef SHMEM_ARRAYS
#undef SHMEM_SAMPLING
#undef SIMPLE_SHMEM
#undef SPLIT_SHMEM
#undef REMAP_THREADS
#undef SHMEM_LOAD_ITERATIONS
#undef OVERSAMPLE_MULTIPLIER
#undef GET_CACHE
#undef INTERPOLATE_FROM_CACHE
#undef MIN_BLOCKS
#undef CACHE
#undef FAST_CACHE
#undef FAST_CACHE2
#undef FANCY_ROUND
#undef FANCY_INDEX
#undef FANCY_MINH_CORRECTION
#undef FANCY_CACHE
#undef FANCY_CACHE_MULTIPLIER
#undef SHFL_CACHE
#undef SHMEM_LOAD_FULLWARP
#undef SHMEM_LOAD2
#undef CACHE_POS
#undef CACHE_BLOCK
#undef GET_SINO
#undef BSX
#undef BSY
#undef ASSYMETRY
#undef PSTEP
#undef PTYPE
#undef sincos
#undef P
#undef NEWCACHE

#define BSX BS
#define BSY BS

#if defined(HST_CACHE_Y)&&!defined(HST_HALF_MODE)
# define CACHE_Y
#endif

#define MK_KERNEL_NAME(name, version) HST_MACRO_JOIN(name, version)

#ifdef OVERSAMPLE
# if OVERSAMPLE == 1
#  define KERNEL_NAME hst_cuda_alu_nn
#  define MIN_BLOCKS HST_NN_MIN_BLOCKS
#  define ASSYMETRY HST_NN_ASSYMETRY
# else
#  define KERNEL_NAME MK_KERNEL_NAME(hst_cuda_alu_oversample, OVERSAMPLE)
#  define MIN_BLOCKS HST_OVERS_MIN_BLOCKS
#  define ASSYMETRY HST_OVERS_ASSYMETRY
# endif
# define SHMEM_SAMPLING OVERSAMPLE			/**< Oversmapling coefficients, how many points to sample per cell */
# undef CACHE_Y
#else
# define KERNEL_NAME hst_cuda_alu_linear
# define SHMEM_SAMPLING 1
# define MIN_BLOCKS HST_LINEAR_MIN_BLOCKS
# define ASSYMETRY HST_LINEAR_ASSYMETRY
#endif
#define SHMEM_SHIFTED_PADDING 0

#ifdef CACHE_Y
# define SHMEM_ARRAYS 2
# define SHMEM_PADDING 1
#else
# define SIMPLE_SHMEM					/**< Use simple ShMem array instead of float2 with 'x' and 'y-x' */
# define SHMEM_ARRAYS 1					/**< Number of shmem arrays (in non SIMPLE mode). For interpolation we may need 2 */
# define SHMEM_PADDING 0				/**< We need extra byte in the end of shmem array if we compute (y-x) */
#endif

#if SHMEM_SAMPLING > 1
# define REMAP_THREADS					/**< Localise accesses (needed for wide Oversampling kernels to avoid cache misses */
#endif

#if SHMEM_SAMPLING == 4 
# define OVERSAMPLE_MULTIPLIER 0.25f
#elif SHMEM_SAMPLING > 1
# define OVERSAMPLE_MULTIPLIER (1.f / OVERSAMPLE)
#else
# define OVERSAMPLE_MULTIPLIER 1
#endif


#ifdef HST_TUNE_SHMEM
	/* This is pretty much useless. We optimize access to ShMem cache during the writting phase, but
	this costs us extra instructions (need to compute 2 addresses instead of 1). 
	- On Fermi (and likely all other architectures float1 is compute bound. For float* we enforce it
	anyway.
	- Besides, using 3D arrays is significantly more optimal than addressing manually in linear array.
	So, the clever tricks will also not work.
	- Shifted padding is to extend array at different (non-last) dimmension to ensure no bank conflicts
	in 32-bit mode. */
# ifndef SIMPLE_SHMEM
#  if SLICE_BLOCK == 1
#    undef SHMEM_SHIFTED_PADDING
#    define SHMEM_SHIFTED_PADDING SHMEM_PADDING
#    undef SHMEM_PADDING
#    define SHMEM_PADDING 0
#  endif
#  define SIMPLE_SHMEM
# endif /* ! SIMPLE_SHMEM */
#endif

#if SLICE_BLOCK > 1
# define SIMPLE_SHMEM
# if SLICE_BLOCK == 4
#  if  ((OVERSAMPLE > 2)||!defined(HST_OPTIMIZE_KEPLER))&&!defined(HST_HALF_MODE)
#   define SPLIT_SHMEM
#  endif
# endif
#endif

#ifndef HST_SHMEM64
# if (OVERSAMPLE > 2)&&(SLICE_BLOCK > 1)&&!defined(HST_HALF_MODE)
#  define SPLIT_SHMEM32
# endif
#endif


    // We don't need padding if shfl used to compute (x+1)-x
#if defined(CACHE_Y)&&!defined(SIMPLE_SHMEM)
# if defined(HST_NEWCACHE)
#  define NEWCACHE
#  if defined(HST_SHFL_CACHE)||defined(HST_NEWCACHE_UNPAD)
#   undef SHMEM_PADDING
#   define SHMEM_PADDING 0
#   define REMAP_THREADS
#  endif
#  if defined(HST_SHFL_CACHE)
#   define SHFL_CACHE
#  endif
# endif
#endif




#if PPT == 1
# if SHMEM_SAMPLING == 4
#  define SHMEM_LOADS (6)
# else
#  define SHMEM_LOADS (2 * SHMEM_SAMPLING)
# endif
# define SHMEM_LOADS_ITERATIONS SHMEM_LOADS
#else
# define SHMEM_LOADS (3 * SHMEM_SAMPLING * (PPT / 2))

# if defined(HST_WARPLINE)&&!defined(HST_HALF_MODE)&&((SHMEM_SAMPLING * PPT) >= 4)/*&&(PROJ_BLOCK <= 8)*/
#  if defined(HST_SHMEM64)&&((SHMEM_SAMPLING * PPT) >= 8)&&(SLICE_BLOCK == 1)&&defined(SIMPLE_SHMEM)
#    define SHMEM_LOAD2						/* Combine two texture fetches to 1 64-bit shmem store */
#    define SHMEM_LOAD_FULLWARP ((SHMEM_SAMPLING * PPT) / 4)	/* Counts number of rows we can load simultaneously */
#    define SHMEM_LOAD_ITERATIONS (SHMEM_LOADS  / SHMEM_LOAD_FULLWARP / 2)  /* Use 32-threads (or more) for loading data, this limits projections to 8 */
#  else
#    if defined(SHFL_CACHE)||defined(HST_NEWCACHE_UNPAD)
#     define SHMEM_LOAD_FULLWARP 2				/* Counts number of rows we can load simultaneously */
#    elif defined(HST_SHMEM64)
#     define SHMEM_LOAD_FULLWARP ((SHMEM_SAMPLING * PPT) / 2)	/* Counts number of rows we can load simultaneously */
#    else
#     define SHMEM_LOAD_FULLWARP 2//2 // Shoudl be adaptive, set to 4 if only 4 proj (prevents shfl!)
#    endif
#    define SHMEM_LOAD_ITERATIONS (SHMEM_LOADS / SHMEM_LOAD_FULLWARP)
#  endif
# else
#  define SHMEM_LOAD_ITERATIONS (SHMEM_LOADS)
# endif
#endif




#ifdef SIMPLE_SHMEM
# define CACHE(proj, bin, c) cache[c][proj][bin]
# define FANCY_CACHE(proj, bin, c) (*(vfloat*)(((char*)(&(cache[c][proj][0]))) + bin))
# define FANCY_CACHE_MULTIPLIER (sizeof(vfloat))
#else 
# define CACHE(proj, bin, c) cache[proj][bin].comp(c)
# define CACHE2(proj, bin) cache[proj][bin]
# define FANCY_CACHE(proj, bin, c) (*(vfloat*)(((char*)(&(cache[proj]))) + bin + c))
# define FANCY_CACHE2(proj, bin) (*(float2*)(((char*)(&(cache[proj]))) + bin))
# define FANCY_CACHE_MULTIPLIER (2 * sizeof(float))
#endif

#define FANCY_MINH_CORRECTION(x) (x)
#ifdef OVERSAMPLE
# ifdef HST_OVERS_FANCY_ROUND
#  ifdef HST_OVERS_FANCY_INDEX
#   define FANCY_INDEX
//#  define FANCY_ROUND(idx, h) ({idx = FANCY_CACHE_MULTIPLIER * (int)(h); })
#   define FANCY_ROUND(idx, h) ({float fh = FANCY_CACHE_MULTIPLIER * ((h) + exp23) - (FANCY_CACHE_MULTIPLIER - 1) * exp23; idx = __float_as_int(fh) - 0x4B000000; })
#  else /* NO FANCY_INDEX */
#   define FANCY_ROUND(idx, h) ({float fh = (h) + exp23; idx = __float_as_int(fh) - 0x4B000000; })
#  endif /* NO FANCY_INDEX */
# endif /* FANCY_ROUND */
#else /* NOT OVERSAMPLE */
# ifdef HST_LINEAR_FANCY_FLOOR
#  ifdef HST_LINEAR_FANCY_INDEX
#   define FANCY_INDEX
#   define FANCY_ROUND(idx, diff, h) ({ float newh = h - 0.499999f; float fh = newh + exp23; diff = (fh - exp23); fh = FANCY_CACHE_MULTIPLIER * fh - (FANCY_CACHE_MULTIPLIER - 1) * exp23; idx = (*(int*)(&fh)) - (0x4B000000 + FANCY_CACHE_MULTIPLIER); })
#  else /* NO FANCY_INDEX */
#   define FANCY_ROUND(idx, diff, h) ({ float newh = h - 0.499999f; float fh = newh + exp23; diff = (fh - exp23); idx = (*(int*)(&fh)) - 0x4B000001; })
#  endif
	// For proper rounding we need values above 1. So, we artificaly add 1 to array offset. We will remove it during the rounding.
#  undef FANCY_MINH_CORRECTION
#  define FANCY_MINH_CORRECTION(x) ((x) - 1.f)
# endif
#endif


#ifdef FANCY_INDEX
# define FAST_CACHE FANCY_CACHE
# define FAST_CACHE2 FANCY_CACHE2
#else
# define FAST_CACHE CACHE
# define FAST_CACHE2 CACHE2
#endif





#ifdef SIMPLE_SHMEM
# ifdef CACHE_Y
#  ifdef SPLIT_SHMEM
#   define GET_CACHE(v1, v2, minh, i) ({ int bin = (minh); float2 vxy, vzw; vxy = FAST_CACHE(i, bin, 0); vzw = FAST_CACHE(i, bin, 2); v1 = mkf4(vxy, vzw);  vxy = FAST_CACHE(i, bin, 1); vzw = FAST_CACHE(i, bin, 3); v2 = mkf4(vxy, vzw); })
#  else /* SPLIT_SHMEM */
#   define GET_CACHE(v1, v2, minh, i) ({ int bin = (minh); v1 = FAST_CACHE(i, bin, 0); v2 = FAST_CACHE(i, bin, 1); })
#  endif /* SPLIT_SHMEM */
# else
#  if defined(SPLIT_SHMEM32)
#   if SLICE_BLOCK == 4
#    define GET_CACHE(v, minh, i) ({ int bin = (minh); float vx, vy, vz, vw; vx = FAST_CACHE(i, bin, 0); vy = FAST_CACHE(i, bin, 1); vz = FAST_CACHE(i, bin, 2); vw = FAST_CACHE(i, bin, 3); v = make_float4(vx, vy, vz, vw); })
#   elif SLICE_BLOCK == 2
#    define GET_CACHE(v, minh, i) ({ int bin = (minh); float vx, vy; vx = FAST_CACHE(i, bin, 0); vy = FAST_CACHE(i, bin, 1); v = make_float2(vx, vy); })
#   else
#    error "SHMEM32 should only be enabled in multi-slice modes"
#   endif
#  elif defined(SPLIT_SHMEM)
#   define GET_CACHE(v, minh, i) ({ int bin = (minh); float2 vxy, vzw; vxy = FAST_CACHE(i, bin, 0); vzw = FAST_CACHE(i, bin, 1); v = mkf4(vxy, vzw); })
#  else /* SPLIT_SHMEM */
#   ifdef HST_HALF_CACHE
#    define GET_CACHE(v, minh, i) ({ int bin = (minh); union { sfloat f; half2 h[2]; } vdata; vdata.f = FAST_CACHE(i, bin, 0); float4 data = mkf4(__half22float2(vdata.h[0]), __half22float2(vdata.h[1])); v = data; })
#   else /* ! HALF_CACHE */
#    define GET_CACHE(v, minh, i) ({ int bin = (minh); v = FAST_CACHE(i, bin, 0); })
#   endif /* ! HALF_CACHE */
#  endif /* SPLIT_SHMEM */
# endif
#else /* SIMPLE_SHMEM */
# define GET_CACHE(v1, v2, minh, i) ({ int bin = (minh); float2 c = FAST_CACHE2(i, bin); v1 = c.x; v2 = c.y; })
#endif /* SIMPLE_SHMEM */

#if defined(HST_MEM_FETCHES)&&(SHMEM_SAMPLING == 1)&&!defined(HST_HALF_MODE)
# define HST_FETCH_VAR(var, val, tex05) const int var = (val); 	// floor?
# define GET_SINO(bin, proj) sino[(proj)*bin_pitch + (((bin)<num_bins)?(bin):0)]
#else  /* HST_MEM_FETCHES */
# define HST_FETCH_VAR(var, val, tex05) const float var = (val) + tex05;
# ifdef HST_HALF_CACHE
#  define GET_SINO(bin, proj) hst_real_tex(texptr, bin, proj);
# else
#  define GET_SINO(bin, proj) hst_tex(texptr, bin, proj);
# endif /* HST_HALF_CACHE */
#endif /* HST_MEM_FETCHES */

#ifdef DEBUG_OFFSETS
# undef GET_SINO
# define GET_SINO(bin, proj)  {0.f}
#endif

#if defined(CACHE_Y)
# define INTERPOLATE_FROM_CACHE(i, idx) ({ vfloat v1, v2; GET_CACHE(v1, v2, idx, i); (v1 + diff * v2); })
#elif defined(OVERSAMPLE)
# define INTERPOLATE_FROM_CACHE(i, idx) ({ vfloat v; GET_CACHE(v, idx, i); v; })
#else  /* UNCACHED_Y */
# define INTERPOLATE_FROM_CACHE(i, idx) ({ vfloat v1, v2; int _idx = idx; GET_CACHE(v1, _idx, i); GET_CACHE(v2, _idx + 1, i); (v1 + diff * (v2 - v1)); })
#endif /* OVERSAMPLE */


#ifdef SHMEM_LOAD_FULLWARP
# ifdef SHMEM_LOAD2
#  define CACHE_POS(i) (2 * SHMEM_LOAD_FULLWARP * i * BSX + 2 * cache_tidx)
# else
#  define CACHE_POS(i) (SHMEM_LOAD_FULLWARP * i * BSX + cache_tidx)
# endif
#else
# define CACHE_POS(i) (i * BSX + cache_tidx)
#endif

#if HST_LINEAR_CACHE_BLOCK > (BSX * BSY / ASSYMETRY)
# define CACHE_BLOCK (BSX * BSY / ASSYMETRY)
#else 
# define CACHE_BLOCK HST_LINEAR_CACHE_BLOCK
#endif

#if defined(HST_SUBH_DIRECT)&&!defined(HST_CACHE_SUBH)
# define FLOOR(x) (x)
#else
//# define FLOOR(x) (floor(x))
# define FLOOR(x) (floor(x - 0.5f) + 0.5f)
#endif

#if (ASSYMETRY > 1)&&((BSX != 16)||(BSY != 16))
# error "Can't use non standard block size together with assymetry"
#elif (ASSYMETRY > 1)
# undef HST_SQUARE_PPT
#elif (BSX != 16)||(BSY != 16)
# define HST_SQUARE_PPT
#endif

#if defined(HST_CACHE_SUBH_X)//&&!defined(OVERSAMPLE)
# define CACHE_SUBH_X
#else
# undef CACHE_SUBH_X
#endif


#ifdef CACHE_SUBH_X
# ifdef HST_CACHE_MINH
#  ifdef HST_XCACHE_LD128
#   define P_BLOCK HST_XCACHE_LD128
#  else
        // 2 seems optimal, 4 on Kepler?
#   define P_BLOCK 1
#  endif
#  undef HST_SQUARE_PPT
#  if ASSYMETRY > 1
#   error "ASSYMETRY has to be implemented for SQUARE_PPT"
#  endif
# else 
#  define P_BLOCK PPT
#  define HST_SQUARE_PPT
# endif
#endif

#if P_BLOCK == 4
# define PTYPE float4
#elif P_BLOCK == 2
# define PTYPE float2
#else
# define PTYPE float
#endif

#if defined(HST_CACHE_MINH)
# ifdef HST_PRECOMPUTE_OFFSETS
#  define CACHE_IDX(p) (proj_offset + (p))
# else
#  define CACHE_IDX(p) (subproj * PROJ_BLOCK + (p))
# endif
#else
# define CACHE_IDX(p) (p)
#endif




#ifdef SHMEM_LOAD_FULLWARP
# define PSTEP (BSY/ASSYMETRY/SHMEM_LOAD_FULLWARP)
#else
# define PSTEP (BSY/ASSYMETRY)
#endif



#if defined(HST_HYBRID) && (!defined(OVERSAMPLE) || (OVERSAMPLE == 1))
 __device__
 static 
#else
__global__
static 
# if MIN_BLOCKS > 0
__launch_bounds__(BSX * BSY / ASSYMETRY, MIN_BLOCKS)
# endif
#endif
void KERNEL_NAME(cudaTextureObject_t texptr, const float * __restrict__ g_all, const tfloat* __restrict__ sino, int num_proj, int num_bins, int bin_pitch, vfloat *d_SLICE, float apos_off_x, float apos_off_y, int batch) {
#ifdef HST_SQUARE_PPT
    vfloat res[PPT][PPT] = {0.f};
#else
# define YSIZE (ASSYMETRY * PPT * PPT)
# define YSTEP (BSY / (ASSYMETRY * PPT))
    vfloat res[YSIZE] = {0.f};
#endif

    const float exp23 = exp2(23.f);

    const int tidx = threadIdx.x;
    const int tidy = threadIdx.y;

#ifdef SHMEM_LOAD_FULLWARP
	// Should not happen for 8x8 blocks (should be no conflicts with ppt=2)
    const int cache_tidx = (tidy%SHMEM_LOAD_FULLWARP)*16 + tidx;
    const int cache_tidy = tidy/SHMEM_LOAD_FULLWARP;
#else
    const int cache_tidx = tidx;
    const int cache_tidy = tidy;
#endif

    const int bidx = PPT * blockIdx.x * BSX;
    const int bidy = batch + PPT * blockIdx.y * BSY;

	// We get centers offsetted by 0.5f to get in the pixel centers as expected by texture engine
    const float off_x = apos_off_x;
    const float off_y = apos_off_y;

    const float bx = bidx + off_x;
    const float by = bidy + off_y;

#if defined(HST_SQUARE_PPT)
# ifdef REMAP_THREADS
	// Also should not happen for 8x8 blocks (but we can reconsider)
#  if (BSX == 16)&&(BSY == 16)
    const int sidx = 4 * (threadIdx.y % 4) + (threadIdx.x % 4);
    const int sidy = 4 * (threadIdx.y / 4) + (threadIdx.x / 4);
#  elif (BSX == 8)&&(BSY == 8)
    const int sidx16 = 4 * (threadIdx.y % 4) + (threadIdx.x % 4);
    const int sidx = sidx16 % 8;
    const int sidy = 4 * (sidx16 / 8) + 2 * (threadIdx.y / 4) + (threadIdx.x / 4);
#  else
#   error "Specified block size is not supported"
#  endif
# else
    const int sidx = tidx;
    const int sidy = tidy;
# endif
#else // standard mode
# if PPT == 4
#  if defined(HST_ZMAP)&&(ASSYMETRY < 2)
    const int xy_minor = threadIdx.x % 4;
    const int y_minor = xy_minor / 2;
    const int x_minor = xy_minor % 2;

    const int xy_major = threadIdx.x / 4;
    const int y_major = xy_major / 2;
    const int x_major = xy_major % 2;

    const int sidx = 4 * ASSYMETRY * threadIdx.y + x_major * 2 + x_minor;
    const int sidy = y_major * 2 + y_minor;
#  elif defined(HST_LMAP)&&(ASSYMETRY < 2)
    const int sidx = 16 * (threadIdx.y % 4) + threadIdx.x;
    const int sidy = threadIdx.y / 4;
#  else
    const int sidx = 4 * ASSYMETRY * threadIdx.y + (threadIdx.x % (4 * ASSYMETRY));
    const int sidy = (threadIdx.x / (4 * ASSYMETRY));
#  endif
# elif PPT == 2
#  if ASSYMETRY == 4
    const int sidx = 8 * threadIdx.y  + (threadIdx.x % 8);
    const int sidy = (threadIdx.x / 8);
#  elif defined(HST_ZMAP)&&(ASSYMETRY < 2)
    const int xy_minor = threadIdx.x % 4;
    const int y_minor = xy_minor / 2;
    const int x_minor = xy_minor % 2;

    const int xy_major = threadIdx.x / 4;
    const int y_major = xy_major / 2;
    const int x_major = xy_major % 2;

    const int sidx = 4 * (threadIdx.y % 8) + x_major * 2 + x_minor;
    const int sidy = 4 * (threadIdx.y / 8) + y_major * 2 + y_minor;
#  elif defined(HST_LMAP)&&(ASSYMETRY < 2)
    const int sidx = 16 * (threadIdx.y % 2) + threadIdx.x;
    const int sidy = threadIdx.y / 8;
#  else
    const int sidx = 4 * (threadIdx.y % 8) + (threadIdx.x % 4);
    const int sidy = 4 * (threadIdx.y / 8) + (threadIdx.x / 4);
#  endif
# else
#   error "Specified PPT is not supported"
# endif
#endif

    const int idx = bidx + sidx; 
    const int idy = bidy + sidy; 

#if defined(CACHE_SUBH_X)&&!defined(HST_CACHE_MINH)
    const float _x = SHMEM_SAMPLING * (bx + cache_tidx);
#else
    const float x = SHMEM_SAMPLING * (idx + off_x);
#endif
    const float y = SHMEM_SAMPLING * (idy + off_y);


#ifdef DEBUG_OFFSETS
    int debug_run = !DEBUG_OFFSETS;
#endif 

#ifdef HST_CACHE_MINH
# ifdef HST_SHFL_MINH 
     const int wrapid = (tidy&1) * 16 + tidx;
# endif
    __shared__ float  cache_minh[CACHE_BLOCK];		// For mem-fetches we actually can store in unsigned char (but that would involve back-conversion for texture fetches)
    __shared__ float  cache_subh[CACHE_BLOCK];
# ifdef CACHE_SUBH_X
#  if P_BLOCK == 2
	// may be different on SHMEM64
    __shared__ float  cache_subhx[PROJ_BLOCK/P_BLOCK][PPT * BSX + 8][P_BLOCK];
#  elif P_BLOCK > 1
    __shared__ float  cache_subhx[PROJ_BLOCK/P_BLOCK][PPT * BSX][P_BLOCK];
#  else 
    __shared__ float  cache_subhx[PROJ_BLOCK][PPT * BSX + 8];
#  endif
    __shared__ float  cache_cos[CACHE_BLOCK];
    __shared__ float  cache_sin[CACHE_BLOCK];
# else
    __shared__ float2 cache_sin[CACHE_BLOCK];
# endif

    int tminhx = tidy * BSX + tidx;
#else
# ifdef HST_CACHE_SUBH
#  ifdef CACHE_SUBH_X
//    __shared__ float  cache_subh[PROJ_BLOCK/P_BLOCK][PPT * BSX][P_BLOCK];
    __shared__ PTYPE  cache_subhx[PROJ_BLOCK][BSX];
#  else
    __shared__  float   cache_subh[PROJ_BLOCK];
#  endif
# endif

# ifdef HST_CACHE_SIN
#  ifdef CACHE_SUBH_X
    __shared__  float  cache_sin[PROJ_BLOCK];
#  else
    __shared__  float2  cache_sin[PROJ_BLOCK];
#  endif
# endif
#endif

#ifdef SIMPLE_SHMEM
# if defined(SPLIT_SHMEM32)
    __shared__  float   cache[SLICE_BLOCK * SHMEM_ARRAYS][PROJ_BLOCK + SHMEM_SHIFTED_PADDING][SHMEM_LOADS * BSX + SHMEM_PADDING];
# elif defined(SPLIT_SHMEM)
    __shared__  float2   cache[2 * SHMEM_ARRAYS][PROJ_BLOCK + SHMEM_SHIFTED_PADDING][SHMEM_LOADS * BSX + SHMEM_PADDING];
# else
    __shared__  sfloat   cache[SHMEM_ARRAYS][PROJ_BLOCK + SHMEM_SHIFTED_PADDING][SHMEM_LOADS * BSX + SHMEM_PADDING];
# endif
#else 
    __shared__  float2   cache[PROJ_BLOCK][SHMEM_LOADS * BSX + SHMEM_PADDING];
#endif


    HST_FETCH_VAR(projf, cache_tidy, 0.5f);

#ifdef HST_CACHE_MINH
    for (int proje=0; proje<num_proj; /*proje += PROJ_BLOCK*/) {

	    // Load2 on SHMEM64
      if (tminhx < CACHE_BLOCK) {
        int g_proj = proje + tminhx;
# if defined(HST_GMEM_CONST)
	const float4 all = { g_all[g_proj], g_all[MAXNPROJECTIONS + g_proj], g_all[2 * MAXNPROJECTIONS + g_proj], g_all[3 * MAXNPROJECTIONS + g_proj] };
# elif defined(HST_C_TRIG)
	const float4 all = (float4){c_trig[g_proj].x, c_trig[g_proj].y, c_ofst[g_proj].x, c_ofst[g_proj].y };
# else
	const float4 all = c_all[g_proj];
# endif

# if !defined(HST_GMEM_CONST)&&defined(HST_C_TRIG)
//	const float g_minh = floor(__fmaf_rz(-by, all.y, __fmaf_rz(bx, all.x, all.w)));
	const float g_minh = floor(bx * all.x - by * all.y + all.w);
	float g_subh = all.z - SHMEM_SAMPLING * g_minh;
# else
	const float g_minh = floor(all.z + bx * all.x - by * all.y + all.w);
	float g_subh = SHMEM_SAMPLING * (all.z - FANCY_MINH_CORRECTION(g_minh));
# endif

# if defined(OVERSAMPLE)&&!defined(FANCY_ROUND)
	g_subh += 0.5f;
# endif
	cache_subh[tminhx] = g_subh;
	cache_minh[tminhx] = g_minh;
# ifdef CACHE_SUBH_X
	cache_cos[tminhx] = all.x;
	cache_sin[tminhx] = all.y;
# else
	cache_sin[tminhx] = (float2){ all.x, all.y};
# endif
      }

      __syncthreads();

# ifdef HST_SHFL_MINH 
      float regcache_subh[CACHE_BLOCK / 32];
#  pragma unroll
      for (int i = 0; i < (CACHE_BLOCK / 32); i ++) {
        regcache_subh[i] = cache_subh[32 * i + wrapid];
      }
# endif
# if CACHE_BLOCK > 32
      const int last_proj = min(num_proj, proje + CACHE_BLOCK);
      for (int subproj = 0; proje < last_proj; subproj++, proje += PROJ_BLOCK) {
# else
#pragma unroll
      for (int subproj = 0; subproj < (32 / PROJ_BLOCK); subproj++, proje += PROJ_BLOCK) {
#endif
# ifdef HST_PRECOMPUTE_OFFSETS
	 const int proj_offset = subproj * PROJ_BLOCK;
	 const int proj_var = (proj_offset) >> 5;
	 const int proj_idx = (proj_offset)&0x1F;
# endif

#else
    for (int proje=0; proje < num_proj; proje += PROJ_BLOCK) {
#endif

	if (cache_tidy < PROJ_BLOCK) {
#if PSTEP < PROJ_BLOCK
#warning "multi-step caching..."
# define P(proje) (proje + p)
#pragma unroll
	  for (int p = 0; p < PROJ_BLOCK; p += PSTEP) {
#else
# define P(proje) (proje)
	  {
#endif
#ifdef HST_CACHE_MINH
	    const float minh = cache_minh[subproj * PROJ_BLOCK + P(cache_tidy)];
#else // ! HST_CACHE_MINH


# ifdef HST_C_TRIG
	    const float4 all = (float4){c_trig[P(proje + cache_tidy)].x, c_trig[P(proje + cache_tidy)].y, c_ofst[P(proje + cache_tidy)].x, c_ofst[P(proje + cache_tidy)].y };
//	    const float minh = FLOOR(__fmaf_rz(-by, all.y, __fmaf_rz(bx, all.x, all.w)));
	    const float minh = FLOOR(bx * all.x - by * all.y + all.w);
# else
		// cache_tidy is either equal to tidy or to tidy/2
	    float4 all = c_all[P(proje + cache_tidy)];
	    float minh = FLOOR(all.z + bx * all.x - by * all.y + all.w);
# endif

# ifdef HST_CACHE_SUBH
#  ifdef HST_C_TRIG
	    float subh = all.z - SHMEM_SAMPLING * minh;
#  else
	    float subh = SHMEM_SAMPLING * (all.z - FANCY_MINH_CORRECTION(minh));
#  endif
#  if defined(OVERSAMPLE)&&!defined(FANCY_ROUND)
	    subh += 0.5f;
#  endif

#  ifdef CACHE_SUBH_X
#   if (defined(SHMEM_LOAD_FULLWARP))
/*	if (cache_tidx < BSX) 
	    cache_subhx[P(cache_tidy)][cache_tidx].x = subh + _x * all.x;
	else
	    cache_subhx[P(cache_tidy)][cache_tidx - BSX].y = subh + _x * all.x;
*/
#if PPT == 2
	const int hxpos = (cache_tidx < BSX)?(2*cache_tidx):(2*(cache_tidx - BSX) + 1);
	*(((float*)(&cache_subhx[P(cache_tidy)][0])) + hxpos) = subh + _x * all.x;
#else
	const int hxhalf = (cache_tidx < BSX)?0:1;
	const int hxpos = (cache_tidx < BSX)?(2*cache_tidx):(2*(cache_tidx - BSX) + 1);
	*(((float2*)(&cache_subhx[P(cache_tidy)][0])) + hxpos) = (float2){
		subh + (_x + (2 * hxhalf    ) * BSX * SHMEM_SAMPLING) * all.x,
		subh + (_x + (2 * hxhalf + 1) * BSX * SHMEM_SAMPLING) * all.x
	};
#endif

#    ifdef HST_CACHE_SIN
	if (cache_tidx == 0)
	    cache_sin[P(cache_tidy)] = all.y;
#    endif

	{
#   else
#    ifdef SHMEM_LOAD_FULLWARP
	    if (cache_tidx < BSX) 
#    endif
	    {


	    cache_subhx[P(cache_tidy)][cache_tidx] = (PTYPE){
		subh + _x * all.x
		, subh + (_x + BSX * SHMEM_SAMPLING) * all.x
#    if PPT > 2
		, subh + (_x + 2 * BSX * SHMEM_SAMPLING) * all.x
		, subh + (_x + 3 * BSX * SHMEM_SAMPLING) * all.x
#    endif
	    };


#    ifdef HST_CACHE_SIN
	    if (cache_tidx == 0)
	        cache_sin[P(cache_tidy)] = all.y;
#    endif
#   endif //FAST_TRACK
#  else
	    if (cache_tidx == 0) {
		cache_subh[P(cache_tidy)] = subh;
#   ifdef HST_CACHE_SIN
		cache_sin[P(cache_tidy)] = (float2){all.x, all.y};
#   endif
#  endif
	    }
# endif // HST_CACHE_SUBH
#endif // ! HST_CACHE_MINH


	    HST_FETCH_VAR(binf, minh, 0.0f);
#if defined(NEWCACHE)
	    const int bin_start = SHMEM_LOAD_ITERATIONS * cache_tidx;
	    float v1 = GET_SINO(binf + bin_start, P(proje) + projf);
	    float v = v1;
#pragma unroll
	    for (int i = 0; i < (SHMEM_LOAD_ITERATIONS - 1); i++) {
		const int pos = bin_start + i;
		const float vnext = GET_SINO(binf + pos + 1, P(proje) + projf);
		CACHE2(P(cache_tidy), pos) = (float2){ v, vnext - v };
		v = vnext;
	    }
//	    const int last_bin = bin_start + SHMEM_LOAD_ITERATIONS;
# if defined(SHFL_CACHE)
#  ifdef SHMEM_LOAD_FULLWARP
	    v1 = __shfl_down(v1, 1, 32);
#  else
	    v1 = __shfl_down(v1, 1, BSX);
#  endif
# else
	    __threadfence_block();
#  ifdef HST_NEWCACHE_UNPAD
#   ifdef SHMEM_LOAD_FULLWARP
	    v1 = (cache_tidx<31)?CACHE(P(cache_tidy),  bin_start + SHMEM_LOAD_ITERATIONS, 0):v;
#   else
	    v1 = (cache_tidx<(BSX-1))?CACHE(P(cache_tidy),  bin_start + SHMEM_LOAD_ITERATIONS, 0):v;
#   endif
#  else
	    v1 = CACHE(P(cache_tidy),  bin_start + SHMEM_LOAD_ITERATIONS, 0);
#  endif
# endif
	    CACHE2(P(cache_tidy), bin_start + SHMEM_LOAD_ITERATIONS - 1) = (float2){ v, (v1 - v) };
#else
# pragma unroll
	    for (int i = 0; i < SHMEM_LOAD_ITERATIONS; i++) {
		int pos = CACHE_POS(i);

		sfloat v = GET_SINO(binf + OVERSAMPLE_MULTIPLIER * pos, P(proje) + projf);
# if defined(SPLIT_SHMEM32)
		CACHE(P(cache_tidy), pos, 0) = v.x;
		CACHE(P(cache_tidy), pos, SHMEM_ARRAYS) = v.y;
#  if SLICE_BLOCK > 2
		CACHE(P(cache_tidy), pos, 2 * SHMEM_ARRAYS) = v.z;
		CACHE(P(cache_tidy), pos, 3 * SHMEM_ARRAYS) = v.w;
#  endif
# elif defined(SPLIT_SHMEM)
		CACHE(P(cache_tidy), pos, 0) = (float2){v.x, v.y};
		CACHE(P(cache_tidy), pos, SHMEM_ARRAYS) = (float2){v.z, v.w};
# elif defined(SHMEM_LOAD2)
		float v2 = GET_SINO(binf + OVERSAMPLE_MULTIPLIER * (pos + 1), P(proje) + projf);
		*(float2*)&CACHE(P(cache_tidy), pos, 0) = (float2){v, v2};
# else
		CACHE(P(cache_tidy), pos, 0) = v;
# endif
	    }

# ifdef CACHE_Y
	    // We don't need to sync all groups, but prevent optimizer from reshuffling the instructions,
	    // __syncthreads() will require to break the (to be executed by all threads) and slightly slower
	__threadfence_block();

# pragma unroll
	    for (int i = 0; i < SHMEM_LOAD_ITERATIONS; i++) {
		int pos = CACHE_POS(i);
		CACHE(P(cache_tidy), pos, 1) = CACHE(P(cache_tidy), pos + 1, 0) - CACHE(P(cache_tidy), pos, 0);
#  ifdef SPLIT_SHMEM
		CACHE(P(cache_tidy), pos, 3) = CACHE(P(cache_tidy), pos + 1, 2) - CACHE(P(cache_tidy), pos, 2);
#  endif
	    }
# endif // CACHE_Y
#endif // SHFL_CACHE

	  } // p
	} //cache_tidy in range

	__syncthreads();
	

#if defined(HST_CACHE_MINH)&&defined(CACHE_SUBH_X)
	const int pid = CACHE_IDX(sidy);

# if PROJ_BLOCK < YSTEP
#  if P_BLOCK > 1
	if (sidy < PROJ_BLOCK) {
	    cache_subhx[sidy/P_BLOCK][sidx][sidy%P_BLOCK] = cache_subh[pid] + x * cache_cos[pid];
	}
#  else
	if (sidy < PROJ_BLOCK) {
	    cache_subhx[sidy][sidx] = cache_subh[pid] + x * cache_cos[pid];
	}
#  endif
# else
#  if P_BLOCK > 1
#   pragma unroll
	for (int i = 0; i < (PROJ_BLOCK / YSTEP); i++) {
	    cache_subhx[(i * YSTEP)/P_BLOCK + sidy/P_BLOCK][sidx][sidy%P_BLOCK] = cache_subh[pid + i * YSTEP] + x * cache_cos[pid + i * YSTEP];
	}
#  else /* P_BLOCK == 1 */
#   pragma unroll
	for (int i = 0; i < (PROJ_BLOCK / YSTEP); i++) {
	    cache_subhx[(i * YSTEP) + sidy][sidx] = cache_subh[pid + i * YSTEP] + x * cache_cos[pid + i * YSTEP];
	}
#  endif /* P_BLOCK == 1 */
# endif
	__syncthreads(); //?
#endif



#if defined(HST_CACHE_MINH)&&defined(CACHE_SUBH_X)
# if P_BLOCK > 1
//#  pragma unroll
	for (int pblock = 0; pblock < (PROJ_BLOCK/P_BLOCK); pblock ++) {
/*	    PTYPE _subh = *(PTYPE*)(&cache_subhx[pblock][sidx][0]);
	    PTYPE _sincos = *(PTYPE*)(&cache_sin[CACHE_IDX(P_BLOCK * pblock)]);
	    float *pb_subh = (float*)&_subh;
	    float *pb_sincos = (float*)&_sincos;*/


	    float pb_subh[P_BLOCK];
	    float pb_sincos[P_BLOCK];

#  pragma unroll
	    for (int i = 0; i < P_BLOCK; i++) {
		pb_sincos[i] = cache_sin[CACHE_IDX(P_BLOCK * pblock + i)];
#  ifdef HST_XCACHE_EARLY_Y
		pb_subh[i] = cache_subhx[pblock][sidx][i] - y * pb_sincos[i];
#  else
		pb_subh[i] = cache_subhx[pblock][sidx][i];
#  endif
	    }

#  pragma unroll
	    for (int pstep = 0; pstep < P_BLOCK; pstep++) {
		const int p = pblock * P_BLOCK + pstep;
		const float subh = pb_subh[pstep];
		const float sincos = pb_sincos[pstep];

# else

            {
# pragma unroll
	      for (int p = 0; p < PROJ_BLOCK; p++) {
		float sincos = cache_sin[CACHE_IDX(p)];
#  ifdef HST_XCACHE_EARLY_Y
		float subh = cache_subhx[p][sidx] - y * sincos;
#  else
		float subh = cache_subhx[p][sidx];
#  endif
# endif

#else /* ! CACHE_MINH && CAHE SUBH_X */
	    {
# if !defined(HST_SHMEM64)
#  if SLICE_BLOCK > 2
#   pragma unroll 2
#  else
#   pragma unroll// over-unrolling hurts performance (but 4 is need to allow joining accesses to subh_cache)
#  endif
# elif (ASSYMETRY > 2)||(SLICE_BLOCK > 1)||(!defined(HST_FANCY_ROUND))
#  pragma unroll 2	// over-unrolling hurts performance (but 4 is need to allow joining accesses to subh_cache)
# else
#  pragma unroll 4
	// over-unrolling hurts performance (but 4 is need to allow joining accesses to subh_cache)
# endif
	    for (int p = 0; p < PROJ_BLOCK; p++) {

# if !defined(HST_CACHE_SUBH)
#  ifdef HST_C_TRIG
		const float4 all = (float4){c_trig[proje + p].x, c_trig[proje + p].y, c_ofst[proje + p].x, c_ofst[proje + p].y };
#  else
		const float4 all = c_all[proje + p];
#  endif
# elif !defined(HST_CACHE_SIN)
#  if defined(CACHE_SUBH_X)&&defined(HST_C_SIN)
		const float all = c_sin[proje + p];
#  elif defined(HST_C_TRIG)
		const float2 all = c_trig[proje + p];
#  else
		const float4 all = c_all[proje + p];
#  endif
# endif

# if defined(HST_CACHE_SUBH)&&defined(HST_CACHE_SIN)
#  ifdef CACHE_SUBH_X
 		const float sincos = cache_sin[CACHE_IDX(p)];
#  else
		const float2 sincos = cache_sin[CACHE_IDX(p)];
#  endif
# elif defined(HST_C_SIN)
#  define sincos all
# elif defined(CACHE_SUBH_X)
#  define sincos all.y
# else
#  define sincos all
# endif

# if defined(HST_CACHE_MINH)
#  if defined(HST_SHFL_MINH)
#   ifdef HST_PRECOMPUTE_OFFSETS
		float subh = __shfl(regcache_subh[proj_var], proj_idx + p, 32);
#   else
		float subh = __shfl(regcache_subh[subproj * PROJ_BLOCK >> 5], (subproj * PROJ_BLOCK + p)&0x1F, 32);
#   endif // HST_PRECOMPUTE_OFFSETS
#  else
		float subh = cache_subh[CACHE_IDX(p)];
#  endif // CACHE_SUBH_X
# elif defined(HST_CACHE_SUBH)
#  ifdef CACHE_SUBH_X
//	    	const int pidx = CACHE_IDX(p);
//	    	const float subh = cache_subh[pidx/4][sidx][pidx%4];
//	    	float subh = cache_subh[CACHE_IDX(pblock * P_BLOCK + p)];
		const float *subh = (float*)&cache_subhx[CACHE_IDX(p)][sidx];
#  else
		float subh = cache_subh[CACHE_IDX(p)];
#  endif // ! CACHE_SUBH_X
# else // DIRECT MODE
#  ifdef HST_C_TRIG
		const float minh = FLOOR(__fmaf_rz(-by, all.y, __fmaf_rz(bx, all.x, all.w)));
//	    	float minh = FLOOR(bx * all.x - by * all.y + all.w);
		const float subh = all.z - SHMEM_SAMPLING * minh;
#  else
		float minh = FLOOR(all.z + bx * all.x - by * all.y + all.w);
		float subh = SHMEM_SAMPLING * (all.z - FANCY_MINH_CORRECTION (minh));
#  endif
#  if defined(OVERSAMPLE)&&!defined(FANCY_ROUND)
		subh += 0.5f; // bringing to center of the pixel to avoid adding 0.5f before interpolation
#  endif
# endif // ! HST_CACHE_MINH

# ifndef CACHE_SUBH_X
//	    	subh += SHMEM_SAMPLING_MULTIPLIER * (x * sincos.x - y * sincos.y);
//	    	subh = (subh + x * sincos.x) - y * sincos.y;
		subh = __fmaf_rz(-y, sincos.y, __fmaf_rz(x, sincos.x, subh));
# endif
#endif /* No p-blocks */

#ifdef HST_SQUARE_PPT
# pragma unroll
		for (int i = 0; i < PPT; i++) {
# ifdef CACHE_SUBH_X
	    	  float hsquare[PPT];
#  pragma unroll
	    	  for (int j = 0; j < PPT; j++) {
	    	    hsquare[j] = subh[j] - (y + i * BSY * SHMEM_SAMPLING) * sincos;
	    	  }
# endif
# pragma unroll
	    	  for (int j = 0; j < PPT; j++) {
# ifdef CACHE_SUBH_X
		    // This is faster for some reason on GTX580
		    const float h = hsquare[j];//subh[j] - (y + i * BSY * SHMEM_SAMPLING) * sincos;

# else
		    const float h = subh + (j * BSX * SHMEM_SAMPLING) * sincos.x - (i * BSY * SHMEM_SAMPLING) * sincos.y;
# endif
#else // ! HST_SQUARE_PPT
# pragma unroll
        	  for (int i = 0; i < YSIZE; i++) {
# ifdef CACHE_SUBH_X
#  ifdef HST_XCACHE_EARLY_Y
            	    const float h = subh - (i * YSTEP * SHMEM_SAMPLING) * sincos;
#  else
            	    const float h = subh - (y + i * YSTEP * SHMEM_SAMPLING) * sincos;
#  endif
# else
            	    const float h = subh  -    (i * YSTEP * SHMEM_SAMPLING) * sincos.y;
# endif
#endif // ! HST_SQUARE_PPT

#ifdef OVERSAMPLE
# ifdef FANCY_ROUND
		    int idx;
		    FANCY_ROUND(idx, h);
# else
		    int idx = (int)h;
		    //int idx = (int)(h + 0.5f);	// DS!!! 0.5f can go to subh [OVERSAMPLE NON FANCY]
		    //int idx = (int)rint(h);	// DS!!! We now add 0.5f, so this should not be used
# endif
#else
# ifdef FANCY_ROUND
		    int idx; float diff;
		    FANCY_ROUND(idx, diff, h);
# else
		    float diff = floorf(h);
		    int idx = (int)diff;
# endif
		    diff = h - diff;
#endif


#ifdef DEBUG_OFFSETS
		    if (debug_run > 0) {
#endif

#ifdef HST_SQUARE_PPT
			res[i][j] += INTERPOLATE_FROM_CACHE(p, idx); 
#else
			res[i] += INTERPOLATE_FROM_CACHE(p, idx); 
#endif

#ifdef DEBUG_OFFSETS
		    }

		    if ((debug_run >=0 )&&((proje + p)==0)) { 
# ifdef HST_SQUARE_PPT
			*(float*)(&res[i][j]) = idx;
# else
			*(float*)(&res[i]) = idx;
# endif
			debug_run = -1;
		    }
#endif
#ifdef HST_SQUARE_PPT
	        } // j
#endif
    	      } // i
    	    } // p
    	} // pblock

	__syncthreads();
#ifdef HST_CACHE_MINH
      }
#endif
    }


#ifdef HST_SQUARE_PPT
# pragma unroll
    for (int i = 0; i < PPT; i++) {
# pragma unroll
	for (int j = 0; j < PPT; j++) {
	    d_SLICE[BSX * PPT * gridDim.x * (idy + i * BSY) + idx + j * BSX] = res[i][j];
	}
    }
#else
# pragma unroll
    for (int i = 0; i < YSIZE; i++) {
	d_SLICE[BSX * PPT * gridDim.x * (idy + i * YSTEP) + idx] = res[i];
    }
#endif

}