2
* The PyHST program is Copyright (C) 2002-2011 of the
3
* European Synchrotron Radiation Facility (ESRF) and
4
* Karlsruhe Institute of Technology (KIT).
6
* PyHST is free software: you can redistribute it and/or modify it
7
* under the terms of the GNU General Public License as published by the
8
* Free Software Foundation, either version 3 of the License, or
9
* (at your option) any later version.
11
* hst is distributed in the hope that it will be useful, but
12
* WITHOUT ANY WARRANTY; without even the implied warranty of
13
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
14
* See the GNU General Public License for more details.
16
* You should have received a copy of the GNU General Public License along
17
* with this program. If not, see <http://www.gnu.org/licenses/>.
21
__constant__ float4 c_all[ MAXNPROJECTIONS ] ; //!< (cosinuses, sinuses, axis position, min bin
22
texture<float, 2, cudaReadModeElementType> tex_projes;
25
__global__ static void hst_cuda_unpack_kernel_fai360(cufftReal *out, int dpitch, cufftReal *data, int spitch, int half, float *params, int batch) {
28
int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
29
int idy = blockIdx.y * BLOCK_SIZE + threadIdx.y;
31
int dest_vector = (idx >= half)?1:0;
32
int dest_pos = 2 * (idx - dest_vector * half);
34
int src_pos = 2 * (idy * spitch + dest_pos) + dest_vector;
35
int src_vector = dest_vector;//(src_pos >= 0);
38
tmp2 = data[src_pos + 2];
40
float axis_position_corr = c_all[batch + 2*idy + src_vector].z;
41
float flat_zone = params[idy * 4 + 2*src_vector];
42
float param = params[idy * 4 + 2*src_vector + 1];
44
int pos = (2 * idy + dest_vector)*dpitch + dest_pos;
46
float apc_plus_flat = axis_position_corr + flat_zone;
47
float apc_param_flat = apc_plus_flat + param - dest_pos;
49
int flag1 = dest_pos < apc_plus_flat;
50
float multiplier1 = __fdividef((flag1?(apc_param_flat - 2 * flat_zone):apc_param_flat),param);
52
int flag2 =(dest_pos + 1) < apc_plus_flat;
53
float multiplier2 = __fdividef((flag2?(apc_param_flat - 2 * flat_zone):apc_param_flat) - 1,param);
56
out[pos] = tmp1 * min(2., max(flag1?1.:0., multiplier1));
57
out[pos+1] = tmp2 * min(2., max(flag2?1.:0., multiplier2));
59
out[pos] = tmp1 * max(0., min(flag1?1.:2., multiplier1));
60
out[pos+1] = tmp2 * max(0., min(flag2?1.:2., multiplier2));
66
__global__ static void hst_cuda_kernel(int num_proj, int num_bins, float *d_SLICE, float apos_off_x, float apos_off_y, int batch) {
67
const int idx = blockIdx.x * BLOCK_SIZE_X + threadIdx.x;
68
const int idy = blockIdx.y * BLOCK_SIZE_Y + threadIdx.y + batch;
73
const float bx = idx + apos_off_x;
74
const float by = idy + apos_off_y;
77
for (int proje=0; proje<num_proj; proje++) {
78
const float4 all = c_all[proje];
79
h = all.z + bx * all.x - by * all.y;
81
res=res+tex2D(tex_projes, h, proje + 0.5f) ;
84
d_SLICE[ BLOCK_SIZE_X*gridDim.x*idy + idx ] = res;
88
__global__ static void hst_kepler_kernel(int num_proj, int num_bins, float *d_SLICE, float apos_off_x, float apos_off_y, int batch) {
90
float res[4][4] = {0};
92
#ifdef HST_OPTIMIZE_KEPLER
93
__shared__ float buf[16][18]; // 64b for kepler
94
__shared__ float fill[48];
95
__shared__ float fin[16][18];
96
#else // HST_OPTIMIZE_KEPLER
97
__shared__ float buf[16][17]; // 32b for Fermi & GT200
98
__shared__ float fill[56];
99
__shared__ float fin[16][17];
100
#endif // HST_OPTIMIZE_KEPLER
102
const int tidx = threadIdx.x;
103
const int tidy = threadIdx.y;
105
const int bidx = blockIdx.x * BLOCK_SIZE_X;
106
const int bidy = batch + blockIdx.y * BLOCK_SIZE_Y;
108
const int sidx = tidx % 4;
109
const int sidy = tidx / 4;
111
const float x = bidx + sidx + apos_off_x;
112
const float y = bidy + sidy + apos_off_y;
114
const float projf = tidy + 0.5f;
116
for (int proje=0; proje<num_proj; proje+=16) {
117
const float4 all = c_all[proje+tidy];
118
h = all.z + x * all.x - y * all.y;
121
for (int i = 0; i < 4; i++) {
123
for (int j = 0; j < 4; j++) {
124
float subh = h + 4 * j * all.x - 4 * i * all.y;
125
res[i][j] += tex2D(tex_projes, subh, proje + projf);
133
for (int i = 0; i < 4; i++) {
135
for (int j = 0; j < 4; j++) {
136
buf[tidx][tidy] = res[i][j];
141
#ifdef HST_OPTIMIZE_KEPLER
142
float val = buf[tidy][tidx];
143
for (int k=16; k>=1; k/=2)
144
val += __shfl_xor(val, k, 16);
145
#else // HST_OPTIMIZE_KEPLER
146
volatile float *ptr = &buf[tidy][0];
147
for (int k=8; k>=1; k/=2)
148
ptr[tidx] += ptr[tidx+k];
150
#endif // HST_OPTIMIZE_KEPLER
152
const int rx = 4 * j + tidy%4;
153
const int ry = 4 * i + tidy/4;
164
const int idx = bidx + tidx;
165
const int idy = bidy + tidy;
167
d_SLICE[BLOCK_SIZE_X * gridDim.x * idy + idx] = fin[tidy][tidx];
172
__global__ static void hst_cuda_nearest_kernel(int num_proj, int num_bins, float *d_SLICE, float apos_off_x, float apos_off_y, int batch) {
173
float2 res[2] = {0.f, 0.f, 0.f, 0.f};
175
const int tidx = threadIdx.x;
176
const int tidy = threadIdx.y;
178
const int bidx = 2 * blockIdx.x * BLOCK_SIZE_X;
179
const int bidy = batch + 2 * blockIdx.y * BLOCK_SIZE_Y;
181
const int idx = bidx + 2 * tidx;
182
const int idy = bidy + 2 * tidy;
184
const float bx = bidx + apos_off_x;
185
const float by = bidy + apos_off_y;
187
const float x = idx + apos_off_x;
188
const float y = idy + apos_off_y;
191
__shared__ float cache_subh[2*BLOCK_SIZE_X][BLOCK_SIZE_Y + 1];
192
__shared__ float cache_sin[BLOCK_SIZE_Y];
193
__shared__ float cache[BLOCK_SIZE_Y][3 * BLOCK_SIZE_X];
196
float projf = tidy + 0.5;
197
for (int proje=0; proje<num_proj; proje += BLOCK_SIZE_Y) {
198
float4 all = c_all[proje + tidy];
199
int minh = (int)floor(all.z + bx * all.x - by * all.y + all.w);
201
float subh = (all.z + x * all.x - minh);
202
cache_subh[2*tidx][tidy] = subh;
203
cache_subh[2*tidx+1][tidy] = subh + all.x;
204
if (tidx == 0) cache_sin[tidy] = all.y;
208
for (int i = 0; i < 3; i++) {
209
int pos = i * BLOCK_SIZE_X + tidx;
210
cache[tidy][pos] = tex2D(tex_projes, minh + pos, proje + projf);
216
for (int i = 0; i < BLOCK_SIZE_Y; i++) {
218
cache_subh[2 * tidx][i] - (y) * cache_sin[i],
219
cache_subh[2 * tidx + 1][i] - (y) * cache_sin[i],
220
cache_subh[2 * tidx][i] - (y + 1) * cache_sin[i],
221
cache_subh[2 * tidx + 1][i] - (y + 1) * cache_sin[i],
224
res[0].x += cache[i][minh.x];
225
res[0].y += cache[i][minh.y];
226
res[1].x += cache[i][minh.z];
227
res[1].y += cache[i][minh.w];
236
*(float2*)(d_SLICE + BLOCK_SIZE_X * 2 * gridDim.x * (idy ) + idx) = res[0];
237
*(float2*)(d_SLICE + BLOCK_SIZE_X * 2 * gridDim.x * (idy + 1) + idx) = res[1];
241
__global__ static void hst_cuda_linear_kernel(int num_proj, int num_bins, float *d_SLICE, float apos_off_x, float apos_off_y, int batch) {
242
float4 res = {0.f, 0.f, 0.f, 0.f};
244
const int tidx = threadIdx.x;
245
const int tidy = threadIdx.y;
247
const int bidx = 2 * blockIdx.x * BLOCK_SIZE_X;
248
const int bidy = batch + 2 * blockIdx.y * BLOCK_SIZE_Y;
250
const int idx = bidx + 2 * tidx;
251
const int idy = bidy + 2 * tidy;
253
const float bx = bidx + apos_off_x;
254
const float by = bidy + apos_off_y;
256
const float x = idx + apos_off_x;
257
const float y = idy + apos_off_y;
260
__shared__ float cache_subh[2*BLOCK_SIZE_X][BLOCK_SIZE_Y + 1];
261
__shared__ float cache_sin[BLOCK_SIZE_Y];
262
__shared__ float2 cache[BLOCK_SIZE_Y][3 * BLOCK_SIZE_X + 1];
265
float projf = tidy + 0.5;
266
for (int proje=0; proje<num_proj; proje += BLOCK_SIZE_Y) {
267
float4 all = c_all[proje + tidy];
268
int minh = (int)floor(all.z + bx * all.x - by * all.y + all.w);
270
float subh = (all.z + x * all.x - minh);
271
cache_subh[2*tidx][tidy] = subh;
272
cache_subh[2*tidx+1][tidy] = subh + all.x;
273
if (tidx == 0) cache_sin[tidy] = all.y;
277
for (int i = 0; i < 3; i++) {
278
int pos = i * BLOCK_SIZE_X + tidx;
279
cache[tidy][pos].x = tex2D(tex_projes, minh + pos, proje + projf);
285
for (int i = 0; i < 3; i++) {
286
int pos = i * BLOCK_SIZE_X + tidx;
287
cache[tidy][pos].y = cache[tidy][pos + 1].x - cache[tidy][pos].x;
293
for (int i = 0; i < BLOCK_SIZE_Y; i++) {
297
cache_subh[2 * tidx][i] - (y) * cache_sin[i],
298
cache_subh[2 * tidx + 1][i] - (y) * cache_sin[i],
299
cache_subh[2 * tidx][i] - (y + 1) * cache_sin[i],
300
cache_subh[2 * tidx + 1][i] - (y + 1) * cache_sin[i],
303
c = cache[i][(int)minh.x];
304
v1.x = c.x; v2.x = c.y;
305
c = cache[i][(int)minh.y];
306
v1.y = c.x; v2.y = c.y;
307
c = cache[i][(int)minh.z];
308
v1.z = c.x; v2.z = c.y;
309
c = cache[i][(int)minh.w];
310
v1.w = c.x; v2.w = c.y;
312
res += v1 + (minh - floorf(minh)) * v2;
319
*(float2*)(d_SLICE + BLOCK_SIZE_X * 2 * gridDim.x * (idy ) + idx) = make_float2(res.x, res.y);
320
*(float2*)(d_SLICE + BLOCK_SIZE_X * 2 * gridDim.x * (idy + 1) + idx) = make_float2(res.z, res.w);
325
__global__ static void hst_cuda_oversample4_kernel(int num_proj, int num_bins, float *d_SLICE, float apos_off_x, float apos_off_y, int batch) {
326
float2 res[2] = {0.f, 0.f, 0.f, 0.f};
328
const int tidx = threadIdx.x;
329
const int tidy = threadIdx.y;
331
const int bidx = 2 * blockIdx.x * BLOCK_SIZE_X;
332
const int bidy = batch + 2 * blockIdx.y * BLOCK_SIZE_Y;
334
const int idx = bidx + 2 * tidx;
335
const int idy = bidy + 2 * tidy;
337
const float bx = bidx + apos_off_x;
338
const float by = bidy + apos_off_y;
340
const float x = idx + apos_off_x;
341
const float y = idy + apos_off_y;
344
__shared__ float cache_subh[2*BLOCK_SIZE_X][BLOCK_SIZE_Y + 1];
345
__shared__ float cache_sin[BLOCK_SIZE_Y];
346
__shared__ float cache[BLOCK_SIZE_Y][12 * BLOCK_SIZE_X];
349
float projf = tidy + 0.5;
350
for (int proje=0; proje<num_proj; proje += BLOCK_SIZE_Y) {
351
float4 all = c_all[proje + tidy];
352
int minh = (int)floor(all.z + bx * all.x - by * all.y + all.w);
354
float subh = 4 * (all.z + x * all.x - minh);
355
cache_subh[2*tidx][tidy] = subh;
356
cache_subh[2*tidx+1][tidy] = subh + 4 * all.x;
357
if (tidx == 0) cache_sin[tidy] = 4 * all.y;
361
for (int i = 0; i < 12; i++) {
362
int pos = i * BLOCK_SIZE_X + tidx;
363
cache[tidy][pos] = tex2D(tex_projes, minh + ((float)pos)/4, proje + projf);
369
for (int i = 0; i < BLOCK_SIZE_Y; i++) {
371
cache_subh[2 * tidx][i] - (y) * cache_sin[i],
372
cache_subh[2 * tidx + 1][i] - (y) * cache_sin[i],
373
cache_subh[2 * tidx][i] - (y + 1) * cache_sin[i],
374
cache_subh[2 * tidx + 1][i] - (y + 1) * cache_sin[i],
377
res[0].x += cache[i][minh.x];
378
res[0].y += cache[i][minh.y];
379
res[1].x += cache[i][minh.z];
380
res[1].y += cache[i][minh.w];
389
*(float2*)(d_SLICE + BLOCK_SIZE_X * 2 * gridDim.x * (idy ) + idx) = res[0];
390
*(float2*)(d_SLICE + BLOCK_SIZE_X * 2 * gridDim.x * (idy + 1) + idx) = res[1];
394
// Both kernels have problems with pre sm2 builds. Can't tell.
397
// Iterations per line
399
//#define PROJ_LINE (2 * IPT * PPT * 3)
400
#define PROJ_LINE (PPT * BLOCK_SIZE_X * 3 / 2)
403
__global__ static void hst_cuda_mplinear_kernel(int num_proj, int num_bins, float *d_SLICE, float apos_off_x, float apos_off_y, int batch) {
404
float res1[PPT][PPT] = {0.f};
405
float res2[PPT][PPT] = {0.f};
407
const int tidx = threadIdx.x;
408
const int tidy = threadIdx.y;
410
const int bidx = PPT * blockIdx.x * BLOCK_SIZE_X;
411
const int bidy = batch + PPT * blockIdx.y * BLOCK_SIZE_Y;
414
const float bx = bidx + apos_off_x;
415
const float by = bidy + apos_off_y;
417
__shared__ float f_minh[128];
418
__shared__ float2 cache[IPT * PROJ_LINE + 1];
420
const int tbidy = tidy/(PPT/2);
421
const int tbidx = tidy%(PPT/2);
423
const int wrapid = (tidy&1) * 16 + tidx;
425
const int sbidx = 8 * ((tidy / 2) % 2);
426
const int sbidy = 8 * ((tidy / 2) / 2);
428
const int stidx = tidx % 8;
429
const int stidy = 2 * (2 * (tidy&1) + tidx / 8);
431
const int sidx = (sbidx + stidx);
432
const int sidy = (sbidy + stidy);
434
const int idx = bidx + sidx;
435
const int idy = bidy + sidy;
437
const float x = idx + apos_off_x;
438
const float y = idy + apos_off_y;
440
const float exp23 = exp2(23.f);
442
const int ttidx = 16 * tidy + tidx;
444
float projf = tbidy + 0.5f;
446
const int num_proj_blocks = num_proj / 32;
448
for (int proj_block=0; proj_block<num_proj_blocks; proj_block += 4) {
449
const int proj = proj_block * 32;
451
float4 all = c_all[proj + ttidx];
452
float minh = floor(all.z + bx * all.x - by * all.y + all.w);
453
f_minh[ttidx] = minh;
460
for (int i = 0; i < 4; i++) {
461
fminh[i] = f_minh[32 * i + wrapid];
464
int max_proj = min(num_proj_blocks - proj_block, 4);
467
for (int subproj32 = 0; subproj32 < max_proj; subproj32++) {
469
for (int subproj = 0; subproj < (32/IPT); subproj++) {
470
const int loop_proj = 32 * subproj32 + IPT * subproj;
471
const int proje = proj + loop_proj;
473
float4 all = c_all[proje + tbidy];
475
#ifdef HST_OPTIMIZE_KEPLER
476
int minh = __shfl(fminh[subproj32], IPT * subproj + tbidy, 32);
477
#else // HST_OPTIMIZE_KEPLER
478
int minh = f_minh[loop_proj + tbidy];
479
#endif // HST_OPTIMIZE_KEPLER
482
for (int i = 0; i < 3; i++) {
483
int pos = (i * (PPT/2) + tbidx) * BLOCK_SIZE_X + tidx;
485
cache[PROJ_LINE * tbidy + pos].x = tex2D(tex_projes, minh + pos, proje + projf);
488
// we may use fence instead
492
for (int i = 0; i < 3; i++) {
493
int pos = (i * (PPT/2) + tbidx) * BLOCK_SIZE_X + tidx;
494
cache[tbidy * PROJ_LINE + pos].y = cache[tbidy * PROJ_LINE + pos + 1].x - cache[tbidy * PROJ_LINE + pos].x;
499
#pragma unroll 4 // IPT
500
for (int p = 0; p < IPT; p++) {
501
float4 all = c_all[proje + p];
503
#ifdef HST_OPTIMIZE_KEPLER
504
float minh = __shfl(fminh[subproj32], IPT * subproj + p, 32);
505
#else // HST_OPTIMIZE_KEPLER
506
float minh = f_minh[loop_proj + p];
507
#endif // HST_OPTIMIZE_KEPLER
509
float h = all.z + x * all.x - y * all.y - minh;
512
char *cache_row = ((char*)(&cache)) + p * PROJ_LINE * sizeof(float2);
515
#pragma unroll 4 // PPT
516
for (int i = 0; i < PPT; i++) {
519
#pragma unroll 4 // PPT
520
for (int j = 0; j < PPT; j++) {
522
float subh2 = subh - all.y;
525
float fsubh1 = subh1 + exp23;
526
int idx1 = (*(int*)(&fsubh1)) - 0x4B000000;
527
fsubh1 = subh1 - (fsubh1 - exp23);
529
float fsubh2 = subh2 + exp23;
530
int idx2 = (*(int*)(&fsubh2)) - 0x4B000000;
531
fsubh2 = subh2 - (fsubh2 - exp23);
534
float2 c1 = cache[p * PROJ_LINE + idx1];
535
res1[i][j] += c1.x + fsubh1 * c1.y;
537
float2 c2 = cache[p * PROJ_LINE + idx2];
538
res2[i][j] += c2.x + fsubh2 * c2.y;
540
subh += 16.f * all.x;
551
for (int i = 0; i < PPT; i++) {
553
for (int j = 0; j < PPT; j++) {
554
d_SLICE[BLOCK_SIZE_X * PPT * gridDim.x * (idy + i * BLOCK_SIZE_Y) + idx + j * BLOCK_SIZE_X] = res1[i][j];
555
d_SLICE[BLOCK_SIZE_X * PPT * gridDim.x * (idy + i * BLOCK_SIZE_Y + 1) + idx + j * BLOCK_SIZE_X] = res2[i][j];
562
#define PROJ_LINE (2 * PPT * BLOCK_SIZE_X * 3)
564
__global__ static void hst_cuda_mpoversample4_kernel(int num_proj, int num_bins, float *d_SLICE, float apos_off_x, float apos_off_y, int batch) {
567
const int tidx = threadIdx.x;
568
const int tidy = threadIdx.y;
570
const int bidx = PPT * blockIdx.x * BLOCK_SIZE_X;
571
const int bidy = batch + PPT * blockIdx.y * BLOCK_SIZE_Y;
574
const float bx = bidx + apos_off_x;
575
const float by = bidy + apos_off_y;
577
__shared__ float f_minh[256];
578
__shared__ float cache[IPT * PROJ_LINE];
580
const int tbidy = tidy/(16 / IPT);
581
const int tbidx = tidy%(16 / IPT);
583
const int wrapid = (tidy&1) * 16 + tidx;
585
const int sbidx = tidy % 4;
586
const int sbidy = tidy / 4;
588
const int stidx = tidx % 4;
589
const int stidy = tidx / 4;
591
const int sidx = (4 * sbidx + stidx);
592
const int sidy = (4 * sbidy + stidy);
594
const int idx = bidx + sidx;
595
const int idy = bidy + sidy;
597
const float x = idx + apos_off_x;
598
const float y = idy + apos_off_y;
600
const int ttidx = 16 * tidy + tidx;
602
float projf = tbidy + 0.5f;
604
const int num_proj_blocks = num_proj / 32;
606
for (int proj_block=0; proj_block<num_proj_blocks; proj_block += 8) {
607
const int proj = proj_block * 32;
609
float4 all = c_all[proj + ttidx];
610
float minh = floor(all.z + bx * all.x - by * all.y + all.w);
611
f_minh[ttidx] = minh;
615
#ifdef HST_OPTIMIZE_KEPLER
619
for (int i = 0; i < 8; i++) {
620
fminh[i] = f_minh[32 * i + wrapid];
622
#endif // HST_OPTIMIZE_KEPLER
624
int max_proj = min(num_proj_blocks - proj_block, 8);
627
for (int subproj32 = 0; subproj32 < max_proj; subproj32++) {
629
for (int subproj = 0; subproj < (32/IPT); subproj++) {
630
const int loop_proj = 32 * subproj32 + IPT * subproj;
631
const int proje = proj + loop_proj;
633
float4 all = c_all[proje + tbidy];
635
#ifdef HST_OPTIMIZE_KEPLER
636
int minh = __shfl(fminh[subproj32], IPT * subproj + tbidy, 32);
637
#else // HST_OPTIMIZE_KEPLER
638
int minh = f_minh[loop_proj + tbidy];
639
#endif // HST_OPTIMIZE_KEPLER
643
for (int i = 0; i < 6; i++) {
644
int pos = (i * PPT + tbidx) * BLOCK_SIZE_X + tidx;
645
cache[PROJ_LINE * tbidy + pos] = tex2D(tex_projes, minh + 0.25f * pos, proje + projf);
651
#pragma unroll 4 // IPT
652
for (int p = 0; p < IPT; p++) {
653
float4 all = 4 * c_all[proje + p];
656
#ifdef HST_OPTIMIZE_KEPLER
657
float minh = __shfl(fminh[subproj32], IPT * subproj + p, 32);
658
#else // HST_OPTIMIZE_KEPLER
659
float minh = f_minh[loop_proj + p];
660
#endif // HST_OPTIMIZE_KEPLER
662
float h = all.z + x * all.x + y * all.y - 4 * minh;
667
#pragma unroll 4 // PPT
668
for (int i = 0; i < PPT; i++) {
669
#pragma unroll 4 // PPT
670
for (int j = 0; j < PPT; j++) {
671
float subh = h + i * all.y + j * all.x;
672
res[i][j] += cache[p*PROJ_LINE + (int)subh];//cache[p * PROJ_LINE + (int)subh];
685
for (int i = 0; i < PPT; i++) {
687
for (int j = 0; j < PPT; j++) {
688
d_SLICE[BLOCK_SIZE_X * PPT * gridDim.x * (idy + i * BLOCK_SIZE_Y) + idx + j * BLOCK_SIZE_X] = res[i][j];