2
__global__ static void hst_kepler_orig_kernel(cudaTextureObject_t texptr, int num_proj, int num_bins, float *d_SLICE, float apos_off_x, float apos_off_y, int batch) {
6
#ifdef HST_OPTIMIZE_KEPLER
7
__shared__ float buf[16][18]; // 64b for kepler
8
// __shared__ float fill[48];
9
__shared__ float fin[16][18];
10
#else // HST_OPTIMIZE_KEPLER
11
__shared__ float buf[16][17]; // 32b for Fermi & GT200
12
// __shared__ float fill[56];
13
__shared__ float fin[16][17];
14
#endif // HST_OPTIMIZE_KEPLER
16
const int tidx = threadIdx.x;
17
const int tidy = threadIdx.y;
19
const int bidx = blockIdx.x * BLOCK_SIZE_X;
20
const int bidy = batch + blockIdx.y * BLOCK_SIZE_Y;
22
const int sidx = tidx % 4;
23
const int sidy = tidx / 4;
25
const float x = bidx + sidx + apos_off_x;
26
const float y = bidy + sidy + apos_off_y;
28
const float projf = tidy + 0.5f;
30
for (int proje=0; proje<num_proj; proje+=16) {
31
const float4 all = c_all[proje+tidy];
32
h = all.z + x * all.x - y * all.y;
35
for (int i = 0; i < 4; i++) {
37
for (int j = 0; j < 4; j++) {
38
float subh = h + 4 * j * all.x - 4 * i * all.y;
39
res[i][j] += hst_tex(texptr, subh, proje + projf);
47
for (int i = 0; i < 4; i++) {
49
for (int j = 0; j < 4; j++) {
50
buf[tidx][tidy] = res[i][j];
55
#ifdef HST_OPTIMIZE_KEPLER
56
float val = buf[tidy][tidx];
57
for (int k=16; k>=1; k/=2)
58
val += __shfl_xor(val, k, 16);
59
#else // HST_OPTIMIZE_KEPLER
60
volatile float *ptr = &buf[tidy][0];
61
for (int k=8; k>=1; k/=2)
62
ptr[tidx] += ptr[tidx+k];
64
#endif // HST_OPTIMIZE_KEPLER
66
const int rx = 4 * j + tidy%4;
67
const int ry = 4 * i + tidy/4;
78
const int idx = bidx + tidx;
79
const int idy = bidy + tidy;
81
d_SLICE[BLOCK_SIZE_X * gridDim.x * idy + idx] = fin[tidy][tidx];
b'\\ No newline at end of file'