/tomo/pyhst

To get this branch, use:
bzr branch http://darksoft.org/webbzr/tomo/pyhst
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
/*
 * The PyHST program is Copyright (C) 2002-2011 of the
 * European Synchrotron Radiation Facility (ESRF) and
 * Karlsruhe Institute of Technology (KIT).
 *
 * PyHST is free software: you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * hst is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#if CUDA_VERSION_MAJOR > 3
//# define HST_TEXTURE16
# define HST_TEXTURE16_PACKED
#endif /* CUDA_VERSION_MAJOR > 3 */

#ifdef HST_TEXTURE16
# ifdef HST_TEXTURE16_PACKED
texture<uint32_t, 2, cudaReadModeElementType> int_projes;
# else /* HST_TEXTURE16_PACKED */
texture<uint16_t, 2, cudaReadModeElementType> int_projes;
# endif /* HST_TEXTURE16_PACKED */

__global__ static void  hst_cuda_normalize_kernel(unsigned short *res, float *data, int pitch, float *params) {

    int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
    int idy = blockIdx.y * BLOCK_SIZE + threadIdx.y;
    
    float min_val = params[0];
    float max_val = params[1];
    
    int pos = idy * pitch + idx;

    res[pos] = round(65535. * (data[pos] - min_val) / (max_val - min_val));
}

__global__ static void  hst_cuda_denormalize_kernel(float *data, float *params, int num_proj) {

    int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
    int idy = blockIdx.y * BLOCK_SIZE + threadIdx.y;
    
    float min_val = params[0];
    float max_val = params[1];
    
    int pos = BLOCK_SIZE_X*gridDim.x * idy + idx;
    data[pos] = num_proj * min_val + ((uint32_t*)data)[pos] * (max_val - min_val) / 65535.;
}

__global__ static void hst_cuda_intkernel_esrf(int num_proj, int num_bins, uint32_t *d_SLICE, float apos_off_x, float apos_off_y, int batch) {
    const int idx = blockIdx.x * BLOCK_SIZE_X + threadIdx.x;
    const int idy = blockIdx.y * BLOCK_SIZE_Y + threadIdx.y + batch;

    float h;
    int res;

    res=0;

    const float bx = idx + apos_off_x;
    const float by = idy + apos_off_y;

#pragma unroll 8
    for (int proje=0; proje<num_proj; proje++) {
        const float4 all = c_all[proje];
        h = all.z + bx * all.x - by * all.y;

        res=res+tex2D(int_projes, h, proje + 0.5f) ;
    }

    d_SLICE[ BLOCK_SIZE_X*gridDim.x*idy + idx ] = res;
}

__global__ static void hst_cuda_intkernel(int num_proj, int num_bins, uint32_t *d_SLICE, float apos_off_x, float apos_off_y, int batch) {
    int i;
    int proj;
    
    const int tidx = threadIdx.x;
    const int tidy = threadIdx.y;

    const int bidx = blockIdx.x * BLOCK_SIZE_X;
    const int bidy = batch + blockIdx.y * BLOCK_SIZE_Y;

    const int idx = bidx + threadIdx.x;
    const int idy = bidy + threadIdx.y;

    float4 all;

    uint32_t res = 0;

    const float bx = bidx + apos_off_x;
    const float by = bidy + apos_off_y;

    const float x = bx + tidx;
    const float y = by + tidy;

    float h;
    int minh;

    __shared__  float   cache_subh[BLOCK_SIZE_X][BLOCK_SIZE_Y + 1];
    __shared__  float   cache_sin[BLOCK_SIZE_Y];
    __shared__  uint32_t   cache[BLOCK_SIZE_Y][BLOCK_SIZE_X + 1];
    
    for (int proje=0; proje<num_proj; proje += BLOCK_SIZE_Y) {
	int proj = proje + tidy;

        all = c_all[proj];
	h = all.z + bx * all.x - by * all.y;
	minh = (int)(h + all.w);
	minh = (minh >> 1);

	cache_subh[tidx][tidy] = all.z + x * all.x - (minh << 1);
	if (tidx == 0) cache_sin[tidy] = all.y;

        cache[tidy][tidx] = tex2D(int_projes, minh + tidx, proje + tidy);

	__syncthreads();

//#pragma unroll 16
	for (i = 0; i < BLOCK_SIZE_Y; i++) {
	    uint16_t *cache_row = (uint16_t*)(&cache[i]);
	    
	    minh = cache_subh[tidx][i] - y * cache_sin[i];
	    res += cache_row[minh];
	}

	__syncthreads();

    }

    d_SLICE[ BLOCK_SIZE_X*gridDim.x*idy + idx ] = res;
}

#endif /* HST_TEXTURE16 */