2
* The PyHST program is Copyright (C) 2002-2011 of the
3
* European Synchrotron Radiation Facility (ESRF) and
4
* Karlsruhe Institute of Technology (KIT).
6
* PyHST is free software: you can redistribute it and/or modify it
7
* under the terms of the GNU General Public License as published by the
8
* Free Software Foundation, either version 3 of the License, or
9
* (at your option) any later version.
11
* hst is distributed in the hope that it will be useful, but
12
* WITHOUT ANY WARRANTY; without even the implied warranty of
13
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
14
* See the GNU General Public License for more details.
16
* You should have received a copy of the GNU General Public License along
17
* with this program. If not, see <http://www.gnu.org/licenses/>.
20
__global__ static void hst_cuda_pack_kernel(cufftReal *out, int dpitch, cufftReal *data, int spitch) {
23
int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
24
int idy = blockIdx.y * BLOCK_SIZE + threadIdx.y;
26
int pos = 2 * idy * spitch + idx;
29
tmp2 = data[pos + spitch];
31
pos = 2 * (idy * dpitch + idx);
37
__global__ static void hst_cuda_pack_kernel_zpad(cufftReal *out, int dpitch, cufftReal *data, int spitch, int num_bins) {
40
int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
41
int idy = blockIdx.y * BLOCK_SIZE + threadIdx.y;
46
pos = 2 * idy * spitch + idx;
49
tmp2 = data[pos + spitch];
55
pos = 2 * (idy * dpitch + idx);
61
__global__ static void hst_cuda_pack_kernel_epad(cufftReal *out, int dpitch, cufftReal *data, int spitch, int num_bins, int mid) {
64
int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
65
int idy = blockIdx.y * BLOCK_SIZE + threadIdx.y;
69
pos = 2 * idy * spitch + idx;
72
tmp2 = data[pos + spitch];
73
} else if (idx < mid) {
74
pos = 2 * idy * spitch + num_bins - 1;
77
tmp2 = data[pos + spitch];
79
pos = 2 * idy * spitch;
82
tmp2 = data[pos + spitch];
85
pos = 2 * (idy * dpitch + idx);
92
__global__ static void hst_cuda_filter_kernel(int vector_size, float *data, const float *filter) {
93
int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
94
int id = idx + (blockIdx.y * BLOCK_SIZE + threadIdx.y)*vector_size;
96
data[id] *= filter[idx];
101
flat_zone_start = (axis_position_corr - flat_zone)
102
dx = flat_zone_start - i
103
multiplier = (2 * prof_shift + prof_fact) + dx * (prof_fact / pente_zone)
105
(2 * prof_shift + prof_fact) is always 1
108
1 + prof_fact * min(1, (flat_zone_start - i) / pente_zone)
110
param2 is prof_fact / pente_zone
111
param1 is flat_zone_start + 1/param2
112
multiplier = max(0, (param1 - i)*param2)
114
param2 is prof_fact / pente_zone
115
param1 is flat_zone_start + 1/param2
116
multiplier = min(2, (param1 - i)*param2)
119
flat_zone_start = (axis_position_corr + flat_zone)
120
dx = flat_zone_start - i
122
multiplier = (2 * prof_shift + prof_fact) + dx * (prof_fact / pente_zone)
124
(2 * prof_shift + prof_fact) is always 1
127
1 + prof_fact * max(-1, (flat_zone_start - i) / pente_zone)
129
param2 is prof_fact / pente_zone
130
param1 is flat_zone_start + 1/param2
131
multiplier = min(2, (param1 - i)*param2)
133
param2 is prof_fact / pente_zone
134
param1 is flat_zone_start + 1/param2
135
multiplier = max(0, (param1 - i)*param2)
137
__global__ static void limit_kernel_max(float *data, float param1, float param2) {
138
int i = blockIdx.x * BLOCK_SIZE + threadIdx.x;
140
data[i] *= max(0., (param1 - i) * param2);
143
__global__ static void limit_kernel_min(float *data, float param1, float param2) {
144
int i = blockIdx.x * BLOCK_SIZE + threadIdx.x;
146
data[i] *= min(2., (param1 - i) * param2);
149
the next functions are joint
151
__global__ static void limit_kernel_positive(float *data, int first_right, float param1a, float param1b, float param2) {
152
int i = blockIdx.x * BLOCK_SIZE + threadIdx.x;
153
int flag = i < first_right;
155
data[i] *= min(2., max(flag?1.:0., ((flag?param1a:param1b) - i) * param2));
158
__global__ static void limit_kernel_negative(float *data, int first_right, float param1a, float param1b, float param2) {
159
int i = blockIdx.x * BLOCK_SIZE + threadIdx.x;
160
int flag = i < first_right;
162
data[i] *= max(0., min(flag?1.:2., ((flag?param1a:param1b) - i) * param2));
164
and now completely blocked and joint:
165
__global__ static void hst_cuda_limit_kernel(float *data, int pitch, int ystart, float *params) {
166
int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
167
int idy = blockIdx.y * BLOCK_SIZE + threadIdx.y + ystart;
168
int i = idy * pitch + idx;
170
float axis_position_corr = c_axiss[idy];
171
float flat_zone = params[idy * 2];
172
float param = params[idy * 2 + 1];
174
int flag = idx < (axis_position_corr + flat_zone);
176
float multiplier = ((flag?((axis_position_corr - flat_zone) + param):((axis_position_corr + flat_zone) + param)) - idx)/param;
180
data[i] *= min(2., max(flag?1.:0., multiplier));
182
data[i] *= max(0., min(flag?1.:2., multiplier));
188
__global__ static void hst_cuda_unpack_kernel(cufftReal *out, int dpitch, cufftReal *data, int spitch) {
191
int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
192
int idy = blockIdx.y * BLOCK_SIZE + threadIdx.y;
194
int pos = 2 * (idy * spitch + idx);
197
tmp2 = data[pos + 1];
199
pos = 2 * idy * dpitch + idx ;
202
out[pos + dpitch] = tmp2;
208
This is backup variant. Theoretically, the first function should work slightly faster due to the reduced amount
209
of memory accesses. However, it hardly could be seen!
210
__global__ static void hst_cuda_unpack_kernel_fai360(cufftReal *out, int dpitch, cufftReal *data, int spitch, int half, float *params, int batch) {
213
int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
214
int idy = blockIdx.y * BLOCK_SIZE + threadIdx.y;
216
int pos = 2 * (idy * spitch + idx);
219
tmp2 = data[pos + 1];
221
pos = 2 * idy * dpitch + idx ;
224
float axis_position_corr = c_axiss[batch + 2*idy];
225
float flat_zone = params[idy * 4];
226
float param = params[idy * 4 + 1];
228
int flag = idx < (axis_position_corr + flat_zone);
229
float multiplier = ((flag?((axis_position_corr - flat_zone) + param):((axis_position_corr + flat_zone) + param)) - idx)/param;
233
out[pos] = tmp1 * min(2., max(flag?1.:0., multiplier));
235
out[pos] = tmp1 * max(0., min(flag?1.:2., multiplier));
238
axis_position_corr = c_axiss[batch + (++idy)];
239
flat_zone = params[idy * 2];
240
param = params[idy * 2 + 1];
242
flag = idx < (axis_position_corr + flat_zone);
243
multiplier = ((flag?((axis_position_corr - flat_zone) + param):((axis_position_corr + flat_zone) + param)) - idx)/param;
247
out[pos] = tmp2 * min(2., max(flag?1.:0., multiplier));
249
out[pos] = tmp2 * max(0., min(flag?1.:2., multiplier));