bzr branch
http://darksoft.org/webbzr/tomo/pyhst
28
by csa
Use pinned result buffer to perform device2host memory transfer parallel with computations, add ESRF copyright information in files appeared after redesign |
1 |
/*
|
78
by Suren A. Chilingaryan
Add COPYING and fix license statements |
2 |
* The PyHST program is Copyright (C) 2002-2011 of the
|
3 |
* European Synchrotron Radiation Facility (ESRF) and
|
|
4 |
* Karlsruhe Institute of Technology (KIT).
|
|
28
by csa
Use pinned result buffer to perform device2host memory transfer parallel with computations, add ESRF copyright information in files appeared after redesign |
5 |
*
|
6 |
* PyHST is free software: you can redistribute it and/or modify it
|
|
7 |
* under the terms of the GNU General Public License as published by the
|
|
8 |
* Free Software Foundation, either version 3 of the License, or
|
|
9 |
* (at your option) any later version.
|
|
10 |
*
|
|
11 |
* hst is distributed in the hope that it will be useful, but
|
|
12 |
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
14 |
* See the GNU General Public License for more details.
|
|
15 |
*
|
|
16 |
* You should have received a copy of the GNU General Public License along
|
|
17 |
* with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
18 |
*/
|
|
19 |
||
18
by csa
Big redesign (early commit) |
20 |
#include <stdio.h> |
21 |
#include <stdlib.h> |
|
22 |
#include <string.h> |
|
23 |
#include <assert.h> |
|
28
by csa
Use pinned result buffer to perform device2host memory transfer parallel with computations, add ESRF copyright information in files appeared after redesign |
24 |
#include <errno.h> |
18
by csa
Big redesign (early commit) |
25 |
#include <math.h> |
26 |
||
28
by csa
Use pinned result buffer to perform device2host memory transfer parallel with computations, add ESRF copyright information in files appeared after redesign |
27 |
#include <sys/mman.h> |
28 |
||
168
by Suren A. Chilingaryan
Placeholders for DFI implementation |
29 |
|
30 |
#undef HW_USE_SIMD
|
|
31 |
||
132
by Suren A. Chilingaryan
Pipelined preprocessing (normalize & transpose) |
32 |
#ifdef HW_USE_SIMD
|
33 |
# include <xmmintrin.h>
|
|
34 |
#endif /* HW_USE_SIMD */ |
|
35 |
||
18
by csa
Big redesign (early commit) |
36 |
#include "Vhst_calculate_limits.h" |
37 |
||
38 |
#include "debug.h" |
|
39 |
#include "hst.h" |
|
40 |
#include "hst_reconstructor.h" |
|
41 |
||
58
by Suren A. Chilingaryan
Re-engineering of OpenCL detection |
42 |
#ifdef HW_USE_CUDA
|
21
by csa
Make gputomo library to follow reconstruction modules naming conventions (hst_cuda now) |
43 |
# include "hst_cuda/hst_cuda.h"
|
175
by Suren A. Chilingaryan
Make DFI optional |
44 |
# ifdef PYHST_ENABLE_DFI
|
45 |
# include "dfi_cuda/dfi_cuda.h"
|
|
46 |
# endif /* PYHST_ENABLE_DFI */ |
|
58
by Suren A. Chilingaryan
Re-engineering of OpenCL detection |
47 |
#endif /* HW_USE_CUDA */ |
18
by csa
Big redesign (early commit) |
48 |
|
58
by Suren A. Chilingaryan
Re-engineering of OpenCL detection |
49 |
#ifdef HW_USE_OPENCL
|
57
by Suren A. Chilingaryan
Separate CPU code into the hst_cpu |
50 |
# include "hst_opencl/hst_opencl.h"
|
58
by Suren A. Chilingaryan
Re-engineering of OpenCL detection |
51 |
#endif /* HW_USE_OPENCL */ |
52 |
||
53 |
#ifdef HW_USE_CPU
|
|
54 |
# include "hst_cpu/hst_cpu.h"
|
|
55 |
#endif /* HW_USE_CPU */ |
|
49
by root
Merge /home/matthias/dev/pyHST |
56 |
|
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
57 |
#include "hw_sched.h" |
58 |
||
18
by csa
Big redesign (early commit) |
59 |
#define PRE_RADIUS 1
|
60 |
||
61 |
static int hst_copies = 0; |
|
58
by Suren A. Chilingaryan
Re-engineering of OpenCL detection |
62 |
|
63 |
#ifdef HW_USE_CUDA
|
|
64 |
static HSTReconstructor *cuda_recon = NULL; |
|
168
by Suren A. Chilingaryan
Placeholders for DFI implementation |
65 |
static HSTReconstructor *cuda_fbp_recon = NULL; |
175
by Suren A. Chilingaryan
Make DFI optional |
66 |
# ifdef PYHST_ENABLE_DFI
|
168
by Suren A. Chilingaryan
Placeholders for DFI implementation |
67 |
static HSTReconstructor *cuda_dfi_recon = NULL; |
175
by Suren A. Chilingaryan
Make DFI optional |
68 |
# endif /* PYHST_ENABLE_DFI */ |
58
by Suren A. Chilingaryan
Re-engineering of OpenCL detection |
69 |
#endif /* HW_USE_CUDA */ |
70 |
||
71 |
#ifdef HW_USE_OPENCL
|
|
72 |
static HSTReconstructor *opencl_recon = NULL; |
|
73 |
#endif /* HW_USE_OPENCL */ |
|
74 |
||
75 |
#ifdef HW_USE_CPU
|
|
18
by csa
Big redesign (early commit) |
76 |
static HSTReconstructor *cpu_recon = NULL; |
58
by Suren A. Chilingaryan
Re-engineering of OpenCL detection |
77 |
#endif /* HW_USE_CPU */ |
49
by root
Merge /home/matthias/dev/pyHST |
78 |
|
79 |
int hst_init(void) { |
|
41
by csa
Bug fixes |
80 |
hw_sched_init(); |
58
by Suren A. Chilingaryan
Re-engineering of OpenCL detection |
81 |
|
82 |
#ifdef HW_USE_CUDA
|
|
168
by Suren A. Chilingaryan
Placeholders for DFI implementation |
83 |
cuda_fbp_recon = hst_cuda_init(HW_USE_CUDA); |
175
by Suren A. Chilingaryan
Make DFI optional |
84 |
# ifdef PYHST_ENABLE_DFI
|
168
by Suren A. Chilingaryan
Placeholders for DFI implementation |
85 |
cuda_dfi_recon = dfi_cuda_init(HW_USE_CUDA); |
175
by Suren A. Chilingaryan
Make DFI optional |
86 |
# endif /* PYHST_ENABLE_DFI */ |
58
by Suren A. Chilingaryan
Re-engineering of OpenCL detection |
87 |
#endif /* HW_USE_CUDA */ |
88 |
||
89 |
#ifdef HW_USE_OPENCL
|
|
90 |
opencl_recon = hst_opencl_init(HW_USE_OPENCL); |
|
91 |
#endif /* HW_USE_OPENCL */ |
|
92 |
||
93 |
#ifdef HW_USE_CPU
|
|
94 |
cpu_recon = hst_cpu_init(HST_RECONSTRUCTOR_USE_CPU); |
|
95 |
#endif /* HW_USE_CPU */ |
|
96 |
||
18
by csa
Big redesign (early commit) |
97 |
return 0; |
98 |
}
|
|
99 |
||
100 |
void hst_free() { |
|
58
by Suren A. Chilingaryan
Re-engineering of OpenCL detection |
101 |
#ifdef HW_USE_CPU
|
18
by csa
Big redesign (early commit) |
102 |
hst_cpu_free(); |
58
by Suren A. Chilingaryan
Re-engineering of OpenCL detection |
103 |
#endif /* HW_USE_CPU */ |
104 |
||
105 |
#ifdef HW_USE_CUDA
|
|
175
by Suren A. Chilingaryan
Make DFI optional |
106 |
# ifdef PYHST_ENABLE_DFI
|
168
by Suren A. Chilingaryan
Placeholders for DFI implementation |
107 |
dfi_cuda_free(); |
175
by Suren A. Chilingaryan
Make DFI optional |
108 |
# endif /* PYHST_ENABLE_DFI */ |
21
by csa
Make gputomo library to follow reconstruction modules naming conventions (hst_cuda now) |
109 |
hst_cuda_free(); |
58
by Suren A. Chilingaryan
Re-engineering of OpenCL detection |
110 |
#endif /* HW_USE_CUDA */ |
49
by root
Merge /home/matthias/dev/pyHST |
111 |
|
58
by Suren A. Chilingaryan
Re-engineering of OpenCL detection |
112 |
#ifdef HW_USE_OPENCL
|
49
by root
Merge /home/matthias/dev/pyHST |
113 |
hst_opencl_free(); |
58
by Suren A. Chilingaryan
Re-engineering of OpenCL detection |
114 |
#endif /* HW_USE_OPENCL */ |
18
by csa
Big redesign (early commit) |
115 |
}
|
116 |
||
142
by Suren A. Chilingaryan
Adjust number of slices per block to number of reconstructors |
117 |
size_t hst_get_balanced_number_of_reconstructors() { |
118 |
size_t count = 0; |
|
119 |
||
120 |
#ifdef HW_USE_THREADS
|
|
121 |
# ifdef HW_USE_CUDA
|
|
168
by Suren A. Chilingaryan
Placeholders for DFI implementation |
122 |
count += cuda_fbp_recon->devices; |
142
by Suren A. Chilingaryan
Adjust number of slices per block to number of reconstructors |
123 |
# endif /* HW_USE_OPENCL */ |
124 |
||
125 |
# ifdef HW_USE_OPENCL
|
|
126 |
count += opencl_recon->devices; |
|
127 |
# endif /* HW_USE_CUDA */ |
|
128 |
||
129 |
# if HW_USE_CPU == 1
|
|
130 |
count += cpu_recon->devices - 1; // one core reserved for data preloading |
|
131 |
# elif defined(HW_USE_CPU)
|
|
132 |
if (!count) count += cpu_recon->devices - 1; |
|
133 |
# endif /* HW_USE_CPU */ |
|
134 |
#else
|
|
135 |
count = 1; |
|
136 |
#endif
|
|
137 |
||
138 |
// we should try to estimate relative performance here and compute LCM
|
|
139 |
// this should allow PyHST to allocate such number of slices, that all
|
|
140 |
// reconstructors terminate simultaneously
|
|
141 |
return count; |
|
142 |
}
|
|
143 |
||
22
by csa
Timing reporting, few bug fixes |
144 |
/**
|
145 |
* HST Context
|
|
146 |
*/
|
|
18
by csa
Big redesign (early commit) |
147 |
struct HSTContextT { |
22
by csa
Timing reporting, few bug fixes |
148 |
HSTSetup setup; //!< HST paramaters |
18
by csa
Big redesign (early commit) |
149 |
|
22
by csa
Timing reporting, few bug fixes |
150 |
int num_recon; //!< Number of initialized reconstructors |
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
151 |
HWSched sched; //!< Scheduller |
132
by Suren A. Chilingaryan
Pipelined preprocessing (normalize & transpose) |
152 |
HWSched cpu_sched; //!< Preprocessing Scheduller |
22
by csa
Timing reporting, few bug fixes |
153 |
HSTReconstructorContext **recon; //!< Pool of reconstructors (NULL-terminated) |
18
by csa
Big redesign (early commit) |
154 |
|
155 |
// some cached values set in hst_set_projections
|
|
22
by csa
Timing reporting, few bug fixes |
156 |
const float *custom_angles; //!< angles of projection axis (per projection) |
157 |
const float *axis_corrections; //!< rotation axis corrections (per projection) |
|
158 |
||
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
159 |
float *result_tmpbuf; //!< Temporary buffer for reconstructed slices |
160 |
float *result_reqbuf; //!< Result buffer supplied in reconstruction request |
|
161 |
const float *sinograms; //!< Sinograms for processing |
|
162 |
HWMutex output_mutex; //!< Output serializer |
|
22
by csa
Timing reporting, few bug fixes |
163 |
FILE *output; //!< Output file (open and close is done in PyHST_c) |
152
by Suren A. Chilingaryan
Support fast writter |
164 |
#ifdef HST_USE_FASTWRITER
|
165 |
fastwriter_t *fw; //!< Output with fastwriter |
|
166 |
#else /* HST_USE_FASTWRITER */ |
|
153
by Suren A. Chilingaryan
Compilation without fastwriter |
167 |
void *fw; |
152
by Suren A. Chilingaryan
Support fast writter |
168 |
#endif /* HST_USE_FASTWRITER */ |
22
by csa
Timing reporting, few bug fixes |
169 |
|
170 |
int limits_configured; //!< Flag indicating if limits are already configured |
|
171 |
int filter_configured; //!< Flag indicating if filter is already configured |
|
18
by csa
Big redesign (early commit) |
172 |
|
132
by Suren A. Chilingaryan
Pipelined preprocessing (normalize & transpose) |
173 |
int num_projections; |
174 |
int current_projection; |
|
175 |
int preproc_slices; |
|
176 |
float *preproc_sinograms; |
|
177 |
float **projection_data; |
|
178 |
||
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
179 |
int num_slices; |
180 |
int current_slice; |
|
181 |
||
182 |
int slice_offset; //!< Number of slices already processed (for navigation in output file) |
|
38
by csa
Preload data from EDF images while computing slices |
183 |
int processing; //!< Flag indicating the working threads are running and processing slices |
43
by csa
Option to disable preloading of image files, additional timers |
184 |
|
185 |
void *mmap; |
|
186 |
||
187 |
#ifdef PYHST_MEASURE_TIMINGS
|
|
188 |
double comp_timer; |
|
189 |
double io_timer; |
|
190 |
#endif /* PYHST_MEASURE_TIMINGS */ |
|
191 |
||
18
by csa
Big redesign (early commit) |
192 |
};
|
22
by csa
Timing reporting, few bug fixes |
193 |
typedef struct HSTContextT HSTContext; //!< a short name |
18
by csa
Big redesign (early commit) |
194 |
|
195 |
||
20
by csa
do more correct initialization |
196 |
static int hst_configure_data_structures(HSTContext *ctx, const float *custom_angles, const float *axis_corrections) { |
197 |
int projection; // projection counter |
|
198 |
int num_projections; // Number of projections |
|
199 |
||
200 |
float angle; // projections angle |
|
201 |
float angle_offset; // Offset rotation angle in radians, could be overriden with custom_angles |
|
202 |
float angle_increment; // Angle in radians between each, could be overriden with custom_angles |
|
203 |
float *cos_s, *sin_s; |
|
204 |
||
205 |
float axis_position; // Position of rotation axis |
|
206 |
float *axis_position_corr_s; |
|
207 |
||
208 |
ctx->custom_angles = custom_angles; |
|
209 |
ctx->axis_corrections = axis_corrections; |
|
210 |
||
211 |
num_projections = ctx->setup.num_projections; |
|
212 |
||
213 |
angle_offset = ctx->setup.angle_offset; |
|
214 |
angle_increment = ctx->setup.angle_increment; |
|
215 |
axis_position = ctx->setup.axis_position; |
|
216 |
||
217 |
// Computing FFT dimensions, hard to understand why the -1 is there but it works nicely
|
|
124
by Suren A. Chilingaryan
To prevent silent segmentations during fatal errors executed within multithreaded context, as a temporal solution just print error message before calling g_log |
218 |
// DS: This is actually two times more than the closest power of 2. Do we need that?
|
168
by Suren A. Chilingaryan
Placeholders for DFI implementation |
219 |
ctx->setup.dim_fft = 1<<((int)(log((1. * ctx->setup.fft_oversampling * ctx->setup.num_bins - 1)) / log(2.0) + 0.9999)); |
228
by Suren A. Chilingaryan
Quality fixes (tex-based) |
220 |
printf("FFT convolution: %u\n", ctx->setup.dim_fft); |
23
by csa
Perform pair of convolutions using a single complex fourier transformation in CUDA reconstruction module (early commit) |
221 |
ctx->setup.filter = malloc((2 * ctx->setup.dim_fft) * sizeof(float)); |
20
by csa
do more correct initialization |
222 |
check_alloc(ctx->setup.filter, " FILTER"); |
223 |
||
224 |
axis_position_corr_s = (float *) malloc(num_projections * sizeof(float)); |
|
225 |
check_alloc(axis_position_corr_s, "axis_position_corr_s"); |
|
226 |
ctx->setup.axis_position_corr_s = axis_position_corr_s; |
|
227 |
||
228 |
cos_s = (float *) malloc(num_projections * sizeof(float)); |
|
229 |
check_alloc(cos_s, "cos_s"); |
|
230 |
ctx->setup.cos_s = cos_s; |
|
231 |
||
232 |
sin_s = (float *) malloc(num_projections * sizeof(float)); |
|
233 |
check_alloc(sin_s, "sin_s"); |
|
234 |
ctx->setup.sin_s = sin_s; |
|
235 |
||
236 |
for (projection = 0; projection < num_projections; projection++) { |
|
237 |
if (custom_angles) { |
|
238 |
angle = custom_angles[projection]; |
|
239 |
} else { |
|
240 |
angle = angle_offset + projection * angle_increment ; |
|
241 |
}
|
|
242 |
||
243 |
cos_s[projection] = cos(angle); |
|
244 |
sin_s[projection] = sin(angle); |
|
245 |
||
246 |
if (axis_corrections) axis_position_corr_s[projection] = axis_position + axis_corrections[projection]; |
|
247 |
else axis_position_corr_s[projection] = axis_position; |
|
248 |
}
|
|
249 |
||
250 |
return 0; |
|
251 |
}
|
|
252 |
||
253 |
static int hst_configure_limits(HSTContext *ctx) { |
|
18
by csa
Big redesign (early commit) |
254 |
int i; |
255 |
||
256 |
int projection; // projection counter |
|
257 |
||
258 |
// Some variables from configuration
|
|
259 |
int num_projections; // Number of projections |
|
260 |
int num_bins; // Number of bins in sinograms |
|
261 |
||
262 |
int start_x; // First X-pixel in region required by user |
|
263 |
int start_y; // First Y-pixel in region required by user |
|
264 |
int num_x; // Number of X-pixels in reconstruction region |
|
265 |
int num_y; // Number of Y-pixels in reconstruction region |
|
266 |
||
267 |
float axis_position; // Position of rotation axis |
|
268 |
float angle_increment; // Angle in radians between each, could be overriden with custom_angles |
|
269 |
||
270 |
int axis_to_the_center; |
|
271 |
||
272 |
int zerooffmask; |
|
273 |
||
274 |
int *minX = NULL, *maxX = NULL; |
|
275 |
float *cos_s, *sin_s; |
|
276 |
float *axis_position_corr_s; |
|
277 |
||
278 |
// The block of variables for reconstruction limits
|
|
279 |
int status; |
|
280 |
int *X_STARTS, *X_ENDS; |
|
281 |
int y_start; |
|
282 |
int y_end; |
|
283 |
int startxdum, startydum, finxdum, finydum; |
|
284 |
||
285 |
// The next block are variables needed for zeroofmask-mode computations
|
|
286 |
#if PRE_RADIUS
|
|
287 |
float * RRadius = NULL; |
|
288 |
#endif /* PRE_RADIUS */ |
|
289 |
float yy, rradius; |
|
290 |
int yoristart, yoriend; |
|
291 |
int yorigin ; |
|
292 |
int x_start, x_end ; |
|
293 |
int y; |
|
294 |
float EXT1, EXT2; |
|
295 |
int XH; |
|
296 |
||
297 |
||
298 |
/* The block of variables describing the position of axes (and used to move
|
|
299 |
the axis to the center of the reconstructed zone) as well as projection
|
|
300 |
angles */
|
|
301 |
float MOVE_X; |
|
302 |
float MOVE_Y; |
|
303 |
||
20
by csa
do more correct initialization |
304 |
int fai360; |
18
by csa
Big redesign (early commit) |
305 |
float axis_position_corr = 0; |
306 |
float cos_angle , sin_angle; |
|
20
by csa
do more correct initialization |
307 |
float corre; // axis correction |
18
by csa
Big redesign (early commit) |
308 |
|
309 |
assert(ctx); |
|
310 |
||
311 |
num_bins = ctx->setup.num_bins; |
|
312 |
num_projections = ctx->setup.num_projections; |
|
313 |
||
314 |
start_x = ctx->setup.start_x; |
|
315 |
num_x = ctx->setup.num_x; |
|
316 |
start_y = ctx->setup.start_y; |
|
317 |
num_y = ctx->setup.num_y; |
|
20
by csa
do more correct initialization |
318 |
|
319 |
axis_position_corr_s = ctx->setup.axis_position_corr_s; |
|
320 |
cos_s = ctx->setup.cos_s; |
|
321 |
sin_s = ctx->setup.sin_s; |
|
18
by csa
Big redesign (early commit) |
322 |
|
20
by csa
do more correct initialization |
323 |
axis_position = ctx->setup.axis_position; |
18
by csa
Big redesign (early commit) |
324 |
angle_increment = ctx->setup.angle_increment; |
20
by csa
do more correct initialization |
325 |
|
18
by csa
Big redesign (early commit) |
326 |
axis_to_the_center = ctx->setup.center_axis; |
327 |
zerooffmask = ctx->setup.zerooffmask; |
|
328 |
||
329 |
||
330 |
/*************************************************************
|
|
331 |
* Depending on geometry ( dimensions, axis positions .... )*
|
|
332 |
* only a restraint part of the slice can be back projected *
|
|
333 |
* from all the sinograms. *
|
|
334 |
* X_Start tells the limits for each y ( idem for X_ENDS) *
|
|
335 |
*************************************************************/
|
|
336 |
X_STARTS = malloc(num_bins * sizeof(int)); |
|
337 |
check_alloc(X_STARTS," X_STARTS allocation in c_hst_recon_overslices"); |
|
338 |
X_ENDS = malloc(num_bins * sizeof(int)); |
|
339 |
check_alloc(X_ENDS," X_ENDS allocation in c_hst_recon_overslices"); |
|
340 |
||
341 |
/************************************************************
|
|
342 |
* calculate limits of slices to reconstruct *
|
|
343 |
************************************************************/
|
|
344 |
/* Restrict the area if projections are covering more than 360 grads */
|
|
345 |
startxdum=1; |
|
346 |
startydum=1; |
|
347 |
finxdum = num_bins ; |
|
348 |
finydum = num_bins ; |
|
349 |
||
350 |
/* DSBug: case of custom angles is not handled here */
|
|
351 |
if (fabs((num_projections)*(angle_increment) - 2*M_PI)>0.001 ) { |
|
352 |
hst_calculate_limits__(&num_bins, &startxdum, &startydum, &finxdum, &finydum, &axis_position, &y_start, &y_end, X_STARTS, X_ENDS, &status); |
|
353 |
y_start+=1; |
|
354 |
y_end -=1; |
|
355 |
} else { |
|
356 |
y_start=-1; |
|
357 |
y_end =-1; |
|
358 |
}
|
|
359 |
||
360 |
fai360 = (y_start == -1 && y_end == -1); |
|
361 |
ctx->setup.fai360 = fai360; |
|
362 |
||
363 |
||
364 |
if (axis_to_the_center) { |
|
365 |
MOVE_X = (((start_x - 1) + num_x / 2.0) - axis_position); |
|
366 |
MOVE_Y = (((start_y - 1) + num_y / 2.0) - axis_position); |
|
367 |
} else { |
|
368 |
MOVE_X = 0; |
|
369 |
MOVE_Y = 0; |
|
370 |
}
|
|
371 |
||
372 |
ctx->setup.offset_x = start_x - MOVE_X ; |
|
373 |
ctx->setup.offset_y = start_y - MOVE_Y ; |
|
374 |
||
375 |
if (zerooffmask) { |
|
376 |
/* these should be allocated once and for all out of the routine */
|
|
377 |
if (ctx->setup.minX) minX = ctx->setup.minX; |
|
378 |
else { |
|
379 |
minX = (int*) malloc(num_y * sizeof(int)); |
|
380 |
check_alloc(minX, "minX"); |
|
381 |
ctx->setup.minX = minX; |
|
382 |
}
|
|
383 |
||
384 |
if (ctx->setup.maxX) maxX = ctx->setup.maxX; |
|
385 |
else { |
|
386 |
maxX = (int*) malloc(num_y * sizeof(int)); |
|
387 |
check_alloc(maxX, "maxX"); |
|
388 |
ctx->setup.maxX = maxX; |
|
389 |
}
|
|
390 |
||
391 |
for (i = 0; i < num_y; i++) { |
|
392 |
maxX[i] = 0; |
|
393 |
minX[i] = 1000000; |
|
394 |
}
|
|
395 |
||
396 |
#if PRE_RADIUS
|
|
397 |
/* The difference with !pre_radius what we do not suing axis_corrections,
|
|
398 |
i.e axis_position is used in place of axis_position_corr */
|
|
399 |
RRadius = (float *) malloc(num_y * sizeof(float)); |
|
400 |
check_alloc(RRadius, "RRadius"); |
|
401 |
||
402 |
for (y = 0; y < num_y; y++) { |
|
403 |
rradius = MAX(axis_position , num_bins - axis_position) ; |
|
404 |
yy = ((y + start_y - MOVE_Y - 1) + 0.5 - axis_position); |
|
405 |
if (fabs(yy) > rradius) { |
|
406 |
RRadius[y] = 0.0; |
|
407 |
} else { |
|
408 |
RRadius[y] = sqrt(rradius * rradius - yy * yy); |
|
409 |
}
|
|
410 |
}
|
|
411 |
#endif /* PRE_RADIUS */ |
|
412 |
}
|
|
413 |
||
414 |
for (projection = 0; projection < num_projections; projection++) { |
|
20
by csa
do more correct initialization |
415 |
cos_angle = cos_s[projection]; |
416 |
sin_angle = sin_s[projection]; |
|
417 |
axis_position_corr = axis_position_corr_s[projection]; |
|
418 |
||
419 |
//corre = axis_position_corr - axis_position;
|
|
420 |
corre = ctx->axis_corrections?ctx->axis_corrections[projection]:0; |
|
18
by csa
Big redesign (early commit) |
421 |
|
422 |
if (zerooffmask) { |
|
423 |
if (fai360) { |
|
424 |
yoristart = 0; |
|
425 |
yoriend = num_y ; |
|
426 |
} else { |
|
427 |
yoristart = y_start - 1 ; |
|
428 |
yoriend = y_end ; |
|
429 |
}
|
|
430 |
||
431 |
for (yorigin = yoristart ; yorigin < yoriend ; yorigin++) { |
|
432 |
if (fai360) { |
|
433 |
y = yorigin; |
|
434 |
} else { |
|
435 |
/* y_start e y_end sono calcolati per una maschera massimale */
|
|
436 |
/* y e relativo alla finestra dell user */
|
|
437 |
y = (yorigin + MOVE_Y + corre - start_y) + 100000; /* per evitare che -0.1 sia tagliato a 0 come 0.9 */ |
|
438 |
y = y - 100000; |
|
439 |
}
|
|
440 |
||
441 |
/* si potrebbe eventualmente togliere il 2 se tutto va bene : ps 2 ->1*/
|
|
442 |
if (y < 0 || y >= num_y - 0) continue; |
|
443 |
||
444 |
if (fai360) { |
|
445 |
#if PRE_RADIUS
|
|
446 |
rradius = RRadius[y]; |
|
447 |
#else /*PRE_RADIUS */ |
|
448 |
rradius = MAX(axis_position_corr , num_bins - axis_position_corr) ; |
|
449 |
yy = ((y + start_y - MOVE_Y - 1) + 0.5 - axis_position); |
|
450 |
if (fabs(yy) > rradius) { |
|
451 |
rradius = 0.0; |
|
452 |
} else { |
|
453 |
rradius = sqrt(rradius * rradius - yy * yy); |
|
454 |
}
|
|
455 |
#endif /* PRE_RADIUS */ |
|
456 |
||
457 |
||
458 |
if (fabs(cos_angle) > 1.0e-6) { /* s puo lasciare solo la parte in y */ |
|
459 |
EXT1 = -((float)(((axis_position_corr + |
|
460 |
((start_x - MOVE_X - 1) + 0.5 - axis_position) * cos_angle - |
|
461 |
((y + start_y - MOVE_Y - 1) + 0.5 - axis_position) * sin_angle - 0.5)))) / cos_angle ; |
|
462 |
EXT2 = (num_bins - (float)(((axis_position_corr + |
|
463 |
((start_x - MOVE_X - 1) + 0.5 - axis_position) * cos_angle - |
|
464 |
((y + start_y - MOVE_Y - 1) + 0.5 - axis_position) * sin_angle - 0.5)))) / cos_angle ; |
|
465 |
} else { |
|
466 |
XH = ((float)(((axis_position_corr - |
|
467 |
((y + start_y - MOVE_Y - 1) + 0.5 - axis_position) * sin_angle - 0.5))) |
|
468 |
);
|
|
469 |
if (XH <= 0 || XH >= num_bins) continue; |
|
470 |
EXT1 = 0; |
|
471 |
EXT2 = num_x; |
|
472 |
}
|
|
473 |
x_start = MIN(EXT1, EXT2) + 1; |
|
474 |
x_end = MAX(EXT1, EXT2) - 1; |
|
475 |
||
476 |
/********************************************************************************************/
|
|
477 |
x_start = MAX(x_start , axis_position - rradius - start_x); /* SOSPETTO */ |
|
478 |
x_end = MIN(x_end , axis_position + rradius - start_x); |
|
479 |
/********************************************************************************************/
|
|
480 |
||
481 |
} else { // fai360 if |
|
482 |
x_start = (X_STARTS[yorigin] - 1) - start_x + MOVE_X + corre ; |
|
483 |
/* if( x_start< + DX0) x_start= DX0; */
|
|
484 |
x_end = (X_ENDS[yorigin] - 1) - start_x + MOVE_X + corre ; |
|
485 |
} // end of fai360 if |
|
486 |
||
487 |
{
|
|
488 |
if (x_start < 0) x_start = 0; |
|
489 |
if (x_end >= num_x - 0) x_end = num_x - 0; |
|
490 |
/* anche qui se tutto va bene si puo provare ad essere piu larghi
|
|
491 |
x_start+=2;
|
|
492 |
x_end-=2; */
|
|
493 |
}
|
|
494 |
||
495 |
if (zerooffmask) { |
|
496 |
minX[y] = MIN(minX[y], x_start); |
|
497 |
maxX[y] = MAX(maxX[y], x_end); |
|
498 |
}
|
|
499 |
} // end of yorigin loop |
|
500 |
} // zerooffmask if |
|
501 |
} // end of projections loop |
|
502 |
||
20
by csa
do more correct initialization |
503 |
#if PRE_RADIUS
|
504 |
if (zerooffmask) { |
|
505 |
free(RRadius); |
|
506 |
}
|
|
507 |
#endif /* PRE_RADIUS */ |
|
18
by csa
Big redesign (early commit) |
508 |
|
509 |
free(X_ENDS); |
|
510 |
free(X_STARTS); |
|
511 |
||
20
by csa
do more correct initialization |
512 |
ctx->limits_configured = 1; |
513 |
ctx->setup.what_changed |= HST_LIMITS_CHANGED; |
|
18
by csa
Big redesign (early commit) |
514 |
|
515 |
return 0; |
|
516 |
}
|
|
517 |
||
228
by Suren A. Chilingaryan
Quality fixes (tex-based) |
518 |
#include "astra_fourier.h" |
18
by csa
Big redesign (early commit) |
519 |
int hst_set_filter(HSTContext *ctx, HSTFilterFunction func, void *func_param) { |
520 |
int j; |
|
521 |
int dim_fft; |
|
522 |
float *FILTER; |
|
523 |
||
524 |
assert(ctx); |
|
228
by Suren A. Chilingaryan
Quality fixes (tex-based) |
525 |
|
18
by csa
Big redesign (early commit) |
526 |
dim_fft = ctx->setup.dim_fft; |
20
by csa
do more correct initialization |
527 |
FILTER = ctx->setup.filter; |
18
by csa
Big redesign (early commit) |
528 |
|
228
by Suren A. Chilingaryan
Quality fixes (tex-based) |
529 |
#ifdef PYHST_ASTRA_SCALING
|
530 |
// dim_fft is next power of two of (2 * num_proj)
|
|
531 |
// num_bins is dim_fft/2 + 1
|
|
532 |
int num_bins = dim_fft / 2 + 1; // number of non-symmetric coefficients |
|
533 |
||
534 |
memset(FILTER, 0, 2 * dim_fft * sizeof(float)); |
|
535 |
FILTER[0] = 0.25; |
|
536 |
FILTER[1] = 0; |
|
537 |
for (j = 1; j < dim_fft; j++) { |
|
538 |
if (j&1) { |
|
539 |
double coef; |
|
540 |
if (2 * j > dim_fft) |
|
541 |
coef = M_PI * (dim_fft - j); |
|
542 |
else
|
|
543 |
coef = M_PI * j; |
|
544 |
FILTER[2 * j] = -1. / (coef * coef); |
|
545 |
} else { |
|
546 |
FILTER[2 * j] = 0; |
|
547 |
}
|
|
548 |
FILTER[2 * j + 1] = 0; |
|
549 |
}
|
|
550 |
||
551 |
int *ip = (int*)malloc((2 + sqrt(dim_fft) + 1)*sizeof(int)); ip[0] = 0; |
|
552 |
float *w = (float*)malloc(dim_fft * sizeof(float) / 2); |
|
553 |
cdft(2 * dim_fft, -1, FILTER, ip, w); |
|
554 |
free(w); |
|
555 |
free(ip); |
|
556 |
||
557 |
for (j = 0; j < num_bins; j++) { |
|
558 |
float _fD = 1.0f; |
|
559 |
float fRelIndex = (float)j / (float)dim_fft; |
|
560 |
float pfW = 2. * M_PI * fRelIndex; |
|
561 |
||
562 |
// On last iteration it is actually equal. So, rounding error works differently with ASTRA and here.... Therefore, we require a bit on top.
|
|
563 |
if (pfW > (M_PI * _fD + 1E-6)) { |
|
564 |
FILTER[2 * j] = 0; |
|
565 |
FILTER[2 * j + 1] = 0; |
|
566 |
} else { |
|
567 |
FILTER[2 * j] = 2.0 * FILTER[2 * j]; |
|
568 |
FILTER[2 * j + 1] = FILTER[2 * j]; |
|
569 |
}
|
|
570 |
}
|
|
571 |
||
572 |
for (j = num_bins; j < dim_fft; j++) { |
|
573 |
FILTER[2 * j] = 0; |
|
574 |
FILTER[2 * j + 1] = 0; |
|
575 |
}
|
|
576 |
||
577 |
/*
|
|
578 |
for (j = 0; j < num_bins; j++) {
|
|
579 |
printf("%8.4f %8.4f ", FILTER[2 * j], FILTER[2 * j + 1]);
|
|
580 |
}
|
|
581 |
printf("\n");
|
|
582 |
*/
|
|
583 |
#else /* ASTRA_SCALING */ |
|
18
by csa
Big redesign (early commit) |
584 |
if (func) { |
585 |
func(func_param, dim_fft, FILTER); |
|
586 |
} else { |
|
587 |
FILTER[0]=0.0; |
|
228
by Suren A. Chilingaryan
Quality fixes (tex-based) |
588 |
FILTER[1]= 1.0/dim_fft ; // This is the real part of the highest frequency |
23
by csa
Perform pair of convolutions using a single complex fourier transformation in CUDA reconstruction module (early commit) |
589 |
|
590 |
for (j=1;j<dim_fft / 2; ++j) { |
|
228
by Suren A. Chilingaryan
Quality fixes (tex-based) |
591 |
FILTER[2 * j] = j * 2.0 / dim_fft / dim_fft; |
18
by csa
Big redesign (early commit) |
592 |
FILTER[2 * j + 1] = FILTER[2 * j]; |
593 |
}
|
|
594 |
}
|
|
595 |
||
596 |
if (ctx->setup.sum_rule == 2) { |
|
597 |
FILTER[0]=FILTER[2]/4.0; |
|
598 |
}
|
|
599 |
||
600 |
if (ctx->setup.no_filtering) { |
|
601 |
FILTER[1] = -9999.9999; |
|
602 |
}
|
|
603 |
/* This is due to different organization of data representation used after by
|
|
604 |
real to complex FFT transfer. In both cases only non-redundant coefficients
|
|
605 |
are stored however, the current CPU code stores High Frequency Term (HFT) in the
|
|
606 |
res[1], while cuFFT (and FFTW as well) have always res[1] = 0 and stores
|
|
607 |
the HFT in res[dim_fft] */
|
|
608 |
||
228
by Suren A. Chilingaryan
Quality fixes (tex-based) |
609 |
FILTER[dim_fft] = FILTER[1]; // or should it be 0? |
18
by csa
Big redesign (early commit) |
610 |
|
23
by csa
Perform pair of convolutions using a single complex fourier transformation in CUDA reconstruction module (early commit) |
611 |
/* CUDA code now includes optimization allowing to compute two convolutions
|
612 |
parallely using single Complex-Complex Fourier transform. For that we need
|
|
613 |
to feel complete FILTER matrix including redundant coefficients. Therfore,
|
|
614 |
the filter is filled withcoefficients according to the cuFFT/FFTW approach.
|
|
615 |
The case of embedded FFT code (when compressed FFT represntation is needed)
|
|
616 |
is handled separately in hst.c/hst_cpu code. */
|
|
617 |
||
618 |
FILTER[dim_fft + 1] = FILTER[1]; |
|
619 |
FILTER[1] = FILTER[0]; |
|
228
by Suren A. Chilingaryan
Quality fixes (tex-based) |
620 |
#endif /* ASTRA_SCALING */ |
23
by csa
Perform pair of convolutions using a single complex fourier transformation in CUDA reconstruction module (early commit) |
621 |
|
622 |
for (j = dim_fft + 2; j < 2 * dim_fft; j+=2) { |
|
623 |
FILTER[j] = FILTER[2 * dim_fft - j]; |
|
624 |
FILTER[j + 1] = FILTER[2 * dim_fft - j + 1]; |
|
625 |
}
|
|
626 |
||
627 |
||
20
by csa
do more correct initialization |
628 |
ctx->filter_configured = 1; |
629 |
ctx->setup.what_changed |= HST_FILTER_CHANGED; |
|
630 |
||
18
by csa
Big redesign (early commit) |
631 |
return 0; |
632 |
}
|
|
633 |
||
634 |
int hst_set_output_file(HSTContext *ctx, FILE *output) { |
|
152
by Suren A. Chilingaryan
Support fast writter |
635 |
if ((ctx->fw)||(output != ctx->output)) { |
636 |
ctx->fw = NULL; |
|
20
by csa
do more correct initialization |
637 |
ctx->output = output; |
638 |
ctx->setup.what_changed |= HST_OUTPUT_CHANGED; |
|
639 |
}
|
|
18
by csa
Big redesign (early commit) |
640 |
|
641 |
return 0; |
|
642 |
}
|
|
643 |
||
152
by Suren A. Chilingaryan
Support fast writter |
644 |
#ifdef HST_USE_FASTWRITER
|
645 |
int hst_set_fastwriter(HSTContext *ctx, fastwriter_t *output) { |
|
646 |
# ifndef HST_MULTISLICE_MODE
|
|
647 |
pyhst_error("Fastwriter is not supported in non-multislice mode"); |
|
648 |
return -1; |
|
649 |
# endif /* HST_MULTISLICE_MODE */ |
|
650 |
||
651 |
if ((ctx->output)||(ctx->fw != output)) { |
|
652 |
ctx->output = NULL; |
|
653 |
ctx->fw = output; |
|
654 |
ctx->setup.what_changed |= HST_OUTPUT_CHANGED; |
|
655 |
}
|
|
656 |
}
|
|
657 |
#endif /* HST_USE_FASTWRITER */ |
|
658 |
||
659 |
||
18
by csa
Big redesign (early commit) |
660 |
int hst_set_padding(HSTContext *ctx, int zero_padding) { |
20
by csa
do more correct initialization |
661 |
if (zero_padding != ctx->setup.zero_padding) { |
662 |
ctx->setup.zero_padding = zero_padding; |
|
663 |
ctx->setup.what_changed |= HST_PADDING_CHANGED; |
|
664 |
}
|
|
18
by csa
Big redesign (early commit) |
665 |
return 0; |
666 |
}
|
|
667 |
||
668 |
int hst_set_axis_mode(HSTContext *ctx, int axis_to_the_center) { |
|
669 |
if (axis_to_the_center != ctx->setup.center_axis) { |
|
670 |
ctx->setup.center_axis = axis_to_the_center; |
|
671 |
// The projections configuration is dependent on this parameter
|
|
20
by csa
do more correct initialization |
672 |
hst_configure_limits(ctx); |
673 |
}
|
|
674 |
return 0; |
|
675 |
}
|
|
676 |
||
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
677 |
static int hst_threads_configure(HWThread thr, void *hwctx, int device_id, void *data) { |
678 |
int err = 0; |
|
679 |
HSTContext *ctx = (HSTContext*)data; |
|
680 |
||
681 |
//printf("Configure %i\n", device_id);
|
|
682 |
||
683 |
HSTReconstructorContextPtr rctx = ctx->recon[device_id]; |
|
684 |
||
685 |
if (rctx->recon.configure) { |
|
686 |
err = rctx->recon.configure(rctx, ctx->setup.what_changed); |
|
687 |
}
|
|
132
by Suren A. Chilingaryan
Pipelined preprocessing (normalize & transpose) |
688 |
|
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
689 |
return err; |
690 |
}
|
|
691 |
||
20
by csa
do more correct initialization |
692 |
static int hst_configure(HSTContext *ctx) { |
693 |
int err; |
|
694 |
||
695 |
assert(ctx); |
|
696 |
||
697 |
if (!ctx->filter_configured) { |
|
698 |
err = hst_set_filter(ctx, NULL, NULL); |
|
699 |
check_code(err, "hst_set_filter"); |
|
700 |
}
|
|
701 |
||
702 |
if (!ctx->limits_configured) { |
|
703 |
err = hst_configure_limits(ctx); |
|
704 |
check_code(err, "hst_configure_limits"); |
|
705 |
}
|
|
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
706 |
|
707 |
err = hw_sched_execute_thread_task(ctx->sched, ctx, hst_threads_configure); |
|
708 |
check_code(err, "hst_configure"); |
|
20
by csa
do more correct initialization |
709 |
|
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
710 |
/*
|
711 |
int i;
|
|
20
by csa
do more correct initialization |
712 |
for (i = 0; i < ctx->num_recon; ++i) {
|
713 |
rctx = ctx->recon[i];
|
|
714 |
if (rctx->recon.configure) {
|
|
715 |
err = rctx->recon.configure(rctx, ctx->setup.what_changed);
|
|
716 |
check_code(err, "hst_configure");
|
|
717 |
}
|
|
718 |
}
|
|
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
719 |
*/
|
720 |
||
721 |
if (ctx->setup.what_changed&HST_OUTPUT_CHANGED) { |
|
722 |
ctx->slice_offset = 0; |
|
723 |
}
|
|
20
by csa
do more correct initialization |
724 |
|
725 |
ctx->setup.what_changed = 0; |
|
726 |
||
727 |
return 0; |
|
728 |
}
|
|
18
by csa
Big redesign (early commit) |
729 |
|
730 |
static int hst_reconstruct_slice(HSTContext *ctx, HSTReconstructorContextPtr rctx, float *slice_result, const float *sinograms ) { |
|
731 |
int err; |
|
732 |
||
733 |
assert(ctx); |
|
734 |
assert(rctx); |
|
735 |
assert(rctx->recon.reconstruct); |
|
736 |
||
151
by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices |
737 |
#ifdef HST_MULTISLICE_MODE
|
738 |
if (rctx->recon.send) { |
|
739 |
const float *last_sino; |
|
740 |
float *last_slice; |
|
741 |
||
742 |
last_slice = rctx->saved_slice; |
|
743 |
last_sino = rctx->saved_sino; |
|
744 |
||
745 |
if (sinograms) |
|
746 |
rctx->recon.send(rctx, sinograms); |
|
747 |
||
748 |
rctx->saved_slice = slice_result; |
|
749 |
rctx->saved_sino = sinograms; |
|
750 |
||
751 |
slice_result = last_slice; |
|
752 |
sinograms = last_sino; |
|
753 |
}
|
|
754 |
||
755 |
if (!slice_result) return 0; |
|
756 |
#endif /* HST_MULTISLICE_MODE */ |
|
757 |
//
|
|
758 |
||
18
by csa
Big redesign (early commit) |
759 |
if (rctx->recon.preprocess) { |
760 |
err = rctx->recon.preprocess(rctx, slice_result, sinograms); |
|
761 |
if (err) return err; |
|
762 |
}
|
|
763 |
||
764 |
err = rctx->recon.reconstruct(rctx, slice_result, sinograms); |
|
765 |
if (err) return err; |
|
766 |
||
767 |
if (rctx->recon.postprocess) { |
|
768 |
err = rctx->recon.postprocess(rctx, slice_result, sinograms); |
|
769 |
if (err) return err; |
|
770 |
}
|
|
22
by csa
Timing reporting, few bug fixes |
771 |
|
186
by Suren A. Chilingaryan
float2/4 modes in standard kernel |
772 |
rctx->processed_slices += SLICE_BLOCK; |
151
by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices |
773 |
|
18
by csa
Big redesign (early commit) |
774 |
return 0; |
775 |
}
|
|
776 |
||
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
777 |
static int hst_threads_reconstruct_slice(HWThread thr, void *hwctx, int slice_id, void *data) { |
151
by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices |
778 |
int err = 0; |
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
779 |
int device_id = thr->thread_id; |
780 |
HSTContext *ctx = (HSTContext*)data; |
|
781 |
int cnt, count = ctx->setup.num_x * ctx->setup.num_y; |
|
782 |
||
43
by csa
Option to disable preloading of image files, additional timers |
783 |
#ifdef PYHST_MEASURE_TIMINGS
|
784 |
GTimer *timer; |
|
785 |
#endif /* PYHST_MEASURE_TIMINGS */ |
|
45
by csa
Fixes measurements of I/O timings |
786 |
|
787 |
#ifndef PYHST_IO_BENCHMARK
|
|
151
by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices |
788 |
if (slice_id == HW_SCHED_CHUNK_FREE) { |
789 |
err = hst_reconstruct_slice(ctx, ctx->recon[device_id], NULL, NULL); |
|
790 |
if (ctx->recon[device_id]->recon.wait) |
|
791 |
ctx->recon[device_id]->recon.wait(ctx->recon[device_id]); |
|
792 |
} else if (slice_id >= 0) { |
|
793 |
err = hst_reconstruct_slice(ctx, ctx->recon[device_id], |
|
794 |
# ifdef HST_MULTISLICE_MODE
|
|
186
by Suren A. Chilingaryan
float2/4 modes in standard kernel |
795 |
(ctx->result_reqbuf?ctx->result_reqbuf:ctx->result_tmpbuf) + SLICE_BLOCK * slice_id * count, |
151
by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices |
796 |
# else /* HST_MULTISLICE_MODE */ |
191
by Suren A. Chilingaryan
Multislice for OpenCL |
797 |
ctx->result_reqbuf?(ctx->result_reqbuf + SLICE_BLOCK * slice_id * count):(ctx->result_tmpbuf + SLICE_BLOCK * device_id * count), |
151
by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices |
798 |
# endif /* HST_MULTISLICE_MODE */ |
186
by Suren A. Chilingaryan
float2/4 modes in standard kernel |
799 |
ctx->sinograms + SLICE_BLOCK * slice_id * ctx->setup.num_bins * ctx->setup.num_projections |
151
by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices |
800 |
);
|
801 |
}
|
|
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
802 |
if (err) return err; |
45
by csa
Fixes measurements of I/O timings |
803 |
#endif /* ! PYHST_IO_BENCHMARK */ |
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
804 |
|
151
by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices |
805 |
|
65
by Suren A. Chilingaryan
Reconstruction benchmark mode |
806 |
#ifndef PYHST_RECON_BENCHMARK
|
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
807 |
if (!ctx->result_reqbuf) { |
45
by csa
Fixes measurements of I/O timings |
808 |
hw_sched_lock_mutex(ctx->output_mutex); |
809 |
||
65
by Suren A. Chilingaryan
Reconstruction benchmark mode |
810 |
# ifdef PYHST_MEASURE_TIMINGS
|
43
by csa
Option to disable preloading of image files, additional timers |
811 |
timer = g_timer_new(); |
65
by Suren A. Chilingaryan
Reconstruction benchmark mode |
812 |
# endif /* PYHST_MEASURE_TIMINGS */ |
43
by csa
Option to disable preloading of image files, additional timers |
813 |
|
151
by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices |
814 |
# ifdef HST_MULTISLICE_MODE
|
815 |
if (slice_id == HW_SCHED_CHUNK_TERMINATOR) { |
|
152
by Suren A. Chilingaryan
Support fast writter |
816 |
# ifdef HST_USE_FASTWRITER
|
817 |
if (ctx->fw) { |
|
818 |
retry: |
|
186
by Suren A. Chilingaryan
float2/4 modes in standard kernel |
819 |
err = fastwriter_push_data(ctx->fw, SLICE_BLOCK * ctx->num_slices * count * sizeof(float), ctx->result_tmpbuf); |
152
by Suren A. Chilingaryan
Support fast writter |
820 |
if (err) { |
821 |
if (err == EWOULDBLOCK) goto retry; |
|
822 |
# ifdef PYHST_MEASURE_TIMINGS
|
|
823 |
if (timer) g_timer_destroy(timer); |
|
824 |
# endif
|
|
825 |
hw_sched_unlock_mutex(ctx->output_mutex); |
|
186
by Suren A. Chilingaryan
float2/4 modes in standard kernel |
826 |
pyhst_error("faswriter failed wrting %u MB of data, error %i", SLICE_BLOCK * ctx->num_slices * count * sizeof(float)/1024/1024, err); |
152
by Suren A. Chilingaryan
Support fast writter |
827 |
return 1; |
828 |
}
|
|
829 |
} else { |
|
830 |
# endif /* HST_USE_FASTWRITER */ |
|
186
by Suren A. Chilingaryan
float2/4 modes in standard kernel |
831 |
cnt = fwrite(ctx->result_tmpbuf, sizeof(float), SLICE_BLOCK * ctx->num_slices * count, ctx->output); |
832 |
if (cnt != SLICE_BLOCK * ctx->num_slices * count) { |
|
151
by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices |
833 |
# ifdef PYHST_MEASURE_TIMINGS
|
152
by Suren A. Chilingaryan
Support fast writter |
834 |
if (timer) g_timer_destroy(timer); |
151
by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices |
835 |
# endif
|
152
by Suren A. Chilingaryan
Support fast writter |
836 |
hw_sched_unlock_mutex(ctx->output_mutex); |
186
by Suren A. Chilingaryan
float2/4 modes in standard kernel |
837 |
pyhst_error("fwrite failed, send pushed characters %i, but written %i, errno %i", SLICE_BLOCK * ctx->num_slices * count, cnt, errno); |
152
by Suren A. Chilingaryan
Support fast writter |
838 |
return 1; |
839 |
}
|
|
840 |
# ifdef HST_USE_FASTWRITER
|
|
841 |
}
|
|
842 |
# endif /* HST_USE_FASTWRITER */ |
|
151
by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices |
843 |
}
|
844 |
# else /* HST_MULTISLICE_MODE */ |
|
186
by Suren A. Chilingaryan
float2/4 modes in standard kernel |
845 |
err = fseek(ctx->output, ((long)(ctx->slice_offset + slice_id * SLICE_BLOCK)) * count * sizeof(float), SEEK_SET); |
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
846 |
if (err) { |
151
by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices |
847 |
# ifdef PYHST_MEASURE_TIMINGS
|
43
by csa
Option to disable preloading of image files, additional timers |
848 |
if (timer) g_timer_destroy(timer); |
151
by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices |
849 |
# endif
|
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
850 |
hw_sched_unlock_mutex(ctx->output_mutex); |
851 |
pyhst_error("fseek failed, errno: %i", errno); |
|
852 |
return err; |
|
853 |
}
|
|
43
by csa
Option to disable preloading of image files, additional timers |
854 |
|
186
by Suren A. Chilingaryan
float2/4 modes in standard kernel |
855 |
cnt = fwrite(ctx->result_tmpbuf + device_id * count, sizeof(float), SLICE_BLOCK * count, ctx->output); |
856 |
if (cnt != SLICE_BLOCK * count) { |
|
151
by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices |
857 |
# ifdef PYHST_MEASURE_TIMINGS
|
43
by csa
Option to disable preloading of image files, additional timers |
858 |
if (timer) g_timer_destroy(timer); |
151
by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices |
859 |
# endif
|
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
860 |
hw_sched_unlock_mutex(ctx->output_mutex); |
186
by Suren A. Chilingaryan
float2/4 modes in standard kernel |
861 |
pyhst_error("fwrite failed, send pushed characters %i, but written %i, errno %i", SLICE_BLOCK * count, cnt, errno); |
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
862 |
return 1; |
863 |
}
|
|
151
by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices |
864 |
# endif /* HST_MULTISLICE_MODE */ |
45
by csa
Fixes measurements of I/O timings |
865 |
//fsync(fileno(ctx->output));
|
866 |
||
43
by csa
Option to disable preloading of image files, additional timers |
867 |
|
65
by Suren A. Chilingaryan
Reconstruction benchmark mode |
868 |
# ifdef PYHST_MEASURE_TIMINGS
|
43
by csa
Option to disable preloading of image files, additional timers |
869 |
if (timer) { |
870 |
// We should be inside the lock, otherwise multiple accesses.
|
|
871 |
g_timer_stop(timer); |
|
872 |
ctx->io_timer += g_timer_elapsed(timer, NULL); |
|
45
by csa
Fixes measurements of I/O timings |
873 |
//ctx->comp_timer -= g_timer_elapsed(timer, NULL);
|
43
by csa
Option to disable preloading of image files, additional timers |
874 |
g_timer_destroy(timer); |
875 |
}
|
|
65
by Suren A. Chilingaryan
Reconstruction benchmark mode |
876 |
# endif /* PYHST_MEASURE_TIMINGS */ |
45
by csa
Fixes measurements of I/O timings |
877 |
|
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
878 |
hw_sched_unlock_mutex(ctx->output_mutex); |
879 |
}
|
|
35
by csa
Fix python code not feeding the real data into the sinograms |
880 |
/*
|
881 |
FILE *f;
|
|
882 |
char tmp[64];
|
|
883 |
sprintf(tmp, "/tmp/test/%i-%i",slice_id + ctx->slice_offset, device_id);
|
|
884 |
f=fopen(tmp, "w");
|
|
885 |
fwrite(ctx->result_tmpbuf + device_id * count, sizeof(float), count, f);
|
|
886 |
fclose(f);
|
|
887 |
*/
|
|
65
by Suren A. Chilingaryan
Reconstruction benchmark mode |
888 |
#endif /* ! PYHST_RECON_BENCHMARK */ |
889 |
||
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
890 |
return 0; |
891 |
}
|
|
892 |
||
18
by csa
Big redesign (early commit) |
893 |
|
22
by csa
Timing reporting, few bug fixes |
894 |
int hst_reconstruct_slices(HSTContextPtr ctx, int num_slices, float *result, const float *sinograms) { |
18
by csa
Big redesign (early commit) |
895 |
int err; |
43
by csa
Option to disable preloading of image files, additional timers |
896 |
#ifdef PYHST_MEASURE_TIMINGS
|
897 |
GTimer *timer; |
|
898 |
#endif /* PYHST_MEASURE_TIMINGS */ |
|
18
by csa
Big redesign (early commit) |
899 |
|
900 |
assert(ctx); |
|
901 |
||
20
by csa
do more correct initialization |
902 |
if (ctx->setup.what_changed) { |
903 |
hst_configure(ctx); |
|
18
by csa
Big redesign (early commit) |
904 |
}
|
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
905 |
|
43
by csa
Option to disable preloading of image files, additional timers |
906 |
#ifdef PYHST_MEASURE_TIMINGS
|
907 |
timer = g_timer_new(); |
|
908 |
#endif /* PYHST_MEASURE_TIMINGS */ |
|
909 |
||
186
by Suren A. Chilingaryan
float2/4 modes in standard kernel |
910 |
ctx->num_slices = num_slices / SLICE_BLOCK; |
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
911 |
ctx->result_reqbuf = result; |
912 |
ctx->sinograms = sinograms; |
|
186
by Suren A. Chilingaryan
float2/4 modes in standard kernel |
913 |
|
18
by csa
Big redesign (early commit) |
914 |
pyhst_info("num_projection: %d, angle_increment: %e, fai360: %s", ctx->setup.num_projections, ctx->setup.angle_increment, ctx->setup.fai360?"on":"off" ); |
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
915 |
hw_sched_schedule_task(ctx->sched, ctx, hst_threads_reconstruct_slice); |
916 |
err = hw_sched_wait_task(ctx->sched); |
|
186
by Suren A. Chilingaryan
float2/4 modes in standard kernel |
917 |
ctx->slice_offset += SLICE_BLOCK * num_slices; |
18
by csa
Big redesign (early commit) |
918 |
|
43
by csa
Option to disable preloading of image files, additional timers |
919 |
#ifdef PYHST_MEASURE_TIMINGS
|
920 |
if (timer) { |
|
921 |
ctx->comp_timer += g_timer_elapsed(timer, NULL); |
|
922 |
g_timer_destroy(timer); |
|
923 |
}
|
|
924 |
#endif /* PYHST_MEASURE_TIMINGS */ |
|
925 |
||
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
926 |
/*
|
927 |
int i;
|
|
18
by csa
Big redesign (early commit) |
928 |
for (i=0; i<num_slices; i++) {
|
929 |
err = hst_reconstruct_slice(ctx,
|
|
930 |
ctx->recon[0],
|
|
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
931 |
result?result:ctx->result_tmpbuf,
|
18
by csa
Big redesign (early commit) |
932 |
sinograms + i * ctx->setup.num_bins * ctx->setup.num_projections
|
933 |
);
|
|
934 |
if (err) return err;
|
|
935 |
|
|
29
by csa
Filters and FastEDF are added |
936 |
if (!result) {
|
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
937 |
fwrite(ctx->result_tmpbuf, ctx->setup.num_x * sizeof(float), ctx->setup.num_y, ctx->output);
|
29
by csa
Filters and FastEDF are added |
938 |
}
|
18
by csa
Big redesign (early commit) |
939 |
|
940 |
}
|
|
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
941 |
*/
|
151
by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices |
942 |
|
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
943 |
return err; |
18
by csa
Big redesign (early commit) |
944 |
}
|
945 |
||
38
by csa
Preload data from EDF images while computing slices |
946 |
int hst_reconstruct_start(HSTContextPtr ctx, int num_slices, const float *sinograms) { |
43
by csa
Option to disable preloading of image files, additional timers |
947 |
#ifdef PYHST_MEASURE_TIMINGS
|
948 |
GTimer *timer; |
|
949 |
#endif /* PYHST_MEASURE_TIMINGS */ |
|
950 |
||
38
by csa
Preload data from EDF images while computing slices |
951 |
assert(ctx); |
952 |
||
953 |
if (ctx->setup.what_changed) { |
|
954 |
hst_configure(ctx); |
|
955 |
}
|
|
956 |
||
43
by csa
Option to disable preloading of image files, additional timers |
957 |
#ifdef PYHST_MEASURE_TIMINGS
|
958 |
timer = g_timer_new(); |
|
959 |
#endif /* PYHST_MEASURE_TIMINGS */ |
|
960 |
||
186
by Suren A. Chilingaryan
float2/4 modes in standard kernel |
961 |
ctx->num_slices = num_slices / SLICE_BLOCK; |
38
by csa
Preload data from EDF images while computing slices |
962 |
ctx->result_reqbuf = NULL; |
963 |
ctx->sinograms = sinograms; |
|
964 |
||
965 |
pyhst_info("num_projection: %d, angle_increment: %e, fai360: %s", ctx->setup.num_projections, ctx->setup.angle_increment, ctx->setup.fai360?"on":"off" ); |
|
966 |
hw_sched_schedule_task(ctx->sched, ctx, hst_threads_reconstruct_slice); |
|
967 |
ctx->processing = 1; |
|
43
by csa
Option to disable preloading of image files, additional timers |
968 |
|
969 |
#ifdef PYHST_MEASURE_TIMINGS
|
|
970 |
if (timer) { |
|
971 |
ctx->comp_timer += g_timer_elapsed(timer, NULL); |
|
972 |
g_timer_destroy(timer); |
|
973 |
}
|
|
974 |
#endif /* PYHST_MEASURE_TIMINGS */ |
|
975 |
||
976 |
#ifndef HW_USE_PARALLEL_IO
|
|
977 |
hst_reconstruct_wait(ctx); |
|
978 |
#endif /* ! HW_USE_PARALLEL_IO */ |
|
38
by csa
Preload data from EDF images while computing slices |
979 |
|
980 |
return 0; |
|
981 |
}
|
|
982 |
||
983 |
int hst_reconstruct_wait(HSTContextPtr ctx) { |
|
984 |
int err = 0; |
|
985 |
||
43
by csa
Option to disable preloading of image files, additional timers |
986 |
#ifdef PYHST_MEASURE_TIMINGS
|
987 |
GTimer *timer; |
|
988 |
timer = g_timer_new(); |
|
989 |
#endif /* PYHST_MEASURE_TIMINGS */ |
|
990 |
||
38
by csa
Preload data from EDF images while computing slices |
991 |
if (ctx->processing) { |
992 |
err = hw_sched_wait_task(ctx->sched); |
|
186
by Suren A. Chilingaryan
float2/4 modes in standard kernel |
993 |
ctx->slice_offset += SLICE_BLOCK * ctx->num_slices; |
38
by csa
Preload data from EDF images while computing slices |
994 |
ctx->processing = 0; |
995 |
}
|
|
43
by csa
Option to disable preloading of image files, additional timers |
996 |
|
997 |
#ifdef PYHST_MEASURE_TIMINGS
|
|
998 |
if (timer) { |
|
999 |
ctx->comp_timer += g_timer_elapsed(timer, NULL); |
|
1000 |
g_timer_destroy(timer); |
|
1001 |
}
|
|
1002 |
#endif /* PYHST_MEASURE_TIMINGS */ |
|
151
by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices |
1003 |
|
38
by csa
Preload data from EDF images while computing slices |
1004 |
return err; |
1005 |
}
|
|
1006 |
||
1007 |
||
171
by Suren A. Chilingaryan
Disable broken SIMD |
1008 |
#undef HW_USE_SIMD
|
132
by Suren A. Chilingaryan
Pipelined preprocessing (normalize & transpose) |
1009 |
static int hst_threads_preprocess_projection(HWThread thr, void *hwctx, int proj, void *data) { |
1010 |
HSTContext *ctx = (HSTContext*)data; |
|
1011 |
||
1012 |
int i,j; |
|
1013 |
int num_slices = ctx->preproc_slices; |
|
1014 |
int num_bins = ctx->setup.num_bins; |
|
1015 |
int num_proj = ctx->setup.num_projections; |
|
1016 |
float norm = ctx->setup.norm; |
|
1017 |
||
1018 |
int num_bins4 = (num_bins/4); |
|
1019 |
||
1020 |
#ifdef HW_USE_SIMD
|
|
1021 |
__m128 norm4 = _mm_set_ps1(norm); |
|
1022 |
#endif /* HW_USE_SIMD */ |
|
1023 |
||
1024 |
while (!ctx->projection_data[proj]) |
|
1025 |
usleep(100); |
|
1026 |
||
1027 |
// printf("[%p] %i, %i of %i, %i %f\n", thr, num_slices, proj, num_proj, num_bins, norm);
|
|
1028 |
for (i = 0; i < num_slices; i++) { |
|
1029 |
int src_offset = i * num_bins; |
|
1030 |
int dst_offset = i * num_bins * num_proj + proj * num_bins; |
|
1031 |
||
1032 |
#ifdef HW_USE_SIMD
|
|
1033 |
// Looks pretty inoptimal... Enforce padding? Hand-written assembler?
|
|
1034 |
__m128* src = (__m128*)(ctx->projection_data[proj] + src_offset); |
|
1035 |
__m128* dst = (__m128*)(ctx->preproc_sinograms + dst_offset); |
|
1036 |
||
1037 |
for (j = 0; j < num_bins4; j++) { |
|
1038 |
dst[j] = _mm_mul_ps(src[j], norm4); |
|
1039 |
}
|
|
1040 |
||
1041 |
for (j = num_bins4 * 4; j < num_bins; j++) { |
|
1042 |
#else /* HW_USE_SIMD */ |
|
1043 |
for (j = 0; j < num_bins; j++) { |
|
1044 |
#endif /* HW_USE_SIMD */ |
|
1045 |
ctx->preproc_sinograms[dst_offset + j] = norm * ctx->projection_data[proj][src_offset + j]; |
|
1046 |
}
|
|
1047 |
}
|
|
1048 |
||
1049 |
return 0; |
|
1050 |
}
|
|
1051 |
||
1052 |
int hst_preprocess_start(HSTContextPtr ctx, int num_slices, const float *sinograms) { |
|
1053 |
int err = 0; |
|
1054 |
||
1055 |
ctx->num_projections = ctx->setup.num_projections; |
|
1056 |
ctx->preproc_slices = num_slices; |
|
1057 |
ctx->preproc_sinograms = sinograms; |
|
1058 |
||
1059 |
#ifdef HW_USE_PARALLEL_IO
|
|
1060 |
memset(ctx->projection_data, 0, ctx->setup.num_projections * sizeof(float*)); |
|
1061 |
err = hw_sched_schedule_task(ctx->cpu_sched, ctx, hst_threads_preprocess_projection); |
|
1062 |
#endif /* ! HW_USE_PARALLEL_IO */ |
|
1063 |
||
1064 |
return err; |
|
1065 |
}
|
|
1066 |
||
1067 |
int hst_preprocess_wait(HSTContextPtr ctx) { |
|
1068 |
int err = 0; |
|
1069 |
#ifdef HW_USE_PARALLEL_IO
|
|
1070 |
err = hw_sched_wait_task(ctx->cpu_sched); |
|
1071 |
#endif /* ! HW_USE_PARALLEL_IO */ |
|
1072 |
return err; |
|
1073 |
}
|
|
1074 |
||
1075 |
int hst_preprocess_projection(HSTContextPtr ctx, int proj, float *data) { |
|
1076 |
||
1077 |
ctx->projection_data[proj] = data; |
|
1078 |
||
1079 |
#ifndef HW_USE_PARALLEL_IO
|
|
1080 |
hst_threads_preprocess_projection(NULL, NULL, proj, ctx); |
|
1081 |
#endif /* ! HW_USE_PARALLEL_IO */ |
|
1082 |
||
1083 |
/*
|
|
1084 |
int i,j;
|
|
1085 |
int num_slices = ctx->preproc_slices;
|
|
1086 |
int num_bins = ctx->setup.num_bins;
|
|
1087 |
int num_proj = ctx->setup.num_projections;
|
|
1088 |
float norm = ctx->setup.norm;
|
|
1089 |
||
1090 |
// printf("%i, %i of %i, %i %f\n", num_slices, proj, num_proj, num_bins, norm);
|
|
1091 |
for (i = 0; i < num_slices; i++) {
|
|
1092 |
for (j = 0; j < num_bins; j++) {
|
|
1093 |
ctx->preproc_sinograms[i * num_bins * num_proj + proj * num_bins + j] = norm * data[i * num_bins + j];
|
|
1094 |
}
|
|
1095 |
}
|
|
1096 |
*/
|
|
1097 |
return 0; |
|
1098 |
}
|
|
1099 |
||
1100 |
||
22
by csa
Timing reporting, few bug fixes |
1101 |
HSTReconstructorConstContextPtr *hst_get_configured_reconstructors(HSTContextPtr ctx) { |
1102 |
assert(ctx); |
|
1103 |
||
1104 |
return (HSTReconstructorConstContextPtr*)ctx->recon; |
|
1105 |
}
|
|
1106 |
||
43
by csa
Option to disable preloading of image files, additional timers |
1107 |
#ifdef PYHST_MEASURE_TIMINGS
|
1108 |
double hst_get_io_timer(HSTContextPtr ctx) { |
|
1109 |
assert(ctx); |
|
1110 |
return ctx->io_timer; |
|
1111 |
}
|
|
1112 |
||
1113 |
double hst_get_recon_timer(HSTContextPtr ctx) { |
|
1114 |
assert(ctx); |
|
1115 |
return ctx->comp_timer; |
|
1116 |
}
|
|
1117 |
#endif /* PYHST_MEASURE_TIMINGS */ |
|
1118 |
||
1119 |
||
49
by root
Merge /home/matthias/dev/pyHST |
1120 |
HSTContext *hst_create_context(void) { |
18
by csa
Big redesign (early commit) |
1121 |
if (hst_copies) pyhst_fail("Only a single HST context is supported by the current version"); |
1122 |
||
1123 |
HSTContext *ctx = (HSTContext*)malloc(sizeof(HSTContext)); |
|
1124 |
if (ctx) { |
|
1125 |
memset(ctx, 0, sizeof(HSTContext)); |
|
1126 |
++hst_copies; |
|
1127 |
}
|
|
1128 |
return ctx; |
|
1129 |
}
|
|
1130 |
||
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
1131 |
static int hst_threads_init(HWThread thr, void *hwctx, int device_id, void *data) { |
1132 |
int err; |
|
1133 |
||
1134 |
HSTContext *ctx = (HSTContext*)data; |
|
1135 |
||
1136 |
//printf("Init %i\n", device_id);
|
|
32
by csa
Fix crash in FFTW3 initialization and cleanup in multi-threaded case |
1137 |
err = hst_reconstructor_init(ctx->recon[device_id], thr); |
1138 |
check_code(err, "hst_reconstructor_init"); |
|
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
1139 |
|
1140 |
return 0; |
|
1141 |
}
|
|
1142 |
||
133
by Suren A. Chilingaryan
Pinned OpenCL memory |
1143 |
void *hst_pinned_malloc(size_t size, HSTPinnedAccess a) { |
1144 |
#ifdef HW_USE_OPENCL
|
|
1145 |
return hst_opencl_host_malloc(size, a); |
|
1146 |
#else /* OPENCL */ |
|
1147 |
# ifdef HW_USE_CUDA
|
|
1148 |
return hst_cuda_host_malloc(size, a); |
|
1149 |
# else /* USEGPU */ |
|
37
by csa
Optimizations of EDF reading |
1150 |
return malloc(size); |
1151 |
/*
|
|
1152 |
Unfortunatelly, this demands ROOT priveleges (or non-standard system
|
|
1153 |
configuration) and anyway not supported by CUDA 2.2, so there is only
|
|
1154 |
way to allocate through CUDA (probably we should get results in our
|
|
1155 |
personal buffers to avoid such problems.
|
|
1156 |
if (ctx->result_tmpbuf) mlock(ctx->result, 1 * ctx->setup.num_x * ctx->setup.num_y * sizeof(float));
|
|
1157 |
ctx->result_tmpbuf = mmap(NULL, 1 * ctx->setup.num_x * ctx->setup.num_y * sizeof(float), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS|MAP_LOCKED, -1, 0);
|
|
1158 |
*/
|
|
133
by Suren A. Chilingaryan
Pinned OpenCL memory |
1159 |
# endif /* USEGPU */ |
1160 |
#endif /* OPENCL */ |
|
37
by csa
Optimizations of EDF reading |
1161 |
}
|
1162 |
||
1163 |
void hst_pinned_free(void *ptr) { |
|
133
by Suren A. Chilingaryan
Pinned OpenCL memory |
1164 |
#ifdef HW_USE_OPENCL
|
1165 |
hst_opencl_host_free(ptr); |
|
1166 |
#else /* OPENCL */ |
|
1167 |
# ifdef HW_USE_CUDA
|
|
37
by csa
Optimizations of EDF reading |
1168 |
hst_cuda_host_free(ptr); |
133
by Suren A. Chilingaryan
Pinned OpenCL memory |
1169 |
# else /* USEGPU */ |
37
by csa
Optimizations of EDF reading |
1170 |
free(ptr); |
133
by Suren A. Chilingaryan
Pinned OpenCL memory |
1171 |
# endif /* USEGPU */ |
1172 |
#endif /* OPENCL */ |
|
37
by csa
Optimizations of EDF reading |
1173 |
}
|
1174 |
||
18
by csa
Big redesign (early commit) |
1175 |
int hst_init_context(HSTContext *ctx, HSTSetup *setup, const float *custom_angles, const float *axis_corrections) { |
1176 |
int err; |
|
58
by Suren A. Chilingaryan
Re-engineering of OpenCL detection |
1177 |
int i, limit, threads; |
18
by csa
Big redesign (early commit) |
1178 |
size_t size; |
1179 |
||
1180 |
assert(ctx); |
|
1181 |
assert(setup); |
|
1182 |
||
1183 |
hst_setup_init(&ctx->setup); |
|
20
by csa
do more correct initialization |
1184 |
|
18
by csa
Big redesign (early commit) |
1185 |
size = ((void*)&setup->END_OF_GLOBAL_PART) - ((void*)setup); |
1186 |
memcpy(&ctx->setup, setup, size); |
|
1187 |
||
20
by csa
do more correct initialization |
1188 |
ctx->setup.what_changed = HST_ALL_CHANGED; |
1189 |
hst_configure_data_structures(ctx, custom_angles, axis_corrections); |
|
18
by csa
Big redesign (early commit) |
1190 |
|
58
by Suren A. Chilingaryan
Re-engineering of OpenCL detection |
1191 |
|
1192 |
# ifdef HW_USE_CUDA
|
|
168
by Suren A. Chilingaryan
Placeholders for DFI implementation |
1193 |
switch (setup->method) { |
1194 |
case HST_METHOD_FBP: |
|
1195 |
cuda_recon = cuda_fbp_recon; |
|
1196 |
break; |
|
175
by Suren A. Chilingaryan
Make DFI optional |
1197 |
# ifdef PYHST_ENABLE_DFI
|
168
by Suren A. Chilingaryan
Placeholders for DFI implementation |
1198 |
case HST_METHOD_DFI: |
1199 |
cuda_recon = cuda_dfi_recon; |
|
1200 |
break; |
|
175
by Suren A. Chilingaryan
Make DFI optional |
1201 |
# endif /* PYHST_ENABLE_DFI */ |
182
by Suren A. Chilingaryan
Fix DFI reconstruction |
1202 |
default: |
1203 |
pyhst_error("Unknown reconstruction mode specififed"); |
|
1204 |
exit(-1); |
|
168
by Suren A. Chilingaryan
Placeholders for DFI implementation |
1205 |
}
|
173
by Suren A. Chilingaryan
Fix few bugs in scheduller causing crashes in non-threaded mode (still inoperational) |
1206 |
#endif /* HW_USE_CUDA */ |
1207 |
||
1208 |
#ifdef HW_USE_THREADS
|
|
1209 |
threads = 0; |
|
1210 |
||
1211 |
# ifdef HW_USE_CUDA
|
|
58
by Suren A. Chilingaryan
Re-engineering of OpenCL detection |
1212 |
threads += cuda_recon->devices; |
1213 |
# endif /* HW_USE_CUDA */ |
|
1214 |
||
1215 |
# ifdef HW_USE_OPENCL
|
|
1216 |
threads += opencl_recon->devices; |
|
1217 |
# endif /* HW_USE_OPENCL */ |
|
1218 |
||
1219 |
# if HW_USE_CPU == 1
|
|
1220 |
threads += cpu_recon->devices - 1; // one core reserved for data preloading |
|
1221 |
# elif defined(HW_USE_CPU)
|
|
1222 |
if (!threads) threads += cpu_recon->devices - 1; |
|
1223 |
# endif /* HW_USE_CPU */ |
|
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
1224 |
#else /* HW_USE_THREADS */ |
1225 |
threads = 1; |
|
1226 |
#endif /* HW_USE_THREADS */ |
|
1227 |
||
1228 |
ctx->num_recon = threads; |
|
22
by csa
Timing reporting, few bug fixes |
1229 |
ctx->recon = (HSTReconstructorContext**)malloc((ctx->num_recon + 1) * sizeof(HSTReconstructorContext*)); |
18
by csa
Big redesign (early commit) |
1230 |
check_alloc(ctx->recon, "pool of HSTReconstructors"); |
1231 |
||
22
by csa
Timing reporting, few bug fixes |
1232 |
ctx->recon[ctx->num_recon] = NULL; |
1233 |
||
58
by Suren A. Chilingaryan
Re-engineering of OpenCL detection |
1234 |
i = 0; limit = 0; |
1235 |
#ifdef HW_USE_CUDA
|
|
1236 |
limit += cuda_recon->devices; |
|
1237 |
for (; (i < limit)&&(i < threads); i++) { |
|
1238 |
ctx->recon[i] = hst_reconstructor_create(cuda_recon, &ctx->setup, i); |
|
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
1239 |
check_alloc(ctx->recon[i], "GPUContext"); |
1240 |
}
|
|
58
by Suren A. Chilingaryan
Re-engineering of OpenCL detection |
1241 |
#endif /* HW_USE_CUDA */ |
49
by root
Merge /home/matthias/dev/pyHST |
1242 |
|
58
by Suren A. Chilingaryan
Re-engineering of OpenCL detection |
1243 |
#ifdef HW_USE_OPENCL
|
1244 |
limit += opencl_recon->devices; |
|
1245 |
for (; (i < limit) && (i < threads); i++) { |
|
49
by root
Merge /home/matthias/dev/pyHST |
1246 |
ctx->recon[i] = hst_reconstructor_create(opencl_recon, &ctx->setup, i); |
1247 |
check_alloc(ctx->recon[i], "OpenCLContext"); |
|
1248 |
}
|
|
58
by Suren A. Chilingaryan
Re-engineering of OpenCL detection |
1249 |
#endif /* HW_USE_OPENCL */ |
49
by root
Merge /home/matthias/dev/pyHST |
1250 |
|
58
by Suren A. Chilingaryan
Re-engineering of OpenCL detection |
1251 |
#ifdef HW_USE_CPU
|
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
1252 |
for (; i < threads; i++) { |
1253 |
ctx->recon[i] = hst_reconstructor_create(cpu_recon, &ctx->setup, threads - i - 1); |
|
1254 |
check_alloc(ctx->recon[i], "CPUContext"); |
|
1255 |
}
|
|
58
by Suren A. Chilingaryan
Re-engineering of OpenCL detection |
1256 |
#endif /* HW_USE_CPU */ |
49
by root
Merge /home/matthias/dev/pyHST |
1257 |
|
58
by Suren A. Chilingaryan
Re-engineering of OpenCL detection |
1258 |
if (!i) pyhst_fail("no reconstructors found"); |
1259 |
||
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
1260 |
pyhst_warning("Running %i reconstructors\n", threads); |
1261 |
||
1262 |
ctx->sched = hw_sched_create(threads); |
|
1263 |
check_alloc(ctx->sched, "scheduller"); |
|
132
by Suren A. Chilingaryan
Pipelined preprocessing (normalize & transpose) |
1264 |
|
1265 |
ctx->cpu_sched = hw_sched_create((cpu_recon->devices>1)?2:1); |
|
1266 |
// ctx->cpu_sched = hw_sched_create((cpu_recon->devices>1)?(cpu_recon->devices - 1):1);
|
|
1267 |
check_alloc(ctx->cpu_sched, "cpu scheduller"); |
|
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
1268 |
|
1269 |
ctx->output_mutex = hw_sched_create_mutex(); |
|
1270 |
//Can return NULL if threads are disabled
|
|
1271 |
//check_alloc(ctx->output_mutex, "hw_sched_create_mutex");
|
|
1272 |
||
151
by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices |
1273 |
#ifdef HST_MULTISLICE_MODE
|
1274 |
hw_sched_set_sequential_mode(ctx->sched, &ctx->num_slices, &ctx->current_slice, HW_SCHED_FLAG_FREE_CALL|HW_SCHED_FLAG_TERMINATOR_CALL); |
|
1275 |
#else /* HST_MULTISLICE_MODE */ |
|
1276 |
hw_sched_set_sequential_mode(ctx->sched, &ctx->num_slices, &ctx->current_slice, 0); |
|
1277 |
#endif /* HST_MULTISLICE_MODE */ |
|
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
1278 |
|
1279 |
err = hw_sched_schedule_thread_task(ctx->sched, ctx, hst_threads_init); |
|
1280 |
check_code(err, "hw_sched_schedule_task"); |
|
1281 |
||
151
by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices |
1282 |
hw_sched_set_sequential_mode(ctx->cpu_sched, &ctx->num_projections, &ctx->current_projection, 0); |
132
by Suren A. Chilingaryan
Pipelined preprocessing (normalize & transpose) |
1283 |
|
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
1284 |
/*
|
1285 |
for (i = 0; i < threads; i++) {
|
|
1286 |
err = hst_reconstructor_init(ctx->recon[i]);
|
|
1287 |
check_code(err, "hst_cpu_init_context");
|
|
1288 |
}
|
|
1289 |
*/
|
|
132
by Suren A. Chilingaryan
Pipelined preprocessing (normalize & transpose) |
1290 |
ctx->projection_data = (float**)malloc(ctx->setup.num_projections * sizeof(float*)); |
1291 |
check_alloc(ctx->projection_data, "projection_data"); |
|
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
1292 |
|
151
by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices |
1293 |
#ifdef HST_MULTISLICE_MODE
|
1294 |
ctx->result_tmpbuf = hst_pinned_malloc(ctx->setup.max_slices * ctx->setup.num_x * ctx->setup.num_y * sizeof(float), HST_PINNED_W); |
|
1295 |
#else /* HST_MULTISLICE_MODE */ |
|
133
by Suren A. Chilingaryan
Pinned OpenCL memory |
1296 |
ctx->result_tmpbuf = hst_pinned_malloc(threads * ctx->setup.num_x * ctx->setup.num_y * sizeof(float), HST_PINNED_W); |
151
by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices |
1297 |
#endif /* HST_MULTISLICE_MODE */ |
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
1298 |
check_alloc(ctx->result_tmpbuf, "temporary buffer for reconstructed slices"); |
1299 |
||
1300 |
err = hw_sched_wait_thread_task(ctx->sched); |
|
1301 |
check_code(err, "hw_sched_wait_task"); |
|
132
by Suren A. Chilingaryan
Pipelined preprocessing (normalize & transpose) |
1302 |
|
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
1303 |
return 0; |
1304 |
}
|
|
1305 |
||
1306 |
static int hst_threads_free(HWThread thr, void *hwctx, int device_id, void *data) { |
|
1307 |
HSTContext *ctx = (HSTContext*)data; |
|
1308 |
hst_reconstructor_free(ctx->recon[device_id]); |
|
20
by csa
do more correct initialization |
1309 |
|
18
by csa
Big redesign (early commit) |
1310 |
return 0; |
1311 |
}
|
|
1312 |
||
1313 |
void hst_free_context(HSTContext *ctx) { |
|
1314 |
int i; |
|
1315 |
||
1316 |
assert(ctx); |
|
20
by csa
do more correct initialization |
1317 |
|
37
by csa
Optimizations of EDF reading |
1318 |
hst_pinned_free(ctx->result_tmpbuf); |
20
by csa
do more correct initialization |
1319 |
|
132
by Suren A. Chilingaryan
Pipelined preprocessing (normalize & transpose) |
1320 |
if (ctx->projection_data) { |
1321 |
free(ctx->projection_data); |
|
1322 |
}
|
|
1323 |
||
18
by csa
Big redesign (early commit) |
1324 |
if (ctx->recon) { |
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
1325 |
if (ctx->sched) { |
1326 |
hw_sched_execute_thread_task(ctx->sched, ctx, hst_threads_free); |
|
1327 |
}
|
|
1328 |
||
18
by csa
Big redesign (early commit) |
1329 |
for (i = 0; i < ctx->num_recon; i++) { |
1330 |
if (ctx->recon[i]) hst_reconstructor_destroy(ctx->recon[i]); |
|
1331 |
}
|
|
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
1332 |
|
18
by csa
Big redesign (early commit) |
1333 |
free(ctx->recon); |
1334 |
}
|
|
20
by csa
do more correct initialization |
1335 |
|
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
1336 |
if (ctx->output_mutex) { |
1337 |
hw_sched_destroy_mutex(ctx->output_mutex); |
|
1338 |
}
|
|
1339 |
||
132
by Suren A. Chilingaryan
Pipelined preprocessing (normalize & transpose) |
1340 |
if (ctx->cpu_sched) { |
1341 |
hw_sched_destroy(ctx->cpu_sched); |
|
1342 |
}
|
|
1343 |
||
30
by csa
Multi-GPU, Multi-CPU, and Hybrid modes support |
1344 |
if (ctx->sched) { |
1345 |
hw_sched_destroy(ctx->sched); |
|
1346 |
}
|
|
1347 |
||
20
by csa
do more correct initialization |
1348 |
hst_setup_free(&ctx->setup); |
18
by csa
Big redesign (early commit) |
1349 |
|
1350 |
memset(ctx, 0, sizeof(HSTContext)); |
|
1351 |
}
|
|
1352 |
||
1353 |
void hst_destroy_context(HSTContext *ctx) { |
|
1354 |
--hst_copies; |
|
1355 |
hst_free_context(ctx); |
|
1356 |
free(ctx); |
|
1357 |
}
|