/tomo/pyhst

To get this branch, use:
bzr branch http://darksoft.org/webbzr/tomo/pyhst
28 by csa
Use pinned result buffer to perform device2host memory transfer parallel with computations, add ESRF copyright information in files appeared after redesign
1
/*
78 by Suren A. Chilingaryan
Add COPYING and fix license statements
2
 * The PyHST program is Copyright (C) 2002-2011 of the
3
 * European Synchrotron Radiation Facility (ESRF) and
4
 * Karlsruhe Institute of Technology (KIT).
28 by csa
Use pinned result buffer to perform device2host memory transfer parallel with computations, add ESRF copyright information in files appeared after redesign
5
 *
6
 * PyHST is free software: you can redistribute it and/or modify it
7
 * under the terms of the GNU General Public License as published by the
8
 * Free Software Foundation, either version 3 of the License, or
9
 * (at your option) any later version.
10
 * 
11
 * hst is distributed in the hope that it will be useful, but
12
 * WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
14
 * See the GNU General Public License for more details.
15
 * 
16
 * You should have received a copy of the GNU General Public License along
17
 * with this program.  If not, see <http://www.gnu.org/licenses/>.
18
 */
19
18 by csa
Big redesign (early commit)
20
#include <stdio.h>
21
#include <stdlib.h>
22
#include <string.h>
23
#include <assert.h>
28 by csa
Use pinned result buffer to perform device2host memory transfer parallel with computations, add ESRF copyright information in files appeared after redesign
24
#include <errno.h>
18 by csa
Big redesign (early commit)
25
#include <math.h>
26
28 by csa
Use pinned result buffer to perform device2host memory transfer parallel with computations, add ESRF copyright information in files appeared after redesign
27
#include <sys/mman.h>
28
168 by Suren A. Chilingaryan
Placeholders for DFI implementation
29
30
#undef HW_USE_SIMD
31
132 by Suren A. Chilingaryan
Pipelined preprocessing (normalize & transpose)
32
#ifdef HW_USE_SIMD
33
# include <xmmintrin.h>
34
#endif /* HW_USE_SIMD */
35
18 by csa
Big redesign (early commit)
36
#include "Vhst_calculate_limits.h"
37
38
#include "debug.h"
39
#include "hst.h"
40
#include "hst_reconstructor.h"
41
58 by Suren A. Chilingaryan
Re-engineering of OpenCL detection
42
#ifdef HW_USE_CUDA
21 by csa
Make gputomo library to follow reconstruction modules naming conventions (hst_cuda now)
43
# include "hst_cuda/hst_cuda.h"
175 by Suren A. Chilingaryan
Make DFI optional
44
# ifdef PYHST_ENABLE_DFI
45
#  include "dfi_cuda/dfi_cuda.h"
46
# endif /* PYHST_ENABLE_DFI */
58 by Suren A. Chilingaryan
Re-engineering of OpenCL detection
47
#endif /* HW_USE_CUDA */
18 by csa
Big redesign (early commit)
48
58 by Suren A. Chilingaryan
Re-engineering of OpenCL detection
49
#ifdef HW_USE_OPENCL
57 by Suren A. Chilingaryan
Separate CPU code into the hst_cpu
50
# include "hst_opencl/hst_opencl.h"
58 by Suren A. Chilingaryan
Re-engineering of OpenCL detection
51
#endif /* HW_USE_OPENCL */
52
53
#ifdef HW_USE_CPU
54
# include "hst_cpu/hst_cpu.h"
55
#endif /* HW_USE_CPU */
49 by root
Merge /home/matthias/dev/pyHST
56
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
57
#include "hw_sched.h"
58
18 by csa
Big redesign (early commit)
59
#define PRE_RADIUS 1
60
61
static int hst_copies = 0;
58 by Suren A. Chilingaryan
Re-engineering of OpenCL detection
62
63
#ifdef HW_USE_CUDA
64
static HSTReconstructor *cuda_recon = NULL;
168 by Suren A. Chilingaryan
Placeholders for DFI implementation
65
static HSTReconstructor *cuda_fbp_recon = NULL;
175 by Suren A. Chilingaryan
Make DFI optional
66
# ifdef PYHST_ENABLE_DFI
168 by Suren A. Chilingaryan
Placeholders for DFI implementation
67
static HSTReconstructor *cuda_dfi_recon = NULL;
175 by Suren A. Chilingaryan
Make DFI optional
68
# endif /* PYHST_ENABLE_DFI */
58 by Suren A. Chilingaryan
Re-engineering of OpenCL detection
69
#endif /* HW_USE_CUDA */
70
71
#ifdef HW_USE_OPENCL
72
static HSTReconstructor *opencl_recon = NULL;
73
#endif /* HW_USE_OPENCL */
74
75
#ifdef HW_USE_CPU
18 by csa
Big redesign (early commit)
76
static HSTReconstructor *cpu_recon = NULL;
58 by Suren A. Chilingaryan
Re-engineering of OpenCL detection
77
#endif /* HW_USE_CPU */
49 by root
Merge /home/matthias/dev/pyHST
78
79
int hst_init(void) {
41 by csa
Bug fixes
80
    hw_sched_init();
58 by Suren A. Chilingaryan
Re-engineering of OpenCL detection
81
82
#ifdef HW_USE_CUDA
168 by Suren A. Chilingaryan
Placeholders for DFI implementation
83
    cuda_fbp_recon = hst_cuda_init(HW_USE_CUDA);
175 by Suren A. Chilingaryan
Make DFI optional
84
# ifdef PYHST_ENABLE_DFI
168 by Suren A. Chilingaryan
Placeholders for DFI implementation
85
    cuda_dfi_recon = dfi_cuda_init(HW_USE_CUDA);
175 by Suren A. Chilingaryan
Make DFI optional
86
# endif /* PYHST_ENABLE_DFI */
58 by Suren A. Chilingaryan
Re-engineering of OpenCL detection
87
#endif /* HW_USE_CUDA */
88
89
#ifdef HW_USE_OPENCL
90
    opencl_recon = hst_opencl_init(HW_USE_OPENCL);
91
#endif /* HW_USE_OPENCL */
92
93
#ifdef HW_USE_CPU
94
    cpu_recon = hst_cpu_init(HST_RECONSTRUCTOR_USE_CPU);
95
#endif /* HW_USE_CPU */
96
18 by csa
Big redesign (early commit)
97
    return 0;
98
}
99
100
void hst_free() {
58 by Suren A. Chilingaryan
Re-engineering of OpenCL detection
101
#ifdef HW_USE_CPU
18 by csa
Big redesign (early commit)
102
    hst_cpu_free();
58 by Suren A. Chilingaryan
Re-engineering of OpenCL detection
103
#endif /* HW_USE_CPU */
104
105
#ifdef HW_USE_CUDA
175 by Suren A. Chilingaryan
Make DFI optional
106
# ifdef PYHST_ENABLE_DFI
168 by Suren A. Chilingaryan
Placeholders for DFI implementation
107
    dfi_cuda_free();
175 by Suren A. Chilingaryan
Make DFI optional
108
# endif /* PYHST_ENABLE_DFI */
21 by csa
Make gputomo library to follow reconstruction modules naming conventions (hst_cuda now)
109
    hst_cuda_free();
58 by Suren A. Chilingaryan
Re-engineering of OpenCL detection
110
#endif /* HW_USE_CUDA */
49 by root
Merge /home/matthias/dev/pyHST
111
58 by Suren A. Chilingaryan
Re-engineering of OpenCL detection
112
#ifdef HW_USE_OPENCL
49 by root
Merge /home/matthias/dev/pyHST
113
    hst_opencl_free();
58 by Suren A. Chilingaryan
Re-engineering of OpenCL detection
114
#endif /* HW_USE_OPENCL */
18 by csa
Big redesign (early commit)
115
}
116
142 by Suren A. Chilingaryan
Adjust number of slices per block to number of reconstructors
117
size_t hst_get_balanced_number_of_reconstructors() {
118
    size_t count = 0;
119
120
#ifdef HW_USE_THREADS
121
# ifdef HW_USE_CUDA
168 by Suren A. Chilingaryan
Placeholders for DFI implementation
122
    count += cuda_fbp_recon->devices;
142 by Suren A. Chilingaryan
Adjust number of slices per block to number of reconstructors
123
# endif /* HW_USE_OPENCL */
124
125
# ifdef HW_USE_OPENCL
126
    count += opencl_recon->devices;
127
# endif /* HW_USE_CUDA */
128
129
# if HW_USE_CPU	== 1
130
    count += cpu_recon->devices - 1; // one core reserved for data preloading
131
# elif defined(HW_USE_CPU)
132
    if (!count) count += cpu_recon->devices - 1;
133
# endif /* HW_USE_CPU */
134
#else
135
    count = 1;
136
#endif
137
138
	// we should try to estimate relative performance here and compute LCM
139
	// this should allow PyHST to allocate such number of slices, that all 
140
	// reconstructors terminate simultaneously
141
    return count;
142
}
143
22 by csa
Timing reporting, few bug fixes
144
/**
145
 * HST Context
146
 */
18 by csa
Big redesign (early commit)
147
struct HSTContextT {
22 by csa
Timing reporting, few bug fixes
148
    HSTSetup setup;			//!< HST paramaters
18 by csa
Big redesign (early commit)
149
    
22 by csa
Timing reporting, few bug fixes
150
    int num_recon;			//!< Number of initialized reconstructors
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
151
    HWSched sched;			//!< Scheduller
132 by Suren A. Chilingaryan
Pipelined preprocessing (normalize & transpose)
152
    HWSched cpu_sched;                  //!< Preprocessing Scheduller
22 by csa
Timing reporting, few bug fixes
153
    HSTReconstructorContext **recon;	//!< Pool of reconstructors (NULL-terminated)
18 by csa
Big redesign (early commit)
154
	
155
	// some cached values set in hst_set_projections
22 by csa
Timing reporting, few bug fixes
156
    const float *custom_angles;		//!< angles of projection axis (per projection)
157
    const float *axis_corrections; 	//!< rotation axis corrections (per projection)
158
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
159
    float *result_tmpbuf;		//!< Temporary buffer for reconstructed slices
160
    float *result_reqbuf;		//!< Result buffer supplied in reconstruction request
161
    const float *sinograms;		//!< Sinograms for processing
162
    HWMutex output_mutex;		//!< Output serializer
22 by csa
Timing reporting, few bug fixes
163
    FILE *output;            		//!< Output file (open and close is done in PyHST_c)
152 by Suren A. Chilingaryan
Support fast writter
164
#ifdef HST_USE_FASTWRITER
165
    fastwriter_t *fw;			//!< Output with fastwriter
166
#else /* HST_USE_FASTWRITER */
153 by Suren A. Chilingaryan
Compilation without fastwriter
167
    void *fw;
152 by Suren A. Chilingaryan
Support fast writter
168
#endif /* HST_USE_FASTWRITER */
22 by csa
Timing reporting, few bug fixes
169
170
    int limits_configured;		//!< Flag indicating if limits are already configured
171
    int filter_configured;		//!< Flag indicating if filter is already configured
18 by csa
Big redesign (early commit)
172
132 by Suren A. Chilingaryan
Pipelined preprocessing (normalize & transpose)
173
    int num_projections;
174
    int current_projection;
175
    int preproc_slices;
176
    float *preproc_sinograms;
177
    float **projection_data;
178
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
179
    int num_slices;
180
    int current_slice;
181
182
    int slice_offset;			//!< Number of slices already processed (for navigation in output file)
38 by csa
Preload data from EDF images while computing slices
183
    int processing;			//!< Flag indicating the working threads are running and processing slices
43 by csa
Option to disable preloading of image files, additional timers
184
185
    void *mmap;
186
187
#ifdef PYHST_MEASURE_TIMINGS
188
    double comp_timer;
189
    double io_timer;
190
#endif /* PYHST_MEASURE_TIMINGS */
191
18 by csa
Big redesign (early commit)
192
};
22 by csa
Timing reporting, few bug fixes
193
typedef struct HSTContextT HSTContext; //!< a short name
18 by csa
Big redesign (early commit)
194
195
20 by csa
do more correct initialization
196
static int hst_configure_data_structures(HSTContext *ctx, const float *custom_angles, const float *axis_corrections) {
197
    int projection;		// projection counter
198
    int num_projections;	// Number of projections
199
200
    float angle;		// projections angle
201
    float angle_offset;         // Offset rotation angle in radians, could be overriden with custom_angles
202
    float angle_increment;      // Angle in radians between each, could be overriden with custom_angles
203
    float *cos_s, *sin_s;
204
205
    float axis_position;       	// Position of rotation axis
206
    float *axis_position_corr_s;
207
208
    ctx->custom_angles = custom_angles;
209
    ctx->axis_corrections = axis_corrections;
210
211
    num_projections = ctx->setup.num_projections;
212
213
    angle_offset = ctx->setup.angle_offset;
214
    angle_increment = ctx->setup.angle_increment;
215
    axis_position = ctx->setup.axis_position;
216
217
	// Computing FFT dimensions, hard to understand why the -1 is there but it works nicely
124 by Suren A. Chilingaryan
To prevent silent segmentations during fatal errors executed within multithreaded context, as a temporal solution just print error message before calling g_log
218
	// DS: This is actually two times more than the closest power of 2. Do we need that?
168 by Suren A. Chilingaryan
Placeholders for DFI implementation
219
    ctx->setup.dim_fft = 1<<((int)(log((1. * ctx->setup.fft_oversampling * ctx->setup.num_bins - 1)) / log(2.0) + 0.9999));
228 by Suren A. Chilingaryan
Quality fixes (tex-based)
220
    printf("FFT convolution: %u\n", ctx->setup.dim_fft);
23 by csa
Perform pair of convolutions using a single complex fourier transformation in CUDA reconstruction module (early commit)
221
    ctx->setup.filter = malloc((2 * ctx->setup.dim_fft) * sizeof(float));
20 by csa
do more correct initialization
222
    check_alloc(ctx->setup.filter, " FILTER");
223
224
    axis_position_corr_s = (float *) malloc(num_projections * sizeof(float));
225
    check_alloc(axis_position_corr_s, "axis_position_corr_s");
226
    ctx->setup.axis_position_corr_s = axis_position_corr_s;
227
    
228
    cos_s = (float *) malloc(num_projections * sizeof(float));
229
    check_alloc(cos_s, "cos_s");
230
    ctx->setup.cos_s = cos_s;
231
232
    sin_s = (float *) malloc(num_projections * sizeof(float));
233
    check_alloc(sin_s, "sin_s");
234
    ctx->setup.sin_s = sin_s;
235
236
    for (projection = 0; projection < num_projections; projection++) {
237
        if (custom_angles) {
238
            angle = custom_angles[projection];
239
        } else {
240
            angle = angle_offset + projection * angle_increment ;
241
        }
242
243
        cos_s[projection] = cos(angle);
244
        sin_s[projection] = sin(angle);
245
246
	if (axis_corrections) axis_position_corr_s[projection] = axis_position + axis_corrections[projection];
247
	else axis_position_corr_s[projection] = axis_position;
248
    }
249
250
    return 0;
251
}
252
253
static int hst_configure_limits(HSTContext *ctx) {
18 by csa
Big redesign (early commit)
254
    int i;
255
    
256
    int projection;		// projection counter
257
    
258
	// Some variables from configuration
259
    int num_projections;	// Number of projections
260
    int num_bins;		// Number of bins in sinograms
261
262
    int start_x;                // First X-pixel in region required by user
263
    int start_y;                // First Y-pixel in region required by user
264
    int num_x;			// Number of X-pixels in reconstruction region
265
    int num_y;			// Number of Y-pixels in reconstruction region
266
267
    float axis_position;       	// Position of rotation axis
268
    float angle_increment;      // Angle in radians between each, could be overriden with custom_angles
269
270
    int axis_to_the_center;
271
    
272
    int zerooffmask;
273
274
    int *minX = NULL, *maxX = NULL;
275
    float *cos_s, *sin_s;
276
    float *axis_position_corr_s;
277
278
	// The block of variables for reconstruction limits
279
    int status;
280
    int *X_STARTS, *X_ENDS;
281
    int y_start;
282
    int y_end;
283
    int startxdum, startydum, finxdum, finydum;
284
285
	// The next block are variables needed for zeroofmask-mode computations 
286
#if PRE_RADIUS
287
    float * RRadius = NULL;
288
#endif /* PRE_RADIUS */
289
    float yy, rradius;
290
    int yoristart, yoriend;
291
    int yorigin ;
292
    int x_start, x_end ;
293
    int y;
294
    float EXT1, EXT2;
295
    int XH;
296
297
298
    /* The block of variables describing the position of axes (and used to move
299
    the axis to the center of the reconstructed zone) as well as projection
300
    angles */
301
    float MOVE_X;
302
    float MOVE_Y;
303
20 by csa
do more correct initialization
304
    int fai360;
18 by csa
Big redesign (early commit)
305
    float axis_position_corr = 0;
306
    float cos_angle , sin_angle;
20 by csa
do more correct initialization
307
    float corre; // axis correction
18 by csa
Big redesign (early commit)
308
309
    assert(ctx);
310
    
311
    num_bins = ctx->setup.num_bins;
312
    num_projections = ctx->setup.num_projections;
313
    
314
    start_x = ctx->setup.start_x;
315
    num_x = ctx->setup.num_x;
316
    start_y = ctx->setup.start_y;
317
    num_y = ctx->setup.num_y;
20 by csa
do more correct initialization
318
319
    axis_position_corr_s = ctx->setup.axis_position_corr_s;
320
    cos_s = ctx->setup.cos_s;
321
    sin_s = ctx->setup.sin_s;
18 by csa
Big redesign (early commit)
322
    
20 by csa
do more correct initialization
323
    axis_position = ctx->setup.axis_position;
18 by csa
Big redesign (early commit)
324
    angle_increment = ctx->setup.angle_increment;
20 by csa
do more correct initialization
325
18 by csa
Big redesign (early commit)
326
    axis_to_the_center = ctx->setup.center_axis;
327
    zerooffmask = ctx->setup.zerooffmask;
328
329
    
330
    /*************************************************************
331
     * Depending on geometry ( dimensions, axis positions ....  )*
332
     * only a restraint part of the slice can be back projected  *
333
     * from all the sinograms.                                   *
334
     * X_Start tells the limits for each y  ( idem for X_ENDS)   *
335
     *************************************************************/
336
    X_STARTS = malloc(num_bins * sizeof(int));
337
    check_alloc(X_STARTS," X_STARTS allocation in c_hst_recon_overslices");
338
    X_ENDS = malloc(num_bins * sizeof(int));
339
    check_alloc(X_ENDS," X_ENDS allocation in c_hst_recon_overslices");
340
341
    /************************************************************
342
     * calculate limits of slices to reconstruct                *
343
     ************************************************************/
344
    /* Restrict the area if projections are covering more than 360 grads  */
345
    startxdum=1;
346
    startydum=1;
347
    finxdum = num_bins ;
348
    finydum = num_bins ;
349
350
    /* DSBug: case of custom angles is not handled here */
351
    if (fabs((num_projections)*(angle_increment) - 2*M_PI)>0.001 ) {
352
        hst_calculate_limits__(&num_bins, &startxdum, &startydum, &finxdum, &finydum, &axis_position, &y_start, &y_end, X_STARTS, X_ENDS, &status);
353
        y_start+=1;
354
        y_end  -=1;
355
    } else {
356
        y_start=-1;
357
        y_end  =-1;
358
    }
359
360
    fai360 = (y_start == -1 && y_end == -1);
361
    ctx->setup.fai360 = fai360;
362
363
364
    if (axis_to_the_center) {
365
        MOVE_X = (((start_x - 1) + num_x / 2.0) - axis_position);
366
        MOVE_Y = (((start_y - 1) + num_y / 2.0) - axis_position);
367
    } else {
368
        MOVE_X = 0;
369
        MOVE_Y = 0;
370
    }
371
372
    ctx->setup.offset_x = start_x - MOVE_X ;
373
    ctx->setup.offset_y = start_y - MOVE_Y ;
374
375
    if (zerooffmask) {
376
        /* these should be allocated once and for all out of the routine */
377
	if (ctx->setup.minX) minX = ctx->setup.minX;
378
	else {
379
    	    minX = (int*) malloc(num_y * sizeof(int));
380
    	    check_alloc(minX, "minX");
381
	    ctx->setup.minX = minX;
382
	}
383
	
384
	if (ctx->setup.maxX) maxX = ctx->setup.maxX;
385
	else {
386
    	    maxX = (int*) malloc(num_y * sizeof(int));
387
    	    check_alloc(maxX, "maxX");
388
	    ctx->setup.maxX = maxX;
389
	}
390
	
391
        for (i = 0; i < num_y; i++) {
392
            maxX[i] = 0;
393
            minX[i] = 1000000;
394
        }
395
396
#if PRE_RADIUS
397
        /* The difference with !pre_radius what we do not suing axis_corrections,
398
        i.e axis_position is used in place of axis_position_corr */
399
        RRadius = (float *) malloc(num_y * sizeof(float));
400
        check_alloc(RRadius, "RRadius");
401
402
        for (y = 0; y < num_y; y++) {
403
            rradius = MAX(axis_position , num_bins - axis_position) ;
404
            yy = ((y + start_y - MOVE_Y - 1) + 0.5 - axis_position);
405
            if (fabs(yy) > rradius) {
406
                RRadius[y] = 0.0;
407
            } else {
408
                RRadius[y] = sqrt(rradius * rradius - yy * yy);
409
            }
410
        }
411
#endif /* PRE_RADIUS */
412
    }
413
414
    for (projection = 0; projection < num_projections; projection++) {
20 by csa
do more correct initialization
415
        cos_angle = cos_s[projection];
416
        sin_angle = sin_s[projection];
417
	axis_position_corr = axis_position_corr_s[projection];
418
	
419
	//corre = axis_position_corr - axis_position;
420
	corre = ctx->axis_corrections?ctx->axis_corrections[projection]:0;
18 by csa
Big redesign (early commit)
421
422
        if (zerooffmask) {
423
            if (fai360) {
424
                yoristart = 0;
425
                yoriend = num_y ;
426
            } else {
427
                yoristart = y_start - 1 ;
428
                yoriend = y_end ;
429
            }
430
431
            for (yorigin = yoristart ; yorigin < yoriend ; yorigin++) {
432
                if (fai360) {
433
                    y = yorigin;
434
                } else {
435
                    /* y_start e y_end sono calcolati per una maschera massimale */
436
                    /* y e relativo alla finestra dell user */
437
                    y = (yorigin + MOVE_Y + corre - start_y) + 100000; /* per evitare che -0.1 sia tagliato a 0 come 0.9 */
438
                    y = y - 100000;
439
                }
440
441
                /* si potrebbe eventualmente togliere il 2 se tutto va bene : ps 2 ->1*/
442
                if (y < 0 || y >= num_y - 0) continue;
443
444
                if (fai360) {
445
#if PRE_RADIUS
446
                    rradius = RRadius[y];
447
#else /*PRE_RADIUS */
448
                    rradius = MAX(axis_position_corr , num_bins - axis_position_corr) ;
449
                    yy = ((y + start_y - MOVE_Y - 1) + 0.5 - axis_position);
450
                    if (fabs(yy) > rradius) {
451
                        rradius = 0.0;
452
                    } else {
453
                        rradius = sqrt(rradius * rradius - yy * yy);
454
                    }
455
#endif /* PRE_RADIUS */
456
457
458
                    if (fabs(cos_angle) > 1.0e-6) { /* s puo lasciare solo la parte in y */
459
                        EXT1 = -((float)(((axis_position_corr +
460
                                           ((start_x - MOVE_X - 1) + 0.5 - axis_position) * cos_angle -
461
                                           ((y + start_y - MOVE_Y - 1) + 0.5 - axis_position) * sin_angle - 0.5)))) / cos_angle ;
462
                        EXT2 = (num_bins - (float)(((axis_position_corr +
463
                                                     ((start_x - MOVE_X - 1) + 0.5 - axis_position) * cos_angle -
464
                                                     ((y + start_y - MOVE_Y - 1) + 0.5 - axis_position) * sin_angle - 0.5)))) / cos_angle ;
465
                    } else {
466
                        XH = ((float)(((axis_position_corr -
467
                                        ((y + start_y - MOVE_Y - 1) + 0.5 - axis_position) * sin_angle - 0.5)))
468
                             );
469
                        if (XH <= 0 || XH >= num_bins) continue;
470
                        EXT1 = 0;
471
                        EXT2 = num_x;
472
                    }
473
                    x_start = MIN(EXT1, EXT2) + 1;
474
                    x_end = MAX(EXT1, EXT2) - 1;
475
476
                    /********************************************************************************************/
477
                    x_start = MAX(x_start , axis_position - rradius - start_x); /* SOSPETTO */
478
                    x_end = MIN(x_end , axis_position + rradius - start_x);
479
                    /********************************************************************************************/
480
481
                } else { // fai360 if
482
                    x_start = (X_STARTS[yorigin] - 1) - start_x + MOVE_X + corre ;
483
                    /* if( x_start< + DX0) x_start= DX0; */
484
                    x_end = (X_ENDS[yorigin] - 1) - start_x + MOVE_X + corre ;
485
                } // end of fai360 if
486
487
                {
488
                    if (x_start < 0) x_start = 0;
489
                    if (x_end >= num_x - 0) x_end = num_x - 0;
490
                    /* anche qui se tutto va bene si puo provare ad essere piu larghi
491
                    x_start+=2;
492
                    x_end-=2; */
493
                }
494
495
                if (zerooffmask) {
496
                    minX[y] = MIN(minX[y], x_start);
497
                    maxX[y] = MAX(maxX[y], x_end);
498
                }
499
            } // end of yorigin loop
500
        } // zerooffmask if
501
    } // end of projections loop
502
20 by csa
do more correct initialization
503
#if PRE_RADIUS
504
    if (zerooffmask) {
505
	free(RRadius);
506
    }
507
#endif /* PRE_RADIUS */
18 by csa
Big redesign (early commit)
508
509
    free(X_ENDS);
510
    free(X_STARTS);
511
20 by csa
do more correct initialization
512
    ctx->limits_configured = 1;
513
    ctx->setup.what_changed |= HST_LIMITS_CHANGED;
18 by csa
Big redesign (early commit)
514
    
515
    return 0;
516
}
517
228 by Suren A. Chilingaryan
Quality fixes (tex-based)
518
#include "astra_fourier.h"
18 by csa
Big redesign (early commit)
519
int hst_set_filter(HSTContext *ctx, HSTFilterFunction func, void *func_param) {
520
    int j;
521
    int dim_fft;
522
    float *FILTER;
523
524
    assert(ctx);
228 by Suren A. Chilingaryan
Quality fixes (tex-based)
525
18 by csa
Big redesign (early commit)
526
    dim_fft = ctx->setup.dim_fft;
20 by csa
do more correct initialization
527
    FILTER = ctx->setup.filter;
18 by csa
Big redesign (early commit)
528
228 by Suren A. Chilingaryan
Quality fixes (tex-based)
529
#ifdef PYHST_ASTRA_SCALING
530
	// dim_fft is next power of two of (2 * num_proj)
531
	// num_bins is dim_fft/2 + 1
532
    int num_bins = dim_fft / 2 + 1;	// number of non-symmetric coefficients
533
534
    memset(FILTER, 0, 2 * dim_fft * sizeof(float));
535
    FILTER[0] = 0.25;
536
    FILTER[1] = 0;
537
    for (j = 1; j < dim_fft; j++) {
538
	if (j&1) {
539
	    double coef;
540
	    if (2 * j > dim_fft)
541
		coef = M_PI * (dim_fft - j);
542
	    else
543
		coef = M_PI * j;
544
	    FILTER[2 * j] = -1. / (coef * coef);
545
	} else {
546
	    FILTER[2 * j] = 0;
547
	}
548
	FILTER[2 * j + 1] = 0;
549
    }
550
551
    int *ip = (int*)malloc((2 + sqrt(dim_fft) + 1)*sizeof(int)); ip[0] = 0;
552
    float *w = (float*)malloc(dim_fft * sizeof(float) / 2);
553
    cdft(2 * dim_fft, -1, FILTER, ip, w);
554
    free(w);
555
    free(ip);
556
557
    for (j = 0; j < num_bins; j++) {
558
	float _fD = 1.0f;
559
        float fRelIndex = (float)j / (float)dim_fft;
560
        float pfW = 2. * M_PI * fRelIndex;
561
562
	    // On last iteration it is actually equal. So, rounding error works differently with ASTRA and here.... Therefore, we require a bit on top.
563
	if (pfW > (M_PI * _fD + 1E-6)) {
564
	    FILTER[2 * j] = 0;
565
	    FILTER[2 * j + 1] = 0;
566
	} else {
567
	    FILTER[2 * j] = 2.0 * FILTER[2 * j];
568
	    FILTER[2 * j + 1] = FILTER[2 * j];
569
	}
570
    }
571
572
    for (j = num_bins; j < dim_fft; j++) {
573
	FILTER[2 * j] = 0;
574
	FILTER[2 * j + 1] = 0;
575
    }
576
577
/*
578
    for (j = 0; j < num_bins; j++) {
579
	printf("%8.4f %8.4f ", FILTER[2 * j], FILTER[2 * j + 1]);
580
    }
581
    printf("\n");
582
*/
583
#else /* ASTRA_SCALING */
18 by csa
Big redesign (early commit)
584
    if (func) {
585
        func(func_param, dim_fft, FILTER);
586
    } else {
587
        FILTER[0]=0.0;
228 by Suren A. Chilingaryan
Quality fixes (tex-based)
588
        FILTER[1]= 1.0/dim_fft ;             // This is the real part of the highest frequency 
23 by csa
Perform pair of convolutions using a single complex fourier transformation in CUDA reconstruction module (early commit)
589
590
        for (j=1;j<dim_fft / 2; ++j) {
228 by Suren A. Chilingaryan
Quality fixes (tex-based)
591
            FILTER[2 * j] = j * 2.0 / dim_fft / dim_fft;
18 by csa
Big redesign (early commit)
592
            FILTER[2 * j + 1] = FILTER[2 * j];
593
        }
594
    }
595
596
    if (ctx->setup.sum_rule == 2) {
597
        FILTER[0]=FILTER[2]/4.0;
598
    }
599
600
    if (ctx->setup.no_filtering) {
601
        FILTER[1] = -9999.9999;
602
    }
603
    /* This is due to different organization of data representation used after by
604
    real to complex FFT transfer. In both cases only non-redundant coefficients
605
    are stored however, the current CPU code stores High Frequency Term (HFT) in the
606
    res[1], while cuFFT (and FFTW as well) have always res[1] = 0 and stores
607
    the HFT in res[dim_fft] */
608
228 by Suren A. Chilingaryan
Quality fixes (tex-based)
609
    FILTER[dim_fft] = FILTER[1]; // or should it be 0?
18 by csa
Big redesign (early commit)
610
23 by csa
Perform pair of convolutions using a single complex fourier transformation in CUDA reconstruction module (early commit)
611
    /* CUDA code now includes optimization allowing to compute two convolutions 
612
    parallely using single Complex-Complex Fourier transform. For that we need
613
    to feel complete FILTER matrix including redundant coefficients. Therfore, 
614
    the filter is filled withcoefficients according to the cuFFT/FFTW approach. 
615
    The case of embedded FFT code (when compressed FFT represntation is needed)
616
    is handled separately in hst.c/hst_cpu code. */
617
618
    FILTER[dim_fft + 1] = FILTER[1];
619
    FILTER[1] = FILTER[0];
228 by Suren A. Chilingaryan
Quality fixes (tex-based)
620
#endif /* ASTRA_SCALING */
23 by csa
Perform pair of convolutions using a single complex fourier transformation in CUDA reconstruction module (early commit)
621
622
    for (j = dim_fft + 2; j < 2 * dim_fft; j+=2) {
623
	FILTER[j] = FILTER[2 * dim_fft - j];
624
	FILTER[j + 1] = FILTER[2 * dim_fft - j + 1];
625
    }
626
627
20 by csa
do more correct initialization
628
    ctx->filter_configured = 1;
629
    ctx->setup.what_changed |= HST_FILTER_CHANGED;
630
    
18 by csa
Big redesign (early commit)
631
    return 0;
632
}
633
634
int hst_set_output_file(HSTContext *ctx, FILE *output) {
152 by Suren A. Chilingaryan
Support fast writter
635
    if ((ctx->fw)||(output != ctx->output)) {
636
	ctx->fw = NULL;
20 by csa
do more correct initialization
637
	ctx->output = output;
638
	ctx->setup.what_changed |= HST_OUTPUT_CHANGED;
639
    }
18 by csa
Big redesign (early commit)
640
    
641
    return 0;
642
}
643
152 by Suren A. Chilingaryan
Support fast writter
644
#ifdef HST_USE_FASTWRITER
645
int hst_set_fastwriter(HSTContext *ctx, fastwriter_t *output) {
646
# ifndef HST_MULTISLICE_MODE
647
    pyhst_error("Fastwriter is not supported in non-multislice mode");
648
    return -1;
649
# endif /* HST_MULTISLICE_MODE */
650
651
    if ((ctx->output)||(ctx->fw != output)) {
652
	ctx->output = NULL;
653
	ctx->fw = output;
654
	ctx->setup.what_changed |= HST_OUTPUT_CHANGED;
655
    }
656
}
657
#endif /* HST_USE_FASTWRITER */
658
659
18 by csa
Big redesign (early commit)
660
int hst_set_padding(HSTContext *ctx, int zero_padding) {
20 by csa
do more correct initialization
661
    if (zero_padding != ctx->setup.zero_padding) {
662
	ctx->setup.zero_padding = zero_padding;
663
	ctx->setup.what_changed |= HST_PADDING_CHANGED;
664
    }
18 by csa
Big redesign (early commit)
665
    return 0;
666
}
667
668
int hst_set_axis_mode(HSTContext *ctx, int axis_to_the_center) {
669
    if (axis_to_the_center != ctx->setup.center_axis) {
670
        ctx->setup.center_axis = axis_to_the_center;
671
	    // The projections configuration is dependent on this parameter
20 by csa
do more correct initialization
672
	hst_configure_limits(ctx);
673
    }
674
    return 0;
675
}
676
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
677
static int hst_threads_configure(HWThread thr, void *hwctx, int device_id, void *data) {
678
    int err = 0;
679
    HSTContext *ctx = (HSTContext*)data;
680
681
    //printf("Configure %i\n", device_id);
682
683
    HSTReconstructorContextPtr rctx = ctx->recon[device_id];
684
    
685
    if (rctx->recon.configure) {
686
	err = rctx->recon.configure(rctx, ctx->setup.what_changed);
687
    }
132 by Suren A. Chilingaryan
Pipelined preprocessing (normalize & transpose)
688
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
689
    return err;
690
}
691
20 by csa
do more correct initialization
692
static int hst_configure(HSTContext *ctx) {
693
    int err;
694
695
    assert(ctx);
696
697
    if (!ctx->filter_configured) {
698
        err = hst_set_filter(ctx, NULL, NULL);
699
	check_code(err, "hst_set_filter");
700
    }
701
702
    if (!ctx->limits_configured) {    
703
        err = hst_configure_limits(ctx);
704
	check_code(err, "hst_configure_limits");
705
    }
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
706
707
    err = hw_sched_execute_thread_task(ctx->sched, ctx, hst_threads_configure);
708
    check_code(err, "hst_configure");
20 by csa
do more correct initialization
709
    
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
710
/*
711
    int i;
20 by csa
do more correct initialization
712
    for (i = 0; i < ctx->num_recon; ++i) {
713
	rctx = ctx->recon[i];
714
	if (rctx->recon.configure) {
715
	    err = rctx->recon.configure(rctx, ctx->setup.what_changed);
716
	    check_code(err, "hst_configure");
717
	}
718
    }
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
719
*/
720
721
    if (ctx->setup.what_changed&HST_OUTPUT_CHANGED) {
722
	ctx->slice_offset = 0;
723
    }
20 by csa
do more correct initialization
724
    
725
    ctx->setup.what_changed = 0;
726
    
727
    return 0;
728
}
18 by csa
Big redesign (early commit)
729
730
static int hst_reconstruct_slice(HSTContext *ctx, HSTReconstructorContextPtr rctx, float *slice_result, const float *sinograms ) {
731
    int err;
732
733
    assert(ctx);
734
    assert(rctx);
735
    assert(rctx->recon.reconstruct);
736
151 by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices
737
#ifdef HST_MULTISLICE_MODE
738
        if (rctx->recon.send) {
739
            const float *last_sino;
740
            float *last_slice;
741
742
            last_slice = rctx->saved_slice;
743
            last_sino = rctx->saved_sino;
744
745
            if (sinograms) 
746
                rctx->recon.send(rctx, sinograms);
747
748
            rctx->saved_slice = slice_result;
749
            rctx->saved_sino = sinograms;
750
751
            slice_result = last_slice;
752
            sinograms = last_sino;
753
        } 
754
755
        if (!slice_result) return 0;
756
#endif /* HST_MULTISLICE_MODE */
757
//
758
18 by csa
Big redesign (early commit)
759
    if (rctx->recon.preprocess) {
760
	err = rctx->recon.preprocess(rctx, slice_result, sinograms);
761
	if (err) return err;
762
    }
763
764
    err = rctx->recon.reconstruct(rctx, slice_result, sinograms);
765
    if (err) return err;
766
767
    if (rctx->recon.postprocess) {
768
	err = rctx->recon.postprocess(rctx, slice_result, sinograms);
769
	if (err) return err;
770
    }
22 by csa
Timing reporting, few bug fixes
771
186 by Suren A. Chilingaryan
float2/4 modes in standard kernel
772
    rctx->processed_slices += SLICE_BLOCK;
151 by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices
773
18 by csa
Big redesign (early commit)
774
    return 0;
775
}
776
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
777
static int hst_threads_reconstruct_slice(HWThread thr, void *hwctx, int slice_id, void *data) {
151 by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices
778
    int err = 0;
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
779
    int device_id = thr->thread_id;
780
    HSTContext *ctx = (HSTContext*)data;
781
    int cnt, count =  ctx->setup.num_x * ctx->setup.num_y;
782
43 by csa
Option to disable preloading of image files, additional timers
783
#ifdef PYHST_MEASURE_TIMINGS
784
    GTimer *timer;
785
#endif /* PYHST_MEASURE_TIMINGS */
45 by csa
Fixes measurements of I/O timings
786
787
#ifndef PYHST_IO_BENCHMARK
151 by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices
788
    if (slice_id == HW_SCHED_CHUNK_FREE) {
789
        err = hst_reconstruct_slice(ctx, ctx->recon[device_id], NULL, NULL);
790
        if (ctx->recon[device_id]->recon.wait)
791
            ctx->recon[device_id]->recon.wait(ctx->recon[device_id]);
792
    } else if (slice_id >= 0) {
793
        err = hst_reconstruct_slice(ctx, ctx->recon[device_id],
794
# ifdef HST_MULTISLICE_MODE
186 by Suren A. Chilingaryan
float2/4 modes in standard kernel
795
	    (ctx->result_reqbuf?ctx->result_reqbuf:ctx->result_tmpbuf) + SLICE_BLOCK * slice_id * count,
151 by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices
796
# else /* HST_MULTISLICE_MODE */
191 by Suren A. Chilingaryan
Multislice for OpenCL
797
	    ctx->result_reqbuf?(ctx->result_reqbuf + SLICE_BLOCK * slice_id * count):(ctx->result_tmpbuf + SLICE_BLOCK * device_id * count),
151 by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices
798
# endif /* HST_MULTISLICE_MODE */
186 by Suren A. Chilingaryan
float2/4 modes in standard kernel
799
	    ctx->sinograms + SLICE_BLOCK * slice_id * ctx->setup.num_bins * ctx->setup.num_projections
151 by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices
800
        );
801
    }
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
802
    if (err) return err;
45 by csa
Fixes measurements of I/O timings
803
#endif /* ! PYHST_IO_BENCHMARK */
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
804
151 by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices
805
65 by Suren A. Chilingaryan
Reconstruction benchmark mode
806
#ifndef PYHST_RECON_BENCHMARK
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
807
    if (!ctx->result_reqbuf) {
45 by csa
Fixes measurements of I/O timings
808
	hw_sched_lock_mutex(ctx->output_mutex);
809
65 by Suren A. Chilingaryan
Reconstruction benchmark mode
810
# ifdef PYHST_MEASURE_TIMINGS
43 by csa
Option to disable preloading of image files, additional timers
811
	timer = g_timer_new();
65 by Suren A. Chilingaryan
Reconstruction benchmark mode
812
# endif /* PYHST_MEASURE_TIMINGS */
43 by csa
Option to disable preloading of image files, additional timers
813
151 by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices
814
# ifdef HST_MULTISLICE_MODE
815
        if (slice_id == HW_SCHED_CHUNK_TERMINATOR) {
152 by Suren A. Chilingaryan
Support fast writter
816
#  ifdef HST_USE_FASTWRITER  
817
    	    if (ctx->fw) {
818
retry:
186 by Suren A. Chilingaryan
float2/4 modes in standard kernel
819
    		err = fastwriter_push_data(ctx->fw, SLICE_BLOCK * ctx->num_slices * count * sizeof(float), ctx->result_tmpbuf);
152 by Suren A. Chilingaryan
Support fast writter
820
    		if (err) {
821
    		    if (err == EWOULDBLOCK) goto retry;
822
#   ifdef PYHST_MEASURE_TIMINGS
823
	            if (timer) g_timer_destroy(timer);
824
#   endif
825
	            hw_sched_unlock_mutex(ctx->output_mutex);
186 by Suren A. Chilingaryan
float2/4 modes in standard kernel
826
	            pyhst_error("faswriter failed wrting %u MB of data, error %i", SLICE_BLOCK * ctx->num_slices * count * sizeof(float)/1024/1024, err);
152 by Suren A. Chilingaryan
Support fast writter
827
	            return 1;
828
    		}
829
    	    } else {
830
#  endif /* HST_USE_FASTWRITER */
186 by Suren A. Chilingaryan
float2/4 modes in standard kernel
831
        	cnt = fwrite(ctx->result_tmpbuf, sizeof(float), SLICE_BLOCK * ctx->num_slices * count, ctx->output);
832
                if (cnt !=  SLICE_BLOCK * ctx->num_slices * count) {
151 by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices
833
#  ifdef PYHST_MEASURE_TIMINGS
152 by Suren A. Chilingaryan
Support fast writter
834
	            if (timer) g_timer_destroy(timer);
151 by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices
835
#  endif
152 by Suren A. Chilingaryan
Support fast writter
836
	            hw_sched_unlock_mutex(ctx->output_mutex);
186 by Suren A. Chilingaryan
float2/4 modes in standard kernel
837
	            pyhst_error("fwrite failed, send pushed characters %i, but written %i, errno %i", SLICE_BLOCK * ctx->num_slices * count, cnt, errno);
152 by Suren A. Chilingaryan
Support fast writter
838
	            return 1;
839
	        }
840
#  ifdef HST_USE_FASTWRITER  
841
            }
842
#  endif /* HST_USE_FASTWRITER */
151 by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices
843
        }
844
# else /* HST_MULTISLICE_MODE */
186 by Suren A. Chilingaryan
float2/4 modes in standard kernel
845
	err = fseek(ctx->output, ((long)(ctx->slice_offset + slice_id * SLICE_BLOCK)) * count * sizeof(float), SEEK_SET);
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
846
	if (err) {
151 by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices
847
#  ifdef PYHST_MEASURE_TIMINGS
43 by csa
Option to disable preloading of image files, additional timers
848
	    if (timer) g_timer_destroy(timer);
151 by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices
849
#  endif
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
850
	    hw_sched_unlock_mutex(ctx->output_mutex);
851
	    pyhst_error("fseek failed, errno: %i", errno);
852
	    return err;
853
	}
43 by csa
Option to disable preloading of image files, additional timers
854
186 by Suren A. Chilingaryan
float2/4 modes in standard kernel
855
        cnt = fwrite(ctx->result_tmpbuf + device_id * count, sizeof(float), SLICE_BLOCK * count, ctx->output);
856
	if (cnt != SLICE_BLOCK * count) {
151 by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices
857
#  ifdef PYHST_MEASURE_TIMINGS
43 by csa
Option to disable preloading of image files, additional timers
858
	    if (timer) g_timer_destroy(timer);
151 by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices
859
#  endif
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
860
	    hw_sched_unlock_mutex(ctx->output_mutex);
186 by Suren A. Chilingaryan
float2/4 modes in standard kernel
861
	    pyhst_error("fwrite failed, send pushed characters %i, but written %i, errno %i", SLICE_BLOCK * count, cnt, errno);
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
862
	    return 1;
863
	}
151 by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices
864
# endif /* HST_MULTISLICE_MODE */
45 by csa
Fixes measurements of I/O timings
865
	//fsync(fileno(ctx->output));
866
43 by csa
Option to disable preloading of image files, additional timers
867
65 by Suren A. Chilingaryan
Reconstruction benchmark mode
868
# ifdef PYHST_MEASURE_TIMINGS
43 by csa
Option to disable preloading of image files, additional timers
869
	if (timer) {
870
		// We should be inside the lock, otherwise multiple accesses.
871
	    g_timer_stop(timer);
872
	    ctx->io_timer += g_timer_elapsed(timer, NULL);
45 by csa
Fixes measurements of I/O timings
873
	    //ctx->comp_timer -= g_timer_elapsed(timer, NULL);
43 by csa
Option to disable preloading of image files, additional timers
874
	    g_timer_destroy(timer);
875
	}
65 by Suren A. Chilingaryan
Reconstruction benchmark mode
876
# endif /* PYHST_MEASURE_TIMINGS */
45 by csa
Fixes measurements of I/O timings
877
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
878
	hw_sched_unlock_mutex(ctx->output_mutex);
879
    }
35 by csa
Fix python code not feeding the real data into the sinograms
880
/*    
881
    FILE *f;
882
    char tmp[64];
883
    sprintf(tmp, "/tmp/test/%i-%i",slice_id + ctx->slice_offset, device_id);
884
    f=fopen(tmp, "w");
885
    fwrite(ctx->result_tmpbuf + device_id * count, sizeof(float), count, f);
886
    fclose(f);
887
*/    
65 by Suren A. Chilingaryan
Reconstruction benchmark mode
888
#endif /* ! PYHST_RECON_BENCHMARK */
889
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
890
    return 0;
891
}
892
18 by csa
Big redesign (early commit)
893
22 by csa
Timing reporting, few bug fixes
894
int hst_reconstruct_slices(HSTContextPtr ctx, int num_slices, float *result, const float *sinograms) {
18 by csa
Big redesign (early commit)
895
    int err;
43 by csa
Option to disable preloading of image files, additional timers
896
#ifdef PYHST_MEASURE_TIMINGS
897
    GTimer *timer;
898
#endif /* PYHST_MEASURE_TIMINGS */
18 by csa
Big redesign (early commit)
899
900
    assert(ctx);
901
20 by csa
do more correct initialization
902
    if (ctx->setup.what_changed) {
903
	hst_configure(ctx);
18 by csa
Big redesign (early commit)
904
    }
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
905
43 by csa
Option to disable preloading of image files, additional timers
906
#ifdef PYHST_MEASURE_TIMINGS
907
    timer = g_timer_new();
908
#endif /* PYHST_MEASURE_TIMINGS */
909
186 by Suren A. Chilingaryan
float2/4 modes in standard kernel
910
    ctx->num_slices = num_slices / SLICE_BLOCK;
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
911
    ctx->result_reqbuf = result;
912
    ctx->sinograms = sinograms;
186 by Suren A. Chilingaryan
float2/4 modes in standard kernel
913
18 by csa
Big redesign (early commit)
914
    pyhst_info("num_projection: %d, angle_increment: %e, fai360: %s", ctx->setup.num_projections, ctx->setup.angle_increment, ctx->setup.fai360?"on":"off" );
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
915
    hw_sched_schedule_task(ctx->sched, ctx, hst_threads_reconstruct_slice);
916
    err = hw_sched_wait_task(ctx->sched);
186 by Suren A. Chilingaryan
float2/4 modes in standard kernel
917
    ctx->slice_offset += SLICE_BLOCK * num_slices;
18 by csa
Big redesign (early commit)
918
43 by csa
Option to disable preloading of image files, additional timers
919
#ifdef PYHST_MEASURE_TIMINGS
920
    if (timer) {
921
	ctx->comp_timer += g_timer_elapsed(timer, NULL);
922
	g_timer_destroy(timer);
923
    }
924
#endif /* PYHST_MEASURE_TIMINGS */
925
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
926
/*
927
    int i;
18 by csa
Big redesign (early commit)
928
    for (i=0; i<num_slices; i++) {
929
	err = hst_reconstruct_slice(ctx,
930
	    ctx->recon[0],
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
931
	    result?result:ctx->result_tmpbuf,
18 by csa
Big redesign (early commit)
932
	    sinograms + i * ctx->setup.num_bins * ctx->setup.num_projections
933
	);
934
	if (err) return err;
935
	
29 by csa
Filters and FastEDF are added
936
        if (!result) {
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
937
	    fwrite(ctx->result_tmpbuf, ctx->setup.num_x * sizeof(float), ctx->setup.num_y, ctx->output);
29 by csa
Filters and FastEDF are added
938
	}
18 by csa
Big redesign (early commit)
939
940
    }
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
941
*/    
151 by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices
942
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
943
    return err;
18 by csa
Big redesign (early commit)
944
}
945
38 by csa
Preload data from EDF images while computing slices
946
int hst_reconstruct_start(HSTContextPtr ctx, int num_slices, const float *sinograms) {
43 by csa
Option to disable preloading of image files, additional timers
947
#ifdef PYHST_MEASURE_TIMINGS
948
    GTimer *timer;
949
#endif /* PYHST_MEASURE_TIMINGS */
950
38 by csa
Preload data from EDF images while computing slices
951
    assert(ctx);
952
953
    if (ctx->setup.what_changed) {
954
	hst_configure(ctx);
955
    }
956
43 by csa
Option to disable preloading of image files, additional timers
957
#ifdef PYHST_MEASURE_TIMINGS
958
    timer = g_timer_new();
959
#endif /* PYHST_MEASURE_TIMINGS */
960
186 by Suren A. Chilingaryan
float2/4 modes in standard kernel
961
    ctx->num_slices = num_slices / SLICE_BLOCK;
38 by csa
Preload data from EDF images while computing slices
962
    ctx->result_reqbuf = NULL;
963
    ctx->sinograms = sinograms;
964
        
965
    pyhst_info("num_projection: %d, angle_increment: %e, fai360: %s", ctx->setup.num_projections, ctx->setup.angle_increment, ctx->setup.fai360?"on":"off" );
966
    hw_sched_schedule_task(ctx->sched, ctx, hst_threads_reconstruct_slice);
967
    ctx->processing = 1;
43 by csa
Option to disable preloading of image files, additional timers
968
969
#ifdef PYHST_MEASURE_TIMINGS
970
    if (timer) {
971
	ctx->comp_timer += g_timer_elapsed(timer, NULL);
972
	g_timer_destroy(timer);
973
    }
974
#endif /* PYHST_MEASURE_TIMINGS */
975
976
#ifndef HW_USE_PARALLEL_IO
977
    hst_reconstruct_wait(ctx);
978
#endif /* ! HW_USE_PARALLEL_IO */
38 by csa
Preload data from EDF images while computing slices
979
    
980
    return 0;
981
}
982
983
int hst_reconstruct_wait(HSTContextPtr ctx) {
984
    int err = 0;
985
43 by csa
Option to disable preloading of image files, additional timers
986
#ifdef PYHST_MEASURE_TIMINGS
987
    GTimer *timer;
988
    timer = g_timer_new();
989
#endif /* PYHST_MEASURE_TIMINGS */
990
38 by csa
Preload data from EDF images while computing slices
991
    if (ctx->processing) {
992
	err = hw_sched_wait_task(ctx->sched);
186 by Suren A. Chilingaryan
float2/4 modes in standard kernel
993
	ctx->slice_offset += SLICE_BLOCK * ctx->num_slices;
38 by csa
Preload data from EDF images while computing slices
994
	ctx->processing = 0;
995
    }
43 by csa
Option to disable preloading of image files, additional timers
996
997
#ifdef PYHST_MEASURE_TIMINGS
998
    if (timer) {
999
	ctx->comp_timer += g_timer_elapsed(timer, NULL);
1000
	g_timer_destroy(timer);
1001
    }
1002
#endif /* PYHST_MEASURE_TIMINGS */
151 by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices
1003
38 by csa
Preload data from EDF images while computing slices
1004
    return err;
1005
}
1006
1007
171 by Suren A. Chilingaryan
Disable broken SIMD
1008
#undef HW_USE_SIMD
132 by Suren A. Chilingaryan
Pipelined preprocessing (normalize & transpose)
1009
static int hst_threads_preprocess_projection(HWThread thr, void *hwctx, int proj, void *data) {
1010
    HSTContext *ctx = (HSTContext*)data;
1011
1012
    int i,j;
1013
    int num_slices = ctx->preproc_slices;
1014
    int num_bins = ctx->setup.num_bins;
1015
    int num_proj = ctx->setup.num_projections;
1016
    float norm = ctx->setup.norm;
1017
    
1018
    int num_bins4 = (num_bins/4);
1019
1020
#ifdef HW_USE_SIMD
1021
    __m128 norm4 = _mm_set_ps1(norm);
1022
#endif /* HW_USE_SIMD */
1023
    
1024
    while (!ctx->projection_data[proj]) 
1025
        usleep(100);
1026
1027
//    printf("[%p] %i, %i of %i, %i %f\n", thr, num_slices, proj, num_proj, num_bins, norm);
1028
    for (i = 0; i < num_slices; i++) {
1029
        int src_offset = i * num_bins;
1030
        int dst_offset = i * num_bins * num_proj + proj * num_bins;
1031
1032
#ifdef HW_USE_SIMD
1033
            // Looks pretty inoptimal... Enforce padding? Hand-written assembler?
1034
        __m128* src = (__m128*)(ctx->projection_data[proj] + src_offset);
1035
        __m128* dst = (__m128*)(ctx->preproc_sinograms + dst_offset);
1036
1037
        for (j = 0; j < num_bins4; j++) {
1038
            dst[j] = _mm_mul_ps(src[j], norm4);
1039
        }
1040
1041
        for (j = num_bins4 * 4; j < num_bins; j++) {
1042
#else /* HW_USE_SIMD */
1043
        for (j = 0; j < num_bins; j++) {
1044
#endif /* HW_USE_SIMD */
1045
            ctx->preproc_sinograms[dst_offset + j] = norm * ctx->projection_data[proj][src_offset + j];
1046
        }
1047
    }
1048
1049
    return 0;
1050
}
1051
1052
int hst_preprocess_start(HSTContextPtr ctx, int num_slices, const float *sinograms) {
1053
    int err = 0;
1054
    
1055
    ctx->num_projections = ctx->setup.num_projections;
1056
    ctx->preproc_slices = num_slices;
1057
    ctx->preproc_sinograms = sinograms;
1058
1059
#ifdef HW_USE_PARALLEL_IO
1060
    memset(ctx->projection_data, 0, ctx->setup.num_projections * sizeof(float*));
1061
    err = hw_sched_schedule_task(ctx->cpu_sched, ctx, hst_threads_preprocess_projection);
1062
#endif /* ! HW_USE_PARALLEL_IO */
1063
1064
    return err;
1065
}
1066
1067
int hst_preprocess_wait(HSTContextPtr ctx) {
1068
    int err = 0;
1069
#ifdef HW_USE_PARALLEL_IO
1070
    err = hw_sched_wait_task(ctx->cpu_sched);
1071
#endif /* ! HW_USE_PARALLEL_IO */
1072
    return err;
1073
}
1074
1075
int hst_preprocess_projection(HSTContextPtr ctx, int proj, float *data) {
1076
1077
    ctx->projection_data[proj] = data;
1078
1079
#ifndef HW_USE_PARALLEL_IO
1080
    hst_threads_preprocess_projection(NULL, NULL, proj, ctx);
1081
#endif /* ! HW_USE_PARALLEL_IO */
1082
1083
/*
1084
  int i,j;
1085
  int num_slices = ctx->preproc_slices;
1086
  int num_bins = ctx->setup.num_bins;
1087
  int num_proj = ctx->setup.num_projections;
1088
  float norm = ctx->setup.norm;
1089
1090
//  printf("%i, %i of %i, %i %f\n", num_slices, proj, num_proj, num_bins, norm);
1091
  for (i = 0; i < num_slices; i++) {
1092
    for (j = 0; j < num_bins; j++) {
1093
        ctx->preproc_sinograms[i * num_bins * num_proj + proj * num_bins + j] = norm * data[i * num_bins + j];
1094
    }
1095
  }
1096
*/
1097
  return 0;
1098
}
1099
1100
22 by csa
Timing reporting, few bug fixes
1101
HSTReconstructorConstContextPtr *hst_get_configured_reconstructors(HSTContextPtr ctx) {
1102
    assert(ctx);
1103
    
1104
    return (HSTReconstructorConstContextPtr*)ctx->recon;
1105
}
1106
43 by csa
Option to disable preloading of image files, additional timers
1107
#ifdef PYHST_MEASURE_TIMINGS
1108
double hst_get_io_timer(HSTContextPtr ctx) {
1109
    assert(ctx);
1110
    return ctx->io_timer;
1111
}
1112
1113
double hst_get_recon_timer(HSTContextPtr ctx) {
1114
    assert(ctx);
1115
    return ctx->comp_timer;
1116
}
1117
#endif /* PYHST_MEASURE_TIMINGS */
1118
1119
49 by root
Merge /home/matthias/dev/pyHST
1120
HSTContext *hst_create_context(void) {
18 by csa
Big redesign (early commit)
1121
    if (hst_copies) pyhst_fail("Only a single HST context is supported by the current version");
1122
    
1123
    HSTContext *ctx = (HSTContext*)malloc(sizeof(HSTContext));
1124
    if (ctx) {
1125
        memset(ctx, 0, sizeof(HSTContext));
1126
	++hst_copies;
1127
    }
1128
    return ctx;
1129
}
1130
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
1131
static int hst_threads_init(HWThread thr, void *hwctx, int device_id, void *data) {
1132
    int err;
1133
    
1134
    HSTContext *ctx = (HSTContext*)data;
1135
1136
    //printf("Init %i\n", device_id);
32 by csa
Fix crash in FFTW3 initialization and cleanup in multi-threaded case
1137
    err = hst_reconstructor_init(ctx->recon[device_id], thr);
1138
    check_code(err, "hst_reconstructor_init");
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
1139
    
1140
    return 0;
1141
}
1142
133 by Suren A. Chilingaryan
Pinned OpenCL memory
1143
void *hst_pinned_malloc(size_t size, HSTPinnedAccess a) {
1144
#ifdef HW_USE_OPENCL
1145
    return hst_opencl_host_malloc(size, a);
1146
#else /* OPENCL */
1147
# ifdef HW_USE_CUDA
1148
    return hst_cuda_host_malloc(size, a);
1149
# else /* USEGPU */
37 by csa
Optimizations of EDF reading
1150
    return malloc(size);
1151
/*
1152
    Unfortunatelly, this demands ROOT priveleges (or non-standard system
1153
    configuration) and anyway not supported by CUDA 2.2, so there is only
1154
    way to allocate through CUDA (probably we should get results in our
1155
    personal buffers to avoid such problems.
1156
    if (ctx->result_tmpbuf) mlock(ctx->result, 1 * ctx->setup.num_x * ctx->setup.num_y * sizeof(float));
1157
    ctx->result_tmpbuf = mmap(NULL, 1 * ctx->setup.num_x * ctx->setup.num_y * sizeof(float), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS|MAP_LOCKED, -1, 0);
1158
*/
133 by Suren A. Chilingaryan
Pinned OpenCL memory
1159
# endif /* USEGPU */
1160
#endif /* OPENCL */
37 by csa
Optimizations of EDF reading
1161
}
1162
1163
void hst_pinned_free(void *ptr) {
133 by Suren A. Chilingaryan
Pinned OpenCL memory
1164
#ifdef HW_USE_OPENCL
1165
    hst_opencl_host_free(ptr);
1166
#else /* OPENCL */
1167
# ifdef HW_USE_CUDA
37 by csa
Optimizations of EDF reading
1168
    hst_cuda_host_free(ptr);
133 by Suren A. Chilingaryan
Pinned OpenCL memory
1169
# else /* USEGPU */
37 by csa
Optimizations of EDF reading
1170
    free(ptr);
133 by Suren A. Chilingaryan
Pinned OpenCL memory
1171
# endif /* USEGPU */
1172
#endif /* OPENCL */
37 by csa
Optimizations of EDF reading
1173
}
1174
18 by csa
Big redesign (early commit)
1175
int hst_init_context(HSTContext *ctx, HSTSetup *setup, const float *custom_angles, const float *axis_corrections) {
1176
    int err;
58 by Suren A. Chilingaryan
Re-engineering of OpenCL detection
1177
    int i, limit, threads;
18 by csa
Big redesign (early commit)
1178
    size_t size;
1179
1180
    assert(ctx);
1181
    assert(setup);
1182
1183
    hst_setup_init(&ctx->setup);
20 by csa
do more correct initialization
1184
    
18 by csa
Big redesign (early commit)
1185
    size = ((void*)&setup->END_OF_GLOBAL_PART) - ((void*)setup);
1186
    memcpy(&ctx->setup, setup,  size);
1187
20 by csa
do more correct initialization
1188
    ctx->setup.what_changed = HST_ALL_CHANGED;
1189
    hst_configure_data_structures(ctx, custom_angles, axis_corrections);
18 by csa
Big redesign (early commit)
1190
58 by Suren A. Chilingaryan
Re-engineering of OpenCL detection
1191
1192
# ifdef HW_USE_CUDA
168 by Suren A. Chilingaryan
Placeholders for DFI implementation
1193
    switch (setup->method) {
1194
     case HST_METHOD_FBP: 
1195
        cuda_recon = cuda_fbp_recon;
1196
        break;
175 by Suren A. Chilingaryan
Make DFI optional
1197
# ifdef PYHST_ENABLE_DFI
168 by Suren A. Chilingaryan
Placeholders for DFI implementation
1198
     case HST_METHOD_DFI:
1199
        cuda_recon = cuda_dfi_recon;
1200
        break;
175 by Suren A. Chilingaryan
Make DFI optional
1201
# endif /* PYHST_ENABLE_DFI */
182 by Suren A. Chilingaryan
Fix DFI reconstruction
1202
      default:
1203
        pyhst_error("Unknown reconstruction mode specififed");
1204
        exit(-1);
168 by Suren A. Chilingaryan
Placeholders for DFI implementation
1205
    }
173 by Suren A. Chilingaryan
Fix few bugs in scheduller causing crashes in non-threaded mode (still inoperational)
1206
#endif /* HW_USE_CUDA */
1207
1208
#ifdef HW_USE_THREADS
1209
    threads = 0;
1210
1211
# ifdef HW_USE_CUDA
58 by Suren A. Chilingaryan
Re-engineering of OpenCL detection
1212
    threads += cuda_recon->devices;
1213
# endif /* HW_USE_CUDA */
1214
1215
# ifdef HW_USE_OPENCL
1216
    threads += opencl_recon->devices;
1217
# endif /* HW_USE_OPENCL */
1218
1219
# if HW_USE_CPU	== 1
1220
    threads += cpu_recon->devices - 1; // one core reserved for data preloading
1221
# elif defined(HW_USE_CPU)
1222
    if (!threads) threads += cpu_recon->devices - 1;
1223
# endif /* HW_USE_CPU */
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
1224
#else /* HW_USE_THREADS */
1225
    threads = 1;
1226
#endif /* HW_USE_THREADS */
1227
1228
    ctx->num_recon = threads;
22 by csa
Timing reporting, few bug fixes
1229
    ctx->recon = (HSTReconstructorContext**)malloc((ctx->num_recon + 1) * sizeof(HSTReconstructorContext*));
18 by csa
Big redesign (early commit)
1230
    check_alloc(ctx->recon, "pool of HSTReconstructors");
1231
22 by csa
Timing reporting, few bug fixes
1232
    ctx->recon[ctx->num_recon] = NULL;
1233
58 by Suren A. Chilingaryan
Re-engineering of OpenCL detection
1234
    i = 0; limit = 0;
1235
#ifdef HW_USE_CUDA
1236
    limit += cuda_recon->devices;
1237
    for (; (i < limit)&&(i < threads); i++) {
1238
	ctx->recon[i] = hst_reconstructor_create(cuda_recon, &ctx->setup, i);
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
1239
	check_alloc(ctx->recon[i], "GPUContext");
1240
    } 
58 by Suren A. Chilingaryan
Re-engineering of OpenCL detection
1241
#endif /* HW_USE_CUDA */
49 by root
Merge /home/matthias/dev/pyHST
1242
58 by Suren A. Chilingaryan
Re-engineering of OpenCL detection
1243
#ifdef HW_USE_OPENCL
1244
    limit += opencl_recon->devices;
1245
    for (; (i < limit) && (i < threads); i++) {
49 by root
Merge /home/matthias/dev/pyHST
1246
        ctx->recon[i] = hst_reconstructor_create(opencl_recon, &ctx->setup, i);
1247
        check_alloc(ctx->recon[i], "OpenCLContext");
1248
    }
58 by Suren A. Chilingaryan
Re-engineering of OpenCL detection
1249
#endif /* HW_USE_OPENCL */
49 by root
Merge /home/matthias/dev/pyHST
1250
58 by Suren A. Chilingaryan
Re-engineering of OpenCL detection
1251
#ifdef HW_USE_CPU
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
1252
    for (; i < threads; i++) {
1253
	ctx->recon[i] = hst_reconstructor_create(cpu_recon, &ctx->setup, threads - i - 1);
1254
	check_alloc(ctx->recon[i], "CPUContext");
1255
    }
58 by Suren A. Chilingaryan
Re-engineering of OpenCL detection
1256
#endif /* HW_USE_CPU */
49 by root
Merge /home/matthias/dev/pyHST
1257
58 by Suren A. Chilingaryan
Re-engineering of OpenCL detection
1258
    if (!i) pyhst_fail("no reconstructors found");
1259
    
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
1260
    pyhst_warning("Running %i reconstructors\n", threads);
1261
1262
    ctx->sched = hw_sched_create(threads);
1263
    check_alloc(ctx->sched, "scheduller");
132 by Suren A. Chilingaryan
Pipelined preprocessing (normalize & transpose)
1264
    
1265
    ctx->cpu_sched = hw_sched_create((cpu_recon->devices>1)?2:1);
1266
//    ctx->cpu_sched = hw_sched_create((cpu_recon->devices>1)?(cpu_recon->devices - 1):1);
1267
    check_alloc(ctx->cpu_sched, "cpu scheduller");
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
1268
1269
    ctx->output_mutex = hw_sched_create_mutex();
1270
    //Can return NULL if threads are disabled
1271
    //check_alloc(ctx->output_mutex, "hw_sched_create_mutex");
1272
151 by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices
1273
#ifdef HST_MULTISLICE_MODE
1274
    hw_sched_set_sequential_mode(ctx->sched, &ctx->num_slices, &ctx->current_slice, HW_SCHED_FLAG_FREE_CALL|HW_SCHED_FLAG_TERMINATOR_CALL);
1275
#else /* HST_MULTISLICE_MODE */
1276
    hw_sched_set_sequential_mode(ctx->sched, &ctx->num_slices, &ctx->current_slice, 0);
1277
#endif /* HST_MULTISLICE_MODE */
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
1278
1279
    err = hw_sched_schedule_thread_task(ctx->sched, ctx, hst_threads_init);
1280
    check_code(err, "hw_sched_schedule_task");
1281
    
151 by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices
1282
    hw_sched_set_sequential_mode(ctx->cpu_sched, &ctx->num_projections, &ctx->current_projection, 0);
132 by Suren A. Chilingaryan
Pipelined preprocessing (normalize & transpose)
1283
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
1284
/*
1285
    for (i = 0; i < threads; i++) {
1286
	err = hst_reconstructor_init(ctx->recon[i]);
1287
	check_code(err, "hst_cpu_init_context");
1288
    }
1289
*/
132 by Suren A. Chilingaryan
Pipelined preprocessing (normalize & transpose)
1290
    ctx->projection_data = (float**)malloc(ctx->setup.num_projections * sizeof(float*));
1291
    check_alloc(ctx->projection_data, "projection_data");
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
1292
151 by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices
1293
#ifdef HST_MULTISLICE_MODE
1294
    ctx->result_tmpbuf = hst_pinned_malloc(ctx->setup.max_slices * ctx->setup.num_x * ctx->setup.num_y * sizeof(float), HST_PINNED_W);
1295
#else /* HST_MULTISLICE_MODE */
133 by Suren A. Chilingaryan
Pinned OpenCL memory
1296
    ctx->result_tmpbuf = hst_pinned_malloc(threads * ctx->setup.num_x * ctx->setup.num_y * sizeof(float), HST_PINNED_W);
151 by Suren A. Chilingaryan
Multislice mode: preload into the GPU memory complete slices
1297
#endif /* HST_MULTISLICE_MODE */
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
1298
    check_alloc(ctx->result_tmpbuf, "temporary buffer for reconstructed slices");
1299
1300
    err = hw_sched_wait_thread_task(ctx->sched);
1301
    check_code(err, "hw_sched_wait_task");
132 by Suren A. Chilingaryan
Pipelined preprocessing (normalize & transpose)
1302
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
1303
    return 0;
1304
}
1305
1306
static int hst_threads_free(HWThread thr, void *hwctx, int device_id, void *data) {
1307
    HSTContext *ctx = (HSTContext*)data;
1308
    hst_reconstructor_free(ctx->recon[device_id]);
20 by csa
do more correct initialization
1309
    
18 by csa
Big redesign (early commit)
1310
    return 0;
1311
}
1312
1313
void hst_free_context(HSTContext *ctx) {
1314
    int i;
1315
    
1316
    assert(ctx);
20 by csa
do more correct initialization
1317
37 by csa
Optimizations of EDF reading
1318
    hst_pinned_free(ctx->result_tmpbuf);
20 by csa
do more correct initialization
1319
132 by Suren A. Chilingaryan
Pipelined preprocessing (normalize & transpose)
1320
    if (ctx->projection_data) {
1321
        free(ctx->projection_data);
1322
    }
1323
18 by csa
Big redesign (early commit)
1324
    if (ctx->recon) {
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
1325
	if (ctx->sched) {
1326
	    hw_sched_execute_thread_task(ctx->sched, ctx, hst_threads_free);
1327
	}
1328
	
18 by csa
Big redesign (early commit)
1329
	for (i = 0; i < ctx->num_recon; i++) {
1330
	    if (ctx->recon[i]) hst_reconstructor_destroy(ctx->recon[i]);
1331
	}
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
1332
18 by csa
Big redesign (early commit)
1333
	free(ctx->recon);
1334
    }
20 by csa
do more correct initialization
1335
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
1336
    if (ctx->output_mutex) {
1337
	hw_sched_destroy_mutex(ctx->output_mutex);
1338
    }
1339
    
132 by Suren A. Chilingaryan
Pipelined preprocessing (normalize & transpose)
1340
    if (ctx->cpu_sched) {
1341
        hw_sched_destroy(ctx->cpu_sched);
1342
    }
1343
    
30 by csa
Multi-GPU, Multi-CPU, and Hybrid modes support
1344
    if (ctx->sched) {
1345
	hw_sched_destroy(ctx->sched);
1346
    }
1347
20 by csa
do more correct initialization
1348
    hst_setup_free(&ctx->setup);
18 by csa
Big redesign (early commit)
1349
    
1350
    memset(ctx, 0, sizeof(HSTContext));
1351
}
1352
1353
void hst_destroy_context(HSTContext *ctx) {
1354
    --hst_copies;
1355
    hst_free_context(ctx);
1356
    free(ctx);
1357
}