2
* The PyHST program is Copyright (C) 2002-2011 of the
3
* European Synchrotron Radiation Facility (ESRF) and
4
* Karlsruhe Institute of Technology (KIT).
6
* PyHST is free software: you can redistribute it and/or modify it
7
* under the terms of the GNU General Public License as published by the
8
* Free Software Foundation, either version 3 of the License, or
9
* (at your option) any later version.
11
* hst is distributed in the hope that it will be useful, but
12
* WITHOUT ANY WARRANTY; without even the implied warranty of
13
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
14
* See the GNU General Public License for more details.
16
* You should have received a copy of the GNU General Public License along
17
* with this program. If not, see <http://www.gnu.org/licenses/>.
20
#define PARALLEL_PER_GPU 1
21
// to prevent errors with newer glib and CUDA 4
22
#define GLIB_DISABLE_DEPRECATION_WARNINGS
38
#if CUDA_VERSION_MAJOR < 5
40
# include <cutil_math.h>
42
# include <helper_cuda.h>
43
# include <helper_math.h>
45
# define CUDA_SAFE_CALL(val) check ( (val), #val, __FILE__, __LINE__ )
46
# define CUFFT_SAFE_CALL CUDA_SAFE_CALL
50
#define CUDA_SAFE_CALL(x) (x)
51
#define CUFFT_SAFE_CALL(x) (x)
56
#include "hst_setup.h"
57
#include "hst_reconstructor.h"
62
//#define BUGGY_CUFFT_SET_STREAM
64
#include "dfi_cuda_defines.h"
65
#include "dfi_cuda_kernels.h"
66
#include "dfi_cuda_bp_kernels.h"
71
#endif /* HST_TEXTURE16 */
74
static int blocking = 0;
75
static int hst_cuda_device_num[HST_CUDA_MAX_DEVICES];
76
static cudaDeviceProp hst_cuda_device_prop[HST_CUDA_MAX_DEVICES]; //!< Enumerated CUDA-enabled devices
77
static HSTConstString hst_cuda_timers[] = { "complete reconstruction", "transfer to device", "transfer from device", "texture mapping", "projection filtering", "backprojection", "*initialization and cleanup", NULL }; //!< List of supported timers
79
#define GPU_CONTEXT(ctx) ((GPUContext*)ctx)
82
* This implementation of HSTReconstructor uses NVidia GeForce-family graphic
83
* cards and NVidia Tesla accelerators to accelerate reconstruction process.
84
* The implementation is based on CUDA toolkit and uses NVidia cuFFT library
85
* for performing Fourier Transformations.
86
* GPUContext is extension of #HSTReconstructorContext which provides additional
87
* data members needed to communicate with graphic hardware
90
HSTReconstructorContext recon;
91
#ifdef PYHST_MEASURE_TIMINGS
92
# define GPU_CONTEXT_MEMSET_OFFSET (6 * sizeof(GTimer*))
93
GTimer *main_timer; //!< Counts time spent in backprojection code
94
GTimer *pre_timer; //!< Counts time spent in filtering
95
GTimer *togpu_timer; //!< Counts time spent in memory operations
96
GTimer *init_timer; //!< Counts time spent in initialization
97
GTimer *fromgpu_timer; //!< Counts time spent in memory operations
98
GTimer *texture_timer; //!< Counts time spent in texture binding/unbinding
100
# define GPU_CONTEXT_MEMSET_OFFSET 0
101
#endif /* PYHST_MEASURE_TIMINGS */
103
int device; //!< Sequence number [0..MAX_DEVICES-1]
104
int initialized; //!< At least partly
106
cudaStream_t stream[4]; //!< For interleaving memory writes and computations (2 togpu, 3 fromgpu)
108
int kepler; //!< Indicates if compute compatibility is above 3.0
109
int fermi; //!< Indicates if compute compatibility is above 2.0
110
int fft_batch_size; //!< Number of projections to process at once
111
int bp_batch_size; //!< Number of rows to process at once
112
int bin_pitch_size; //!< The number of floats to allocate for each projection (gpu_data)
114
int bp_grid_lines; //!< Number of CUDA blocks along Y-axis of the result slice (batched processing)
115
int bp_grid_columns; //!< Number of CUDA blocks along X-axis of the result slice
116
int projection_grid_size; //!< Number of CUDA blocks along projections (each block consists of BLOCK_SIZE threads)
117
int bin_grid_size; //!< Number of CUDA blocks along projection bins (each block consists of BLOCK_SIZE threads)
118
int filter_grid_size; //!< Horizontal CUDA blocks for filtering (2 * dim_fft / BLOCK_SIZE)
119
int points_per_thread; //!< How many points is processed by a single thread (actually square of that)
122
int allprojection_grid_size;//!< Number of CUDA blocks along all projections (each block consists of BLOCK_SIZE threads)
123
#endif /* HST_TEXTURE16 */
125
int mmax_buffer_size; //!< Buffer size required for float to uint16 conversion of projection data
128
cudaChannelFormatDesc float_desc; //!< Texture channel description
129
cufftHandle fft_plan[2]; //!< Complex plan for fourier transformations (both forward and inverse)
131
int current_buffer; //!< 1 or 2 (0 means no active buffer yet)
132
int synchronized; //!< Indicates if current input buffer is synchronised
133
float *gpu_input[2]; //!< Input buffers (linked by gpu_data)
134
float *gpu_output[2]; //!< Output buffers (linked by gpu_result)
136
float *gpu_limits; //!< Parameters for fai360-mode corrections
137
float *gpu_data; //!< Sinogram buffer in device memory
138
float *gpu_buffer; //!< Temporary computation buffer in GPU memory (for filtering, can hold a single batch)
139
float *gpu_filter; //!< Filter buffer in device memory
140
float *gpu_result; //!< Reconstructed slice is going here
143
unsigned short *gpu_intdata;//!< Sinogram integer buffer
144
#endif /* HST_TEXTURE16 */
146
typedef struct GPUContextT GPUContext;
149
#define hst_cuda_calc_blocks hw_calc_blocks
151
static int hst_cuda_configure_limits(GPUContext *ctx, HSTSetup *setup) {
155
int num_proj = setup->num_projections;
156
int pad_proj = ctx->fft_batch_size * hst_cuda_calc_blocks(num_proj, ctx->fft_batch_size, NULL);
158
int num_bins = setup->num_bins;
159
float slope_zone = setup->pente_zone;
162
float axis_position_corr;
163
float overlapping, flat_zone;
166
param_s = (float*)malloc(2 * pad_proj * sizeof(float));
167
if (!param_s) return ENOMEM;
170
for (projection = 0; projection < pad_proj; projection++) {
171
if (projection < num_proj) {
172
axis_position_corr = setup->axis_position_corr_s[projection];
174
axis_position_corr = setup->axis_position;
177
if (2 * axis_position_corr > num_bins) {
178
overlapping = num_bins - axis_position_corr;
179
param = 1; // pente_zone / prof_fact
181
overlapping = axis_position_corr;
182
param = -1; // pente_zone / prof_fact
185
if (overlapping <= 0) {
190
slope_zone = MIN(slope_zone, overlapping);
191
flat_zone = overlapping - slope_zone;
193
param_s[2*projection] = flat_zone;
194
param_s[2*projection + 1] = param * slope_zone;
198
#ifdef PYHST_MEASURE_TIMINGS
199
g_timer_continue(ctx->init_timer);
200
#endif /* PYHST_MEASURE_TIMINGS */
201
CUDA_SAFE_CALL(cudaMemcpy(ctx->gpu_limits, param_s, 2*pad_proj*sizeof(float), cudaMemcpyHostToDevice));
202
#ifdef PYHST_MEASURE_TIMINGS
203
g_timer_stop(ctx->init_timer);
204
#endif /* PYHST_MEASURE_TIMINGS */
215
* Create GPU context (uninitialized)
217
* @param prototype is pointer on HSTReconstructure describing the reconstruction module
218
* @param setup is pointer on HSTSetup with various HST parameters
219
* @result created context
221
static HSTReconstructorContext *hst_cuda_create_context(HSTReconstructor *prototype, HSTSetup *setup, int id) {
227
assert((id/PARALLEL_PER_GPU) < prototype->devices);
229
/* FIXME: no error code in case of out-of-memory */
230
ctx = (GPUContext*)malloc(sizeof(struct GPUContextT));
232
memset(ctx, 0, sizeof(struct GPUContextT));
234
#ifdef PYHST_MEASURE_TIMINGS
235
ctx->main_timer = g_timer_new();
236
if (ctx->main_timer) g_timer_stop(ctx->main_timer);
237
ctx->pre_timer = g_timer_new();
238
if (ctx->pre_timer) g_timer_stop(ctx->pre_timer);
239
ctx->togpu_timer = g_timer_new();
240
if (ctx->togpu_timer) g_timer_stop(ctx->togpu_timer);
241
ctx->init_timer = g_timer_new();
242
if (ctx->init_timer) g_timer_stop(ctx->init_timer);
243
ctx->fromgpu_timer = g_timer_new();
244
if (ctx->fromgpu_timer) g_timer_stop(ctx->fromgpu_timer);
245
ctx->texture_timer = g_timer_new();
246
if (ctx->texture_timer) g_timer_stop(ctx->texture_timer);
248
if ((!ctx->main_timer)||(!ctx->pre_timer)||(!ctx->togpu_timer)||(!ctx->init_timer)||(!ctx->fromgpu_timer)||(!ctx->texture_timer)) {
249
if (ctx->texture_timer) g_timer_destroy(ctx->texture_timer);
250
if (ctx->fromgpu_timer) g_timer_destroy(ctx->fromgpu_timer);
251
if (ctx->init_timer) g_timer_destroy(ctx->init_timer);
252
if (ctx->togpu_timer) g_timer_destroy(ctx->togpu_timer);
253
if (ctx->pre_timer) g_timer_destroy(ctx->pre_timer);
254
if (ctx->main_timer) g_timer_destroy(ctx->main_timer);
258
#endif /* PYHST_MEASURE_TIMINGS */
260
hst_reconstructor_init_context((HSTReconstructorContext*)ctx, prototype, setup);
262
ctx->device = id / PARALLEL_PER_GPU;
264
return (HSTReconstructorContext*)ctx;
268
* Initializes GPU context
270
* @param ctx is uninitialized GPU context
271
* @result return code and 0 indicates success
273
static int hst_cuda_init_context(HSTReconstructorContext *rctx, HWThread thr) {
277
int talign; // The mandatory alignment of texture line in floats
278
int dim_result; // The Y-dimmension of result buffer
279
int num_projections; // Projection dimensions (allocation size, rounded for blocked processing)
280
//int filled_projections; // Number of projections which will be filled in the kernels
281
int ppt = 1; // Points per thread
285
ctx = GPU_CONTEXT(rctx);
289
/* This limitation is due to statically allocated memory for sin_s, cos_s, etc. It probably could be done
290
dynamically in run-time, just access performance should be checked */
291
assert(setup->num_projections < MAXNPROJECTIONS);
292
/* Otherwise, the filter kernel will corrupt the data and we need to limit the CUDA block_size here as well */
293
assert (2 * setup->dim_fft >= BLOCK_SIZE);
295
/* Why the width is doubled?
296
We will compute two real convolutions as a single complex convolution.
297
To achieve that we interleaving the data from two float arrays into
298
the complex one (i.e. first real vector is forming real part of complex
299
numbers and second - the imaginary part). Then, we perform the C2C
300
convolution of resulting vector and disassemble the resulting complex
301
vector into the two real-values sequences.
303
http://www.engineeringproductivitytools.com/stuff/T0001/PT10.HTM
306
ctx->bin_grid_size = hst_cuda_calc_blocks(setup->num_bins, BLOCK_SIZE, NULL);
307
ctx->bin_pitch_size = ctx->bin_grid_size * BLOCK_SIZE;
309
talign = hst_cuda_device_prop[ctx->device].textureAlignment / sizeof(float);
310
ctx->bin_pitch_size = talign * hst_cuda_calc_blocks(ctx->bin_pitch_size, talign, NULL);
312
ctx->filter_grid_size = 2 * setup->dim_fft / BLOCK_SIZE;
313
if (setup->num_projections < BLOCK_SIZE * FFT_BATCH_SIZE) {
314
num_projections = setup->num_projections;
315
if (num_projections%2) ++num_projections;
317
ctx->projection_grid_size = hst_cuda_calc_blocks(num_projections/2, BLOCK_SIZE, NULL);
318
num_projections = 2 * ctx->projection_grid_size * BLOCK_SIZE;
320
ctx->fft_batch_size = num_projections;
322
ctx->fft_batch_size = BLOCK_SIZE * FFT_BATCH_SIZE;
323
ctx->projection_grid_size = FFT_BATCH_SIZE / 2;
325
num_projections = ctx->fft_batch_size * hst_cuda_calc_blocks(setup->num_projections, ctx->fft_batch_size, NULL);
328
//filled_projections = num_projections;
329
num_projections = 2 * BLOCK_SIZE_Y * hst_cuda_calc_blocks(num_projections, 2 * BLOCK_SIZE_Y, NULL);
331
if (hst_cuda_device_prop[ctx->device].major > 2) {
333
#ifdef HST_OPTIMIZE_KEPLER
334
if (setup->oversampling) ppt = 4;
335
#endif /* HST_OPTIMIZE_KEPLER */
336
} else if (hst_cuda_device_prop[ctx->device].major > 1) {
338
#ifndef HST_TEXTURE16
340
# ifdef HW_IGNORE_OLD_HARDWARE
341
if (setup->oversampling > 1) ppt = 4;
342
# endif /* HW_IGNORE_OLD_HARDWARE */
343
#endif /* ! HST_TEXTURE16 */
346
ctx->points_per_thread = ppt;
348
ctx->bp_grid_columns = ppt * (int)(setup->num_x * 1.0 / (ppt * BLOCK_SIZE_X) + 0.9999999999);
349
if ((!BP_BATCH_SIZE)||(setup->num_y < BLOCK_SIZE_Y * BP_BATCH_SIZE)) {
350
ctx->bp_batch_size = setup->num_y;
352
ctx->bp_grid_lines = ppt * (int)(setup->num_y * 1.0 / (ppt * BLOCK_SIZE_Y) + 0.9999999999);
354
dim_result = ctx->bp_grid_lines * BLOCK_SIZE_Y;
356
ctx->bp_batch_size = BLOCK_SIZE_Y * BP_BATCH_SIZE;
357
ctx->bp_grid_lines = BP_BATCH_SIZE;
359
dim_result = ctx->bp_batch_size * hst_cuda_calc_blocks(setup->num_y, ctx->bp_batch_size, NULL);
363
ctx->allprojection_grid_size = hst_cuda_calc_blocks(setup->num_projections, BLOCK_SIZE, NULL);
364
#endif /* HST_TEXTURE16 */
367
ctx->initialized = 1;
369
#ifdef PYHST_MEASURE_TIMINGS
370
g_timer_continue(ctx->init_timer);
371
#endif /* PYHST_MEASURE_TIMINGS */
373
#ifdef HW_USE_THREADS
374
CUDA_SAFE_CALL(cudaSetDevice(hst_cuda_device_num[ctx->device]));
375
#endif /* HW_USE_THREADS */
378
# ifdef HST_TEXTURE16_PACKED
379
ctx->float_desc = cudaCreateChannelDesc<uint32_t>();
380
# else /* HST_TEXTURE16_PACKED */
381
ctx->float_desc = cudaCreateChannelDesc<uint16_t>();
382
# endif /* HST_TEXTURE16_PACKED */
384
int_projes.filterMode = cudaFilterModePoint;
385
int_projes.addressMode[0] = cudaAddressModeClamp;
386
int_projes.addressMode[1] = cudaAddressModeClamp;
388
nppsMinMaxGetBufferSize_32f(setup->num_projections * ctx->bin_pitch_size, &ctx->mmax_buffer_size); // DS: Argument in elements, result in bytes?
389
ctx->mmax_buffer_size += 2 * sizeof(float); // min & max
390
#else /* HST_TEXTURE16 */
391
ctx->float_desc = cudaCreateChannelDesc<float>();
392
#endif /* HST_TEXTURE16 */
394
tex_projes.filterMode = cudaFilterModeLinear; //cudaFilterModePoint;
395
tex_projes.addressMode[0] = cudaAddressModeBorder;
396
tex_projes.addressMode[1] = cudaAddressModeBorder;
398
float4 *all = (float4*)malloc(num_projections * sizeof(float4));
399
// int max_num = 0, num;
400
for (int i = 0; i < setup->num_projections; i++) {
401
all[i].x = setup->cos_s[i];
402
all[i].y = setup->sin_s[i];
403
all[i].z = setup->axis_position_corr_s[i];
404
all[i].w = floor(ppt * MIN(MIN(BLOCK_SIZE_X * all[i].x, - BLOCK_SIZE_Y * all[i].y), MIN(0., BLOCK_SIZE_X * all[i].x - BLOCK_SIZE_Y * all[i].y)));
406
// num = ceil(2*MAX(MAX(BLOCK_SIZE_X * all[i].x, - BLOCK_SIZE_Y * all[i].y), MAX(0., BLOCK_SIZE_X * all[i].x - BLOCK_SIZE_Y * all[i].y))) - all[i].w;
407
// if (num > max_num) max_num = num;
409
// printf("Max: %i\n", max_num);
412
if (setup->num_projections < num_projections) {
413
memset(all + setup->num_projections, 0, (num_projections - setup->num_projections)*sizeof(float4));
416
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_all, all, num_projections*sizeof(float4)));
420
// CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_coss, setup->cos_s, setup->num_projections*sizeof(float)));
421
// CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_sins, setup->sin_s, setup->num_projections*sizeof(float)));
422
// CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_axiss, setup->axis_position_corr_s, setup->num_projections*sizeof(float)));
424
CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_input[0], num_projections*ctx->bin_pitch_size*sizeof(float)));
425
CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_input[1], num_projections*ctx->bin_pitch_size*sizeof(float)));
427
if (setup->num_projections < num_projections) {
428
CUDA_SAFE_CALL(cudaMemset(ctx->gpu_input[0] + setup->num_projections * ctx->bin_pitch_size, 0, (num_projections - setup->num_projections)*ctx->bin_pitch_size*sizeof(float)));
429
CUDA_SAFE_CALL(cudaMemset(ctx->gpu_input[1] + setup->num_projections * ctx->bin_pitch_size, 0, (num_projections - setup->num_projections)*ctx->bin_pitch_size*sizeof(float)));
432
ctx->gpu_data = ctx->gpu_input[0];
435
CUDA_SAFE_CALL(cudaMemset(ctx->gpu_input[0], 0, setup->num_projections*ctx->bin_pitch_size*sizeof(float)));
436
CUDA_SAFE_CALL(cudaMemset(ctx->gpu_input[1], 0, setup->num_projections*ctx->bin_pitch_size*sizeof(float)));
438
CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_intdata, num_projections*ctx->bin_pitch_size*sizeof(unsigned short)));
439
CUDA_SAFE_CALL(cudaMemset(ctx->gpu_intdata, 0, num_projections*ctx->bin_pitch_size*sizeof(unsigned short)));
441
if (setup->num_projections < num_projections) {
442
CUDA_SAFE_CALL(cudaMemset(ctx->gpu_intdata + setup->num_projections * ctx->bin_pitch_size, 0, (num_projections - setup->num_projections)*ctx->bin_pitch_size*sizeof(unsigned short)));
444
#endif /* HST_TEXTURE16 */
448
CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_buffer, MAX(ctx->fft_batch_size*setup->dim_fft*sizeof(float), ctx->mmax_buffer_size)));
449
CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_filter, 2*setup->dim_fft*sizeof(float)));
450
CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_limits, 2*num_projections*sizeof(float)));
453
CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_output[0], dim_result * ctx->bp_grid_columns * BLOCK_SIZE_X * sizeof(float)));
454
CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_output[1], dim_result * ctx->bp_grid_columns * BLOCK_SIZE_X * sizeof(float)));
456
ctx->gpu_result = ctx->gpu_output[0];
458
// CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_result, dim_result * ctx->bp_grid_columns * BLOCK_SIZE_X * sizeof(float)));
459
// CUDA_SAFE_CALL(cudaMemset(ctx->gpu_result, 0, sizeof(float) * dim_result));
462
cudaStreamCreate(&ctx->stream[0]);
463
cudaStreamCreate(&ctx->stream[1]);
464
cudaStreamCreate(&ctx->stream[2]);
465
cudaStreamCreate(&ctx->stream[3]);
467
CUFFT_SAFE_CALL(cufftPlan1d(&ctx->fft_plan[0], setup->dim_fft, CUFFT_C2C, ctx->fft_batch_size/2));
468
CUFFT_SAFE_CALL(cufftPlan1d(&ctx->fft_plan[1], setup->dim_fft, CUFFT_C2C, ctx->fft_batch_size/2));
469
#ifndef BUGGY_CUFFT_SET_STREAM
470
CUFFT_SAFE_CALL(cufftSetStream(ctx->fft_plan[0], ctx->stream[0]));
471
CUFFT_SAFE_CALL(cufftSetStream(ctx->fft_plan[1], ctx->stream[1]));
472
#endif /* ! BUGGY_CUFFT_SET_STREAM */
475
Shall we create a smaller plan for the last iteration?
478
// cudaFuncSetCacheConfig(hst_cuda_mplinear_kernel, cudaFuncCachePreferL1);
479
// cudaFuncSetCacheConfig(hst_cuda_mplinear_kernel, cudaFuncCachePreferShared);
481
#ifdef PYHST_MEASURE_TIMINGS
482
g_timer_stop(ctx->init_timer);
483
#endif /* PYHST_MEASURE_TIMINGS */
489
* Free resources occupied by GPU context
491
* @param ctx is initialized GPU context
493
static void hst_cuda_free_context(HSTReconstructorContext *rctx) {
497
ctx = GPU_CONTEXT(rctx);
500
if (ctx->initialized) {
501
#ifdef PYHST_MEASURE_TIMINGS
502
g_timer_continue(ctx->init_timer);
503
#endif /* PYHST_MEASURE_TIMINGS */
504
if (ctx->fft_plan[1]) CUFFT_SAFE_CALL(cufftDestroy(ctx->fft_plan[1]));
505
if (ctx->fft_plan[0]) CUFFT_SAFE_CALL(cufftDestroy(ctx->fft_plan[0]));
507
if (ctx->stream[3]) cudaStreamDestroy(ctx->stream[3]);
508
if (ctx->stream[2]) cudaStreamDestroy(ctx->stream[2]);
509
if (ctx->stream[1]) cudaStreamDestroy(ctx->stream[1]);
510
if (ctx->stream[0]) cudaStreamDestroy(ctx->stream[0]);
512
if (ctx->gpu_output[1]) CUDA_SAFE_CALL(cudaFree(ctx->gpu_output[1]));
513
if (ctx->gpu_output[0]) CUDA_SAFE_CALL(cudaFree(ctx->gpu_output[0]));
514
if (ctx->gpu_limits) CUDA_SAFE_CALL(cudaFree(ctx->gpu_limits));
515
if (ctx->gpu_filter) CUDA_SAFE_CALL(cudaFree(ctx->gpu_filter));
516
if (ctx->gpu_buffer) CUDA_SAFE_CALL(cudaFree(ctx->gpu_buffer));
518
if (ctx->gpu_intdata) CUDA_SAFE_CALL(cudaFree(ctx->gpu_intdata));
519
#endif /* HST_TEXTURE16 */
520
if (ctx->gpu_input[1]) CUDA_SAFE_CALL(cudaFree(ctx->gpu_input[1]));
521
if (ctx->gpu_input[0]) CUDA_SAFE_CALL(cudaFree(ctx->gpu_input[0]));
522
#ifdef PYHST_MEASURE_TIMINGS
523
g_timer_stop(ctx->init_timer);
524
#endif /* PYHST_MEASURE_TIMINGS */
527
memset(((char*)ctx) + sizeof(HSTReconstructorContext) + GPU_CONTEXT_MEMSET_OFFSET, 0, sizeof(GPUContext) - sizeof(HSTReconstructorContext) - GPU_CONTEXT_MEMSET_OFFSET);
531
* Free resources and destroy GPU context
533
* @param ctx is initialized GPU context
535
static void hst_cuda_destroy_context(HSTReconstructorContext *rctx) {
536
#ifdef PYHST_MEASURE_TIMINGS
538
#endif /* PYHST_MEASURE_TIMINGS */
540
hst_cuda_free_context(rctx);
541
hst_reconstructor_free_context(rctx);
543
#ifdef PYHST_MEASURE_TIMINGS
544
ctx = GPU_CONTEXT(rctx);
546
g_timer_destroy(ctx->texture_timer);
547
g_timer_destroy(ctx->fromgpu_timer);
548
g_timer_destroy(ctx->init_timer);
549
g_timer_destroy(ctx->togpu_timer);
550
g_timer_destroy(ctx->pre_timer);
551
g_timer_destroy(ctx->main_timer);
552
#endif /* PYHST_MEASURE_TIMINGS */
558
static int hst_cuda_configure(HSTReconstructorContextPtr rctx, HSTChanged what_changed) {
564
ctx = GPU_CONTEXT(rctx);
567
if (what_changed&HST_FILTER_CHANGED) {
568
#ifdef PYHST_MEASURE_TIMINGS
569
g_timer_continue(ctx->init_timer);
570
#endif /* PYHST_MEASURE_TIMINGS */
571
CUDA_SAFE_CALL(cudaMemcpy(ctx->gpu_filter, setup->filter, (2 * setup->dim_fft)*sizeof(float), cudaMemcpyHostToDevice));
572
#ifdef PYHST_MEASURE_TIMINGS
573
g_timer_stop(ctx->init_timer);
574
#endif /* PYHST_MEASURE_TIMINGS */
576
if (what_changed&HST_LIMITS_CHANGED) {
578
check_code(hst_cuda_configure_limits(ctx, setup), "hst_cuda_configure_limit");
585
static HSTConstString hst_cuda_get_title(HSTReconstructorConstContextPtr rctx) {
588
return hst_cuda_device_prop[GPU_CONTEXT(rctx)->device].name;
592
static HSTConstString *hst_cuda_get_timers(HSTReconstructorConstContextPtr rctx, double *timers) {
593
#ifdef PYHST_MEASURE_TIMINGS
597
timers[1] = g_timer_elapsed(GPU_CONTEXT(rctx)->togpu_timer, NULL);
598
timers[2] = g_timer_elapsed(GPU_CONTEXT(rctx)->fromgpu_timer, NULL);
599
timers[3] = g_timer_elapsed(GPU_CONTEXT(rctx)->texture_timer, NULL);
600
timers[4] = g_timer_elapsed(GPU_CONTEXT(rctx)->pre_timer, NULL);
601
timers[5] = g_timer_elapsed(GPU_CONTEXT(rctx)->main_timer, NULL);
602
timers[6] = g_timer_elapsed(GPU_CONTEXT(rctx)->init_timer, NULL);
603
timers[0] = timers[1] + timers[2] + timers[3] + timers[4] + timers[5]; // no init included
605
#endif /* PYHST_MEASURE_TIMINGS */
607
return hst_cuda_timers;
611
static int hst_cuda_send(HSTReconstructorContext *rctx, const float *data) {
615
ctx = GPU_CONTEXT(rctx);
618
#ifdef PYHST_MEASURE_TIMINGS
619
g_timer_continue(ctx->togpu_timer);
620
#endif /* PYHST_MEASURE_TIMINGS */
623
// CUDA 4.2 profiller will show nothing if there is not enough runtime on default stream
624
if ((!ctx->current_buffer)||(!data)) {
626
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
628
for (i = 0; i<128;i++) {
629
hst_cuda_pack_kernel_zpad<<<dimGrid,dimBlock>>>(ctx->gpu_buffer, setup->dim_fft, ctx->gpu_data, ctx->bin_pitch_size, setup->num_bins);
634
if (ctx->current_buffer) {
635
cudaStreamSynchronize(ctx->stream[2]);
636
ctx->gpu_data = ctx->gpu_input[ctx->current_buffer - 1];
637
ctx->gpu_result = ctx->gpu_output[ctx->current_buffer - 1];
638
ctx->synchronized = 1;
643
if ((++ctx->current_buffer) > 2) ctx->current_buffer = 1;
645
if (setup->num_projections%2) {
646
CUDA_SAFE_CALL(cudaMemset(ctx->gpu_input[ctx->current_buffer - 1] + setup->num_projections * ctx->bin_pitch_size, 0, ctx->bin_pitch_size*sizeof(float)));
650
CUDA_SAFE_CALL(cudaMemcpy2D(ctx->gpu_input[ctx->current_buffer - 1], ctx->bin_pitch_size * sizeof(float), data, setup->num_bins * sizeof(float), setup->num_bins * sizeof(float), setup->num_projections, cudaMemcpyHostToDevice));
652
CUDA_SAFE_CALL(cudaMemcpy2DAsync(ctx->gpu_input[ctx->current_buffer - 1], ctx->bin_pitch_size * sizeof(float), data, setup->num_bins * sizeof(float), setup->num_bins * sizeof(float), setup->num_projections, cudaMemcpyHostToDevice, ctx->stream[2]));
654
} else ctx->current_buffer = 0;
656
#ifdef PYHST_MEASURE_TIMINGS
657
g_timer_stop(ctx->togpu_timer);
658
#endif /* PYHST_MEASURE_TIMINGS */
663
static int hst_cuda_wait(HSTReconstructorContext *rctx) {
667
ctx = GPU_CONTEXT(rctx);
670
#ifdef PYHST_MEASURE_TIMINGS
671
g_timer_continue(ctx->fromgpu_timer);
672
#endif /* PYHST_MEASURE_TIMINGS */
674
cudaStreamSynchronize(ctx->stream[3]);
676
#ifdef PYHST_MEASURE_TIMINGS
677
g_timer_stop(ctx->fromgpu_timer);
678
#endif /* PYHST_MEASURE_TIMINGS */
683
static int hst_cuda_reconstruct(HSTReconstructorContext *rctx, float *SLICE, const float *SINOGRAMS) {
690
int batch_size;//, nextbatch;
700
int dim_fft, num_proj, num_bins;
708
ctx = GPU_CONTEXT(rctx);
714
kepler = ctx->kepler;
716
ppt = ctx->points_per_thread;
718
gpu_buffer = ctx->gpu_buffer;
720
dim_fft = setup->dim_fft;
721
num_proj = setup->num_projections;
722
num_bins = setup->num_bins;
723
num_x = setup->num_x;
724
num_y = setup->num_y;
725
axis_position = setup->axis_position;
728
mid = (num_bins + dim_fft) / 2;
730
dim3 dimPadGrid(setup->dim_fft / BLOCK_SIZE, ctx->projection_grid_size);
731
dim3 dimInputGrid(ctx->bin_grid_size, ctx->projection_grid_size);
732
dim3 dimFilterGrid(ctx->filter_grid_size, ctx->projection_grid_size);
733
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
736
batch_size = ctx->fft_batch_size; // Could be shorter on the last iteration
739
if ((ctx->current_buffer)&&(!ctx->synchronized)) {
740
hst_cuda_send(rctx, NULL);
744
#ifdef PYHST_MEASURE_TIMINGS
745
g_timer_continue(ctx->mem_timer);
747
CUDA_SAFE_CALL(cudaMemcpy2DAsync(ctx->gpu_data, ctx->bin_pitch_size * sizeof(float), SINOGRAMS, num_bins * sizeof(float), num_bins * sizeof(float), batch_size, cudaMemcpyHostToDevice, ctx->stream[0]));
748
#ifdef PYHST_MEASURE_TIMINGS
749
g_timer_stop(ctx->mem_timer);
753
// Debug: Clean to prevent accomulating of error
754
//CUDA_SAFE_CALL(cudaMemset(ctx->gpu_data, 0, 2*hst_cuda_calc_blocks(num_proj, 2, NULL)*ctx->bin_pitch_size*sizeof(float)));
756
/* In the case of odd number of projections, our pad function will not handle second projection of the last pair, it
757
will be simply copied from the gpu_data. However, even if gpu_data is originally zeroed, the convolution may introduce
758
small numbers. Over multiple slices, this small numbers may accomulate to significant value affecting the precision of
759
convolution and, hence, the value of the first vector of the pair. This effect could be easily seen if fai360 mode is
761
if ((num_proj%2)&&(!ctx->synchronized)) {
762
#ifdef PYHST_MEASURE_TIMINGS
763
g_timer_continue(ctx->togpu_timer);
764
#endif /* PYHST_MEASURE_TIMINGS */
765
CUDA_SAFE_CALL(cudaMemset(ctx->gpu_data + num_proj * ctx->bin_pitch_size, 0, ctx->bin_pitch_size*sizeof(float)));
766
#ifdef PYHST_MEASURE_TIMINGS
767
g_timer_stop(ctx->togpu_timer);
768
#endif /* PYHST_MEASURE_TIMINGS */
771
for (batch = 0, i = 0; batch < num_proj; batch += batch_size, ++i) {
772
data = SINOGRAMS + batch * num_bins;
773
gpu_data = ctx->gpu_data + batch * ctx->bin_pitch_size;
775
if (batch + batch_size >= num_proj) {
776
batch_size = num_proj - batch;
779
#ifdef PYHST_MEASURE_TIMINGS
780
g_timer_continue(ctx->togpu_timer);
781
#endif /* PYHST_MEASURE_TIMINGS */
782
if (!ctx->synchronized) {
784
CUDA_SAFE_CALL(cudaMemcpy2D(gpu_data, ctx->bin_pitch_size * sizeof(float), data, num_bins * sizeof(float), num_bins * sizeof(float), batch_size, cudaMemcpyHostToDevice));
786
CUDA_SAFE_CALL(cudaMemcpy2DAsync(gpu_data, ctx->bin_pitch_size * sizeof(float), data, num_bins * sizeof(float), num_bins * sizeof(float), batch_size, cudaMemcpyHostToDevice, ctx->stream[i%2]));
790
// Instead we can allocate dedicated gpu_buffers...
791
cudaStreamSynchronize(ctx->stream[(i+1)%2]);
792
#ifdef PYHST_MEASURE_TIMINGS
793
g_timer_stop(ctx->togpu_timer);
794
#endif /* PYHST_MEASURE_TIMINGS */
796
#ifdef PYHST_MEASURE_TIMINGS
797
g_timer_continue(ctx->pre_timer);
798
#endif /* PYHST_MEASURE_TIMINGS */
799
if (dim_fft > num_bins) {
800
if (setup->zero_padding) {
801
hst_cuda_pack_kernel_zpad<<<dimPadGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_buffer, dim_fft, gpu_data, ctx->bin_pitch_size, num_bins);
803
hst_cuda_pack_kernel_epad<<<dimPadGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_buffer, dim_fft, gpu_data, ctx->bin_pitch_size, num_bins, mid);
806
hst_cuda_pack_kernel<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_buffer, dim_fft, gpu_data, ctx->bin_pitch_size);
809
CUFFT_SAFE_CALL(cufftExecC2C(ctx->fft_plan[i%2], (cufftComplex*)gpu_buffer, (cufftComplex*)gpu_buffer, CUFFT_FORWARD));
810
hst_cuda_filter_kernel<<<dimFilterGrid,dimBlock,0,ctx->stream[i%2]>>>(2*dim_fft, gpu_buffer, ctx->gpu_filter);
811
CUFFT_SAFE_CALL(cufftExecC2C(ctx->fft_plan[i%2], (cufftComplex*)gpu_buffer, (cufftComplex*)gpu_buffer, CUFFT_INVERSE));
814
hst_cuda_unpack_kernel_fai360<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_data, ctx->bin_pitch_size, gpu_buffer, dim_fft, ctx->bin_grid_size * BLOCK_SIZE / 2, ctx->gpu_limits + 2 * batch, batch);
816
hst_cuda_unpack_kernel<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_data, ctx->bin_pitch_size, gpu_buffer, dim_fft);
819
#ifdef PYHST_MEASURE_TIMINGS
820
g_timer_stop(ctx->pre_timer);
821
#endif /* PYHST_MEASURE_TIMINGS */
824
#ifdef PYHST_MEASURE_TIMINGS
825
g_timer_continue(ctx->pre_timer);
826
#endif /* PYHST_MEASURE_TIMINGS */
827
cudaStreamSynchronize(ctx->stream[(i+1)%2]);
828
#ifdef PYHST_MEASURE_TIMINGS
829
g_timer_stop(ctx->pre_timer);
830
#endif /* PYHST_MEASURE_TIMINGS */
833
// Debug: Check filtered projections itself
834
CUDA_SAFE_CALL(cudaMemcpy2D(SLICE, num_x * sizeof(float), ctx->gpu_data, ctx->bin_pitch_size * sizeof(float), num_x * sizeof(float), 652, cudaMemcpyDeviceToHost));
838
#ifdef PYHST_MEASURE_TIMINGS
839
g_timer_continue(ctx->texture_timer);
840
#endif /* PYHST_MEASURE_TIMINGS */
843
nppsMinMax_32f(ctx->gpu_data, num_proj * ctx->bin_pitch_size, (float*)(((char*)ctx->gpu_buffer) + ctx->mmax_buffer_size), (float*)(((char*)ctx->gpu_buffer) + ctx->mmax_buffer_size + sizeof(float)), (Npp8u*)ctx->gpu_buffer);
845
dim3 dimAllInputGrid(ctx->bin_grid_size, ctx->allprojection_grid_size);
846
hst_cuda_normalize_kernel<<<dimAllInputGrid,dimBlock>>>(ctx->gpu_intdata, ctx->gpu_data, ctx->bin_pitch_size, (float*)(((char*)ctx->gpu_buffer) + ctx->mmax_buffer_size));
847
# ifdef HST_TEXTURE16_PACKED
848
CUDA_SAFE_CALL( cudaBindTexture2D(NULL, int_projes, ctx->gpu_intdata, ctx->float_desc, num_bins/2 + num_bins%2, num_proj, ctx->bin_pitch_size * sizeof(unsigned short)) );
849
# else /* HST_TEXTURE16_PACKED */
850
CUDA_SAFE_CALL( cudaBindTexture2D(NULL, int_projes, ctx->gpu_intdata, ctx->float_desc, num_bins, num_proj, ctx->bin_pitch_size * sizeof(unsigned short)) );
851
# endif /* HST_TEXTURE16_PACKED */
852
#else /* HST_TEXTURE16 */
853
CUDA_SAFE_CALL( cudaBindTexture2D(NULL, tex_projes, ctx->gpu_data, ctx->float_desc, num_bins, num_proj, ctx->bin_pitch_size * sizeof(float)) );
854
#endif /* HST_TEXTURE16 */
856
#ifdef PYHST_MEASURE_TIMINGS
857
g_timer_stop(ctx->texture_timer);
858
#endif /* PYHST_MEASURE_TIMINGS */
860
dim3 dimBPBlock(BLOCK_SIZE_X, BLOCK_SIZE_Y);
861
dim3 dimGrid(ctx->bp_grid_columns/ppt, ctx->bp_grid_lines/ppt );
862
dimrecx = ctx->bp_grid_columns * BLOCK_SIZE_X;
864
batch_size = ctx->bp_batch_size; // Could be shorter on the last iteration
866
for (batch = 0, i = 0; batch < num_y; batch += batch_size, ++i) {
867
if (batch + batch_size >= num_y) {
868
batch_size = num_y - batch;
871
#ifdef PYHST_MEASURE_TIMINGS
872
g_timer_continue(ctx->main_timer);
873
#endif /* PYHST_MEASURE_TIMINGS */
876
# ifdef HST_TEXTURE16_PACKED
877
hst_cuda_intkernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, (uint32_t*)ctx->gpu_result, setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);
878
# else /* HST_TEXTURE16_PACKED */
879
hst_cuda_intkernel_esrf<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, (uint32_t*)ctx->gpu_result, setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);
880
# endif /* HST_TEXTURE16_PACKED */
881
hst_cuda_denormalize_kernel<<<dimGrid,dimBPBlock, 0, ctx->stream[i%2]>>>(ctx->gpu_result + dimrecx * batch, (float*)(((char*)ctx->gpu_buffer) + ctx->mmax_buffer_size), num_proj);
882
#else /* HST_TEXTURE16 */
884
#ifdef HST_OPTIMIZE_KEPLER
885
if (setup->oversampling) {
886
hst_cuda_mpoversample4_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, ctx->gpu_result, setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);
888
hst_kepler_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, ctx->gpu_result, setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);
890
//dim3 dimBPBlock(BLOCK_SIZE_X, BLOCK_SIZE_Y/2);
891
//hst_cuda_mplinear_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, ctx->gpu_result, setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);
893
#else /* HST_OPTIMIZE_KEPLER */
894
hst_kepler_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, ctx->gpu_result, setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);
895
#endif /* HST_OPTIMIZE_KEPLER */
898
# ifndef HST_FAST_MODE
899
if (setup->oversampling == 1) {
900
# endif /* ! HST_FAST_MODE */
901
hst_cuda_nearest_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, ctx->gpu_result, setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);
902
# ifndef HST_FAST_MODE
903
} else if (setup->oversampling) {
904
# ifdef HW_IGNORE_OLD_HARDWARE
905
hst_cuda_mpoversample4_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, ctx->gpu_result, setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);
906
# else /* HW_IGNORE_OLD_HARDWARE */
907
hst_cuda_oversample4_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, ctx->gpu_result, setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);
908
# endif /* HW_IGNORE_OLD_HARDWARE */
910
hst_cuda_linear_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, ctx->gpu_result, setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);
913
hst_cuda_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, ctx->gpu_result, setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);
915
# endif /* ! HST_FAST_MODE */
916
#endif /* HST_TEXTURE16 */
918
#ifdef PYHST_MEASURE_TIMINGS
919
g_timer_stop(ctx->main_timer);
920
g_timer_continue(ctx->fromgpu_timer);
921
#endif /* PYHST_MEASURE_TIMINGS */
923
if (!ctx->synchronized) {
925
CUDA_SAFE_CALL(cudaMemcpy2D(SLICE + num_x * batch, num_x*sizeof(float), ctx->gpu_result + dimrecx * batch, dimrecx * sizeof(float), num_x * sizeof(float), batch_size, cudaMemcpyDeviceToHost));
927
CUDA_SAFE_CALL(cudaMemcpy2DAsync(SLICE + num_x * batch, num_x*sizeof(float), ctx->gpu_result + dimrecx * batch, dimrecx * sizeof(float), num_x * sizeof(float), batch_size, cudaMemcpyDeviceToHost, ctx->stream[i%2]));
931
#ifdef PYHST_MEASURE_TIMINGS
932
g_timer_stop(ctx->fromgpu_timer);
933
#endif /* PYHST_MEASURE_TIMINGS */
937
#ifdef PYHST_MEASURE_TIMINGS
938
g_timer_continue(ctx->main_timer);
939
#endif /* PYHST_MEASURE_TIMINGS */
941
if (ctx->synchronized) {
942
cudaStreamSynchronize(ctx->stream[0]);
943
cudaStreamSynchronize(ctx->stream[1]);
946
#ifdef PYHST_MEASURE_TIMINGS
947
g_timer_stop(ctx->main_timer);
948
g_timer_continue(ctx->texture_timer);
949
#endif /* PYHST_MEASURE_TIMINGS */
951
cudaUnbindTexture(tex_projes);
954
#ifdef PYHST_MEASURE_TIMINGS
955
g_timer_stop(ctx->texture_timer);
956
g_timer_continue(ctx->fromgpu_timer);
957
#endif /* PYHST_MEASURE_TIMINGS */
959
if (ctx->synchronized) {
961
CUDA_SAFE_CALL(cudaMemcpy2D(SLICE, num_x * sizeof(float), ctx->gpu_result, dimrecx * sizeof(float), num_x * sizeof(float), num_y, cudaMemcpyDeviceToHost));
963
cudaStreamSynchronize(ctx->stream[3]);
964
CUDA_SAFE_CALL(cudaMemcpy2DAsync(SLICE, num_x * sizeof(float), ctx->gpu_result, dimrecx * sizeof(float), num_x * sizeof(float), num_y, cudaMemcpyDeviceToHost, ctx->stream[3]));
967
cudaThreadSynchronize();
970
#ifdef PYHST_MEASURE_TIMINGS
971
g_timer_stop(ctx->fromgpu_timer);
972
#endif /* PYHST_MEASURE_TIMINGS */
974
ctx->synchronized = 0;
980
static HSTReconstructor hst_gpu_info = {
983
hst_cuda_create_context,
984
hst_cuda_init_context,
985
hst_cuda_free_context,
986
hst_cuda_destroy_context,
990
hst_cuda_reconstruct,
991
hst_reconstructor_postprocess_slice,
997
HSTReconstructor *dfi_cuda_init(int flags) {
1002
CUDA_SAFE_CALL(cudaGetDeviceCount(&device_count));
1004
if (device_count > HST_CUDA_MAX_DEVICES) {
1005
pyhst_warning("There is %u CUDA-enabled devices detected in the system, but hst_cuda is configured to use only %u", device_count, HST_CUDA_MAX_DEVICES);
1006
device_count = HST_CUDA_MAX_DEVICES;
1009
pyhst_info("NVIDIA devices: %u", device_count);
1010
for (i = 0, dev = 0; i < device_count; ++i) {
1011
CUDA_SAFE_CALL(cudaGetDeviceProperties(&hst_cuda_device_prop[dev], i));
1013
if ((hst_cuda_device_prop[dev].computeMode == cudaComputeModeProhibited)||((hst_cuda_device_prop[dev].major == 9999 && hst_cuda_device_prop[dev].minor == 9999))) continue;
1014
#ifdef HW_IGNORE_OLD_HARDWARE
1015
if (hst_cuda_device_prop[dev].major < 2) continue;
1016
#endif /* HW_IGNORE_OLD_HARDWARE */
1017
pyhst_debug(" * Device %d: %s v. %d.%d: %d SM, %.2f MHZ, %d MB%s", dev, hst_cuda_device_prop[dev].name, hst_cuda_device_prop[dev].major, hst_cuda_device_prop[dev].minor, hst_cuda_device_prop[dev].multiProcessorCount, hst_cuda_device_prop[dev].clockRate * 1e-6, hst_cuda_device_prop[dev].totalGlobalMem/1048576, hst_cuda_device_prop[dev].deviceOverlap?", overlap":"");
1019
hst_cuda_device_num[dev++] = i;
1022
stmp = getenv("CUDA_LAUNCH_BLOCKING");
1023
if ((stmp)&&(stmp[0])&&(stmp[0] != '0')) blocking = 1;
1025
hst_gpu_info.devices = dev * PARALLEL_PER_GPU;
1026
return &hst_gpu_info;
1029
void dfi_cuda_free() {