/tomo/pyhst : revision 168

1

/*

2

3

* European Synchrotron Radiation Facility (ESRF) and

4

* Karlsruhe Institute of Technology (KIT).

5

*

6

* PyHST is free software: you can redistribute it and/or modify it

7

* under the terms of the GNU General Public License as published by the

8

* Free Software Foundation, either version 3 of the License, or

9

* (at your option) any later version.

10

*

11

* hst is distributed in the hope that it will be useful, but

12

* WITHOUT ANY WARRANTY; without even the implied warranty of

13

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

14

* See the GNU General Public License for more details.

15

*

16

* You should have received a copy of the GNU General Public License along

17

* with this program. If not, see <http://www.gnu.org/licenses/>.

18

*/

19

20

#define PARALLEL_PER_GPU 1

21

// to prevent errors with newer glib and CUDA 4

22

#define GLIB_DISABLE_DEPRECATION_WARNINGS

23

24

#include <stdio.h>

25

#include <stdlib.h>

26

#include <assert.h>

27

#include <errno.h>

28

#include <math.h>

29

#include <stdint.h>

30

31

32

extern "C" {

33

#include <glib.h>

34

}

35

36

37

#include <cufft.h>

38

#if CUDA_VERSION_MAJOR < 5

39

# include <cutil.h>

40

# include <cutil_math.h>

41

#else

42

# include <helper_cuda.h>

43

# include <helper_math.h>

44

45

# define CUDA_SAFE_CALL(val) check ( (val), #val, __FILE__, __LINE__ )

46

# define CUFFT_SAFE_CALL CUDA_SAFE_CALL

47

#endif /* CVM < 5 */

48

49

/*

50

#define CUDA_SAFE_CALL(x) (x)

51

#define CUFFT_SAFE_CALL(x) (x)

52

*/

53

54

#include "debug.h"

55

#include "hw_tools.h"

56

#include "hst_setup.h"

57

#include "hst_reconstructor.h"

58

59

#include "hst.h"

60

#include "dfi_cuda.h"

61

62

//#define BUGGY_CUFFT_SET_STREAM

63

64

#include "dfi_cuda_defines.h"

65

#include "dfi_cuda_kernels.h"

66

#include "dfi_cuda_bp_kernels.h"

67

68

69

#ifdef HST_TEXTURE16

70

# include <npp.h>

71

#endif /* HST_TEXTURE16 */

72

73

74

static int blocking = 0;

75

static int hst_cuda_device_num[HST_CUDA_MAX_DEVICES];

76

static cudaDeviceProp hst_cuda_device_prop[HST_CUDA_MAX_DEVICES]; //!< Enumerated CUDA-enabled devices

77

static HSTConstString hst_cuda_timers[] = { "complete reconstruction", "transfer to device", "transfer from device", "texture mapping", "projection filtering", "backprojection", "*initialization and cleanup", NULL }; //!< List of supported timers

78

79

#define GPU_CONTEXT(ctx) ((GPUContext*)ctx)

80

81

/**

82

* This implementation of HSTReconstructor uses NVidia GeForce-family graphic

83

* cards and NVidia Tesla accelerators to accelerate reconstruction process.

84

* The implementation is based on CUDA toolkit and uses NVidia cuFFT library

85

* for performing Fourier Transformations.

86

* GPUContext is extension of #HSTReconstructorContext which provides additional

87

* data members needed to communicate with graphic hardware

88

*/

89

struct GPUContextT {

90

HSTReconstructorContext recon;

91

#ifdef PYHST_MEASURE_TIMINGS

92

# define GPU_CONTEXT_MEMSET_OFFSET (6 * sizeof(GTimer*))

93

GTimer *main_timer; //!< Counts time spent in backprojection code

94

GTimer *pre_timer; //!< Counts time spent in filtering

95

GTimer *togpu_timer; //!< Counts time spent in memory operations

96

GTimer *init_timer; //!< Counts time spent in initialization

97

GTimer *fromgpu_timer; //!< Counts time spent in memory operations

98

GTimer *texture_timer; //!< Counts time spent in texture binding/unbinding

99

#else

100

# define GPU_CONTEXT_MEMSET_OFFSET 0

101

#endif /* PYHST_MEASURE_TIMINGS */

102

103

int device; //!< Sequence number [0..MAX_DEVICES-1]

104

int initialized; //!< At least partly

105

106

cudaStream_t stream[4]; //!< For interleaving memory writes and computations (2 togpu, 3 fromgpu)

107

108

int kepler; //!< Indicates if compute compatibility is above 3.0

109

int fermi; //!< Indicates if compute compatibility is above 2.0

110

int fft_batch_size; //!< Number of projections to process at once

111

int bp_batch_size; //!< Number of rows to process at once

112

int bin_pitch_size; //!< The number of floats to allocate for each projection (gpu_data)

113

114

int bp_grid_lines; //!< Number of CUDA blocks along Y-axis of the result slice (batched processing)

115

int bp_grid_columns; //!< Number of CUDA blocks along X-axis of the result slice

116

int projection_grid_size; //!< Number of CUDA blocks along projections (each block consists of BLOCK_SIZE threads)

117

int bin_grid_size; //!< Number of CUDA blocks along projection bins (each block consists of BLOCK_SIZE threads)

118

int filter_grid_size; //!< Horizontal CUDA blocks for filtering (2 * dim_fft / BLOCK_SIZE)

119

int points_per_thread; //!< How many points is processed by a single thread (actually square of that)

120

121

#ifdef HST_TEXTURE16

122

int allprojection_grid_size;//!< Number of CUDA blocks along all projections (each block consists of BLOCK_SIZE threads)

123

#endif /* HST_TEXTURE16 */

124

125

int mmax_buffer_size; //!< Buffer size required for float to uint16 conversion of projection data

126

127

128

cudaChannelFormatDesc float_desc; //!< Texture channel description

129

cufftHandle fft_plan[2]; //!< Complex plan for fourier transformations (both forward and inverse)

130

131

int current_buffer; //!< 1 or 2 (0 means no active buffer yet)

132

int synchronized; //!< Indicates if current input buffer is synchronised

133

float *gpu_input[2]; //!< Input buffers (linked by gpu_data)

134

float *gpu_output[2]; //!< Output buffers (linked by gpu_result)

135

136

float *gpu_limits; //!< Parameters for fai360-mode corrections

137

float *gpu_data; //!< Sinogram buffer in device memory

138

float *gpu_buffer; //!< Temporary computation buffer in GPU memory (for filtering, can hold a single batch)

139

float *gpu_filter; //!< Filter buffer in device memory

140

float *gpu_result; //!< Reconstructed slice is going here

141

142

#ifdef HST_TEXTURE16

143

unsigned short *gpu_intdata;//!< Sinogram integer buffer

144

#endif /* HST_TEXTURE16 */

145

};

146

typedef struct GPUContextT GPUContext;

147

148

149

#define hst_cuda_calc_blocks hw_calc_blocks

150

151

static int hst_cuda_configure_limits(GPUContext *ctx, HSTSetup *setup) {

152

float *param_s;

153

154

int projection;

155

int num_proj = setup->num_projections;

156

int pad_proj = ctx->fft_batch_size * hst_cuda_calc_blocks(num_proj, ctx->fft_batch_size, NULL);

157

158

int num_bins = setup->num_bins;

159

float slope_zone = setup->pente_zone;

160

161

float param;

162

float axis_position_corr;

163

float overlapping, flat_zone;

164

165

166

param_s = (float*)malloc(2 * pad_proj * sizeof(float));

167

if (!param_s) return ENOMEM;

168

169

170

for (projection = 0; projection < pad_proj; projection++) {

171

if (projection < num_proj) {

172

axis_position_corr = setup->axis_position_corr_s[projection];

173

} else {

174

axis_position_corr = setup->axis_position;

175

}

176

177

if (2 * axis_position_corr > num_bins) {

178

overlapping = num_bins - axis_position_corr;

179

param = 1; // pente_zone / prof_fact

180

} else {

181

overlapping = axis_position_corr;

182

param = -1; // pente_zone / prof_fact

183

}

184

185

if (overlapping <= 0) {

186

free(param_s);

187

return ERANGE;

188

}

189

190

slope_zone = MIN(slope_zone, overlapping);

191

flat_zone = overlapping - slope_zone;

192

193

param_s[2*projection] = flat_zone;

194

param_s[2*projection + 1] = param * slope_zone;

195

}

196

197

198

#ifdef PYHST_MEASURE_TIMINGS

199

g_timer_continue(ctx->init_timer);

200

#endif /* PYHST_MEASURE_TIMINGS */

201

CUDA_SAFE_CALL(cudaMemcpy(ctx->gpu_limits, param_s, 2*pad_proj*sizeof(float), cudaMemcpyHostToDevice));

202

#ifdef PYHST_MEASURE_TIMINGS

203

g_timer_stop(ctx->init_timer);

204

#endif /* PYHST_MEASURE_TIMINGS */

205

206

free(param_s);

207

208

return 0;

209

}

210

211

212

213

/**

214

*

215

* Create GPU context (uninitialized)

216

*

217

* @param prototype is pointer on HSTReconstructure describing the reconstruction module

218

* @param setup is pointer on HSTSetup with various HST parameters

219

* @result created context

220

*/

221

static HSTReconstructorContext *hst_cuda_create_context(HSTReconstructor *prototype, HSTSetup *setup, int id) {

222

GPUContext *ctx;

223

224

assert(prototype);

225

assert(setup);

226

227

assert((id/PARALLEL_PER_GPU) < prototype->devices);

228

229

/* FIXME: no error code in case of out-of-memory */

230

ctx = (GPUContext*)malloc(sizeof(struct GPUContextT));

231

if (ctx) {

232

memset(ctx, 0, sizeof(struct GPUContextT));

233

234

#ifdef PYHST_MEASURE_TIMINGS

235

ctx->main_timer = g_timer_new();

236

if (ctx->main_timer) g_timer_stop(ctx->main_timer);

237

ctx->pre_timer = g_timer_new();

238

if (ctx->pre_timer) g_timer_stop(ctx->pre_timer);

239

ctx->togpu_timer = g_timer_new();

240

if (ctx->togpu_timer) g_timer_stop(ctx->togpu_timer);

241

ctx->init_timer = g_timer_new();

242

if (ctx->init_timer) g_timer_stop(ctx->init_timer);

243

ctx->fromgpu_timer = g_timer_new();

244

if (ctx->fromgpu_timer) g_timer_stop(ctx->fromgpu_timer);

245

ctx->texture_timer = g_timer_new();

246

if (ctx->texture_timer) g_timer_stop(ctx->texture_timer);

247

248

if ((!ctx->main_timer)||(!ctx->pre_timer)||(!ctx->togpu_timer)||(!ctx->init_timer)||(!ctx->fromgpu_timer)||(!ctx->texture_timer)) {

249

if (ctx->texture_timer) g_timer_destroy(ctx->texture_timer);

250

if (ctx->fromgpu_timer) g_timer_destroy(ctx->fromgpu_timer);

251

if (ctx->init_timer) g_timer_destroy(ctx->init_timer);

252

if (ctx->togpu_timer) g_timer_destroy(ctx->togpu_timer);

253

if (ctx->pre_timer) g_timer_destroy(ctx->pre_timer);

254

if (ctx->main_timer) g_timer_destroy(ctx->main_timer);

255

free(ctx);

256

return NULL;

257

}

258

#endif /* PYHST_MEASURE_TIMINGS */

259

260

hst_reconstructor_init_context((HSTReconstructorContext*)ctx, prototype, setup);

261

262

ctx->device = id / PARALLEL_PER_GPU;

263

}

264

return (HSTReconstructorContext*)ctx;

265

}

266

267

/**

268

* Initializes GPU context

269

*

270

* @param ctx is uninitialized GPU context

271

* @result return code and 0 indicates success

272

*/

273

static int hst_cuda_init_context(HSTReconstructorContext *rctx, HWThread thr) {

274

GPUContext *ctx;

275

HSTSetup *setup;

276

277

int talign; // The mandatory alignment of texture line in floats

278

int dim_result; // The Y-dimmension of result buffer

279

int num_projections; // Projection dimensions (allocation size, rounded for blocked processing)

280

//int filled_projections; // Number of projections which will be filled in the kernels

281

int ppt = 1; // Points per thread

282

283

assert(rctx);

284

285

ctx = GPU_CONTEXT(rctx);

286

setup = rctx->setup;

287

288

assert(setup);

289

/* This limitation is due to statically allocated memory for sin_s, cos_s, etc. It probably could be done

290

dynamically in run-time, just access performance should be checked */

291

assert(setup->num_projections < MAXNPROJECTIONS);

292

/* Otherwise, the filter kernel will corrupt the data and we need to limit the CUDA block_size here as well */

293

assert (2 * setup->dim_fft >= BLOCK_SIZE);

294

295

/* Why the width is doubled?

296

We will compute two real convolutions as a single complex convolution.

297

To achieve that we interleaving the data from two float arrays into

298

the complex one (i.e. first real vector is forming real part of complex

299

numbers and second - the imaginary part). Then, we perform the C2C

300

convolution of resulting vector and disassemble the resulting complex

301

vector into the two real-values sequences.

302

For details, see:

303

http://www.engineeringproductivitytools.com/stuff/T0001/PT10.HTM

304

*/

305

306

ctx->bin_grid_size = hst_cuda_calc_blocks(setup->num_bins, BLOCK_SIZE, NULL);

307

ctx->bin_pitch_size = ctx->bin_grid_size * BLOCK_SIZE;

308

309

talign = hst_cuda_device_prop[ctx->device].textureAlignment / sizeof(float);

310

ctx->bin_pitch_size = talign * hst_cuda_calc_blocks(ctx->bin_pitch_size, talign, NULL);

311

312

ctx->filter_grid_size = 2 * setup->dim_fft / BLOCK_SIZE;

313

if (setup->num_projections < BLOCK_SIZE * FFT_BATCH_SIZE) {

314

num_projections = setup->num_projections;

315

if (num_projections%2) ++num_projections;

316

317

ctx->projection_grid_size = hst_cuda_calc_blocks(num_projections/2, BLOCK_SIZE, NULL);

318

num_projections = 2 * ctx->projection_grid_size * BLOCK_SIZE;

319

320

ctx->fft_batch_size = num_projections;

321

} else {

322

ctx->fft_batch_size = BLOCK_SIZE * FFT_BATCH_SIZE;

323

ctx->projection_grid_size = FFT_BATCH_SIZE / 2;

324

325

num_projections = ctx->fft_batch_size * hst_cuda_calc_blocks(setup->num_projections, ctx->fft_batch_size, NULL);

326

}

327

328

//filled_projections = num_projections;

329

num_projections = 2 * BLOCK_SIZE_Y * hst_cuda_calc_blocks(num_projections, 2 * BLOCK_SIZE_Y, NULL);

330

331

if (hst_cuda_device_prop[ctx->device].major > 2) {

332

ctx->kepler = 1;

333

#ifdef HST_OPTIMIZE_KEPLER

334

if (setup->oversampling) ppt = 4;

335

#endif /* HST_OPTIMIZE_KEPLER */

336

} else if (hst_cuda_device_prop[ctx->device].major > 1) {

337

ctx->fermi = 1;

338

#ifndef HST_TEXTURE16

339

ppt = 2;

340

# ifdef HW_IGNORE_OLD_HARDWARE

341

if (setup->oversampling > 1) ppt = 4;

342

# endif /* HW_IGNORE_OLD_HARDWARE */

343

#endif /* ! HST_TEXTURE16 */

344

}

345

346

ctx->points_per_thread = ppt;

347

348

ctx->bp_grid_columns = ppt * (int)(setup->num_x * 1.0 / (ppt * BLOCK_SIZE_X) + 0.9999999999);

349

if ((!BP_BATCH_SIZE)||(setup->num_y < BLOCK_SIZE_Y * BP_BATCH_SIZE)) {

350

ctx->bp_batch_size = setup->num_y;

351

352

ctx->bp_grid_lines = ppt * (int)(setup->num_y * 1.0 / (ppt * BLOCK_SIZE_Y) + 0.9999999999);

353

354

dim_result = ctx->bp_grid_lines * BLOCK_SIZE_Y;

355

} else {

356

ctx->bp_batch_size = BLOCK_SIZE_Y * BP_BATCH_SIZE;

357

ctx->bp_grid_lines = BP_BATCH_SIZE;

358

359

dim_result = ctx->bp_batch_size * hst_cuda_calc_blocks(setup->num_y, ctx->bp_batch_size, NULL);

360

}

361

362

#ifdef HST_TEXTURE16

363

ctx->allprojection_grid_size = hst_cuda_calc_blocks(setup->num_projections, BLOCK_SIZE, NULL);

364

#endif /* HST_TEXTURE16 */

365

366

367

ctx->initialized = 1;

368

369

#ifdef PYHST_MEASURE_TIMINGS

370

g_timer_continue(ctx->init_timer);

371

#endif /* PYHST_MEASURE_TIMINGS */

372

373

#ifdef HW_USE_THREADS

374

CUDA_SAFE_CALL(cudaSetDevice(hst_cuda_device_num[ctx->device]));

375

#endif /* HW_USE_THREADS */

376

377

#ifdef HST_TEXTURE16

378

# ifdef HST_TEXTURE16_PACKED

379

ctx->float_desc = cudaCreateChannelDesc<uint32_t>();

380

# else /* HST_TEXTURE16_PACKED */

381

ctx->float_desc = cudaCreateChannelDesc<uint16_t>();

382

# endif /* HST_TEXTURE16_PACKED */

383

384

int_projes.filterMode = cudaFilterModePoint;

385

int_projes.addressMode[0] = cudaAddressModeClamp;

386

int_projes.addressMode[1] = cudaAddressModeClamp;

387

388

nppsMinMaxGetBufferSize_32f(setup->num_projections * ctx->bin_pitch_size, &ctx->mmax_buffer_size); // DS: Argument in elements, result in bytes?

389

ctx->mmax_buffer_size += 2 * sizeof(float); // min & max

390

#else /* HST_TEXTURE16 */

391

ctx->float_desc = cudaCreateChannelDesc<float>();

392

#endif /* HST_TEXTURE16 */

393

394

tex_projes.filterMode = cudaFilterModeLinear; //cudaFilterModePoint;

395

tex_projes.addressMode[0] = cudaAddressModeBorder;

396

tex_projes.addressMode[1] = cudaAddressModeBorder;

397

398

float4 *all = (float4*)malloc(num_projections * sizeof(float4));

399

// int max_num = 0, num;

400

for (int i = 0; i < setup->num_projections; i++) {

401

all[i].x = setup->cos_s[i];

402

all[i].y = setup->sin_s[i];

403

all[i].z = setup->axis_position_corr_s[i];

404

all[i].w = floor(ppt * MIN(MIN(BLOCK_SIZE_X * all[i].x, - BLOCK_SIZE_Y * all[i].y), MIN(0., BLOCK_SIZE_X * all[i].x - BLOCK_SIZE_Y * all[i].y)));

405

406

// num = ceil(2*MAX(MAX(BLOCK_SIZE_X * all[i].x, - BLOCK_SIZE_Y * all[i].y), MAX(0., BLOCK_SIZE_X * all[i].x - BLOCK_SIZE_Y * all[i].y))) - all[i].w;

407

// if (num > max_num) max_num = num;

408

}

409

// printf("Max: %i\n", max_num);

410

411

412

if (setup->num_projections < num_projections) {

413

memset(all + setup->num_projections, 0, (num_projections - setup->num_projections)*sizeof(float4));

414

}

415

416

CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_all, all, num_projections*sizeof(float4)));

417

418

free(all);

419

420

// CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_coss, setup->cos_s, setup->num_projections*sizeof(float)));

421

// CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_sins, setup->sin_s, setup->num_projections*sizeof(float)));

422

// CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_axiss, setup->axis_position_corr_s, setup->num_projections*sizeof(float)));

423

424

CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_input[0], num_projections*ctx->bin_pitch_size*sizeof(float)));

425

CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_input[1], num_projections*ctx->bin_pitch_size*sizeof(float)));

426

427

if (setup->num_projections < num_projections) {

428

CUDA_SAFE_CALL(cudaMemset(ctx->gpu_input[0] + setup->num_projections * ctx->bin_pitch_size, 0, (num_projections - setup->num_projections)*ctx->bin_pitch_size*sizeof(float)));

429

CUDA_SAFE_CALL(cudaMemset(ctx->gpu_input[1] + setup->num_projections * ctx->bin_pitch_size, 0, (num_projections - setup->num_projections)*ctx->bin_pitch_size*sizeof(float)));

430

}

431

432

ctx->gpu_data = ctx->gpu_input[0];

433

434

#ifdef HST_TEXTURE16

435

CUDA_SAFE_CALL(cudaMemset(ctx->gpu_input[0], 0, setup->num_projections*ctx->bin_pitch_size*sizeof(float)));

436

CUDA_SAFE_CALL(cudaMemset(ctx->gpu_input[1], 0, setup->num_projections*ctx->bin_pitch_size*sizeof(float)));

437

438

CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_intdata, num_projections*ctx->bin_pitch_size*sizeof(unsigned short)));

439

CUDA_SAFE_CALL(cudaMemset(ctx->gpu_intdata, 0, num_projections*ctx->bin_pitch_size*sizeof(unsigned short)));

440

441

if (setup->num_projections < num_projections) {

442

CUDA_SAFE_CALL(cudaMemset(ctx->gpu_intdata + setup->num_projections * ctx->bin_pitch_size, 0, (num_projections - setup->num_projections)*ctx->bin_pitch_size*sizeof(unsigned short)));

443

}

444

#endif /* HST_TEXTURE16 */

445

446

447

448

CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_buffer, MAX(ctx->fft_batch_size*setup->dim_fft*sizeof(float), ctx->mmax_buffer_size)));

449

CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_filter, 2*setup->dim_fft*sizeof(float)));

450

CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_limits, 2*num_projections*sizeof(float)));

451

452

453

CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_output[0], dim_result * ctx->bp_grid_columns * BLOCK_SIZE_X * sizeof(float)));

454

CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_output[1], dim_result * ctx->bp_grid_columns * BLOCK_SIZE_X * sizeof(float)));

455

456

ctx->gpu_result = ctx->gpu_output[0];

457

458

// CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_result, dim_result * ctx->bp_grid_columns * BLOCK_SIZE_X * sizeof(float)));

459

// CUDA_SAFE_CALL(cudaMemset(ctx->gpu_result, 0, sizeof(float) * dim_result));

460

461

462

cudaStreamCreate(&ctx->stream[0]);

463

cudaStreamCreate(&ctx->stream[1]);

464

cudaStreamCreate(&ctx->stream[2]);

465

cudaStreamCreate(&ctx->stream[3]);

466

467

CUFFT_SAFE_CALL(cufftPlan1d(&ctx->fft_plan[0], setup->dim_fft, CUFFT_C2C, ctx->fft_batch_size/2));

468

CUFFT_SAFE_CALL(cufftPlan1d(&ctx->fft_plan[1], setup->dim_fft, CUFFT_C2C, ctx->fft_batch_size/2));

469

#ifndef BUGGY_CUFFT_SET_STREAM

470

CUFFT_SAFE_CALL(cufftSetStream(ctx->fft_plan[0], ctx->stream[0]));

471

CUFFT_SAFE_CALL(cufftSetStream(ctx->fft_plan[1], ctx->stream[1]));

472

#endif /* ! BUGGY_CUFFT_SET_STREAM */

473

474

/*

475

Shall we create a smaller plan for the last iteration?

476

*/

477

478

// cudaFuncSetCacheConfig(hst_cuda_mplinear_kernel, cudaFuncCachePreferL1);

479

// cudaFuncSetCacheConfig(hst_cuda_mplinear_kernel, cudaFuncCachePreferShared);

480

481

#ifdef PYHST_MEASURE_TIMINGS

482

g_timer_stop(ctx->init_timer);

483

#endif /* PYHST_MEASURE_TIMINGS */

484

485

return 0;

486

}

487

488

/**

489

* Free resources occupied by GPU context

490

*

491

* @param ctx is initialized GPU context

492

*/

493

static void hst_cuda_free_context(HSTReconstructorContext *rctx) {

494

GPUContext *ctx;

495

496

assert(rctx);

497

ctx = GPU_CONTEXT(rctx);

498

assert(ctx);

499

500

if (ctx->initialized) {

501

#ifdef PYHST_MEASURE_TIMINGS

502

g_timer_continue(ctx->init_timer);

503

#endif /* PYHST_MEASURE_TIMINGS */

504

if (ctx->fft_plan[1]) CUFFT_SAFE_CALL(cufftDestroy(ctx->fft_plan[1]));

505

if (ctx->fft_plan[0]) CUFFT_SAFE_CALL(cufftDestroy(ctx->fft_plan[0]));

506

507

if (ctx->stream[3]) cudaStreamDestroy(ctx->stream[3]);

508

if (ctx->stream[2]) cudaStreamDestroy(ctx->stream[2]);

509

if (ctx->stream[1]) cudaStreamDestroy(ctx->stream[1]);

510

if (ctx->stream[0]) cudaStreamDestroy(ctx->stream[0]);

511

512

if (ctx->gpu_output[1]) CUDA_SAFE_CALL(cudaFree(ctx->gpu_output[1]));

513

if (ctx->gpu_output[0]) CUDA_SAFE_CALL(cudaFree(ctx->gpu_output[0]));

514

if (ctx->gpu_limits) CUDA_SAFE_CALL(cudaFree(ctx->gpu_limits));

515

if (ctx->gpu_filter) CUDA_SAFE_CALL(cudaFree(ctx->gpu_filter));

516

if (ctx->gpu_buffer) CUDA_SAFE_CALL(cudaFree(ctx->gpu_buffer));

517

#ifdef HST_TEXTURE16

518

if (ctx->gpu_intdata) CUDA_SAFE_CALL(cudaFree(ctx->gpu_intdata));

519

#endif /* HST_TEXTURE16 */

520

if (ctx->gpu_input[1]) CUDA_SAFE_CALL(cudaFree(ctx->gpu_input[1]));

521

if (ctx->gpu_input[0]) CUDA_SAFE_CALL(cudaFree(ctx->gpu_input[0]));

522

#ifdef PYHST_MEASURE_TIMINGS

523

g_timer_stop(ctx->init_timer);

524

#endif /* PYHST_MEASURE_TIMINGS */

525

}

526

527

memset(((char*)ctx) + sizeof(HSTReconstructorContext) + GPU_CONTEXT_MEMSET_OFFSET, 0, sizeof(GPUContext) - sizeof(HSTReconstructorContext) - GPU_CONTEXT_MEMSET_OFFSET);

528

}

529

530

/**

531

* Free resources and destroy GPU context

532

*

533

* @param ctx is initialized GPU context

534

*/

535

static void hst_cuda_destroy_context(HSTReconstructorContext *rctx) {

536

#ifdef PYHST_MEASURE_TIMINGS

537

GPUContext *ctx;

538

#endif /* PYHST_MEASURE_TIMINGS */

539

540

hst_cuda_free_context(rctx);

541

hst_reconstructor_free_context(rctx);

542

543

#ifdef PYHST_MEASURE_TIMINGS

544

ctx = GPU_CONTEXT(rctx);

545

546

g_timer_destroy(ctx->texture_timer);

547

g_timer_destroy(ctx->fromgpu_timer);

548

g_timer_destroy(ctx->init_timer);

549

g_timer_destroy(ctx->togpu_timer);

550

g_timer_destroy(ctx->pre_timer);

551

g_timer_destroy(ctx->main_timer);

552

#endif /* PYHST_MEASURE_TIMINGS */

553

554

free(rctx);

555

}

556

557

558

static int hst_cuda_configure(HSTReconstructorContextPtr rctx, HSTChanged what_changed) {

559

GPUContext *ctx;

560

HSTSetup *setup;

561

562

assert(rctx);

563

564

ctx = GPU_CONTEXT(rctx);

565

setup = rctx->setup;

566

567

if (what_changed&HST_FILTER_CHANGED) {

568

#ifdef PYHST_MEASURE_TIMINGS

569

g_timer_continue(ctx->init_timer);

570

#endif /* PYHST_MEASURE_TIMINGS */

571

CUDA_SAFE_CALL(cudaMemcpy(ctx->gpu_filter, setup->filter, (2 * setup->dim_fft)*sizeof(float), cudaMemcpyHostToDevice));

572

#ifdef PYHST_MEASURE_TIMINGS

573

g_timer_stop(ctx->init_timer);

574

#endif /* PYHST_MEASURE_TIMINGS */

575

}

576

if (what_changed&HST_LIMITS_CHANGED) {

577

if (setup->fai360) {

578

check_code(hst_cuda_configure_limits(ctx, setup), "hst_cuda_configure_limit");

579

}

580

}

581

582

return 0;

583

}

584

585

static HSTConstString hst_cuda_get_title(HSTReconstructorConstContextPtr rctx) {

586

assert(rctx);

587

588

return hst_cuda_device_prop[GPU_CONTEXT(rctx)->device].name;

589

}

590

591

592

static HSTConstString *hst_cuda_get_timers(HSTReconstructorConstContextPtr rctx, double *timers) {

593

#ifdef PYHST_MEASURE_TIMINGS

594

assert(rctx);

595

596

if (timers) {

597

timers[1] = g_timer_elapsed(GPU_CONTEXT(rctx)->togpu_timer, NULL);

598

timers[2] = g_timer_elapsed(GPU_CONTEXT(rctx)->fromgpu_timer, NULL);

599

timers[3] = g_timer_elapsed(GPU_CONTEXT(rctx)->texture_timer, NULL);

600

timers[4] = g_timer_elapsed(GPU_CONTEXT(rctx)->pre_timer, NULL);

601

timers[5] = g_timer_elapsed(GPU_CONTEXT(rctx)->main_timer, NULL);

602

timers[6] = g_timer_elapsed(GPU_CONTEXT(rctx)->init_timer, NULL);

603

timers[0] = timers[1] + timers[2] + timers[3] + timers[4] + timers[5]; // no init included

604

}

605

#endif /* PYHST_MEASURE_TIMINGS */

606

607

return hst_cuda_timers;

608

}

609

610

611

static int hst_cuda_send(HSTReconstructorContext *rctx, const float *data) {

612

GPUContext *ctx;

613

HSTSetup *setup;

614

615

ctx = GPU_CONTEXT(rctx);

616

setup = rctx->setup;

617

618

#ifdef PYHST_MEASURE_TIMINGS

619

g_timer_continue(ctx->togpu_timer);

620

#endif /* PYHST_MEASURE_TIMINGS */

621

622

/*

623

// CUDA 4.2 profiller will show nothing if there is not enough runtime on default stream

624

if ((!ctx->current_buffer)||(!data)) {

625

int i;

626

dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

627

dim3 dimGrid(1, 1);

628

for (i = 0; i<128;i++) {

629

hst_cuda_pack_kernel_zpad<<<dimGrid,dimBlock>>>(ctx->gpu_buffer, setup->dim_fft, ctx->gpu_data, ctx->bin_pitch_size, setup->num_bins);

630

}

631

}

632

*/

633

634

if (ctx->current_buffer) {

635

cudaStreamSynchronize(ctx->stream[2]);

636

ctx->gpu_data = ctx->gpu_input[ctx->current_buffer - 1];

637

ctx->gpu_result = ctx->gpu_output[ctx->current_buffer - 1];

638

ctx->synchronized = 1;

639

}

640

641

642

if (data) {

643

if ((++ctx->current_buffer) > 2) ctx->current_buffer = 1;

644

645

if (setup->num_projections%2) {

646

CUDA_SAFE_CALL(cudaMemset(ctx->gpu_input[ctx->current_buffer - 1] + setup->num_projections * ctx->bin_pitch_size, 0, ctx->bin_pitch_size*sizeof(float)));

647

}

648

649

if (blocking) {

650

CUDA_SAFE_CALL(cudaMemcpy2D(ctx->gpu_input[ctx->current_buffer - 1], ctx->bin_pitch_size * sizeof(float), data, setup->num_bins * sizeof(float), setup->num_bins * sizeof(float), setup->num_projections, cudaMemcpyHostToDevice));

651

} else {

652

CUDA_SAFE_CALL(cudaMemcpy2DAsync(ctx->gpu_input[ctx->current_buffer - 1], ctx->bin_pitch_size * sizeof(float), data, setup->num_bins * sizeof(float), setup->num_bins * sizeof(float), setup->num_projections, cudaMemcpyHostToDevice, ctx->stream[2]));

653

}

654

} else ctx->current_buffer = 0;

655

656

#ifdef PYHST_MEASURE_TIMINGS

657

g_timer_stop(ctx->togpu_timer);

658

#endif /* PYHST_MEASURE_TIMINGS */

659

660

return 0;

661

}

662

663

static int hst_cuda_wait(HSTReconstructorContext *rctx) {

664

GPUContext *ctx;

665

HSTSetup *setup;

666

667

ctx = GPU_CONTEXT(rctx);

668

setup = rctx->setup;

669

670

#ifdef PYHST_MEASURE_TIMINGS

671

g_timer_continue(ctx->fromgpu_timer);

672

#endif /* PYHST_MEASURE_TIMINGS */

673

674

cudaStreamSynchronize(ctx->stream[3]);

675

676

#ifdef PYHST_MEASURE_TIMINGS

677

g_timer_stop(ctx->fromgpu_timer);

678

#endif /* PYHST_MEASURE_TIMINGS */

679

680

return 0;

681

}

682

683

static int hst_cuda_reconstruct(HSTReconstructorContext *rctx, float *SLICE, const float *SINOGRAMS) {

684

int i = 0;

685

686

int kepler;

687

int fermi;

688

int ppt;

689

690

int batch_size;//, nextbatch;

691

int batch;

692

693

GPUContext *ctx;

694

HSTSetup *setup;

695

696

float *gpu_data;

697

float *gpu_buffer;

698

const float *data;

699

700

int dim_fft, num_proj, num_bins;

701

int num_x, num_y;

702

int dimrecx;

703

float axis_position;

704

705

int mid;

706

707

assert(rctx);

708

ctx = GPU_CONTEXT(rctx);

709

setup = rctx->setup;

710

711

assert(ctx);

712

assert(setup);

713

714

kepler = ctx->kepler;

715

fermi = ctx->fermi;

716

ppt = ctx->points_per_thread;

717

718

gpu_buffer = ctx->gpu_buffer;

719

720

dim_fft = setup->dim_fft;

721

num_proj = setup->num_projections;

722

num_bins = setup->num_bins;

723

num_x = setup->num_x;

724

num_y = setup->num_y;

725

axis_position = setup->axis_position;

726

727

728

mid = (num_bins + dim_fft) / 2;

729

730

dim3 dimPadGrid(setup->dim_fft / BLOCK_SIZE, ctx->projection_grid_size);

731

dim3 dimInputGrid(ctx->bin_grid_size, ctx->projection_grid_size);

732

dim3 dimFilterGrid(ctx->filter_grid_size, ctx->projection_grid_size);

733

dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

734

735

736

batch_size = ctx->fft_batch_size; // Could be shorter on the last iteration

737

738

739

if ((ctx->current_buffer)&&(!ctx->synchronized)) {

740

hst_cuda_send(rctx, NULL);

741

}

742

743

/*

744

#ifdef PYHST_MEASURE_TIMINGS

745

g_timer_continue(ctx->mem_timer);

746

#endif

747

CUDA_SAFE_CALL(cudaMemcpy2DAsync(ctx->gpu_data, ctx->bin_pitch_size * sizeof(float), SINOGRAMS, num_bins * sizeof(float), num_bins * sizeof(float), batch_size, cudaMemcpyHostToDevice, ctx->stream[0]));

748

#ifdef PYHST_MEASURE_TIMINGS

749

g_timer_stop(ctx->mem_timer);

750

#endif

751

*/

752

753

// Debug: Clean to prevent accomulating of error

754

//CUDA_SAFE_CALL(cudaMemset(ctx->gpu_data, 0, 2*hst_cuda_calc_blocks(num_proj, 2, NULL)*ctx->bin_pitch_size*sizeof(float)));

755

756

/* In the case of odd number of projections, our pad function will not handle second projection of the last pair, it

757

will be simply copied from the gpu_data. However, even if gpu_data is originally zeroed, the convolution may introduce

758

small numbers. Over multiple slices, this small numbers may accomulate to significant value affecting the precision of

759

convolution and, hence, the value of the first vector of the pair. This effect could be easily seen if fai360 mode is

760

on */

761

if ((num_proj%2)&&(!ctx->synchronized)) {

762

#ifdef PYHST_MEASURE_TIMINGS

763

g_timer_continue(ctx->togpu_timer);

764

#endif /* PYHST_MEASURE_TIMINGS */

765

CUDA_SAFE_CALL(cudaMemset(ctx->gpu_data + num_proj * ctx->bin_pitch_size, 0, ctx->bin_pitch_size*sizeof(float)));

766

#ifdef PYHST_MEASURE_TIMINGS

767

g_timer_stop(ctx->togpu_timer);

768

#endif /* PYHST_MEASURE_TIMINGS */

769

}

770

771

for (batch = 0, i = 0; batch < num_proj; batch += batch_size, ++i) {

772

data = SINOGRAMS + batch * num_bins;

773

gpu_data = ctx->gpu_data + batch * ctx->bin_pitch_size;

774

775

if (batch + batch_size >= num_proj) {

776

batch_size = num_proj - batch;

777

}

778

779

#ifdef PYHST_MEASURE_TIMINGS

780

g_timer_continue(ctx->togpu_timer);

781

#endif /* PYHST_MEASURE_TIMINGS */

782

if (!ctx->synchronized) {

783

if (blocking) {

784

CUDA_SAFE_CALL(cudaMemcpy2D(gpu_data, ctx->bin_pitch_size * sizeof(float), data, num_bins * sizeof(float), num_bins * sizeof(float), batch_size, cudaMemcpyHostToDevice));

785

} else {

786

CUDA_SAFE_CALL(cudaMemcpy2DAsync(gpu_data, ctx->bin_pitch_size * sizeof(float), data, num_bins * sizeof(float), num_bins * sizeof(float), batch_size, cudaMemcpyHostToDevice, ctx->stream[i%2]));

787

}

788

}

789

790

// Instead we can allocate dedicated gpu_buffers...

791

cudaStreamSynchronize(ctx->stream[(i+1)%2]);

792

#ifdef PYHST_MEASURE_TIMINGS

793

g_timer_stop(ctx->togpu_timer);

794

#endif /* PYHST_MEASURE_TIMINGS */

795

796

#ifdef PYHST_MEASURE_TIMINGS

797

g_timer_continue(ctx->pre_timer);

798

#endif /* PYHST_MEASURE_TIMINGS */

799

if (dim_fft > num_bins) {

800

if (setup->zero_padding) {

801

hst_cuda_pack_kernel_zpad<<<dimPadGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_buffer, dim_fft, gpu_data, ctx->bin_pitch_size, num_bins);

802

} else {

803

hst_cuda_pack_kernel_epad<<<dimPadGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_buffer, dim_fft, gpu_data, ctx->bin_pitch_size, num_bins, mid);

804

}

805

} else {

806

hst_cuda_pack_kernel<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_buffer, dim_fft, gpu_data, ctx->bin_pitch_size);

807

}

808

809

CUFFT_SAFE_CALL(cufftExecC2C(ctx->fft_plan[i%2], (cufftComplex*)gpu_buffer, (cufftComplex*)gpu_buffer, CUFFT_FORWARD));

810

hst_cuda_filter_kernel<<<dimFilterGrid,dimBlock,0,ctx->stream[i%2]>>>(2*dim_fft, gpu_buffer, ctx->gpu_filter);

811

CUFFT_SAFE_CALL(cufftExecC2C(ctx->fft_plan[i%2], (cufftComplex*)gpu_buffer, (cufftComplex*)gpu_buffer, CUFFT_INVERSE));

812

813

if (setup->fai360) {

814

hst_cuda_unpack_kernel_fai360<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_data, ctx->bin_pitch_size, gpu_buffer, dim_fft, ctx->bin_grid_size * BLOCK_SIZE / 2, ctx->gpu_limits + 2 * batch, batch);

815

} else {

816

hst_cuda_unpack_kernel<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_data, ctx->bin_pitch_size, gpu_buffer, dim_fft);

817

}

818

819

#ifdef PYHST_MEASURE_TIMINGS

820

g_timer_stop(ctx->pre_timer);

821

#endif /* PYHST_MEASURE_TIMINGS */

822

}

823

824

#ifdef PYHST_MEASURE_TIMINGS

825

g_timer_continue(ctx->pre_timer);

826

#endif /* PYHST_MEASURE_TIMINGS */

827

cudaStreamSynchronize(ctx->stream[(i+1)%2]);

828

#ifdef PYHST_MEASURE_TIMINGS

829

g_timer_stop(ctx->pre_timer);

830

#endif /* PYHST_MEASURE_TIMINGS */

831

832

/*

833

// Debug: Check filtered projections itself

834

CUDA_SAFE_CALL(cudaMemcpy2D(SLICE, num_x * sizeof(float), ctx->gpu_data, ctx->bin_pitch_size * sizeof(float), num_x * sizeof(float), 652, cudaMemcpyDeviceToHost));

835

return 0;

836

*/

837

838

#ifdef PYHST_MEASURE_TIMINGS

839

g_timer_continue(ctx->texture_timer);

840

#endif /* PYHST_MEASURE_TIMINGS */

841

842

#ifdef HST_TEXTURE16

843

nppsMinMax_32f(ctx->gpu_data, num_proj * ctx->bin_pitch_size, (float*)(((char*)ctx->gpu_buffer) + ctx->mmax_buffer_size), (float*)(((char*)ctx->gpu_buffer) + ctx->mmax_buffer_size + sizeof(float)), (Npp8u*)ctx->gpu_buffer);

844

845

dim3 dimAllInputGrid(ctx->bin_grid_size, ctx->allprojection_grid_size);

846

hst_cuda_normalize_kernel<<<dimAllInputGrid,dimBlock>>>(ctx->gpu_intdata, ctx->gpu_data, ctx->bin_pitch_size, (float*)(((char*)ctx->gpu_buffer) + ctx->mmax_buffer_size));

847

# ifdef HST_TEXTURE16_PACKED

848

CUDA_SAFE_CALL( cudaBindTexture2D(NULL, int_projes, ctx->gpu_intdata, ctx->float_desc, num_bins/2 + num_bins%2, num_proj, ctx->bin_pitch_size * sizeof(unsigned short)) );

849

# else /* HST_TEXTURE16_PACKED */

850

CUDA_SAFE_CALL( cudaBindTexture2D(NULL, int_projes, ctx->gpu_intdata, ctx->float_desc, num_bins, num_proj, ctx->bin_pitch_size * sizeof(unsigned short)) );

851

# endif /* HST_TEXTURE16_PACKED */

852

#else /* HST_TEXTURE16 */

853

CUDA_SAFE_CALL( cudaBindTexture2D(NULL, tex_projes, ctx->gpu_data, ctx->float_desc, num_bins, num_proj, ctx->bin_pitch_size * sizeof(float)) );

854

#endif /* HST_TEXTURE16 */

855

856

#ifdef PYHST_MEASURE_TIMINGS

857

g_timer_stop(ctx->texture_timer);

858

#endif /* PYHST_MEASURE_TIMINGS */

859

860

dim3 dimBPBlock(BLOCK_SIZE_X, BLOCK_SIZE_Y);

861

dim3 dimGrid(ctx->bp_grid_columns/ppt, ctx->bp_grid_lines/ppt );

862

dimrecx = ctx->bp_grid_columns * BLOCK_SIZE_X;

863

864

batch_size = ctx->bp_batch_size; // Could be shorter on the last iteration

865

866

for (batch = 0, i = 0; batch < num_y; batch += batch_size, ++i) {

867

if (batch + batch_size >= num_y) {

868

batch_size = num_y - batch;

869

}

870

871

#ifdef PYHST_MEASURE_TIMINGS

872

g_timer_continue(ctx->main_timer);

873

#endif /* PYHST_MEASURE_TIMINGS */

874

875

#ifdef HST_TEXTURE16

876

# ifdef HST_TEXTURE16_PACKED

877

hst_cuda_intkernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, (uint32_t*)ctx->gpu_result, setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);

878

# else /* HST_TEXTURE16_PACKED */

879

hst_cuda_intkernel_esrf<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, (uint32_t*)ctx->gpu_result, setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);

880

# endif /* HST_TEXTURE16_PACKED */

881

hst_cuda_denormalize_kernel<<<dimGrid,dimBPBlock, 0, ctx->stream[i%2]>>>(ctx->gpu_result + dimrecx * batch, (float*)(((char*)ctx->gpu_buffer) + ctx->mmax_buffer_size), num_proj);

882

#else /* HST_TEXTURE16 */

883

if (kepler) {

884

#ifdef HST_OPTIMIZE_KEPLER

885

if (setup->oversampling) {

886

hst_cuda_mpoversample4_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, ctx->gpu_result, setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);

887

} else {

888

hst_kepler_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, ctx->gpu_result, setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);

889

890

//dim3 dimBPBlock(BLOCK_SIZE_X, BLOCK_SIZE_Y/2);

891

//hst_cuda_mplinear_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, ctx->gpu_result, setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);

892

}

893

#else /* HST_OPTIMIZE_KEPLER */

894

hst_kepler_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, ctx->gpu_result, setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);

895

#endif /* HST_OPTIMIZE_KEPLER */

896

897

} else if (fermi) {

898

# ifndef HST_FAST_MODE

899

if (setup->oversampling == 1) {

900

# endif /* ! HST_FAST_MODE */

901

hst_cuda_nearest_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, ctx->gpu_result, setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);

902

# ifndef HST_FAST_MODE

903

} else if (setup->oversampling) {

904

# ifdef HW_IGNORE_OLD_HARDWARE

905

hst_cuda_mpoversample4_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, ctx->gpu_result, setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);

906

# else /* HW_IGNORE_OLD_HARDWARE */

907

hst_cuda_oversample4_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, ctx->gpu_result, setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);

908

# endif /* HW_IGNORE_OLD_HARDWARE */

909

} else {

910

hst_cuda_linear_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, ctx->gpu_result, setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);

911

}

912

} else {

913

hst_cuda_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, ctx->gpu_result, setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);

914

}

915

# endif /* ! HST_FAST_MODE */

916

#endif /* HST_TEXTURE16 */

917

918

#ifdef PYHST_MEASURE_TIMINGS

919

g_timer_stop(ctx->main_timer);

920

g_timer_continue(ctx->fromgpu_timer);

921

#endif /* PYHST_MEASURE_TIMINGS */

922

923

if (!ctx->synchronized) {

924

if (blocking) {

925

CUDA_SAFE_CALL(cudaMemcpy2D(SLICE + num_x * batch, num_x*sizeof(float), ctx->gpu_result + dimrecx * batch, dimrecx * sizeof(float), num_x * sizeof(float), batch_size, cudaMemcpyDeviceToHost));

926

} else {

927

CUDA_SAFE_CALL(cudaMemcpy2DAsync(SLICE + num_x * batch, num_x*sizeof(float), ctx->gpu_result + dimrecx * batch, dimrecx * sizeof(float), num_x * sizeof(float), batch_size, cudaMemcpyDeviceToHost, ctx->stream[i%2]));

928

}

929

}

930

931

#ifdef PYHST_MEASURE_TIMINGS

932

g_timer_stop(ctx->fromgpu_timer);

933

#endif /* PYHST_MEASURE_TIMINGS */

934

}

935

936

937

#ifdef PYHST_MEASURE_TIMINGS

938

g_timer_continue(ctx->main_timer);

939

#endif /* PYHST_MEASURE_TIMINGS */

940

941

if (ctx->synchronized) {

942

cudaStreamSynchronize(ctx->stream[0]);

943

cudaStreamSynchronize(ctx->stream[1]);

944

}

945

946

#ifdef PYHST_MEASURE_TIMINGS

947

g_timer_stop(ctx->main_timer);

948

g_timer_continue(ctx->texture_timer);

949

#endif /* PYHST_MEASURE_TIMINGS */

950

951

cudaUnbindTexture(tex_projes);

952

953

954

#ifdef PYHST_MEASURE_TIMINGS

955

g_timer_stop(ctx->texture_timer);

956

g_timer_continue(ctx->fromgpu_timer);

957

#endif /* PYHST_MEASURE_TIMINGS */

958

959

if (ctx->synchronized) {

960

if (blocking) {

961

CUDA_SAFE_CALL(cudaMemcpy2D(SLICE, num_x * sizeof(float), ctx->gpu_result, dimrecx * sizeof(float), num_x * sizeof(float), num_y, cudaMemcpyDeviceToHost));

962

} else {

963

cudaStreamSynchronize(ctx->stream[3]);

964

CUDA_SAFE_CALL(cudaMemcpy2DAsync(SLICE, num_x * sizeof(float), ctx->gpu_result, dimrecx * sizeof(float), num_x * sizeof(float), num_y, cudaMemcpyDeviceToHost, ctx->stream[3]));

965

}

966

} else {

967

cudaThreadSynchronize();

968

}

969

970

#ifdef PYHST_MEASURE_TIMINGS

971

g_timer_stop(ctx->fromgpu_timer);

972

#endif /* PYHST_MEASURE_TIMINGS */

973

974

ctx->synchronized = 0;

975

976

return 0;

977

}

978

979

980

static HSTReconstructor hst_gpu_info = {

981

0,

982

hst_cuda_get_title,

983

hst_cuda_create_context,

984

hst_cuda_init_context,

985

hst_cuda_free_context,

986

hst_cuda_destroy_context,

987

hst_cuda_configure,

988

hst_cuda_send,

989

NULL,

990

hst_cuda_reconstruct,

991

hst_reconstructor_postprocess_slice,

992

hst_cuda_wait,

993

hst_cuda_get_timers

994

};

995

996

997

HSTReconstructor *dfi_cuda_init(int flags) {

998

int i, dev;

999

char *stmp;

1000

int device_count;

1001

1002

CUDA_SAFE_CALL(cudaGetDeviceCount(&device_count));

1003

1004

if (device_count > HST_CUDA_MAX_DEVICES) {

1005

pyhst_warning("There is %u CUDA-enabled devices detected in the system, but hst_cuda is configured to use only %u", device_count, HST_CUDA_MAX_DEVICES);

1006

device_count = HST_CUDA_MAX_DEVICES;

1007

}

1008

1009

pyhst_info("NVIDIA devices: %u", device_count);

1010

for (i = 0, dev = 0; i < device_count; ++i) {

1011

CUDA_SAFE_CALL(cudaGetDeviceProperties(&hst_cuda_device_prop[dev], i));

1012

1013

if ((hst_cuda_device_prop[dev].computeMode == cudaComputeModeProhibited)||((hst_cuda_device_prop[dev].major == 9999 && hst_cuda_device_prop[dev].minor == 9999))) continue;

1014

#ifdef HW_IGNORE_OLD_HARDWARE

1015

if (hst_cuda_device_prop[dev].major < 2) continue;

1016

#endif /* HW_IGNORE_OLD_HARDWARE */

1017

pyhst_debug(" * Device %d: %s v. %d.%d: %d SM, %.2f MHZ, %d MB%s", dev, hst_cuda_device_prop[dev].name, hst_cuda_device_prop[dev].major, hst_cuda_device_prop[dev].minor, hst_cuda_device_prop[dev].multiProcessorCount, hst_cuda_device_prop[dev].clockRate * 1e-6, hst_cuda_device_prop[dev].totalGlobalMem/1048576, hst_cuda_device_prop[dev].deviceOverlap?", overlap":"");

1018

1019

hst_cuda_device_num[dev++] = i;

1020

}

1021

1022

stmp = getenv("CUDA_LAUNCH_BLOCKING");

1023

if ((stmp)&&(stmp[0])&&(stmp[0] != '0')) blocking = 1;

1024

1025

hst_gpu_info.devices = dev * PARALLEL_PER_GPU;

1026

return &hst_gpu_info;

1027

}

1028

1029

void dfi_cuda_free() {

1030

}

1031