735
742
cudaStreamCreate(&ctx->stream[2]);
736
743
cudaStreamCreate(&ctx->stream[3]);
738
746
CUFFT_SAFE_CALL(cufftPlan1d(&ctx->fft_plan[0], setup->dim_fft, CUFFT_C2C, SLICE_BLOCK * (ctx->fft_batch_size/2)));
739
747
CUFFT_SAFE_CALL(cufftPlan1d(&ctx->fft_plan[1], setup->dim_fft, CUFFT_C2C, SLICE_BLOCK * (ctx->fft_batch_size/2)));
749
int dim_fft = setup->dim_fft;
751
// In Fourier domain, the parameters are given in cufftComplex. I.e. they are double size
752
CUFFT_SAFE_CALL(cufftPlanMany(&ctx->fft_plan[0], 1, &dim_fft, &dim_fft, 1, dim_fft, &dim_fft, 1, dim_fft, CUFFT_R2C, SLICE_BLOCK * (ctx->fft_batch_size)));
753
CUFFT_SAFE_CALL(cufftPlanMany(&ctx->fft_plan[1], 1, &dim_fft, &dim_fft, 1, dim_fft, &dim_fft, 1, dim_fft, CUFFT_R2C, SLICE_BLOCK * (ctx->fft_batch_size)));
754
CUFFT_SAFE_CALL(cufftPlanMany(&ctx->ifft_plan[0], 1, &dim_fft, &dim_fft, 1, dim_fft, &dim_fft, 1, dim_fft, CUFFT_C2R, SLICE_BLOCK * (ctx->fft_batch_size)));
755
CUFFT_SAFE_CALL(cufftPlanMany(&ctx->ifft_plan[1], 1, &dim_fft, &dim_fft, 1, dim_fft, &dim_fft, 1, dim_fft, CUFFT_C2R, SLICE_BLOCK * (ctx->fft_batch_size)));
740
758
#ifndef BUGGY_CUFFT_SET_STREAM
741
759
CUFFT_SAFE_CALL(cufftSetStream(ctx->fft_plan[0], ctx->stream[0]));
742
760
CUFFT_SAFE_CALL(cufftSetStream(ctx->fft_plan[1], ctx->stream[1]));
762
CUFFT_SAFE_CALL(cufftSetStream(ctx->ifft_plan[0], ctx->stream[0]));
763
CUFFT_SAFE_CALL(cufftSetStream(ctx->ifft_plan[1], ctx->stream[1]));
743
765
#endif /* ! BUGGY_CUFFT_SET_STREAM */
1265
1299
g_timer_continue(ctx->pre_timer);
1266
1300
#endif /* PYHST_MEASURE_TIMINGS */
1303
int dim_fft_pitch = dim_fft;
1305
int dim_fft_pitch = 1 * dim_fft;
1268
1308
if (dim_fft > num_bins) {
1269
1309
if (setup->zero_padding) {
1270
hst_cuda_pack_kernel_zpad<<<dimPadGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_buffer, dim_fft, gpu_data, ctx->bin_pitch_size, num_bins);
1310
hst_cuda_pack_kernel_zpad<<<dimPadGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_buffer, dim_fft_pitch, gpu_data, ctx->bin_pitch_size, num_bins);
1272
hst_cuda_pack_kernel_epad<<<dimPadGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_buffer, dim_fft, gpu_data, ctx->bin_pitch_size, num_bins, mid);
1312
hst_cuda_pack_kernel_epad<<<dimPadGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_buffer, dim_fft_pitch, gpu_data, ctx->bin_pitch_size, num_bins, mid);
1275
hst_cuda_pack_kernel<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_buffer, dim_fft, gpu_data, ctx->bin_pitch_size);
1315
hst_cuda_pack_kernel<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_buffer, dim_fft_pitch, gpu_data, ctx->bin_pitch_size);
1319
dim3 dimFilterGrid(ctx->filter_grid_size, SLICE_BLOCK * ctx->projection_grid_size);
1278
1320
CUFFT_SAFE_CALL(cufftExecC2C(ctx->fft_plan[i%2], (cufftComplex*)gpu_buffer, (cufftComplex*)gpu_buffer, CUFFT_FORWARD));
1279
1321
hst_cuda_filter_kernel<<<dimFilterGrid,dimBlock,0,ctx->stream[i%2]>>>(2*dim_fft, gpu_buffer, ctx->gpu_filter);
1280
1322
CUFFT_SAFE_CALL(cufftExecC2C(ctx->fft_plan[i%2], (cufftComplex*)gpu_buffer, (cufftComplex*)gpu_buffer, CUFFT_INVERSE));
1324
dim3 dimFilterGrid(ctx->filter_grid_size, 2 * SLICE_BLOCK * ctx->projection_grid_size); // i.e. (2 * dim_fft) x (2 * (num_proj/2) * SLICE_BLOCK)
1325
CUFFT_SAFE_CALL(cufftExecR2C(ctx->fft_plan[i%2], (cufftReal*)gpu_buffer, (cufftComplex*)ctx->fft_buffer));
1326
hst_cuda_filter_kernel<<<dimFilterGrid,dimBlock,0,ctx->stream[i%2]>>>(2 * dim_fft, ctx->fft_buffer, ctx->gpu_filter);
1327
CUFFT_SAFE_CALL(cufftExecC2R(ctx->ifft_plan[i%2], (cufftComplex*)ctx->fft_buffer, (cufftReal*)gpu_buffer));
1282
1330
if (setup->fai360) {
1283
hst_cuda_unpack_kernel_fai360<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_data, ctx->bin_pitch_size, gpu_buffer, dim_fft, ctx->bin_grid_size * BLOCK_SIZE / 2, ctx->gpu_limits + 2 * batch, batch);
1331
#ifdef PYHST_ASTRA_SCALING
1332
printf("Astra scaling is not supported in FAI360 mode yet\n");
1335
hst_cuda_unpack_kernel_fai360<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_data, ctx->bin_pitch_size, gpu_buffer, dim_fft_pitch, ctx->bin_grid_size * BLOCK_SIZE / 2, ctx->gpu_limits + 2 * batch, batch);
1286
hst_cuda_unpack_to_half_kernel<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>((half*)gpu_data, ctx->bin_pitch_size, gpu_buffer, dim_fft);
1338
# ifdef PYHST_ASTRA_SCALING
1339
printf("Astra scaling is not supported in HALF mode yet\n");
1342
hst_cuda_unpack_to_half_kernel<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>((half*)gpu_data, ctx->bin_pitch_size, gpu_buffer, dim_fft_pitch);
1287
1343
#else /* HST_FLOAT16 */
1288
hst_cuda_unpack_kernel<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_data, ctx->bin_pitch_size, gpu_buffer, dim_fft);
1344
# ifdef PYHST_ASTRA_SCALING
1345
const float scaling = M_PI_F / (2 * setup->num_projections * dim_fft);
1347
const float scaling = 1.f;
1349
hst_cuda_unpack_kernel<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_data, ctx->bin_pitch_size, gpu_buffer, dim_fft_pitch, scaling);
1289
1350
#endif /* HST_FLOAT16 */
1358
1419
if (ctx->base_kernel) {
1359
1420
hst_report_kernel(ctx, "base", ppt, dimGrid, dimBPBlock);
1360
hst_cuda_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (ctx->tex, num_proj, num_bins, GPU_RESULT(ctx), setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);
1421
hst_cuda_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (ctx->tex, num_proj, num_bins, GPU_RESULT(ctx), setup->offset_x - axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);
1361
1422
} else if (ctx->tex_kernel) {
1362
1423
hst_report_kernel(ctx, "tex", ppt, dimGrid, dimBPBlock);
1363
hst_kepler_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (ctx->tex, num_proj, num_bins, GPU_RESULT(ctx), setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);
1364
// hst_kepler_orig_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (ctx->tex, num_proj, num_bins, GPU_RESULT(ctx), setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);
1424
hst_kepler_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, num_proj, num_bins, GPU_RESULT(ctx), setup->offset_x - axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);
1425
// hst_kepler_orig_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (ctx->tex, num_proj, num_bins, GPU_RESULT(ctx), setup->offset_x - axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);
1365
1426
} else if (ctx->linear_kernel) {
1367
1428
if (setup->oversampling == 1) {
1368
1429
dim3 dimBPBlockA(ctx->block_size_x, ctx->block_size_y/HST_NN_ASSYMETRY);
1369
1430
hst_report_kernel(ctx, "alu_nn", ppt, dimGrid, dimBPBlockA);
1370
hst_cuda_alu_nn<<<dimGrid, dimBPBlockA, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, GPU_SINO(ctx), num_proj, num_bins, ctx->bin_pitch_size, GPU_RESULT(ctx), setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);
1431
hst_cuda_alu_nn<<<dimGrid, dimBPBlockA, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, GPU_SINO(ctx), num_proj, num_bins, ctx->bin_pitch_size, GPU_RESULT(ctx), setup->offset_x - axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);
1371
1432
} else if (setup->oversampling) {
1372
1433
dim3 dimBPBlockA(ctx->block_size_x, ctx->block_size_y/HST_OVERS_ASSYMETRY);
1373
1434
hst_report_kernel(ctx, "alu_oversample4", ppt, dimGrid, dimBPBlockA);
1374
hst_cuda_alu_oversample4<<<dimGrid, dimBPBlockA, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, GPU_SINO(ctx), num_proj, num_bins, ctx->bin_pitch_size, GPU_RESULT(ctx), setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);
1435
hst_cuda_alu_oversample4<<<dimGrid, dimBPBlockA, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, GPU_SINO(ctx), num_proj, num_bins, ctx->bin_pitch_size, GPU_RESULT(ctx), setup->offset_x - axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);
1376
1437
#ifdef HST_HYBRID
1377
1438
# if HST_LINEAR_ASSYMETRY > 1
1378
1439
# error "Assymetry is not supported with hybrid kernels"
1380
1441
hst_report_kernel(ctx, "alu_hybrid", ppt, dimGrid, dimBPBlock);
1381
hst_cuda_linear_hybrid<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, GPU_SINO(ctx), num_proj, num_bins, ctx->bin_pitch_size, GPU_RESULT(ctx), setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);
1442
hst_cuda_linear_hybrid<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, GPU_SINO(ctx), num_proj, num_bins, ctx->bin_pitch_size, GPU_RESULT(ctx), setup->offset_x - axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);
1383
1444
dim3 dimBPBlockA(ctx->block_size_x, ctx->block_size_y/HST_LINEAR_ASSYMETRY);
1384
1445
hst_report_kernel(ctx, "alu_linear", ppt, dimGrid, dimBPBlockA);
1385
hst_cuda_alu_linear<<<dimGrid, dimBPBlockA, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, GPU_SINO(ctx), num_proj, num_bins, ctx->bin_pitch_size, GPU_RESULT(ctx), setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);
1446
hst_cuda_alu_linear<<<dimGrid, dimBPBlockA, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, GPU_SINO(ctx), num_proj, num_bins, ctx->bin_pitch_size, GPU_RESULT(ctx), setup->offset_x - axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);
1388
1449
/* } else if (ctx->mplinear_kernel) {
1389
1450
if (setup->oversampling) {
1390
1451
hst_report_kernel(ctx, "mpoversample4", ppt, dimGrid, dimBPBlock);
1391
hst_cuda_mpoversample4_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, GPU_RESULT_COMPAT(ctx), setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);
1452
hst_cuda_mpoversample4_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, GPU_RESULT_COMPAT(ctx), setup->offset_x - axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);
1393
1454
#ifdef HST_MP_ASYMMETRIC
1394
1455
dim3 dimBPBlockA(ctx->block_size_x, ctx->block_size_y/2);
1395
1456
hst_report_kernel(ctx, "mplinear", ppt, dimGrid, dimBPBlockA);
1396
hst_cuda_mplinear_kernel<<<dimGrid, dimBPBlockA, 0, ctx->stream[i%2]>>> (num_proj, num_bins, GPU_RESULT_COMPAT(ctx), setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);
1457
hst_cuda_mplinear_kernel<<<dimGrid, dimBPBlockA, 0, ctx->stream[i%2]>>> (num_proj, num_bins, GPU_RESULT_COMPAT(ctx), setup->offset_x - axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);
1398
1459
hst_report_kernel(ctx, "mplinear/symmetric", ppt, dimGrid, dimBPBlock);
1399
hst_cuda_mpoversample4_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, GPU_RESULT_COMPAT(ctx), setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);
1460
hst_cuda_mpoversample4_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, GPU_RESULT_COMPAT(ctx), setup->offset_x - axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);
1400
1461
#endif // HST_MP_ASYMMETRIC