/tomo/pyhst

To get this branch, use:
bzr branch http://darksoft.org/webbzr/tomo/pyhst

« back to all changes in this revision

Viewing changes to hst_cuda/hst_cuda.cu

  • Committer: Suren A. Chilingaryan
  • Date: 2017-09-28 10:34:47 UTC
  • Revision ID: csa@suren.me-20170928103447-dggjgnuxmymgew2a
Quality fixes (tex-based)

Show diffs side-by-side

added added

removed removed

Lines of Context:
181
181
#endif /* HST_CUDA_ARRAY */
182
182
 
183
183
    cufftHandle fft_plan[2];            //!< Complex plan for fourier transformations (both forward and inverse)
 
184
#ifndef HST_FILTER2
 
185
    cufftHandle ifft_plan[2];           //!< Complex plan for fourier transformations (both forward and inverse)
 
186
#endif
184
187
 
185
188
    int current_buffer;         //!< 1 or 2 (0 means no active buffer yet)
186
189
    int synchronized;           //!< Indicates if current input buffer is synchronised
190
193
 
191
194
    float *gpu_limits;          //!< Parameters for fai360-mode corrections
192
195
    float *gpu_data;            //!< Sinogram buffer in device memory
 
196
    float *fft_buffer;          //!< Filtering buffer
193
197
    float *gpu_buffer;          //!< Temporary computation buffer in GPU memory (for filtering, can hold a single batch)
194
198
    float *gpu_filter;          //!< Filter buffer in device memory
195
199
    float *gpu_result;          //!< Reconstructed slice is going here
706
710
 
707
711
 
708
712
    CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_buffer, SLICE_BLOCK * ctx->fft_batch_size*setup->dim_fft*sizeof(float)));
 
713
    CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->fft_buffer,    2 * SLICE_BLOCK * ctx->fft_batch_size*setup->dim_fft*sizeof(float)));
 
714
    CUDA_SAFE_CALL(cudaMemset((void*)  ctx->fft_buffer, 0, 2 * SLICE_BLOCK * ctx->fft_batch_size*setup->dim_fft*sizeof(float)));
 
715
 
709
716
    CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_filter, 2*setup->dim_fft*sizeof(float)));
710
717
    CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_limits, 2*num_projections*sizeof(float)));
711
718
    
735
742
    cudaStreamCreate(&ctx->stream[2]);
736
743
    cudaStreamCreate(&ctx->stream[3]);
737
744
 
 
745
#ifdef HST_FILTER2
738
746
    CUFFT_SAFE_CALL(cufftPlan1d(&ctx->fft_plan[0], setup->dim_fft, CUFFT_C2C, SLICE_BLOCK * (ctx->fft_batch_size/2)));
739
747
    CUFFT_SAFE_CALL(cufftPlan1d(&ctx->fft_plan[1], setup->dim_fft, CUFFT_C2C, SLICE_BLOCK * (ctx->fft_batch_size/2)));
 
748
#else
 
749
    int dim_fft = setup->dim_fft;
 
750
 
 
751
        // In Fourier domain, the parameters are given in cufftComplex. I.e. they are double size
 
752
    CUFFT_SAFE_CALL(cufftPlanMany(&ctx->fft_plan[0], 1, &dim_fft, &dim_fft, 1, dim_fft, &dim_fft, 1, dim_fft, CUFFT_R2C, SLICE_BLOCK * (ctx->fft_batch_size)));
 
753
    CUFFT_SAFE_CALL(cufftPlanMany(&ctx->fft_plan[1], 1, &dim_fft, &dim_fft, 1, dim_fft, &dim_fft, 1, dim_fft, CUFFT_R2C, SLICE_BLOCK * (ctx->fft_batch_size)));
 
754
    CUFFT_SAFE_CALL(cufftPlanMany(&ctx->ifft_plan[0], 1, &dim_fft, &dim_fft, 1, dim_fft, &dim_fft, 1, dim_fft, CUFFT_C2R, SLICE_BLOCK * (ctx->fft_batch_size)));
 
755
    CUFFT_SAFE_CALL(cufftPlanMany(&ctx->ifft_plan[1], 1, &dim_fft, &dim_fft, 1, dim_fft, &dim_fft, 1, dim_fft, CUFFT_C2R, SLICE_BLOCK * (ctx->fft_batch_size)));
 
756
#endif
 
757
 
740
758
#ifndef BUGGY_CUFFT_SET_STREAM
741
759
    CUFFT_SAFE_CALL(cufftSetStream(ctx->fft_plan[0], ctx->stream[0]));
742
760
    CUFFT_SAFE_CALL(cufftSetStream(ctx->fft_plan[1], ctx->stream[1]));
 
761
# ifndef HST_FILTER2
 
762
    CUFFT_SAFE_CALL(cufftSetStream(ctx->ifft_plan[0], ctx->stream[0]));
 
763
    CUFFT_SAFE_CALL(cufftSetStream(ctx->ifft_plan[1], ctx->stream[1]));
 
764
# endif
743
765
#endif /* ! BUGGY_CUFFT_SET_STREAM */
744
766
    
745
767
    /* 
796
818
#endif /* PYHST_MEASURE_TIMINGS */
797
819
        if (ctx->fft_plan[1]) CUFFT_SAFE_CALL(cufftDestroy(ctx->fft_plan[1]));
798
820
        if (ctx->fft_plan[0]) CUFFT_SAFE_CALL(cufftDestroy(ctx->fft_plan[0]));
 
821
#ifndef HST_FILTER2
 
822
        if (ctx->ifft_plan[1]) CUFFT_SAFE_CALL(cufftDestroy(ctx->ifft_plan[1]));
 
823
        if (ctx->ifft_plan[0]) CUFFT_SAFE_CALL(cufftDestroy(ctx->ifft_plan[0]));
 
824
#endif
799
825
 
800
826
#ifdef HST_CUDA_ARRAY
801
827
# ifdef HST_CREATE_TEXTURE
822
848
        if (ctx->gpu_output[0]) CUDA_SAFE_CALL(cudaFree(ctx->gpu_output[0]));
823
849
        if (ctx->gpu_limits) CUDA_SAFE_CALL(cudaFree(ctx->gpu_limits));
824
850
        if (ctx->gpu_filter) CUDA_SAFE_CALL(cudaFree(ctx->gpu_filter));
 
851
        if (ctx->fft_buffer) CUDA_SAFE_CALL(cudaFree(ctx->fft_buffer));
825
852
        if (ctx->gpu_buffer) CUDA_SAFE_CALL(cudaFree(ctx->gpu_buffer));
826
853
        if (ctx->gpu_input[1]) CUDA_SAFE_CALL(cudaFree(ctx->gpu_input[1]));
827
854
        if (ctx->gpu_input[0]) CUDA_SAFE_CALL(cudaFree(ctx->gpu_input[0]));
960
987
#endif /* HST_CUDA_4SLICE_MODE */
961
988
    
962
989
#ifndef PYHST_BP_BENCHMARK
 
990
/*
 
991
        for (int i = 0; i < setup->num_projections; i++) {
 
992
            for (int j = 0; j < setup->num_bins; j++) {
 
993
                printf("%.3f ", data[i * ctx->bin_pitch_size + j]);
 
994
            }
 
995
            printf("\n");
 
996
        }
 
997
        printf("\n\n\n");*/
963
998
        if (blocking) {
964
999
            CUDA_SAFE_CALL(cudaMemcpy2D(ctx->gpu_input[ctx->current_buffer - 1], ctx->bin_pitch_size * sizeof(float), data, setup->num_bins * sizeof(float), setup->num_bins * sizeof(float), SLICE_BLOCK * setup->num_projections, cudaMemcpyHostToDevice));
965
1000
        } else {
1191
1226
    dim3 dimFullGrid(ctx->bp_grid_columns, ctx->bp_grid_lines);
1192
1227
    dim3 dimPadGrid(setup->dim_fft / BLOCK_SIZE,  SLICE_BLOCK * ctx->projection_grid_size);
1193
1228
    dim3 dimInputGrid(ctx->bin_grid_size, SLICE_BLOCK * ctx->projection_grid_size);
1194
 
    dim3 dimFilterGrid(ctx->filter_grid_size, SLICE_BLOCK * ctx->projection_grid_size);
1195
1229
    dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
1196
1230
 
1197
1231
 
1265
1299
        g_timer_continue(ctx->pre_timer);
1266
1300
#endif /* PYHST_MEASURE_TIMINGS */
1267
1301
 
 
1302
#ifdef HST_FILTER2
 
1303
        int dim_fft_pitch = dim_fft;
 
1304
#else
 
1305
        int dim_fft_pitch = 1 * dim_fft;
 
1306
#endif
 
1307
 
1268
1308
        if (dim_fft > num_bins) {
1269
1309
            if (setup->zero_padding) {
1270
 
                hst_cuda_pack_kernel_zpad<<<dimPadGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_buffer, dim_fft, gpu_data, ctx->bin_pitch_size, num_bins);
 
1310
                hst_cuda_pack_kernel_zpad<<<dimPadGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_buffer, dim_fft_pitch, gpu_data, ctx->bin_pitch_size, num_bins);
1271
1311
            } else {
1272
 
                hst_cuda_pack_kernel_epad<<<dimPadGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_buffer, dim_fft, gpu_data, ctx->bin_pitch_size, num_bins, mid);
 
1312
                hst_cuda_pack_kernel_epad<<<dimPadGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_buffer, dim_fft_pitch, gpu_data, ctx->bin_pitch_size, num_bins, mid);
1273
1313
            }
1274
1314
        } else {
1275
 
            hst_cuda_pack_kernel<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_buffer, dim_fft, gpu_data, ctx->bin_pitch_size);
 
1315
            hst_cuda_pack_kernel<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_buffer, dim_fft_pitch, gpu_data, ctx->bin_pitch_size);
1276
1316
        }
1277
1317
 
 
1318
#ifdef HST_FILTER2
 
1319
        dim3 dimFilterGrid(ctx->filter_grid_size, SLICE_BLOCK * ctx->projection_grid_size);
1278
1320
        CUFFT_SAFE_CALL(cufftExecC2C(ctx->fft_plan[i%2], (cufftComplex*)gpu_buffer, (cufftComplex*)gpu_buffer, CUFFT_FORWARD));
1279
1321
        hst_cuda_filter_kernel<<<dimFilterGrid,dimBlock,0,ctx->stream[i%2]>>>(2*dim_fft, gpu_buffer, ctx->gpu_filter);
1280
1322
        CUFFT_SAFE_CALL(cufftExecC2C(ctx->fft_plan[i%2], (cufftComplex*)gpu_buffer, (cufftComplex*)gpu_buffer, CUFFT_INVERSE));
 
1323
#else
 
1324
        dim3 dimFilterGrid(ctx->filter_grid_size, 2 * SLICE_BLOCK * ctx->projection_grid_size); // i.e. (2 * dim_fft) x (2 * (num_proj/2) * SLICE_BLOCK)
 
1325
        CUFFT_SAFE_CALL(cufftExecR2C(ctx->fft_plan[i%2], (cufftReal*)gpu_buffer, (cufftComplex*)ctx->fft_buffer));
 
1326
        hst_cuda_filter_kernel<<<dimFilterGrid,dimBlock,0,ctx->stream[i%2]>>>(2 * dim_fft, ctx->fft_buffer, ctx->gpu_filter);
 
1327
        CUFFT_SAFE_CALL(cufftExecC2R(ctx->ifft_plan[i%2], (cufftComplex*)ctx->fft_buffer, (cufftReal*)gpu_buffer));
 
1328
#endif
1281
1329
 
1282
1330
        if (setup->fai360) {
1283
 
            hst_cuda_unpack_kernel_fai360<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_data, ctx->bin_pitch_size, gpu_buffer, dim_fft, ctx->bin_grid_size * BLOCK_SIZE / 2, ctx->gpu_limits + 2 * batch, batch);
 
1331
#ifdef PYHST_ASTRA_SCALING
 
1332
            printf("Astra scaling is not supported in FAI360 mode yet\n");
 
1333
            exit(1);
 
1334
#endif
 
1335
            hst_cuda_unpack_kernel_fai360<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_data, ctx->bin_pitch_size, gpu_buffer, dim_fft_pitch, ctx->bin_grid_size * BLOCK_SIZE / 2, ctx->gpu_limits + 2 * batch, batch);
1284
1336
        } else {
1285
 
# ifdef HST_FLOAT16
1286
 
            hst_cuda_unpack_to_half_kernel<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>((half*)gpu_data, ctx->bin_pitch_size, gpu_buffer, dim_fft);
 
1337
#ifdef HST_FLOAT16
 
1338
# ifdef PYHST_ASTRA_SCALING
 
1339
            printf("Astra scaling is not supported in HALF mode yet\n");
 
1340
            exit(1);
 
1341
# endif
 
1342
            hst_cuda_unpack_to_half_kernel<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>((half*)gpu_data, ctx->bin_pitch_size, gpu_buffer, dim_fft_pitch);
1287
1343
#else /* HST_FLOAT16 */
1288
 
            hst_cuda_unpack_kernel<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_data, ctx->bin_pitch_size, gpu_buffer, dim_fft);
 
1344
# ifdef PYHST_ASTRA_SCALING
 
1345
            const float scaling = M_PI_F / (2 * setup->num_projections * dim_fft);
 
1346
# else
 
1347
            const float scaling = 1.f;
 
1348
# endif
 
1349
            hst_cuda_unpack_kernel<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_data, ctx->bin_pitch_size, gpu_buffer, dim_fft_pitch, scaling);
1289
1350
#endif /* HST_FLOAT16 */
1290
1351
        }
1291
1352
 
1357
1418
 
1358
1419
        if (ctx->base_kernel) {
1359
1420
            hst_report_kernel(ctx, "base", ppt, dimGrid, dimBPBlock); 
1360
 
            hst_cuda_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (ctx->tex, num_proj, num_bins, GPU_RESULT(ctx), setup->offset_x  -  axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);
 
1421
            hst_cuda_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (ctx->tex, num_proj, num_bins, GPU_RESULT(ctx), setup->offset_x  -  axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);
1361
1422
        } else if (ctx->tex_kernel) {
1362
1423
            hst_report_kernel(ctx, "tex", ppt, dimGrid, dimBPBlock); 
1363
 
            hst_kepler_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (ctx->tex, num_proj, num_bins, GPU_RESULT(ctx), setup->offset_x  -  axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);
1364
 
//          hst_kepler_orig_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (ctx->tex, num_proj, num_bins, GPU_RESULT(ctx), setup->offset_x  -  axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);
 
1424
            hst_kepler_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, num_proj, num_bins, GPU_RESULT(ctx), setup->offset_x  -  axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);
 
1425
//          hst_kepler_orig_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (ctx->tex, num_proj, num_bins, GPU_RESULT(ctx), setup->offset_x  -  axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);
1365
1426
        } else if (ctx->linear_kernel) {
1366
1427
 
1367
1428
            if (setup->oversampling == 1) {
1368
1429
                dim3 dimBPBlockA(ctx->block_size_x, ctx->block_size_y/HST_NN_ASSYMETRY);
1369
1430
                hst_report_kernel(ctx, "alu_nn", ppt, dimGrid, dimBPBlockA); 
1370
 
                hst_cuda_alu_nn<<<dimGrid, dimBPBlockA, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, GPU_SINO(ctx), num_proj, num_bins, ctx->bin_pitch_size, GPU_RESULT(ctx), setup->offset_x  -  axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);
 
1431
                hst_cuda_alu_nn<<<dimGrid, dimBPBlockA, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, GPU_SINO(ctx), num_proj, num_bins, ctx->bin_pitch_size, GPU_RESULT(ctx), setup->offset_x  -  axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);
1371
1432
            } else if (setup->oversampling) {
1372
1433
                dim3 dimBPBlockA(ctx->block_size_x, ctx->block_size_y/HST_OVERS_ASSYMETRY);
1373
1434
                hst_report_kernel(ctx, "alu_oversample4", ppt, dimGrid, dimBPBlockA); 
1374
 
                hst_cuda_alu_oversample4<<<dimGrid, dimBPBlockA, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, GPU_SINO(ctx), num_proj, num_bins, ctx->bin_pitch_size, GPU_RESULT(ctx), setup->offset_x  -  axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);
 
1435
                hst_cuda_alu_oversample4<<<dimGrid, dimBPBlockA, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, GPU_SINO(ctx), num_proj, num_bins, ctx->bin_pitch_size, GPU_RESULT(ctx), setup->offset_x  -  axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);
1375
1436
            } else {
1376
1437
#ifdef HST_HYBRID
1377
1438
# if HST_LINEAR_ASSYMETRY > 1
1378
1439
#   error "Assymetry is not supported with hybrid kernels"
1379
1440
# endif
1380
1441
                hst_report_kernel(ctx, "alu_hybrid", ppt, dimGrid, dimBPBlock); 
1381
 
                hst_cuda_linear_hybrid<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, GPU_SINO(ctx), num_proj, num_bins, ctx->bin_pitch_size, GPU_RESULT(ctx), setup->offset_x  -  axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);
 
1442
                hst_cuda_linear_hybrid<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, GPU_SINO(ctx), num_proj, num_bins, ctx->bin_pitch_size, GPU_RESULT(ctx), setup->offset_x  -  axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);
1382
1443
#else
1383
1444
                dim3 dimBPBlockA(ctx->block_size_x, ctx->block_size_y/HST_LINEAR_ASSYMETRY);
1384
1445
                hst_report_kernel(ctx, "alu_linear", ppt, dimGrid, dimBPBlockA); 
1385
 
                hst_cuda_alu_linear<<<dimGrid, dimBPBlockA, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, GPU_SINO(ctx), num_proj, num_bins, ctx->bin_pitch_size, GPU_RESULT(ctx), setup->offset_x  -  axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);
 
1446
                hst_cuda_alu_linear<<<dimGrid, dimBPBlockA, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, GPU_SINO(ctx), num_proj, num_bins, ctx->bin_pitch_size, GPU_RESULT(ctx), setup->offset_x  -  axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);
1386
1447
#endif
1387
1448
            }
1388
1449
/*      } else if (ctx->mplinear_kernel) {
1389
1450
            if (setup->oversampling) {
1390
1451
                hst_report_kernel(ctx, "mpoversample4", ppt, dimGrid, dimBPBlock); 
1391
 
                hst_cuda_mpoversample4_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, GPU_RESULT_COMPAT(ctx), setup->offset_x  -  axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);
 
1452
                hst_cuda_mpoversample4_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, GPU_RESULT_COMPAT(ctx), setup->offset_x  -  axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);
1392
1453
            } else {
1393
1454
#ifdef HST_MP_ASYMMETRIC
1394
1455
                dim3 dimBPBlockA(ctx->block_size_x, ctx->block_size_y/2);
1395
1456
                hst_report_kernel(ctx, "mplinear", ppt, dimGrid, dimBPBlockA); 
1396
 
                hst_cuda_mplinear_kernel<<<dimGrid, dimBPBlockA, 0, ctx->stream[i%2]>>> (num_proj, num_bins, GPU_RESULT_COMPAT(ctx), setup->offset_x  -  axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);
 
1457
                hst_cuda_mplinear_kernel<<<dimGrid, dimBPBlockA, 0, ctx->stream[i%2]>>> (num_proj, num_bins, GPU_RESULT_COMPAT(ctx), setup->offset_x  -  axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);
1397
1458
#else
1398
1459
                hst_report_kernel(ctx, "mplinear/symmetric", ppt, dimGrid, dimBPBlock); 
1399
 
                hst_cuda_mpoversample4_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, GPU_RESULT_COMPAT(ctx), setup->offset_x  -  axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);
 
1460
                hst_cuda_mpoversample4_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, GPU_RESULT_COMPAT(ctx), setup->offset_x  -  axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);
1400
1461
#endif // HST_MP_ASYMMETRIC
1401
1462
            }*/
1402
1463
        } else {