/tomo/pyhst : revision 228

To get this branch, use:

bzr branch
http://darksoft.org/webbzr/tomo/pyhst

« back to all changes in this revision

Viewing changes to hst_cuda/hst_cuda.cu

Committer: Suren A. Chilingaryan
Date: 2017-09-28 10:34:47 UTC
Revision ID: csa@suren.me-20170928103447-dggjgnuxmymgew2a

Quality fixes (tex-based)

files modified:
CMakeLists.txt

PyHST.py

PyHST_c.c

hst.c

hst_cuda/CMakeLists.txt

hst_cuda/hst_cuda.cu

hst_cuda/hst_cuda_bp_kepler.h

hst_cuda/hst_cuda_bp_kernels.h

hst_cuda/hst_cuda_bp_mplinear.h

hst_cuda/hst_cuda_defines.h

hst_cuda/hst_cuda_kernels.h

setup.py

Show diffs side-by-side

added added

removed removed

hst_cuda/hst_cuda.cu

181

#endif /* HST_CUDA_ARRAY */

182

183

cufftHandle fft_plan[2]; //!< Complex plan for fourier transformations (both forward and inverse)

184

#ifndef HST_FILTER2

185

cufftHandle ifft_plan[2]; //!< Complex plan for fourier transformations (both forward and inverse)

186

#endif

184

187

185

188

int current_buffer; //!< 1 or 2 (0 means no active buffer yet)

186

189

int synchronized; //!< Indicates if current input buffer is synchronised

190

193

191

194

float *gpu_limits; //!< Parameters for fai360-mode corrections

192

195

float *gpu_data; //!< Sinogram buffer in device memory

196

float *fft_buffer; //!< Filtering buffer

193

197

float *gpu_buffer; //!< Temporary computation buffer in GPU memory (for filtering, can hold a single batch)

194

198

float *gpu_filter; //!< Filter buffer in device memory

195

199

float *gpu_result; //!< Reconstructed slice is going here

706

710

707

711

708

712

CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_buffer, SLICE_BLOCK * ctx->fft_batch_size*setup->dim_fft*sizeof(float)));

713

CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->fft_buffer, 2 * SLICE_BLOCK * ctx->fft_batch_size*setup->dim_fft*sizeof(float)));

714

CUDA_SAFE_CALL(cudaMemset((void*) ctx->fft_buffer, 0, 2 * SLICE_BLOCK * ctx->fft_batch_size*setup->dim_fft*sizeof(float)));

715

709

716

CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_filter, 2*setup->dim_fft*sizeof(float)));

710

717

CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_limits, 2*num_projections*sizeof(float)));

711

718

735

742

cudaStreamCreate(&ctx->stream[2]);

736

743

cudaStreamCreate(&ctx->stream[3]);

737

744

745

#ifdef HST_FILTER2

738

746

CUFFT_SAFE_CALL(cufftPlan1d(&ctx->fft_plan[0], setup->dim_fft, CUFFT_C2C, SLICE_BLOCK * (ctx->fft_batch_size/2)));

739

747

CUFFT_SAFE_CALL(cufftPlan1d(&ctx->fft_plan[1], setup->dim_fft, CUFFT_C2C, SLICE_BLOCK * (ctx->fft_batch_size/2)));

748

#else

749

int dim_fft = setup->dim_fft;

750

751

// In Fourier domain, the parameters are given in cufftComplex. I.e. they are double size

752

CUFFT_SAFE_CALL(cufftPlanMany(&ctx->fft_plan[0], 1, &dim_fft, &dim_fft, 1, dim_fft, &dim_fft, 1, dim_fft, CUFFT_R2C, SLICE_BLOCK * (ctx->fft_batch_size)));

753

CUFFT_SAFE_CALL(cufftPlanMany(&ctx->fft_plan[1], 1, &dim_fft, &dim_fft, 1, dim_fft, &dim_fft, 1, dim_fft, CUFFT_R2C, SLICE_BLOCK * (ctx->fft_batch_size)));

754

CUFFT_SAFE_CALL(cufftPlanMany(&ctx->ifft_plan[0], 1, &dim_fft, &dim_fft, 1, dim_fft, &dim_fft, 1, dim_fft, CUFFT_C2R, SLICE_BLOCK * (ctx->fft_batch_size)));

755

CUFFT_SAFE_CALL(cufftPlanMany(&ctx->ifft_plan[1], 1, &dim_fft, &dim_fft, 1, dim_fft, &dim_fft, 1, dim_fft, CUFFT_C2R, SLICE_BLOCK * (ctx->fft_batch_size)));

756

#endif

757

740

758

#ifndef BUGGY_CUFFT_SET_STREAM

741

759

CUFFT_SAFE_CALL(cufftSetStream(ctx->fft_plan[0], ctx->stream[0]));

742

760

CUFFT_SAFE_CALL(cufftSetStream(ctx->fft_plan[1], ctx->stream[1]));

761

# ifndef HST_FILTER2

762

CUFFT_SAFE_CALL(cufftSetStream(ctx->ifft_plan[0], ctx->stream[0]));

763

CUFFT_SAFE_CALL(cufftSetStream(ctx->ifft_plan[1], ctx->stream[1]));

764

# endif

743

765

#endif /* ! BUGGY_CUFFT_SET_STREAM */

744

766

745

767

796

818

#endif /* PYHST_MEASURE_TIMINGS */

797

819

if (ctx->fft_plan[1]) CUFFT_SAFE_CALL(cufftDestroy(ctx->fft_plan[1]));

798

820

if (ctx->fft_plan[0]) CUFFT_SAFE_CALL(cufftDestroy(ctx->fft_plan[0]));

821

#ifndef HST_FILTER2

822

if (ctx->ifft_plan[1]) CUFFT_SAFE_CALL(cufftDestroy(ctx->ifft_plan[1]));

823

if (ctx->ifft_plan[0]) CUFFT_SAFE_CALL(cufftDestroy(ctx->ifft_plan[0]));

824

#endif

799

825

800

826

#ifdef HST_CUDA_ARRAY

801

827

# ifdef HST_CREATE_TEXTURE

822

848

if (ctx->gpu_output[0]) CUDA_SAFE_CALL(cudaFree(ctx->gpu_output[0]));

823

849

if (ctx->gpu_limits) CUDA_SAFE_CALL(cudaFree(ctx->gpu_limits));

824

850

if (ctx->gpu_filter) CUDA_SAFE_CALL(cudaFree(ctx->gpu_filter));

851

if (ctx->fft_buffer) CUDA_SAFE_CALL(cudaFree(ctx->fft_buffer));

825

852

if (ctx->gpu_buffer) CUDA_SAFE_CALL(cudaFree(ctx->gpu_buffer));

826

853

if (ctx->gpu_input[1]) CUDA_SAFE_CALL(cudaFree(ctx->gpu_input[1]));

827

854

if (ctx->gpu_input[0]) CUDA_SAFE_CALL(cudaFree(ctx->gpu_input[0]));

960

987

#endif /* HST_CUDA_4SLICE_MODE */

961

988

962

989

#ifndef PYHST_BP_BENCHMARK

990

991

for (int i = 0; i < setup->num_projections; i++) {

992

for (int j = 0; j < setup->num_bins; j++) {

993

printf("%.3f ", data[i * ctx->bin_pitch_size + j]);

994

}

995

printf("\n");

996

}

997

printf("\n\n\n");*/

963

998

if (blocking) {

964

999

CUDA_SAFE_CALL(cudaMemcpy2D(ctx->gpu_input[ctx->current_buffer - 1], ctx->bin_pitch_size * sizeof(float), data, setup->num_bins * sizeof(float), setup->num_bins * sizeof(float), SLICE_BLOCK * setup->num_projections, cudaMemcpyHostToDevice));

965

1000

} else {

1191

1226

dim3 dimFullGrid(ctx->bp_grid_columns, ctx->bp_grid_lines);

1192

1227

dim3 dimPadGrid(setup->dim_fft / BLOCK_SIZE, SLICE_BLOCK * ctx->projection_grid_size);

1193

1228

dim3 dimInputGrid(ctx->bin_grid_size, SLICE_BLOCK * ctx->projection_grid_size);

1194

dim3 dimFilterGrid(ctx->filter_grid_size, SLICE_BLOCK * ctx->projection_grid_size);

1195

1229

dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

1196

1230

1197

1231

1265

1299

g_timer_continue(ctx->pre_timer);

1266

1300

#endif /* PYHST_MEASURE_TIMINGS */

1267

1301

1302

#ifdef HST_FILTER2

1303

int dim_fft_pitch = dim_fft;

1304

#else

1305

int dim_fft_pitch = 1 * dim_fft;

1306

#endif

1307

1268

1308

if (dim_fft > num_bins) {

1269

1309

if (setup->zero_padding) {

1270

hst_cuda_pack_kernel_zpad<<<dimPadGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_buffer, dim_fft, gpu_data, ctx->bin_pitch_size, num_bins);

1310

hst_cuda_pack_kernel_zpad<<<dimPadGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_buffer, dim_fft_pitch, gpu_data, ctx->bin_pitch_size, num_bins);

1271

1311

} else {

1272

hst_cuda_pack_kernel_epad<<<dimPadGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_buffer, dim_fft, gpu_data, ctx->bin_pitch_size, num_bins, mid);

1312

hst_cuda_pack_kernel_epad<<<dimPadGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_buffer, dim_fft_pitch, gpu_data, ctx->bin_pitch_size, num_bins, mid);

1273

1313

}

1274

1314

} else {

1275

hst_cuda_pack_kernel<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_buffer, dim_fft, gpu_data, ctx->bin_pitch_size);

1315

hst_cuda_pack_kernel<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_buffer, dim_fft_pitch, gpu_data, ctx->bin_pitch_size);

1276

1316

}

1277

1317

1318

#ifdef HST_FILTER2

1319

dim3 dimFilterGrid(ctx->filter_grid_size, SLICE_BLOCK * ctx->projection_grid_size);

1278

1320

CUFFT_SAFE_CALL(cufftExecC2C(ctx->fft_plan[i%2], (cufftComplex*)gpu_buffer, (cufftComplex*)gpu_buffer, CUFFT_FORWARD));

1279

1321

hst_cuda_filter_kernel<<<dimFilterGrid,dimBlock,0,ctx->stream[i%2]>>>(2*dim_fft, gpu_buffer, ctx->gpu_filter);

1280

1322

CUFFT_SAFE_CALL(cufftExecC2C(ctx->fft_plan[i%2], (cufftComplex*)gpu_buffer, (cufftComplex*)gpu_buffer, CUFFT_INVERSE));

1323

#else

1324

dim3 dimFilterGrid(ctx->filter_grid_size, 2 * SLICE_BLOCK * ctx->projection_grid_size); // i.e. (2 * dim_fft) x (2 * (num_proj/2) * SLICE_BLOCK)

1325

CUFFT_SAFE_CALL(cufftExecR2C(ctx->fft_plan[i%2], (cufftReal*)gpu_buffer, (cufftComplex*)ctx->fft_buffer));

1326

hst_cuda_filter_kernel<<<dimFilterGrid,dimBlock,0,ctx->stream[i%2]>>>(2 * dim_fft, ctx->fft_buffer, ctx->gpu_filter);

1327

CUFFT_SAFE_CALL(cufftExecC2R(ctx->ifft_plan[i%2], (cufftComplex*)ctx->fft_buffer, (cufftReal*)gpu_buffer));

1328

#endif

1281

1329

1282

1330

if (setup->fai360) {

1283

hst_cuda_unpack_kernel_fai360<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_data, ctx->bin_pitch_size, gpu_buffer, dim_fft, ctx->bin_grid_size * BLOCK_SIZE / 2, ctx->gpu_limits + 2 * batch, batch);

1331

#ifdef PYHST_ASTRA_SCALING

1332

printf("Astra scaling is not supported in FAI360 mode yet\n");

1333

exit(1);

1334

#endif

1335

hst_cuda_unpack_kernel_fai360<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_data, ctx->bin_pitch_size, gpu_buffer, dim_fft_pitch, ctx->bin_grid_size * BLOCK_SIZE / 2, ctx->gpu_limits + 2 * batch, batch);

1284

1336

} else {

1285

# ifdef HST_FLOAT16

1286

hst_cuda_unpack_to_half_kernel<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>((half*)gpu_data, ctx->bin_pitch_size, gpu_buffer, dim_fft);

1337

#ifdef HST_FLOAT16

1338

# ifdef PYHST_ASTRA_SCALING

1339

printf("Astra scaling is not supported in HALF mode yet\n");

1340

exit(1);

1341

# endif

1342

hst_cuda_unpack_to_half_kernel<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>((half*)gpu_data, ctx->bin_pitch_size, gpu_buffer, dim_fft_pitch);

1287

1343

#else /* HST_FLOAT16 */

1288

hst_cuda_unpack_kernel<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_data, ctx->bin_pitch_size, gpu_buffer, dim_fft);

1344

# ifdef PYHST_ASTRA_SCALING

1345

const float scaling = M_PI_F / (2 * setup->num_projections * dim_fft);

1346

# else

1347

const float scaling = 1.f;

1348

# endif

1349

hst_cuda_unpack_kernel<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_data, ctx->bin_pitch_size, gpu_buffer, dim_fft_pitch, scaling);

1289

1350

#endif /* HST_FLOAT16 */

1290

1351

}

1291

1352

1357

1418

1358

1419

if (ctx->base_kernel) {

1359

1420

hst_report_kernel(ctx, "base", ppt, dimGrid, dimBPBlock);

1360

hst_cuda_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (ctx->tex, num_proj, num_bins, GPU_RESULT(ctx), setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);

1421

hst_cuda_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (ctx->tex, num_proj, num_bins, GPU_RESULT(ctx), setup->offset_x - axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);

1361

1422

} else if (ctx->tex_kernel) {

1362

1423

hst_report_kernel(ctx, "tex", ppt, dimGrid, dimBPBlock);

1363

hst_kepler_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (ctx->tex, num_proj, num_bins, GPU_RESULT(ctx), setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);

1364

// hst_kepler_orig_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (ctx->tex, num_proj, num_bins, GPU_RESULT(ctx), setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);

1424

hst_kepler_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, num_proj, num_bins, GPU_RESULT(ctx), setup->offset_x - axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);

1425

// hst_kepler_orig_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (ctx->tex, num_proj, num_bins, GPU_RESULT(ctx), setup->offset_x - axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);

1365

1426

} else if (ctx->linear_kernel) {

1366

1427

1367

1428

if (setup->oversampling == 1) {

1368

1429

dim3 dimBPBlockA(ctx->block_size_x, ctx->block_size_y/HST_NN_ASSYMETRY);

1369

1430

hst_report_kernel(ctx, "alu_nn", ppt, dimGrid, dimBPBlockA);

1370

hst_cuda_alu_nn<<<dimGrid, dimBPBlockA, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, GPU_SINO(ctx), num_proj, num_bins, ctx->bin_pitch_size, GPU_RESULT(ctx), setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);

1431

hst_cuda_alu_nn<<<dimGrid, dimBPBlockA, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, GPU_SINO(ctx), num_proj, num_bins, ctx->bin_pitch_size, GPU_RESULT(ctx), setup->offset_x - axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);

1371

1432

} else if (setup->oversampling) {

1372

1433

dim3 dimBPBlockA(ctx->block_size_x, ctx->block_size_y/HST_OVERS_ASSYMETRY);

1373

1434

hst_report_kernel(ctx, "alu_oversample4", ppt, dimGrid, dimBPBlockA);

1374

hst_cuda_alu_oversample4<<<dimGrid, dimBPBlockA, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, GPU_SINO(ctx), num_proj, num_bins, ctx->bin_pitch_size, GPU_RESULT(ctx), setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);

1435

hst_cuda_alu_oversample4<<<dimGrid, dimBPBlockA, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, GPU_SINO(ctx), num_proj, num_bins, ctx->bin_pitch_size, GPU_RESULT(ctx), setup->offset_x - axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);

1375

1436

} else {

1376

1437

#ifdef HST_HYBRID

1377

1438

# if HST_LINEAR_ASSYMETRY > 1

1378

1439

# error "Assymetry is not supported with hybrid kernels"

1379

1440

# endif

1380

1441

hst_report_kernel(ctx, "alu_hybrid", ppt, dimGrid, dimBPBlock);

1381

hst_cuda_linear_hybrid<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, GPU_SINO(ctx), num_proj, num_bins, ctx->bin_pitch_size, GPU_RESULT(ctx), setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);

1442

hst_cuda_linear_hybrid<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, GPU_SINO(ctx), num_proj, num_bins, ctx->bin_pitch_size, GPU_RESULT(ctx), setup->offset_x - axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);

1382

1443

#else

1383

1444

dim3 dimBPBlockA(ctx->block_size_x, ctx->block_size_y/HST_LINEAR_ASSYMETRY);

1384

1445

hst_report_kernel(ctx, "alu_linear", ppt, dimGrid, dimBPBlockA);

1385

hst_cuda_alu_linear<<<dimGrid, dimBPBlockA, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, GPU_SINO(ctx), num_proj, num_bins, ctx->bin_pitch_size, GPU_RESULT(ctx), setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);

1446

hst_cuda_alu_linear<<<dimGrid, dimBPBlockA, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, GPU_SINO(ctx), num_proj, num_bins, ctx->bin_pitch_size, GPU_RESULT(ctx), setup->offset_x - axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);

1386

1447

#endif

1387

1448

}

1388

1449

/* } else if (ctx->mplinear_kernel) {

1389

1450

if (setup->oversampling) {

1390

1451

hst_report_kernel(ctx, "mpoversample4", ppt, dimGrid, dimBPBlock);

1391

hst_cuda_mpoversample4_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, GPU_RESULT_COMPAT(ctx), setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);

1452

hst_cuda_mpoversample4_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, GPU_RESULT_COMPAT(ctx), setup->offset_x - axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);

1392

1453

} else {

1393

1454

#ifdef HST_MP_ASYMMETRIC

1394

1455

dim3 dimBPBlockA(ctx->block_size_x, ctx->block_size_y/2);

1395

1456

hst_report_kernel(ctx, "mplinear", ppt, dimGrid, dimBPBlockA);

1396

hst_cuda_mplinear_kernel<<<dimGrid, dimBPBlockA, 0, ctx->stream[i%2]>>> (num_proj, num_bins, GPU_RESULT_COMPAT(ctx), setup->offset_x - axis_position - 0.5f, setup->offset_y - axis_position - 0.5f, batch);

1457

hst_cuda_mplinear_kernel<<<dimGrid, dimBPBlockA, 0, ctx->stream[i%2]>>> (num_proj, num_bins, GPU_RESULT_COMPAT(ctx), setup->offset_x - axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);

1397

1458

#else

1398

1459

hst_report_kernel(ctx, "mplinear/symmetric", ppt, dimGrid, dimBPBlock);

1399

1460

1400

1461

#endif // HST_MP_ASYMMETRIC

1401

1462

}*/

1402

1463

} else {

Older »