/tomo/pyhst : contents of dfi_cuda/dfi

: (revision 276)

To get this branch, use:

bzr branch
http://darksoft.org/webbzr/tomo/pyhst

/*
 * The PyHST program is Copyright (C) 2002-2011 of the
 * European Synchrotron Radiation Facility (ESRF) and
 * Karlsruhe Institute of Technology (KIT).
 *
 * PyHST is free software: you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * hst is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License along
 * with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#define PARALLEL_PER_GPU 1
    // to prevent errors with newer glib and CUDA 4
#define GLIB_DISABLE_DEPRECATION_WARNINGS
#define USE_R2C_TRANSFORM 0  //using R2C transform at the first stage of 1D FFT

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <errno.h>
#include <math.h>
#include <stdint.h>

#include <gsl/gsl_sort.h>
#include <gsl/gsl_statistics.h>

extern "C" {
#include <glib.h>
}

#include <cufft.h>
#if CUDA_VERSION_MAJOR < 5
# include <cutil.h>
# include <cutil_math.h>
#else 
# include <helper_cuda.h>
# include <helper_math.h>

# define CUDA_SAFE_CALL(val) check ( (val), #val, __FILE__, __LINE__ )
# define CUFFT_SAFE_CALL CUDA_SAFE_CALL
#endif /* CVM < 5 */

#include "debug.h"
#include "hw_tools.h"
#include "hst_setup.h"
#include "hst_reconstructor.h"

#include "hst.h"
#include "dfi_cuda.h"

#include "dfi_cuda_defines.h"
#include "dfi_cuda_sinc_kernels.h"
#include "dfi_cuda_heatup_kernel.h"

int blocking = 0;
static int dfi_cuda_device_num[HST_CUDA_MAX_DEVICES];
static cudaDeviceProp dfi_cuda_device_prop[HST_CUDA_MAX_DEVICES];	//!< Enumerated CUDA-enabled devices
static HSTConstString dfi_cuda_timers[] = { "complete reconstruction", "transfer to device", "transfer from device", "texture mapping", "projection filtering", "backprojection", "*initialization and cleanup", NULL }; //!< List of supported timers


#define DFI_CONTEXT(ctx) ((DFIContext*)ctx)

/**
 * This implementation of HSTReconstructor uses NVidia GeForce-family graphic
 * cards and NVidia Tesla accelerators to accelerate reconstruction process.
 * The implementation is based on CUDA toolkit and uses NVidia cuFFT library
 * for performing Fourier Transformations.
 * DFIContext is extension of #HSTReconstructorContext which provides additional
 * data members needed to communicate with graphic hardware
 */
struct DFIContextT {
    HSTReconstructorContext recon;
#ifdef PYHST_MEASURE_TIMINGS
# define DFI_CONTEXT_MEMSET_OFFSET (6 * sizeof(GTimer*))

    GTimer *main_timer;     //!< Counts time spent in interpolation
    GTimer *pre_timer;      //!< Counts time spent in FFTs
    GTimer *togpu_timer;    //!< Counts time spent in memory operations
    GTimer *init_timer;     //!< Counts time spent in initialization
    GTimer *fromgpu_timer;  //!< Counts time spent in memory operations
    GTimer *texture_timer;  //!< Counts time spent in texture binding/unbinding

#else
# define DFI_CONTEXT_MEMSET_OFFSET 0
#endif /* PYHST_MEASURE_TIMINGS */

    int device;			//!< Sequence number [0..MAX_DEVICES-1]
    int initialized;		//!< At least partly
    
    int pps_grid_cols; //!< Number of CUDA blocks along X-axis of 1D Fouried sinogram
    int pps_grid_rows; //!< Number of CUDA blocks along Y-axis of 1D Fouried sinogram
    int interp_grid_cols; //!< Number of CUDA blocks along X-axis of 2D Fourier domain
    int interp_grid_rows; //!< Number of CUDA blocks along Y-axis of 2D Fourier domain
    int swap_grid_cols; //!< Number of CUDA blocks along X-axis of 2D Fourier domain
    int swap_grid_rows; //!< Number of CUDA blocks along Y-axis of 2D Fourier domain
    int swap_quad_grid_cols; //!< Number of CUDA blocks along X-axis of 2D Fourier domain
    int swap_quad_grid_rows; //!< Number of CUDA blocks along Y-axis of 2D Fourier domain
    int roi_grid_cols; //!< Number of CUDA blocks along X-axis of ROI
    int roi_grid_rows; //!< Number of CUDA blocks along Y-axis of ROI
    int points_per_thread;  //!< How many points is processed by a single thread (actually square of that)

    int rho_len; //!< Initial length of sinogram  along rho direction
    int rho_ext_len; //!< Extended length of sinogram along rho direction
    int num_projections; //!< Number of projections in sinogram (the length along theta direction)
    float L; //!< Length of one side of interpolation kernel
    float L2; //!< A half of length of interpolation kernel
    int ktbl_len; //!< Number of presampled kernel values
    int ktbl_len2; //!< A half of number of presampled kernel values
    int raster_size; //!< Length of side of output (2D Fourier domain)
    int raster_size2; //!< A half of a length of number of presampled kernel values
    int roi_start_x; //!< Starting coordinate of ROI along X-axis
    int roi_start_y; //!< Starting coordinate of ROI along Y-axis
    int roi_x; //!< The length of ROI along X-axis
    int roi_y; //!< The length of ROI along Y-axis
    float table_spacing; //!< An interval between samples of presampled kernel values
    float angle_step_rad; //!< A step between projections (in degrees)
    float theta_max; //!< Max value of theta along Y-axis
    float rho_max; //!< Max value of rho along X-axis
    int oversampling; //!< An oversampling coefficient
    float scale;
    int ppt;
    float *ktbl; //!< Temp array of presampled kernel values
    int spectrum_offset_y; //!< Number of dropped values along Y-axis (due to reducing 2D Fourier domain radius)
    float max_radius; //!< Maximum radius of 2D Fourier domain

#if USE_R2C_TRANSFORM
    int fft_sino_dim;
#endif

    cufftHandle fft1d_plan; //!< Complex plan for forward fourier transformations
    cufftHandle ifft2d_plan;  //!< Complex plan for inverse fourier transformations

    float *gpu_ktbl; //!< Array of presampled values at GPU
    float *gpu_truncated_sino; //!< Sinogram after truncation

#if USE_R2C_TRANSFORM
    float *gpu_zeropadded_sino; //!< Zeropadded sinogram using real values
#else
    cufftComplex *gpu_input; //!< Zeropadded sinogram using complex values
#endif
    
    cufftComplex *gpu_spectrum; //!< Array of reconstructed 2D Fourier domain for GPU
    cufftComplex *gpu_swapped_spectrum; //!< Array of swapped 2D Fourier domain for GPU
    float *gpu_output; //!< Reconstructed slice is going here
    float *gpu_c2r_result; //!< Result of 2D IFFT - Complex-to-Real transform
    
    
    size_t bp_runs;
    double stats[CUDA_MAX_STATS];
    double gflops_start;
    double last_elapsed;
};
typedef struct DFIContextT DFIContext;


#define dfi_cuda_calc_blocks hw_calc_blocks

static int dfi_cuda_reconstruct(HSTReconstructorContext *rctx, float *SLICE, const float *SINOGRAMS);


void dfi_cuda_print_timings(DFIContext *ctx) {
#ifndef HST_MEASURE_GPUBOOST
    if (ctx->bp_runs <= 10000) {
#endif
	printf("DFI Timings:");
	for (int i = 0; i < ctx->bp_runs; i++) {
	    if ((i%10)==0) {
#ifdef HST_MEASURE_GPUBOOST
		printf("\n BP Stats: ");
#else
		printf("\n");
#endif
	    }
	    printf("% 6.3lf   ", ctx->stats[i] * 1000);
	    ctx->stats[i] /= SLICE_BLOCK;
	}
	printf("\n\n");
#ifdef HST_MEASURE_GPUBOOST
	printf("FBP Timings:");
	for (int i = 0; i < ctx->bp_runs; i++) {
	    if ((i%10)==0) printf("\nFBP Stats: ");
	    printf("% 6.3lf   ", ctx->fbp_stats[i] * 1000);
	    ctx->stats[i] /= SLICE_BLOCK;
	}
	printf("\n\n");
#else
    } else {
	for (int i = 0; i < ctx->bp_runs; i++) {
	    ctx->stats[i] /= SLICE_BLOCK;
	}
    }
#endif

    gsl_sort (ctx->stats, 1, ctx->bp_runs);
    printf(" Runs: %zu, Median: %lf, Mean: %lf, StdDev: %lf, Min: %lf, Max: %lf\n", ctx->bp_runs, 
		gsl_stats_median_from_sorted_data(ctx->stats, 1, ctx->bp_runs), 
		gsl_stats_mean(ctx->stats, 1, ctx->bp_runs), 
		gsl_stats_sd(ctx->stats, 1, ctx->bp_runs),
		gsl_stats_min(ctx->stats, 1, ctx->bp_runs),
		gsl_stats_max(ctx->stats, 1, ctx->bp_runs)
    );
}

void dfi_cuda_heatup(DFIContext *ctx, int init) {
#ifdef HST_HEATUP
	float heatup_time;
	size_t heatup_iter = 0;
	const size_t grid =  (size_t)CUDA_HEATUP_GRID * 256;
	const size_t computations = CUDA_HEATUP_ELEMENTS * grid  + (2 * CUDA_HEATUP_ELEMENTS * CUDA_HEATUP_KERNEL_ITERATIONS * grid );
	double gflops, gflops_last = 0, change = 1;
#endif
	if (init) {
//	    dfi_cuda_reconstruct((HSTReconstructorContext*)ctx, NULL, NULL);
	} else {
	    printf("\n\n\n");
	    dfi_cuda_print_timings(ctx);
	    if (!ctx->gflops_start) return;
	}

#ifdef HST_HEATUP
	cudaEvent_t start, stop;
	CUDA_SAFE_CALL( cudaEventCreate(&start) );
	CUDA_SAFE_CALL( cudaEventCreate(&stop) );

	do {
	    CUDA_SAFE_CALL( cudaEventRecord(start, 0) );
	    dfi_cuda_heatup_kernel<<<CUDA_HEATUP_GRID,256>>>(0, NULL);
	    CUDA_SAFE_CALL( cudaEventRecord(stop, 0) );
	    CUDA_SAFE_CALL( cudaEventSynchronize(stop) );
	    CUDA_SAFE_CALL( cudaEventElapsedTime(&heatup_time, start, stop) );
	    gflops = ((double)computations)/heatup_time*1000./(double)(1000*1000*1000);

	    if (gflops_last) {
		change = fabs(gflops - gflops_last) / gflops_last;
	    }
	    gflops_last = gflops;
	    if (heatup_iter > CUDA_HEATUP_MIN_ITERATIONS)
		printf("Change: %6.3lf%%    current: %9.3lf GFlops (%9.6lf s)      iter: %zu\n", 100 * change, heatup_time, gflops, heatup_iter);

	    heatup_iter++;
	} while ((init)&&((heatup_iter < CUDA_HEATUP_MIN_ITERATIONS)||((change > CUDA_HEATUP_MAX_CHANGE)&&(heatup_iter<CUDA_HEATUP_MAX_ITERATIONS))));
	
	if (init) {
	    printf("Heating is complete...\n");
	    ctx->gflops_start = gflops;
	} else if (ctx->gflops_start) {
	    change = fabs(gflops - ctx->gflops_start) / ctx->gflops_start;
	    if (change > 0.05) printf("WARNING: ****** ");
	    printf(" Clock change: %6.3lf%%    current: %9.3lf GFlops, start: %9.3lf GFlops\n", 100 * change, gflops, ctx->gflops_start);

	    printf("\n\n Details:\n");
	    double max_change = 0;
	    for (int i = 1; i < ctx->bp_runs; i++) {
		change = fabs(ctx->stats[i] - ctx->stats[i - 1]) / ctx->stats[i - 1];
		if (change > CUDA_HEATUP_MAX_CHANGE) {
		    printf("  Timing change: %6.3lf%%    slice: % 5i current: %9.6lf s, before: %9.6lf s\n", 100 * change, i, ctx->stats[i], ctx->stats[i-1]);
		    if (change > max_change) {
//			printf("Timing change: %6.3lf%%    slice: % 5i current: %9.6lf s, before: %9.6lf s\n", 100 * change, i, ctx->stats[i], ctx->stats[i-1]);
			max_change = change;
		    }
		}
	    }
	}
#endif
}


/**
 *
 * Create GPU context (uninitialized)
 *
 * @param prototype is pointer on HSTReconstructure describing the reconstruction module
 * @param setup is pointer on HSTSetup with various HST parameters
 * @result created context
 */
static HSTReconstructorContext *dfi_cuda_create_context(HSTReconstructor *prototype, HSTSetup *setup, int id) {
    DFIContext *ctx;
    
    assert(prototype);
    assert(setup);

    assert((id/PARALLEL_PER_GPU) < prototype->devices);
    
    /* FIXME: no error code in case of out-of-memory */
    ctx = (DFIContext*)malloc(sizeof(struct DFIContextT));
    if (ctx) {
	memset(ctx, 0, sizeof(struct DFIContextT));
	
#ifdef PYHST_MEASURE_TIMINGS
        ctx->main_timer = g_timer_new();
        if (ctx->main_timer) g_timer_stop(ctx->main_timer);
        ctx->pre_timer = g_timer_new();
        if (ctx->pre_timer) g_timer_stop(ctx->pre_timer);
        ctx->togpu_timer = g_timer_new();
        if (ctx->togpu_timer) g_timer_stop(ctx->togpu_timer);
        ctx->init_timer = g_timer_new();
        if (ctx->init_timer) g_timer_stop(ctx->init_timer);
        ctx->fromgpu_timer = g_timer_new();
        if (ctx->fromgpu_timer) g_timer_stop(ctx->fromgpu_timer);
        ctx->texture_timer = g_timer_new();
        if (ctx->texture_timer) g_timer_stop(ctx->texture_timer);

        if ((!ctx->main_timer)||(!ctx->pre_timer)||(!ctx->togpu_timer)||(!ctx->init_timer)||(!ctx->fromgpu_timer)||(!ctx->texture_timer)) {
            if (ctx->texture_timer) g_timer_destroy(ctx->texture_timer);
            if (ctx->fromgpu_timer) g_timer_destroy(ctx->fromgpu_timer);
            if (ctx->init_timer) g_timer_destroy(ctx->init_timer);
            if (ctx->togpu_timer) g_timer_destroy(ctx->togpu_timer);
            if (ctx->pre_timer) g_timer_destroy(ctx->pre_timer);
            if (ctx->main_timer) g_timer_destroy(ctx->main_timer);
            free(ctx);
            return NULL;
        }
#endif /* PYHST_MEASURE_TIMINGS */

	hst_reconstructor_init_context((HSTReconstructorContext*)ctx, prototype, setup);
    
        ctx->device = id / PARALLEL_PER_GPU;
    }

    return (HSTReconstructorContext*)ctx;
}

/**
  * Calculates value of sinc function
  *
  * @param x is value at an normalized from PI to -PI interval
  * @result value of sinc function
  */
static float dfi_cuda_sinc(float x) {
  return (x == 0.0f) ? 1.0 : sin(M_PI * x)/(M_PI * x);
} 

/**
  * Calculates the value of Hamming window function
  *
  * @param i is a number of current sample
  * @param length is the length of Hamming window
  * @result value of Hamming window function
  */
static float dfi_cuda_hamming_window(float i, float length) {
  return (0.54f - 0.46f * cos(2*M_PI*((gfloat)i/(gfloat)length)));
} 


/**
  * Calculates the values of presampled values array
  *
  * @param length is the length of presampled array
  * @result presampled kernel values
  */
static float *dfi_cuda_get_ktbl(int length) 
{
    float *ktbl = (float *)malloc(length * sizeof(float));

    if (!length%2) {
      g_print("Error: Length of ktbl cannot be even!\n");
      exit(1);
    }

    int ktbl_len2 = (length - 1)/2;
    float step = M_PI/(float)ktbl_len2;

    float value = -ktbl_len2 * step;

    for (int i = 0; i < length; ++i, value += step) {
        ktbl[i] = dfi_cuda_sinc(value) * dfi_cuda_hamming_window(i, length);
    }

    return ktbl;
}

/**
  * Initializes GPU context
  *
  * @param ctx is uninitialized GPU context
  * @result return code and 0 indicates success
  */
static int dfi_cuda_init_context(HSTReconstructorContext *rctx, HWThread thr) {
    DFIContext *ctx;
    HSTSetup *setup;

    assert(rctx);
    ctx = DFI_CONTEXT(rctx);
    setup = rctx->setup;

    int cutted_rho_len = (setup->num_bins - setup->axis_position) * 2;
    int ppt = 1;

    ctx->oversampling = setup->fft_oversampling;
    ctx->rho_len = cutted_rho_len;
//    ctx->rho_ext_len = pow(2, ceil(log2f(ctx->rho_len))) * ctx->oversampling;
    ctx->rho_ext_len = ((cutted_rho_len) / 128 + (cutted_rho_len % 128?1:0)) * 128 * ctx->oversampling;
    
    printf("Sino bins %u, oversampling %u\n", ctx->rho_ext_len, ctx->oversampling);
    ctx->num_projections = setup->num_projections;
    ctx->L = (float)setup->dfi_kernel_size;
    ctx->L2 = ctx->L/2.0f;
    ctx->ktbl_len = setup->dfi_kernel_points;

    if (!(ctx->ktbl_len%2)) {
        pyhst_warning("Length of presampled kernel cannot be even.");
        --ctx->ktbl_len;
    }

    ctx->ktbl_len2 = (ctx->ktbl_len - 1)/2;
    ctx->raster_size = ctx->rho_ext_len;
    ctx->raster_size2 = ctx->raster_size/2;
    ctx->table_spacing = (float)ctx->ktbl_len/ctx->L;
    ctx->angle_step_rad = setup->angle_increment;
    ctx->theta_max = ctx->num_projections;
    ctx->rho_max = ctx->rho_ext_len/2;
    ctx->roi_x = setup->num_x;
    ctx->roi_y = setup->num_y;
    ctx->roi_start_x = ((ctx->raster_size - ctx->roi_x)/2);
    ctx->roi_start_y = ((ctx->raster_size - ctx->roi_y)/2);
    ctx->scale = 1.0/(float)(ctx->raster_size * setup->fft_oversampling * 2);

#if USE_R2C_TRANSFORM
    ctx->fft_sino_dim = (dfi_cuda_calc_blocks((ctx->rho_ext_len/2 + 1) * 2, BLOCK_SIZE, NULL) + 1) * BLOCK_SIZE;
#endif

    float spectrum_scaling_rate = (float)ctx->rho_len/(float)ctx->rho_ext_len;

    ctx->pps_grid_cols = dfi_cuda_calc_blocks(ctx->rho_ext_len, BLOCK_SIZE, NULL);
    ctx->pps_grid_rows = dfi_cuda_calc_blocks(ctx->num_projections, BLOCK_SIZE, NULL);
    ctx->interp_grid_cols = dfi_cuda_calc_blocks((ctx->rho_ext_len/2 + 1) * spectrum_scaling_rate, BLOCK_SIZE, NULL);
    ctx->interp_grid_rows = dfi_cuda_calc_blocks(ctx->rho_ext_len * spectrum_scaling_rate, BLOCK_SIZE, NULL);
    ctx->swap_grid_cols = dfi_cuda_calc_blocks(ctx->raster_size/2, BLOCK_SIZE, NULL);
    ctx->swap_grid_rows = dfi_cuda_calc_blocks(ctx->raster_size/2, BLOCK_SIZE, NULL);
    ctx->swap_quad_grid_cols = dfi_cuda_calc_blocks(ctx->raster_size, BLOCK_SIZE, NULL);
    ctx->swap_quad_grid_rows = dfi_cuda_calc_blocks(ctx->raster_size/2, BLOCK_SIZE, NULL);
    ctx->roi_grid_cols = dfi_cuda_calc_blocks(setup->num_x, BLOCK_SIZE, NULL);
    ctx->roi_grid_rows = dfi_cuda_calc_blocks(setup->num_y, BLOCK_SIZE, NULL);

    ctx->points_per_thread = ppt;
    ctx->spectrum_offset_y = (ctx->raster_size - ctx->interp_grid_rows * BLOCK_SIZE)/2;
    ctx->max_radius = (ctx->interp_grid_rows * BLOCK_SIZE)/2.0;
    ctx->initialized = 1;

#ifdef PYHST_MEASURE_TIMINGS
    g_timer_continue(ctx->init_timer);
#endif /* PYHST_MEASURE_TIMINGS */

#ifdef HW_USE_THREADS
    CUDA_SAFE_CALL(cudaSetDevice(dfi_cuda_device_num[ctx->device]));
#endif /* HW_USE_THREADS */

    //init textures
    tex_projections.normalized = false;
    tex_projections.filterMode = cudaFilterModePoint;
    tex_projections.addressMode[0] = cudaAddressModeMirror;
    tex_projections.addressMode[1] = cudaAddressModeWrap;

    tex_ktbl.normalized = false;
    tex_ktbl.filterMode = cudaFilterModeLinear;
    tex_ktbl.addressMode[0] = cudaAddressModeBorder;
    tex_ktbl.addressMode[1] = cudaAddressModeBorder;

    //memory allocations
    CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_truncated_sino, ctx->num_projections * ctx->rho_len * sizeof(float)));
    CUDA_SAFE_CALL(cudaMemset(ctx->gpu_truncated_sino, 0, ctx->num_projections * ctx->rho_len * sizeof(float)));

#if USE_R2C_TRANSFORM
    CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_zeropadded_sino, ctx->num_projections * ctx->fft_sino_dim * sizeof(float)));
    CUDA_SAFE_CALL(cudaMemset(ctx->gpu_zeropadded_sino, 0, ctx->num_projections * ctx->fft_sino_dim * sizeof(float)));
#else
    CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_input, ctx->num_projections * ctx->rho_ext_len * sizeof(cufftComplex)));
    CUDA_SAFE_CALL(cudaMemset(ctx->gpu_input, 0, ctx->num_projections * ctx->rho_ext_len * sizeof(cufftComplex)));
#endif

    CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_spectrum, (ctx->raster_size/2 + 1) * ctx->raster_size * sizeof(cufftComplex)));
    CUDA_SAFE_CALL(cudaMemset(ctx->gpu_spectrum, 0, (ctx->raster_size/2 + 1) * ctx->raster_size * sizeof(cufftComplex)));

    CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_swapped_spectrum, (ctx->raster_size/2 + 1) * ctx->raster_size * sizeof(cufftComplex)));
    CUDA_SAFE_CALL(cudaMemset(ctx->gpu_swapped_spectrum, 0, (ctx->raster_size/2 + 1) * ctx->raster_size * sizeof(cufftComplex)));

    CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_c2r_result, ctx->raster_size * ctx->raster_size * sizeof(float)));
    CUDA_SAFE_CALL(cudaMemset(ctx->gpu_c2r_result, 0, ctx->raster_size * ctx->raster_size * sizeof(float)));

    CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_output, ctx->roi_x * ctx->roi_y * sizeof(float)));
    CUDA_SAFE_CALL(cudaMemset(ctx->gpu_output, 0, ctx->roi_x * ctx->roi_y * sizeof(float)));

    CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_ktbl, ctx->ktbl_len * sizeof(float)));
    CUDA_SAFE_CALL(cudaMemset(ctx->gpu_ktbl, 0, ctx->ktbl_len * sizeof(float)));

    ctx->ktbl = dfi_cuda_get_ktbl(ctx->ktbl_len);

    //create plans
#if USE_R2C_TRANSFORM
    const int nrank = 1;
    int n[nrank] = {ctx->rho_ext_len};
    int inembed[nrank] = {ctx->rho_ext_len};
    int istride = 1;
    int idist = ctx->fft_sino_dim;
    int onembed[nrank] = {ctx->rho_ext_len/2};
    int ostride = 1;
    int odist = ctx->fft_sino_dim/2;

    CUFFT_SAFE_CALL(cufftPlanMany(&ctx->fft1d_plan, nrank, n,
                    inembed, istride, idist,
                    onembed, ostride, odist,
                    CUFFT_R2C, ctx->num_projections));
#else
    CUFFT_SAFE_CALL(cufftPlan1d(&ctx->fft1d_plan, ctx->rho_ext_len, CUFFT_C2C, ctx->num_projections));       
#endif

    CUFFT_SAFE_CALL(cufftPlan2d(&ctx->ifft2d_plan, ctx->rho_ext_len, ctx->rho_ext_len, CUFFT_C2R));


	// On Kepler and later heatup
    if (dfi_cuda_device_prop[ctx->device].major > 2) {
	dfi_cuda_heatup(ctx, 1);
    }


#ifdef PYHST_MEASURE_TIMINGS
    g_timer_stop(ctx->init_timer);
#endif /* PYHST_MEASURE_TIMINGS */

//    cufftSetCompatibilityMode(ctx->ifft2d_plan, CUFFT_COMPATIBILITY_NATIVE);
//    cufftSetCompatibilityMode(ctx->fft1d_plan, CUFFT_COMPATIBILITY_NATIVE);

    return 0;
}

/**
  * Free resources occupied by GPU context
  *
  * @param ctx is initialized GPU context
  */    
static void dfi_cuda_free_context(HSTReconstructorContext *rctx) {
    DFIContext *ctx;

    assert(rctx);
    ctx = DFI_CONTEXT(rctx);
    assert(ctx);

    if (ctx->initialized) {
	dfi_cuda_heatup(ctx, 0);

#ifdef PYHST_MEASURE_TIMINGS
	g_timer_continue(ctx->init_timer);
#endif /* PYHST_MEASURE_TIMINGS */

	if (ctx->fft1d_plan) CUFFT_SAFE_CALL(cufftDestroy(ctx->fft1d_plan));
	if (ctx->ifft2d_plan) CUFFT_SAFE_CALL(cufftDestroy(ctx->ifft2d_plan));

#if USE_R2C_TRANSFORM
    if (ctx->gpu_zeropadded_sino) CUDA_SAFE_CALL(cudaFree(ctx->gpu_zeropadded_sino));
#else
	if (ctx->gpu_input) CUDA_SAFE_CALL(cudaFree(ctx->gpu_input));
#endif

	if (ctx->gpu_output) CUDA_SAFE_CALL(cudaFree(ctx->gpu_output));
    if (ctx->gpu_spectrum) CUDA_SAFE_CALL(cudaFree(ctx->gpu_spectrum));
    if (ctx->gpu_swapped_spectrum) CUDA_SAFE_CALL(cudaFree(ctx->gpu_swapped_spectrum));
    if (ctx->gpu_truncated_sino) CUDA_SAFE_CALL(cudaFree(ctx->gpu_truncated_sino));
    if (ctx->gpu_ktbl) CUDA_SAFE_CALL(cudaFree(ctx->gpu_ktbl));
    if (ctx->gpu_c2r_result) CUDA_SAFE_CALL(cudaFree(ctx->gpu_c2r_result));
    if (ctx->ktbl) free(ctx->ktbl);
    
#ifdef PYHST_MEASURE_TIMINGS
	g_timer_stop(ctx->init_timer);
#endif /* PYHST_MEASURE_TIMINGS */
    }

    memset(((char*)ctx) + sizeof(HSTReconstructorContext) + DFI_CONTEXT_MEMSET_OFFSET, 0, sizeof(DFIContext) - sizeof(HSTReconstructorContext) - DFI_CONTEXT_MEMSET_OFFSET);
}

/**
  * Free resources and destroy GPU context
  *
  * @param ctx is initialized GPU context
  */
static void dfi_cuda_destroy_context(HSTReconstructorContext *rctx) {
#ifdef PYHST_MEASURE_TIMINGS
    DFIContext *ctx;
#endif /* PYHST_MEASURE_TIMINGS */
    
    dfi_cuda_free_context(rctx);
    hst_reconstructor_free_context(rctx);

#ifdef PYHST_MEASURE_TIMINGS
    ctx = DFI_CONTEXT(rctx);

    g_timer_destroy(ctx->texture_timer);
    g_timer_destroy(ctx->fromgpu_timer);
    g_timer_destroy(ctx->init_timer);
    g_timer_destroy(ctx->togpu_timer);
    g_timer_destroy(ctx->pre_timer);
    g_timer_destroy(ctx->main_timer);

#endif /* PYHST_MEASURE_TIMINGS */

    free(rctx);
}


static HSTConstString dfi_cuda_get_title(HSTReconstructorConstContextPtr rctx) {
    assert(rctx);

    return dfi_cuda_device_prop[DFI_CONTEXT(rctx)->device].name;
}


static HSTConstString *dfi_cuda_get_timers(HSTReconstructorConstContextPtr rctx, double *timers) {
#ifdef PYHST_MEASURE_TIMINGS
    assert(rctx);
    
    if (timers) {
        timers[1] = g_timer_elapsed(DFI_CONTEXT(rctx)->togpu_timer, NULL);
        timers[2] = g_timer_elapsed(DFI_CONTEXT(rctx)->fromgpu_timer, NULL);
        timers[3] = g_timer_elapsed(DFI_CONTEXT(rctx)->texture_timer, NULL);
        timers[4] = g_timer_elapsed(DFI_CONTEXT(rctx)->pre_timer, NULL);
        timers[5] = g_timer_elapsed(DFI_CONTEXT(rctx)->main_timer, NULL);
        timers[6] = g_timer_elapsed(DFI_CONTEXT(rctx)->init_timer, NULL);
        timers[0] = timers[1] + timers[2] + timers[3] + timers[4] + timers[5]; // no init included
    }
#endif /* PYHST_MEASURE_TIMINGS */

    return dfi_cuda_timers;
}

static int dfi_cuda_reconstruct(HSTReconstructorContext *rctx, float *SLICE, const float *SINOGRAMS) {
    DFIContext *ctx;
    HSTSetup *setup;

    assert(rctx);
    ctx = DFI_CONTEXT(rctx);
    setup = rctx->setup;

    assert(ctx);
    assert(setup);

//    puts("DFI Reconstruction");

    dim3 dimPadGrid(ctx->pps_grid_cols, ctx->pps_grid_rows);
    dim3 dimInterpGrid(ctx->interp_grid_cols, ctx->interp_grid_rows);
    dim3 dimSwapGrid(ctx->swap_grid_cols, ctx->swap_grid_rows);
    dim3 dimSwapQuadsGrid(ctx->swap_quad_grid_cols,ctx->swap_quad_grid_rows);
    dim3 dimRoiGrid(ctx->roi_grid_cols, ctx->roi_grid_rows);
    dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

#ifdef PYHST_MEASURE_TIMINGS
    g_timer_continue(ctx->togpu_timer);
#endif /* PYHST_MEASURE_TIMINGS */

    CUDA_SAFE_CALL(cudaMemcpy(ctx->gpu_ktbl, 
                              ctx->ktbl, 
                              ctx->ktbl_len * sizeof(float), 
                              cudaMemcpyHostToDevice));

#ifndef PYHST_BP_BENCHMARK
    if (SINOGRAMS) {
	CUDA_SAFE_CALL(cudaMemcpy2D(ctx->gpu_truncated_sino, 
                                ctx->rho_len * sizeof(float),
                                SINOGRAMS + (setup->num_bins - ctx->rho_len), 
                                setup->num_bins * sizeof(float),
                                ctx->rho_len * sizeof(float), 
                                setup->num_projections, 
                                cudaMemcpyHostToDevice));
    }
#endif

#ifdef PYHST_MEASURE_TIMINGS
    g_timer_stop(ctx->togpu_timer);
#endif /* PYHST_MEASURE_TIMINGS */


#ifdef PYHST_MEASURE_TIMINGS
    g_timer_continue(ctx->pre_timer);
#endif /* PYHST_MEASURE_TIMINGS */

#if USE_R2C_TRANSFORM
    dfi_cuda_zeropadding_real<<<dimPadGrid,dimBlock>>>(ctx->gpu_truncated_sino, 
                                                       ctx->rho_len,
                                                       ctx->fft_sino_dim,
                                                       ctx->gpu_zeropadded_sino);

    CUFFT_SAFE_CALL(cufftExecR2C(ctx->fft1d_plan, 
                                 (cufftReal *)ctx->gpu_zeropadded_sino, 
                                 (cufftComplex *)ctx->gpu_zeropadded_sino));
#else
    dfi_cuda_zeropadding_complex<<<dimPadGrid,dimBlock>>>((cufftReal *)ctx->gpu_truncated_sino, 
                                                          ctx->rho_len,
                                                          (cufftComplex *)ctx->gpu_input);

    CUFFT_SAFE_CALL(cufftExecC2C(ctx->fft1d_plan, 
                                 (cufftComplex *)ctx->gpu_input, 
                                 (cufftComplex *)ctx->gpu_input,
                                 CUFFT_FORWARD));
#endif


#if 0
    cufftReal *h_cut_spectrum;
    CUDA_SAFE_CALL(cudaMallocHost((void**)&h_cut_spectrum, ctx->num_projections * (ctx->rho_ext_len/2 + 1) * 2 * sizeof(cufftReal)));
    CUDA_SAFE_CALL(cudaMemcpy(h_cut_spectrum, 
                              ctx->gpu_zeropadded_sino, 
                              ctx->num_projections * (ctx->rho_ext_len/2 + 1) * 2 * sizeof(cufftReal), 
                              cudaMemcpyDeviceToHost));
    memcpy(SLICE, h_cut_spectrum, ctx->num_projections * (ctx->rho_ext_len/2 + 1) * 2 * sizeof(cufftReal));
    return 0;
#endif


#ifdef PYHST_MEASURE_TIMINGS
    g_timer_stop(ctx->pre_timer);
    g_timer_continue(ctx->texture_timer);
#endif /* PYHST_MEASURE_TIMINGS */

#if USE_R2C_TRANSFORM
    CUDA_SAFE_CALL(cudaBindTexture2D(NULL, 
                                     tex_projections, 
                                     ctx->gpu_zeropadded_sino,
                                     tex_projections.channelDesc, 
                                     ctx->rho_ext_len/2 ,
                                     ctx->num_projections,
                                     ctx->fft_sino_dim * sizeof(float)));
#else
    CUDA_SAFE_CALL(cudaBindTexture2D(NULL, 
                                     tex_projections, 
                                     ctx->gpu_input, 
                                     tex_projections.channelDesc, 
                                     ctx->rho_ext_len,
                                     ctx->num_projections,
                                     ctx->rho_ext_len * sizeof(cufftComplex)));
#endif


    CUDA_SAFE_CALL(cudaBindTexture2D(NULL, 
                                     tex_ktbl, 
                                     ctx->gpu_ktbl, 
                                     tex_ktbl.channelDesc, 
                                     ctx->ktbl_len,
                                     1,
                                     ctx->ktbl_len * sizeof(float)));

#ifdef PYHST_MEASURE_TIMINGS
    g_timer_stop(ctx->texture_timer);
    g_timer_continue(ctx->main_timer);
#endif /* PYHST_MEASURE_TIMINGS */

    dfi_cuda_interpolation_sinc<<<dimInterpGrid,dimBlock>>>(ctx->spectrum_offset_y,
                                                            ctx->L2, 
                                                            ctx->ktbl_len2, 
                                                            ctx->raster_size, 
                                                            ctx->raster_size2, 
                                                            ctx->table_spacing, 
                                                            ctx->angle_step_rad,
                                                            ctx->theta_max,
                                                            ctx->rho_max,
                                                            (ctx->raster_size/2 + 1),
                                                            ctx->max_radius,
                                                            ctx->gpu_spectrum);

#if 0
    cufftComplex *h_cut_spectrum;
    CUDA_SAFE_CALL(cudaMallocHost((void**)&h_cut_spectrum, (ctx->rho_ext_len/2 + 1) * ctx->rho_ext_len * sizeof(cufftComplex)));
    CUDA_SAFE_CALL(cudaMemcpy(h_cut_spectrum, 
                              ctx->gpu_spectrum, 
                              (ctx->rho_ext_len/2 + 1) * ctx->rho_ext_len * sizeof(cufftComplex), 
                              cudaMemcpyDeviceToHost));
    memcpy(SLICE, h_cut_spectrum, (ctx->rho_ext_len/2 + 1) * ctx->rho_ext_len * sizeof(cufftComplex));
    return 0;
#endif

#ifdef PYHST_MEASURE_TIMINGS
    g_timer_stop(ctx->main_timer);
    g_timer_continue(ctx->texture_timer);
#endif /* PYHST_MEASURE_TIMINGS */

    cudaUnbindTexture(tex_projections);
    cudaUnbindTexture(tex_ktbl);
    
#ifdef PYHST_MEASURE_TIMINGS
    g_timer_stop(ctx->texture_timer);
    g_timer_continue(ctx->pre_timer);
#endif /* PYHST_MEASURE_TIMINGS */


    dfi_cuda_swap_quadrants_complex<<<dimSwapGrid,dimBlock>>>(ctx->gpu_spectrum, ctx->gpu_swapped_spectrum, (ctx->raster_size/2 + 1));

#if 0
    cufftComplex *h_cut_spectrum;
    CUDA_SAFE_CALL(cudaMallocHost((void**)&h_cut_spectrum, (ctx->rho_ext_len/2 + 1) * ctx->rho_ext_len * sizeof(cufftComplex)));
    CUDA_SAFE_CALL(cudaMemcpy(h_cut_spectrum, 
                              ctx->gpu_swapped_spectrum, 
                              (ctx->rho_ext_len/2 + 1) * ctx->rho_ext_len * sizeof(cufftComplex), 
                              cudaMemcpyDeviceToHost));
    memcpy(SLICE, h_cut_spectrum, (ctx->rho_ext_len/2 + 1) * ctx->rho_ext_len * sizeof(cufftComplex));
    return 0;
#endif

    cufftExecC2R(ctx->ifft2d_plan, ctx->gpu_swapped_spectrum, ctx->gpu_c2r_result);

#if 0
    cufftComplex *h_cut_spectrum;
    CUDA_SAFE_CALL(cudaMallocHost((void**)&h_cut_spectrum, ctx->raster_size * ctx->raster_size * sizeof(float)));
    CUDA_SAFE_CALL(cudaMemcpy(h_cut_spectrum, 
                              ctx->gpu_c2r_result, 
                              ctx->raster_size * ctx->raster_size * sizeof(float), 
                              cudaMemcpyDeviceToHost));
    memcpy(SLICE, h_cut_spectrum, ctx->raster_size * ctx->raster_size * sizeof(float));
    return 0;
#endif

    
    dfi_cuda_swap_quadrants_real<<<dimSwapQuadsGrid,dimBlock>>>(ctx->gpu_c2r_result);

#if 0
    cufftComplex *h_cut_spectrum;
    CUDA_SAFE_CALL(cudaMallocHost((void**)&h_cut_spectrum, ctx->raster_size * ctx->raster_size * sizeof(cufftComplex)));
    CUDA_SAFE_CALL(cudaMemcpy(h_cut_spectrum, 
                              ctx->gpu_swapped_spectrum, 
                              ctx->raster_size * ctx->raster_size * sizeof(cufftComplex), 
                              cudaMemcpyDeviceToHost));
    memcpy(SLICE, h_cut_spectrum, ctx->raster_size * ctx->raster_size * sizeof(cufftComplex));
    return 0;
#endif

    dfi_cuda_crop_roi<<<dimRoiGrid,dimBlock>>>(ctx->gpu_c2r_result, 
                                               ctx->roi_start_x, 
                                               ctx->roi_start_y,
                                               ctx->roi_x, 
                                               ctx->roi_y,
                                               ctx->raster_size,
                                               ctx->scale,
                                               ctx->gpu_output);

    CUDA_SAFE_CALL(cudaMemset(ctx->gpu_swapped_spectrum, 0, (ctx->raster_size/2 + 1) * ctx->raster_size * sizeof(cufftComplex)));
    CUDA_SAFE_CALL(cudaMemset(ctx->gpu_spectrum, 0, (ctx->raster_size/2 + 1) * ctx->raster_size * sizeof(cufftComplex)));
    
#ifdef PYHST_MEASURE_TIMINGS
    g_timer_stop(ctx->pre_timer);
    g_timer_continue(ctx->fromgpu_timer);
#endif /* PYHST_MEASURE_TIMINGS */

#ifndef PYHST_BP_BENCHMARK
  if (SLICE) {
     CUDA_SAFE_CALL(cudaMemcpy(SLICE, 
                            ctx->gpu_output,
                            ctx->roi_x * ctx->roi_y * sizeof(float), 
                            cudaMemcpyDeviceToHost));
  }
#endif 

#ifdef PYHST_MEASURE_TIMINGS
    g_timer_stop(ctx->fromgpu_timer);
#endif /* PYHST_MEASURE_TIMINGS */


    {
	double total_elapsed = g_timer_elapsed(DFI_CONTEXT(rctx)->main_timer, NULL) +  g_timer_elapsed(DFI_CONTEXT(rctx)->pre_timer, NULL) +  g_timer_elapsed(DFI_CONTEXT(rctx)->texture_timer, NULL);
	if (SLICE&&SINOGRAMS) {
	    if (ctx->bp_runs < CUDA_MAX_STATS) {
		ctx->stats[ctx->bp_runs] = total_elapsed - ctx->last_elapsed;
		ctx->bp_runs++;
	    }
	} 
	ctx->last_elapsed = total_elapsed;
    }


    return 0;
}

static HSTReconstructor dfi_gpu_info = {
    0,
    dfi_cuda_get_title,
    dfi_cuda_create_context,
    dfi_cuda_init_context,
    dfi_cuda_free_context,
    dfi_cuda_destroy_context,
    NULL,
    NULL,
    NULL,
    dfi_cuda_reconstruct,
    NULL,
    NULL,
    dfi_cuda_get_timers
};

HSTReconstructor *dfi_cuda_init(int flags) {
    int i, dev;
    char *stmp;
    int device_count;

    pyhst_info("DFI method was launched!");

    CUDA_SAFE_CALL(cudaGetDeviceCount(&device_count));
    
    if (device_count > HST_CUDA_MAX_DEVICES) {
    	pyhst_warning("There is %u CUDA-enabled devices detected in the system, but dfi_cuda is configured to use only %u", device_count, HST_CUDA_MAX_DEVICES);
    	device_count = HST_CUDA_MAX_DEVICES;
    }

    pyhst_info("NVIDIA devices: %u", device_count);
    for (i = 0, dev = 0; i < device_count; ++i) {
        CUDA_SAFE_CALL(cudaGetDeviceProperties(&dfi_cuda_device_prop[dev], i));

        if ((dfi_cuda_device_prop[dev].computeMode == cudaComputeModeProhibited)||((dfi_cuda_device_prop[dev].major == 9999 && dfi_cuda_device_prop[dev].minor == 9999))) continue;
#ifdef HW_IGNORE_OLD_HARDWARE
	if (dfi_cuda_device_prop[dev].major < 2) continue;
#endif /* HW_IGNORE_OLD_HARDWARE */
        pyhst_debug(" * Device %d: %s v. %d.%d: %d SM, %.2f MHZ, %d MB%s", dev, dfi_cuda_device_prop[dev].name, dfi_cuda_device_prop[dev].major, dfi_cuda_device_prop[dev].minor, dfi_cuda_device_prop[dev].multiProcessorCount, dfi_cuda_device_prop[dev].clockRate * 1e-6, dfi_cuda_device_prop[dev].totalGlobalMem/1048576, dfi_cuda_device_prop[dev].deviceOverlap?", overlap":"");

        dfi_cuda_device_num[dev++] = i;
    }

    stmp = getenv("CUDA_LAUNCH_BLOCKING");
    if ((stmp)&&(stmp[0])&&(stmp[0] != '0')) blocking = 1;

    dfi_gpu_info.devices = dev * PARALLEL_PER_GPU;
    return &dfi_gpu_info;
}

void dfi_cuda_free() {
}