/tomo/pyhst : contents of hst_cuda/hst

: (revision 276)

To get this branch, use:

bzr branch
http://darksoft.org/webbzr/tomo/pyhst

/*
 * The PyHST program is Copyright (C) 2002-2011 of the
 * European Synchrotron Radiation Facility (ESRF) and
 * Karlsruhe Institute of Technology (KIT).
 *
 * PyHST is free software: you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * hst is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License along
 * with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#define PARALLEL_PER_GPU 1
    // to prevent errors with newer glib and CUDA 4
#define GLIB_DISABLE_DEPRECATION_WARNINGS


//#define HST_FLOAT16             // just stubs now


#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <assert.h>
#include <errno.h>
#include <math.h>
#include <stdint.h>

#include <gsl/gsl_sort.h>
#include <gsl/gsl_statistics.h>

extern "C" {
#include <glib.h>
}

#ifdef HST_FLOAT16
# include <cuda_fp16.h>
#endif /* HST_FLOAT16 */

#include <cufft.h>
#if CUDA_VERSION_MAJOR < 5
# include <cutil.h>
# include <cutil_math.h>
#else 
# include <helper_cuda.h>
# include <helper_math.h>

# define CUDA_SAFE_CALL(val) check ( (val), #val, __FILE__, __LINE__ )
# define CUFFT_SAFE_CALL CUDA_SAFE_CALL
#endif /* CVM < 5 */


/*
#define CUDA_SAFE_CALL(x) (x)
#define CUFFT_SAFE_CALL(x) (x)
*/

#include "debug.h"
#include "hw_tools.h"
#include "hst_setup.h"
#include "hst_reconstructor.h"

#include "hst.h"
#include "hst_cuda.h"

//#define BUGGY_CUFFT_SET_STREAM

#include "hst_cuda_defines.h"
#include "hst_cuda_kernels.h"
#include "hst_cuda_bp_kernels.h"
//#include "saved_kernels/hst_cuda_old_kernels.h"

#include "hst_cuda_int_kernels.h"

#if SLICE_BLOCK > 1
# define GPU_RESULT(ctx) (ctx->gpu_result4)
# define GPU_SINO(ctx) (ctx->gpu_data4)
#else  /* SLICE_BLOCK */
# define GPU_RESULT(ctx) (ctx->gpu_result)
# define GPU_SINO(ctx) (ctx->gpu_data)
#endif /* SLICE_BLOCK */
# define GPU_RESULT_COMPAT(ctx) (ctx->gpu_result)


#if SLICE_BLOCK == 4
# define hst_cuda_array2slice(stream, res4, x, y, res, pad_x, pad_y) hst_cuda_array2slice_4<<<dimFullGrid, dimBPBlock, 0, stream>>>(res4, x, y, res, pad_x, pad_y)
#elif SLICE_BLOCK == 2
# define hst_cuda_array2slice(stream, res4, x, y, res, pad_x, pad_y) hst_cuda_array2slice_2<<<dimFullGrid, dimBPBlock, 0, stream>>>(res4, x, y, res, pad_x, pad_y)
#else
# define hst_cuda_array2slice(stream, res4, x, y, res, pad_x, pad_y) 
#endif


static int blocking = 0;

static int hst_cuda_device_num[HST_CUDA_MAX_DEVICES];
static cudaDeviceProp hst_cuda_device_prop[HST_CUDA_MAX_DEVICES];	//!< Enumerated CUDA-enabled devices
static HSTConstString hst_cuda_timers[] = { "complete reconstruction", "transfer to device", "transfer from device", "texture mapping", "projection filtering", "backprojection", "*initialization and cleanup", NULL }; //!< List of supported timers

#define GPU_CONTEXT(ctx) ((GPUContext*)ctx)

/**
 * This implementation of HSTReconstructor uses NVidia GeForce-family graphic
 * cards and NVidia Tesla accelerators to accelerate reconstruction process.
 * The implementation is based on CUDA toolkit and uses NVidia cuFFT library
 * for performing Fourier Transformations.
 * GPUContext is extension of #HSTReconstructorContext which provides additional
 * data members needed to communicate with graphic hardware
 */
struct GPUContextT {
    HSTReconstructorContext recon;
#ifdef PYHST_MEASURE_TIMINGS
# define GPU_CONTEXT_MEMSET_OFFSET (6 * sizeof(GTimer*))
    GTimer *main_timer;		//!< Counts time spent in backprojection code
    GTimer *pre_timer;		//!< Counts time spent in filtering
    GTimer *togpu_timer;	//!< Counts time spent in memory operations
    GTimer *init_timer;		//!< Counts time spent in initialization
    GTimer *fromgpu_timer;	//!< Counts time spent in memory operations
    GTimer *texture_timer;      //!< Counts time spent in texture binding/unbinding
#else
# define GPU_CONTEXT_MEMSET_OFFSET 0
#endif /* PYHST_MEASURE_TIMINGS */

    int device;			//!< Sequence number [0..MAX_DEVICES-1]
    int initialized;		//!< At least partly

    cudaStream_t stream[4];	//!< For interleaving memory writes and computations (2 togpu, 3  fromgpu)
    
    int pascal;                 //!< Indicates if compute compatibility is above 6.0
    int maxwell;		//!< Indicates if compute compatibility is above 5.0
    int kepler;			//!< Indicates if compute compatibility is above 3.0
    int fermi;			//!< Indicates if compute compatibility is above 2.0
    int gt200;
    
    int base_kernel;		//!< Use standard kernel
    int tex_kernel;		//!< Use optimized texture based kernel
    int linear_kernel;		//!< Use linear kernel family
    
    int fft_batch_size;		//!< Number of projections to process at once
    int bp_batch_size;		//!< Number of rows to process at once
    int bin_pitch_size;		//!< The number of floats to allocate for each projection (gpu_data)
    
    int block_size_x;		//!< Revised block size
    int block_size_y;
    int bp_grid_lines;		//!< Number of CUDA blocks along Y-axis of the result slice (batched processing)
    int bp_grid_columns;	//!< Number of CUDA blocks along X-axis of the result slice 
    int projection_grid_size;	//!< Number of CUDA blocks along projections (each block consists of BLOCK_SIZE threads)
    int bin_grid_size;		//!< Number of CUDA blocks along projection bins (each block consists of BLOCK_SIZE threads)
    int filter_grid_size;	//!< Horizontal CUDA blocks for filtering (2 * dim_fft / BLOCK_SIZE)
    int points_per_thread;	//!< How many points is processed by a single thread (actually square of that)
    int texlin;			//!< Texture or ALU-based kernel. Affects interpolation mode

    size_t bp_runs;
    double stats[CUDA_MAX_STATS];
    double gflops_start;
    double last_elapsed;

    double fbp_stats[CUDA_MAX_STATS];
    double last_fbp_elapsed;
    GTimer *fbp_timer;

    const char *last_kernel;

    
    cudaChannelFormatDesc float_desc;	//!< Texture channel description
    cudaChannelFormatDesc float4_desc;	//!< Texture channel description
#ifdef HST_FLOAT16
    cudaChannelFormatDesc half_desc;	//!< Texture channel description
#endif /* HST_FLOAT16 */

    cudaTextureObject_t tex;		//!< Texture object (or NULL if nt HST_CREATE_TEXTURE defined)
#ifdef HST_CUDA_ARRAY
    cudaArray *gpu_array;		//!< CUDA Array binded to texture

# ifdef HST_CREATE_TEXTURE
    struct cudaTextureDesc tex_info;	//!< Texture description
    struct cudaResourceDesc tex_desc;	//!< Texture object description
# endif /* HST_CREATE_TEXTURE */
#endif /* HST_CUDA_ARRAY */

    cufftHandle fft_plan[2];		//!< Complex plan for fourier transformations (both forward and inverse)
#ifndef HST_FILTER2
    cufftHandle ifft_plan[2];		//!< Complex plan for fourier transformations (both forward and inverse)
#endif

    int current_buffer;         //!< 1 or 2 (0 means no active buffer yet)
    int synchronized;           //!< Indicates if current input buffer is synchronised
    float *gpu_const;
    float *gpu_input[2];	//!< Input buffers (linked by gpu_data)
    float *gpu_output[2];	//!< Output buffers (linked by gpu_result)

    float *gpu_limits;		//!< Parameters for fai360-mode corrections
    float *gpu_data;		//!< Sinogram buffer in device memory
    float *fft_buffer;		//!< Filtering buffer
    float *gpu_buffer;		//!< Temporary computation buffer in GPU memory (for filtering, can hold a single batch)
    float *gpu_filter;		//!< Filter buffer in device memory
    float *gpu_result;		//!< Reconstructed slice is going here

# if SLICE_BLOCK > 1
    tfloat *gpu_input4[2];	//!< Input buffers (linked by gpu_data)
    tfloat *gpu_data4;
    vfloat *gpu_output4[2];	//!< Output buffers (linked by gpu_result)
    vfloat *gpu_result4;
#endif 
};
typedef struct GPUContextT GPUContext;


#define hst_cuda_calc_blocks hw_calc_blocks

static int hst_cuda_configure_limits(GPUContext *ctx, HSTSetup *setup) {
    float *param_s;

    int projection;
    int num_proj = setup->num_projections;
    int pad_proj = ctx->fft_batch_size * hst_cuda_calc_blocks(num_proj, ctx->fft_batch_size, NULL);

    int num_bins = setup->num_bins;
    float slope_zone = setup->pente_zone;

    float param;
    float axis_position_corr;
    float overlapping, flat_zone;

    
    param_s = (float*)malloc(2 * pad_proj * sizeof(float));
    if (!param_s) return ENOMEM;


    for (projection = 0; projection < pad_proj; projection++) {
	if (projection < num_proj) {
	    axis_position_corr = setup->axis_position_corr_s[projection];
	} else {
	    axis_position_corr = setup->axis_position;
	}

        if (2 * axis_position_corr > num_bins) {
            overlapping = num_bins - axis_position_corr;
            param = 1; // pente_zone / prof_fact
        } else {
            overlapping = axis_position_corr;
            param = -1; // pente_zone / prof_fact
        }

        if (overlapping <= 0) {
	    free(param_s);
	    return ERANGE;
	}

        slope_zone = MIN(slope_zone, overlapping);
        flat_zone = overlapping - slope_zone;

	param_s[2*projection] = flat_zone;
	param_s[2*projection + 1] = param * slope_zone;
    }


#ifdef PYHST_MEASURE_TIMINGS
    g_timer_continue(ctx->init_timer);
#endif /* PYHST_MEASURE_TIMINGS */
    CUDA_SAFE_CALL(cudaMemcpy(ctx->gpu_limits, param_s, 2*pad_proj*sizeof(float), cudaMemcpyHostToDevice));
#ifdef PYHST_MEASURE_TIMINGS
    g_timer_stop(ctx->init_timer);
#endif /* PYHST_MEASURE_TIMINGS */

    free(param_s);

    return 0;
}

void hst_cuda_print_timings(GPUContext *ctx) {
#ifndef HST_MEASURE_GPUBOOST
    if (ctx->bp_runs <= 10000) {
#endif
	printf("BP Timings:");
	for (int i = 0; i < ctx->bp_runs; i++) {
	    if ((i%10)==0) {
#ifdef HST_MEASURE_GPUBOOST
		printf("\n BP Stats: ");
#else
		printf("\n");
#endif
	    }
	    printf("% 6.3lf   ", ctx->stats[i] * 1000);
	    ctx->stats[i] /= SLICE_BLOCK;
	}
	printf("\n\n");
#ifdef HST_MEASURE_GPUBOOST
	printf("FBP Timings:");
	for (int i = 0; i < ctx->bp_runs; i++) {
	    if ((i%10)==0) printf("\nFBP Stats: ");
	    printf("% 6.3lf   ", ctx->fbp_stats[i] * 1000);
	    ctx->stats[i] /= SLICE_BLOCK;
	}
	printf("\n\n");
#else
    } else {
	for (int i = 0; i < ctx->bp_runs; i++) {
	    ctx->stats[i] /= SLICE_BLOCK;
	}
    }
#endif

    gsl_sort (ctx->stats, 1, ctx->bp_runs);
    printf(" Runs: %zu, Median: %lf, Mean: %lf, StdDev: %lf, Min: %lf, Max: %lf\n", ctx->bp_runs, 
		gsl_stats_median_from_sorted_data(ctx->stats, 1, ctx->bp_runs), 
		gsl_stats_mean(ctx->stats, 1, ctx->bp_runs), 
		gsl_stats_sd(ctx->stats, 1, ctx->bp_runs),
		gsl_stats_min(ctx->stats, 1, ctx->bp_runs),
		gsl_stats_max(ctx->stats, 1, ctx->bp_runs)
    );
}

void hst_cuda_heatup(GPUContext *ctx, int init) {
#ifdef HST_HEATUP
	float heatup_time;
	size_t heatup_iter = 0;
	const size_t grid =  (size_t)CUDA_HEATUP_GRID * 256;
	const size_t computations = CUDA_HEATUP_ELEMENTS * grid  + (2 * CUDA_HEATUP_ELEMENTS * CUDA_HEATUP_KERNEL_ITERATIONS * grid );
	double gflops, gflops_last = 0, change = 1;
#endif

	if (!init) {
	    printf("\n\n\n");
	    hst_cuda_print_timings(ctx);
	    if (!ctx->gflops_start) return;
	}

#ifdef HST_HEATUP
	cudaEvent_t start, stop;
	CUDA_SAFE_CALL( cudaEventCreate(&start) );
	CUDA_SAFE_CALL( cudaEventCreate(&stop) );

	do {
	    CUDA_SAFE_CALL( cudaEventRecord(start, 0) );
	    hst_cuda_heatup_kernel<<<CUDA_HEATUP_GRID,256>>>(0, NULL);
	    CUDA_SAFE_CALL( cudaEventRecord(stop, 0) );
	    CUDA_SAFE_CALL( cudaEventSynchronize(stop) );
	    CUDA_SAFE_CALL( cudaEventElapsedTime(&heatup_time, start, stop) );
	    gflops = ((double)computations)/heatup_time*1000./(double)(1000*1000*1000);

	    if (gflops_last) {
		change = fabs(gflops - gflops_last) / gflops_last;
	    }
	    gflops_last = gflops;
	    if (heatup_iter > CUDA_HEATUP_MIN_ITERATIONS)
		printf("Change: %6.3lf%%    current: %9.3lf GFlops (%9.6lf s)      iter: %zu\n", 100 * change, heatup_time, gflops, heatup_iter);

	    heatup_iter++;
	} while ((init)&&((heatup_iter < CUDA_HEATUP_MIN_ITERATIONS)||((change > CUDA_HEATUP_MAX_CHANGE)&&(heatup_iter<CUDA_HEATUP_MAX_ITERATIONS))));
	
	if (init) {
	    printf("Heating is complete...\n");
	    ctx->gflops_start = gflops;
	} else if (ctx->gflops_start) {
	    change = fabs(gflops - ctx->gflops_start) / ctx->gflops_start;
	    if (change > 0.05) printf("WARNING: ****** ");
	    printf(" Clock change: %6.3lf%%    current: %9.3lf GFlops, start: %9.3lf GFlops\n", 100 * change, gflops, ctx->gflops_start);

	    printf("\n\n Details:\n");
	    double max_change = 0;
	    for (int i = 1; i < ctx->bp_runs; i++) {
		change = fabs(ctx->stats[i] - ctx->stats[i - 1]) / ctx->stats[i - 1];
		if (change > CUDA_HEATUP_MAX_CHANGE) {
		    printf("  Timing change: %6.3lf%%    slice: % 5i current: %9.6lf s, before: %9.6lf s\n", 100 * change, i, ctx->stats[i], ctx->stats[i-1]);
		    if (change > max_change) {
//			printf("Timing change: %6.3lf%%    slice: % 5i current: %9.6lf s, before: %9.6lf s\n", 100 * change, i, ctx->stats[i], ctx->stats[i-1]);
			max_change = change;
		    }
		}
	    }
	}
#endif
}


/**
 *
 * Create GPU context (uninitialized)
 *
 * @param prototype is pointer on HSTReconstructure describing the reconstruction module
 * @param setup is pointer on HSTSetup with various HST parameters
 * @result created context
 */
static HSTReconstructorContext *hst_cuda_create_context(HSTReconstructor *prototype, HSTSetup *setup, int id) {
    GPUContext *ctx;
    
    assert(prototype);
    assert(setup);

    assert((id/PARALLEL_PER_GPU) < prototype->devices);
    
    /* FIXME: no error code in case of out-of-memory */
    ctx = (GPUContext*)malloc(sizeof(struct GPUContextT));
    if (ctx) {
	memset(ctx, 0, sizeof(struct GPUContextT));
	
#ifdef PYHST_MEASURE_TIMINGS
        ctx->main_timer = g_timer_new();
        if (ctx->main_timer) g_timer_stop(ctx->main_timer);
        ctx->pre_timer = g_timer_new();
        if (ctx->pre_timer) g_timer_stop(ctx->pre_timer);
        ctx->togpu_timer = g_timer_new();
        if (ctx->togpu_timer) g_timer_stop(ctx->togpu_timer);
        ctx->init_timer = g_timer_new();
        if (ctx->init_timer) g_timer_stop(ctx->init_timer);
        ctx->fromgpu_timer = g_timer_new();
        if (ctx->fromgpu_timer) g_timer_stop(ctx->fromgpu_timer);
        ctx->texture_timer = g_timer_new();
        if (ctx->texture_timer) g_timer_stop(ctx->texture_timer);

        if ((!ctx->main_timer)||(!ctx->pre_timer)||(!ctx->togpu_timer)||(!ctx->init_timer)||(!ctx->fromgpu_timer)||(!ctx->texture_timer)) {
            if (ctx->texture_timer) g_timer_destroy(ctx->texture_timer);
            if (ctx->fromgpu_timer) g_timer_destroy(ctx->fromgpu_timer);
            if (ctx->init_timer) g_timer_destroy(ctx->init_timer);
            if (ctx->togpu_timer) g_timer_destroy(ctx->togpu_timer);
            if (ctx->pre_timer) g_timer_destroy(ctx->pre_timer);
            if (ctx->main_timer) g_timer_destroy(ctx->main_timer);
            free(ctx);
            return NULL;
        }
#endif /* PYHST_MEASURE_TIMINGS */

	hst_reconstructor_init_context((HSTReconstructorContext*)ctx, prototype, setup);
    
        ctx->device = id / PARALLEL_PER_GPU;
    }
    return (HSTReconstructorContext*)ctx;
}

/**
  * Initializes GPU context
  *
  * @param ctx is uninitialized GPU context
  * @result return code and 0 indicates success
  */
static int hst_cuda_init_context(HSTReconstructorContext *rctx, HWThread thr) {
    GPUContext *ctx;
    HSTSetup *setup;
    
    int talign;				// The mandatory alignment of texture line in floats
    int dim_result;			// The Y-dimmension of result buffer
    int num_projections;		// Projection dimensions (allocation size, rounded for blocked processing)
    //int filled_projections;		// Number of projections which will be filled in the kernels
    int bsx = BLOCK_SIZE_X;
    int bsy = BLOCK_SIZE_Y;
    int ppt = 1;			// Points per thread
    int texlin = 1;			// Indicates if we are building Tex-based kernel or ALU-based kernel (Hybrid defaults to Tex)

    assert(rctx);

    ctx = GPU_CONTEXT(rctx);
    setup = rctx->setup;
    
    assert(setup);
	/* This limitation is due to statically allocated memory for sin_s, cos_s, etc. It probably could be done
	dynamically in run-time, just access performance should be checked */
    assert(setup->num_projections <= MAXNPROJECTIONS);
	/* Otherwise, the filter kernel will corrupt the data and we need to limit the CUDA block_size here as well */
    assert (2 * setup->dim_fft >= BLOCK_SIZE);
    
    /* Why the width is doubled?
	We will compute two real convolutions as a single complex convolution.
	To achieve that we interleaving the data from two float arrays into 
	the complex one (i.e. first real vector is forming real part of complex
	numbers and second - the imaginary part). Then, we perform the C2C
	convolution of resulting vector and disassemble the resulting complex
	vector into the two real-values sequences.
	For details, see:
	http://www.engineeringproductivitytools.com/stuff/T0001/PT10.HTM
    */
    
    ctx->bin_grid_size = hst_cuda_calc_blocks(setup->num_bins, BLOCK_SIZE, NULL);
    ctx->bin_pitch_size = ctx->bin_grid_size * BLOCK_SIZE;

    talign = hst_cuda_device_prop[ctx->device].textureAlignment / sizeof(float);
    ctx->bin_pitch_size = talign * hst_cuda_calc_blocks(ctx->bin_pitch_size, talign, NULL);

    ctx->filter_grid_size = 2 * setup->dim_fft / BLOCK_SIZE;
    if (setup->num_projections < BLOCK_SIZE * FFT_BATCH_SIZE) {
	num_projections = setup->num_projections;
	if (num_projections%2) ++num_projections;
	
	ctx->projection_grid_size = hst_cuda_calc_blocks(num_projections/2, BLOCK_SIZE, NULL);
	num_projections = 2 * ctx->projection_grid_size * BLOCK_SIZE;
	
	 ctx->fft_batch_size = num_projections;
    } else { 
	ctx->fft_batch_size = BLOCK_SIZE * FFT_BATCH_SIZE;
	ctx->projection_grid_size = FFT_BATCH_SIZE / 2;
	
	num_projections = ctx->fft_batch_size * hst_cuda_calc_blocks(setup->num_projections, ctx->fft_batch_size, NULL);
    }	

    if (hst_cuda_device_prop[ctx->device].major > 5) ctx->pascal = 1;
    else if (hst_cuda_device_prop[ctx->device].major > 4) ctx->maxwell = 1;
    else if (hst_cuda_device_prop[ctx->device].major > 2) ctx->kepler = 1;
    else if (hst_cuda_device_prop[ctx->device].major > 1) ctx->fermi = 1;
    else ctx->gt200 = 1;

#if defined(HST_BASE_KERNEL)
    ctx->base_kernel = 1;
#elif defined(HST_TEX_KERNEL)
    ctx->tex_kernel = 1;
#elif defined(HST_LINEAR_KERNEL)
    ctx->linear_kernel = 1;
#else
    if (ctx->gt200) {
	ctx->base_kernel = 1;
    } else if (ctx->fermi) {
	ctx->linear_kernel = 1;
    } else {
# if (SLICE_BLOCK > 1)||(!HST_OPTIMIZE_KEPLER)
	ctx->tex_kernel = 1;
# else
	if (hst_cuda_device_prop[ctx->device].major > 4) {
	    ctx->linear_kernel = 1;
	} else {
	    if (setup->oversampling) ctx->linear_kernel = 1;
	    else ctx->tex_kernel = 1;
	}
# endif
    }
#endif

    if (ctx->linear_kernel) {
	bsx = HST_LINEAR_BLOCK;
	bsy = HST_LINEAR_BLOCK;
	if (setup->oversampling > 1) ppt = HST_OVERS_PPT;
	else if (setup->oversampling > 0) ppt = HST_NN_PPT;
	else ppt = HST_LINEAR_PPT;
    } else if (ctx->base_kernel) {
#if defined(HST_BASE_REMAP)&&defined(HST_BASE_PPT)
        ppt = HST_BASE_PPT;
#else
        ppt = 1;
#endif
    } else ppt = 1;


    //filled_projections = num_projections;
    num_projections = 2 * bsy * hst_cuda_calc_blocks(num_projections, 2 * bsy, NULL);


	// i.e. linear & mplinear kernels without oversampling (linear & nn modes)
#ifndef HST_HYBRID
    if ((ctx->linear_kernel)&&(setup->oversampling <= 1)) {
	texlin = 0;
    }
#endif

#ifdef HST_HALF_MODE
    if (setup->oversampling > 1) {
	printf("The float16 mode is not supported with oversampling, only NN interpolation of texture engine...\n\n");
	exit(1);
    } else if (((ctx->base_kernel)||(ctx->tex_kernel))&&(!setup->oversampling)) {
	printf("The Linear interpolation in float16 mode is only supported using ALU kernel, only NN interpolation of texture engine...\n\n");
	exit(1);
    }
    texlin = 0;
#endif

	// On Kepler and later heatup
    if (hst_cuda_device_prop[ctx->device].major > 2) {
	hst_cuda_heatup(ctx, 1);
	
    }

    ctx->texlin = texlin;
    ctx->points_per_thread = ppt;
    ctx->block_size_x = bsx;
    ctx->block_size_y = bsy;

    ctx->bp_grid_columns = ppt * (int)(setup->num_x * 1.0 / (ppt * bsx) + 0.9999999999);
    if ((!BP_BATCH_SIZE)||(setup->num_y < bsy * BP_BATCH_SIZE)) {
	ctx->bp_batch_size = setup->num_y;

	ctx->bp_grid_lines = ppt * (int)(setup->num_y * 1.0 / (ppt * bsy) + 0.9999999999);

	dim_result = ctx->bp_grid_lines * bsy;
    } else {
	ctx->bp_batch_size = bsy * BP_BATCH_SIZE;
	ctx->bp_grid_lines = BP_BATCH_SIZE;
	
	dim_result = ctx->bp_batch_size * hst_cuda_calc_blocks(setup->num_y, ctx->bp_batch_size, NULL);
    }

    ctx->initialized = 1;    

#ifdef PYHST_MEASURE_TIMINGS
    g_timer_continue(ctx->init_timer);
#endif /* PYHST_MEASURE_TIMINGS */

#ifdef HW_USE_THREADS
    CUDA_SAFE_CALL(cudaSetDevice(hst_cuda_device_num[ctx->device]));
#endif /* HW_USE_THREADS */

    ctx->float_desc = cudaCreateChannelDesc<float>();
#ifdef HST_FLOAT16
    ctx->half_desc = cudaCreateChannelDescHalf();
#endif /* HST_FLOAT16 */

    ctx->float4_desc = cudaCreateChannelDesc<tfloat>();

	// All manual kernels processing multiple points, and all Tex-based only one

	// Check when needed
    if (texlin) tex_projes.filterMode = cudaFilterModeLinear; 
    else tex_projes.filterMode = cudaFilterModePoint;

    tex_projes.addressMode[0] = cudaAddressModeBorder;
    tex_projes.addressMode[1] = cudaAddressModeBorder;

#ifndef HST_CREATE_TEXTURE
    if (texlin) array_projes.filterMode = cudaFilterModeLinear; 
    else array_projes.filterMode = cudaFilterModePoint;
    array_projes.addressMode[0] = cudaAddressModeBorder;
    array_projes.addressMode[1] = cudaAddressModeBorder;
#endif
    
    float4 *all = (float4*)malloc(num_projections * sizeof(float4));
    for (int i = 0; i < setup->num_projections; i++) {
	all[i].x = setup->cos_s[i];
	all[i].y = setup->sin_s[i];
	all[i].z = setup->axis_position_corr_s[i];
	all[i].w = floor(ppt * MIN(MIN(bsx * all[i].x, - bsy * all[i].y), MIN(0., bsx * all[i].x - bsy * all[i].y)));	// we actually don't need floor here.
    }
    if (setup->num_projections < num_projections) {
	memset(all + setup->num_projections, 0, (num_projections - setup->num_projections)*sizeof(float4));
    }
    CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_all, all, num_projections*sizeof(float4)));

#ifdef HST_C_SIN
    float *trig = (float*)malloc(num_projections * sizeof(float));
    for (int i = 0; i < setup->num_projections; i++) {
	trig[i] = setup->sin_s[i];
    }
    if (setup->num_projections < num_projections) {
	memset(trig + setup->num_projections, 0, (num_projections - setup->num_projections)*sizeof(float));
    }
    CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_sin, trig, num_projections*sizeof(float)));
    free(trig);
#endif

#ifdef HST_C_TRIG
    float2 *ofst = (float2*)malloc(num_projections * sizeof(float2));
    float2 *trig = (float2*)malloc(num_projections * sizeof(float2));
    for (int i = 0; i < setup->num_projections; i++) {
	float z = setup->axis_position_corr_s[i];
	float w = floor(ppt * MIN(MIN(bsx * all[i].x, - bsy * all[i].y), MIN(0., bsx * all[i].x - bsy * all[i].y)));

	trig[i].x = setup->cos_s[i];
	trig[i].y = setup->sin_s[i];

	    // SHMEM_SAMPLING * (all.z - FANCY_ROUND_CORRECTION)
#ifdef HST_LINEAR_FANCY_FLOOR
	ofst[i].x = ((setup->oversampling>0)?setup->oversampling:1) * (z + ((setup->oversampling==0)?1.f:0.f));
#else
	ofst[i].x = ((setup->oversampling>0)?setup->oversampling:1) * (z);
#endif
	ofst[i].y = z + w;
	
//	ofst[i].x = setup->axis_position_corr_s[i];
//	ofst[i].y = floor(ppt * MIN(MIN(bsx * all[i].x, - bsy * all[i].y), MIN(0., bsx * all[i].x - bsy * all[i].y)));
    }
    if (setup->num_projections < num_projections) {
	memset(trig + setup->num_projections, 0, (num_projections - setup->num_projections)*sizeof(float2));
	memset(ofst + setup->num_projections, 0, (num_projections - setup->num_projections)*sizeof(float2));
    }
    CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_trig, trig, num_projections*sizeof(float2)));
    CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_ofst, ofst, num_projections*sizeof(float2)));
    free(trig);
    free(ofst);
#endif

//    CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_coss, setup->cos_s, setup->num_projections*sizeof(float)));
//    CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_sins, setup->sin_s, setup->num_projections*sizeof(float)));
//    CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_axiss, setup->axis_position_corr_s, setup->num_projections*sizeof(float)));

    float *g_all = (float*)malloc(4 * MAXNPROJECTIONS * sizeof(float));
    memset(g_all, 0, 4 * MAXNPROJECTIONS  * sizeof(float));
    for (int i = 0; i < setup->num_projections; i++) {
	g_all[0 * MAXNPROJECTIONS + i] = setup->cos_s[i];
	g_all[1 * MAXNPROJECTIONS + i] = setup->sin_s[i];
	g_all[2 * MAXNPROJECTIONS + i] = setup->axis_position_corr_s[i];
	g_all[3 * MAXNPROJECTIONS + i] = floor(ppt * MIN(MIN(bsx * all[i].x, - bsy * all[i].y), MIN(0., bsx * all[i].x - bsy * all[i].y)));
    }
    CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_const, 4 * MAXNPROJECTIONS * sizeof(float)));
    CUDA_SAFE_CALL(cudaMemcpy(ctx->gpu_const, g_all,  4 * MAXNPROJECTIONS  * sizeof(float), cudaMemcpyHostToDevice));
    free(g_all);
    free(all);

#ifdef HST_CUDA_ARRAY
    CUDA_SAFE_CALL(cudaMallocArray(&ctx->gpu_array, &ctx->float4_desc, setup->num_bins, setup->num_projections));
# ifdef HST_CREATE_TEXTURE
    ctx->tex_desc.resType = cudaResourceTypeArray;
    ctx->tex_desc.res.array.array = ctx->gpu_array;
    if (texlin) ctx->tex_info.filterMode = cudaFilterModeLinear; 
    else ctx->tex_info.filterMode = cudaFilterModePoint;

    ctx->tex_info.addressMode[0] = cudaAddressModeBorder;
    ctx->tex_info.addressMode[1] = cudaAddressModeBorder;
//    ctx->tex_info.readMode = cudaReadModeElementType;
//    ctx->tex_info.normalizedCoords = 1;
     cudaCreateTextureObject(&ctx->tex, &ctx->tex_desc, &ctx->tex_info, NULL);
# else /* HST_CREATE_TEXTURE */
    CUDA_SAFE_CALL( cudaBindTextureToArray(array_projes, ctx->gpu_array, ctx->float4_desc) );
# endif /* HST_CREATE_TEXTURE */
#endif /* HST_CUDA_ARRAY */


    CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_input[0], SLICE_BLOCK * num_projections * ctx->bin_pitch_size * sizeof(float)));
    CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_input[1], SLICE_BLOCK * num_projections * ctx->bin_pitch_size * sizeof(float)));

#if SLICE_BLOCK == 4
# ifdef HST_HALF_MODE
    CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_input4[0], num_projections * ctx->bin_pitch_size * sizeof(float2)));
    CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_input4[1], num_projections * ctx->bin_pitch_size * sizeof(float2)));
# else
    CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_input4[0], num_projections * ctx->bin_pitch_size * sizeof(float4)));
    CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_input4[1], num_projections * ctx->bin_pitch_size * sizeof(float4)));
# endif
    // Only simple algorithm here
#elif SLICE_BLOCK == 2
    CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_input4[0], num_projections * ctx->bin_pitch_size * sizeof(float2)));
    CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_input4[1], num_projections * ctx->bin_pitch_size * sizeof(float2)));
    // Only simple algorithm here
#else /* HST_CUDA_4SLICE_MODE */
    if (setup->num_projections < num_projections) {
	CUDA_SAFE_CALL(cudaMemset(ctx->gpu_input[0] + setup->num_projections * ctx->bin_pitch_size, 0, (num_projections - setup->num_projections)*ctx->bin_pitch_size*sizeof(float)));
	CUDA_SAFE_CALL(cudaMemset(ctx->gpu_input[1] + setup->num_projections * ctx->bin_pitch_size, 0, (num_projections - setup->num_projections)*ctx->bin_pitch_size*sizeof(float)));
    }
#endif /* HST_CUDA_4SLICE_MODE */

#if SLICE_BLOCK > 1
    ctx->gpu_data = ctx->gpu_input[0];
    ctx->gpu_data4 = ctx->gpu_input4[0];
#endif


    CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_buffer, SLICE_BLOCK * ctx->fft_batch_size*setup->dim_fft*sizeof(float)));
    CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->fft_buffer,    2 * SLICE_BLOCK * ctx->fft_batch_size*setup->dim_fft*sizeof(float)));
    CUDA_SAFE_CALL(cudaMemset((void*)  ctx->fft_buffer, 0, 2 * SLICE_BLOCK * ctx->fft_batch_size*setup->dim_fft*sizeof(float)));

    CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_filter, 2*setup->dim_fft*sizeof(float)));
    CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_limits, 2*num_projections*sizeof(float)));
    
    CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_output[0], SLICE_BLOCK * dim_result * ctx->bp_grid_columns * ctx->block_size_x * sizeof(float)));
    CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_output[1], SLICE_BLOCK * dim_result * ctx->bp_grid_columns * ctx->block_size_x * sizeof(float)));

#if SLICE_BLOCK == 4
    CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_output4[0], dim_result * ctx->bp_grid_columns * ctx->block_size_x * sizeof(float4)));
    CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_output4[1], dim_result * ctx->bp_grid_columns * ctx->block_size_x * sizeof(float4)));
#elif SLICE_BLOCK == 2
    CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_output4[0], dim_result * ctx->bp_grid_columns * ctx->block_size_x * sizeof(float2)));
    CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_output4[1], dim_result * ctx->bp_grid_columns * ctx->block_size_x * sizeof(float2)));
#endif /* HST_CUDA_4SLICE_MODE */


#if SLICE_BLOCK > 1
    ctx->gpu_result = ctx->gpu_output[0];
    ctx->gpu_result4 = ctx->gpu_output4[0];
#endif

//    CUDA_SAFE_CALL(cudaMalloc((void**)&ctx->gpu_result, dim_result * ctx->bp_grid_columns * BLOCK_SIZE_X * sizeof(float)));
//    CUDA_SAFE_CALL(cudaMemset(ctx->gpu_result, 0, sizeof(float) * dim_result));


    cudaStreamCreate(&ctx->stream[0]);
    cudaStreamCreate(&ctx->stream[1]);
    cudaStreamCreate(&ctx->stream[2]);
    cudaStreamCreate(&ctx->stream[3]);

#ifdef HST_FILTER2
    CUFFT_SAFE_CALL(cufftPlan1d(&ctx->fft_plan[0], setup->dim_fft, CUFFT_C2C, SLICE_BLOCK * (ctx->fft_batch_size/2)));
    CUFFT_SAFE_CALL(cufftPlan1d(&ctx->fft_plan[1], setup->dim_fft, CUFFT_C2C, SLICE_BLOCK * (ctx->fft_batch_size/2)));
#else
    int dim_fft = setup->dim_fft;

	// In Fourier domain, the parameters are given in cufftComplex. I.e. they are double size
    CUFFT_SAFE_CALL(cufftPlanMany(&ctx->fft_plan[0], 1, &dim_fft, &dim_fft, 1, dim_fft, &dim_fft, 1, dim_fft, CUFFT_R2C, SLICE_BLOCK * (ctx->fft_batch_size)));
    CUFFT_SAFE_CALL(cufftPlanMany(&ctx->fft_plan[1], 1, &dim_fft, &dim_fft, 1, dim_fft, &dim_fft, 1, dim_fft, CUFFT_R2C, SLICE_BLOCK * (ctx->fft_batch_size)));
    CUFFT_SAFE_CALL(cufftPlanMany(&ctx->ifft_plan[0], 1, &dim_fft, &dim_fft, 1, dim_fft, &dim_fft, 1, dim_fft, CUFFT_C2R, SLICE_BLOCK * (ctx->fft_batch_size)));
    CUFFT_SAFE_CALL(cufftPlanMany(&ctx->ifft_plan[1], 1, &dim_fft, &dim_fft, 1, dim_fft, &dim_fft, 1, dim_fft, CUFFT_C2R, SLICE_BLOCK * (ctx->fft_batch_size)));
#endif

#ifndef BUGGY_CUFFT_SET_STREAM
    CUFFT_SAFE_CALL(cufftSetStream(ctx->fft_plan[0], ctx->stream[0]));
    CUFFT_SAFE_CALL(cufftSetStream(ctx->fft_plan[1], ctx->stream[1]));
# ifndef HST_FILTER2
    CUFFT_SAFE_CALL(cufftSetStream(ctx->ifft_plan[0], ctx->stream[0]));
    CUFFT_SAFE_CALL(cufftSetStream(ctx->ifft_plan[1], ctx->stream[1]));
# endif
#endif /* ! BUGGY_CUFFT_SET_STREAM */
    
    /* 
	Shall we create a smaller plan for the last iteration?
    */

#ifdef HST_TEX_KERNEL
/*# if SLICE_BLOCK == 1
    cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte);
# else
# endif
*/
    cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
#else
//    cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte);
    cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
#endif


    cudaFuncSetCacheConfig(hst_cuda_kernel, cudaFuncCachePreferL1);

#if defined(HST_KEPLER_PREFER_L1_EQUAL)
    printf(" => 32KB L1 / 32 KB Shared Memory\n");
    cudaFuncSetCacheConfig(hst_kepler_kernel, cudaFuncCachePreferEqual);
#elif defined(HST_KEPLER_PREFER_L1)
    printf(" => Prefering L1 over SharedMemory\n");
	// We expect register spills and need L1 to overcome
    cudaFuncSetCacheConfig(hst_kepler_kernel, cudaFuncCachePreferL1);
#else
    printf(" => Standard configuration, 48KB of Shared Memory on Kepler\n");
#endif /* HST_KEPLER_PREFER_L1 */


    cudaFuncSetCacheConfig(hst_cuda_alu_linear, cudaFuncCachePreferShared);
    cudaFuncSetCacheConfig(hst_cuda_alu_oversample4, cudaFuncCachePreferShared);
    cudaFuncSetCacheConfig(hst_cuda_alu_nn, cudaFuncCachePreferShared);


#ifdef PYHST_MEASURE_TIMINGS
    g_timer_stop(ctx->init_timer);
#endif /* PYHST_MEASURE_TIMINGS */

    ctx->fbp_timer = g_timer_new();
    g_timer_start(ctx->fbp_timer);

    return 0;
}

/**
  * Free resources occupied by GPU context
  *
  * @param ctx is initialized GPU context
  */    
static void hst_cuda_free_context(HSTReconstructorContext *rctx) {
    GPUContext *ctx;

    assert(rctx);
    ctx = GPU_CONTEXT(rctx);
    assert(ctx);

    if (ctx->initialized) {
	hst_cuda_heatup(ctx, 0);

#ifdef PYHST_MEASURE_TIMINGS
	g_timer_continue(ctx->init_timer);
#endif /* PYHST_MEASURE_TIMINGS */
	if (ctx->fft_plan[1]) CUFFT_SAFE_CALL(cufftDestroy(ctx->fft_plan[1]));
	if (ctx->fft_plan[0]) CUFFT_SAFE_CALL(cufftDestroy(ctx->fft_plan[0]));
#ifndef HST_FILTER2
	if (ctx->ifft_plan[1]) CUFFT_SAFE_CALL(cufftDestroy(ctx->ifft_plan[1]));
	if (ctx->ifft_plan[0]) CUFFT_SAFE_CALL(cufftDestroy(ctx->ifft_plan[0]));
#endif

#ifdef HST_CUDA_ARRAY
# ifdef HST_CREATE_TEXTURE
	cudaDestroyTextureObject(ctx->tex);
# else /*  HST_CREATE_TEXTURE */
	cudaUnbindTexture(array_projes);
# endif /* HST_CREATE_TEXTURE */
	cudaFreeArray(ctx->gpu_array);
#endif /* HST_CUDA_ARRAY */

        if (ctx->stream[3]) cudaStreamDestroy(ctx->stream[3]);
	if (ctx->stream[2]) cudaStreamDestroy(ctx->stream[2]);
        if (ctx->stream[1]) cudaStreamDestroy(ctx->stream[1]);
	if (ctx->stream[0]) cudaStreamDestroy(ctx->stream[0]);

# if SLICE_BLOCK > 1
	if (ctx->gpu_output4[1]) CUDA_SAFE_CALL(cudaFree(ctx->gpu_output4[1]));
	if (ctx->gpu_output4[0]) CUDA_SAFE_CALL(cudaFree(ctx->gpu_output4[0]));
	if (ctx->gpu_input4[1]) CUDA_SAFE_CALL(cudaFree(ctx->gpu_input4[1]));
	if (ctx->gpu_input4[0]) CUDA_SAFE_CALL(cudaFree(ctx->gpu_input4[0]));
#endif /* HST_CUDA_4SLICE_MODE */

	if (ctx->gpu_output[1]) CUDA_SAFE_CALL(cudaFree(ctx->gpu_output[1]));
	if (ctx->gpu_output[0]) CUDA_SAFE_CALL(cudaFree(ctx->gpu_output[0]));
	if (ctx->gpu_limits) CUDA_SAFE_CALL(cudaFree(ctx->gpu_limits));
	if (ctx->gpu_filter) CUDA_SAFE_CALL(cudaFree(ctx->gpu_filter));
	if (ctx->fft_buffer) CUDA_SAFE_CALL(cudaFree(ctx->fft_buffer));
	if (ctx->gpu_buffer) CUDA_SAFE_CALL(cudaFree(ctx->gpu_buffer));
	if (ctx->gpu_input[1]) CUDA_SAFE_CALL(cudaFree(ctx->gpu_input[1]));
	if (ctx->gpu_input[0]) CUDA_SAFE_CALL(cudaFree(ctx->gpu_input[0]));
	if (ctx->gpu_const) CUDA_SAFE_CALL(cudaFree(ctx->gpu_const));
#ifdef PYHST_MEASURE_TIMINGS
	g_timer_stop(ctx->init_timer);
#endif /* PYHST_MEASURE_TIMINGS */
    }

    memset(((char*)ctx) + sizeof(HSTReconstructorContext) + GPU_CONTEXT_MEMSET_OFFSET, 0, sizeof(GPUContext) - sizeof(HSTReconstructorContext) - GPU_CONTEXT_MEMSET_OFFSET);
}

/**
  * Free resources and destroy GPU context
  *
  * @param ctx is initialized GPU context
  */
static void hst_cuda_destroy_context(HSTReconstructorContext *rctx) {
#ifdef PYHST_MEASURE_TIMINGS
    GPUContext *ctx;
#endif /* PYHST_MEASURE_TIMINGS */
    
    hst_cuda_free_context(rctx);
    hst_reconstructor_free_context(rctx);

#ifdef PYHST_MEASURE_TIMINGS
    ctx = GPU_CONTEXT(rctx);

    g_timer_destroy(ctx->texture_timer);
    g_timer_destroy(ctx->fromgpu_timer);
    g_timer_destroy(ctx->init_timer);
    g_timer_destroy(ctx->togpu_timer);
    g_timer_destroy(ctx->pre_timer);
    g_timer_destroy(ctx->main_timer);
#endif /* PYHST_MEASURE_TIMINGS */

    free(rctx);
}


static int hst_cuda_configure(HSTReconstructorContextPtr rctx, HSTChanged what_changed) {
    GPUContext *ctx;
    HSTSetup *setup;

    assert(rctx);

    ctx = GPU_CONTEXT(rctx);
    setup = rctx->setup;

    if (what_changed&HST_FILTER_CHANGED) {
#ifdef PYHST_MEASURE_TIMINGS
	g_timer_continue(ctx->init_timer);
#endif /* PYHST_MEASURE_TIMINGS */
	CUDA_SAFE_CALL(cudaMemcpy(ctx->gpu_filter, setup->filter, (2 * setup->dim_fft)*sizeof(float), cudaMemcpyHostToDevice));
#ifdef PYHST_MEASURE_TIMINGS
	g_timer_stop(ctx->init_timer);
#endif /* PYHST_MEASURE_TIMINGS */
    }
    if (what_changed&HST_LIMITS_CHANGED) {
	if (setup->fai360) {
	    check_code(hst_cuda_configure_limits(ctx, setup), "hst_cuda_configure_limit");
	}
    }

    return 0;
}

static HSTConstString hst_cuda_get_title(HSTReconstructorConstContextPtr rctx) {
    assert(rctx);

    return hst_cuda_device_prop[GPU_CONTEXT(rctx)->device].name;
}


static HSTConstString *hst_cuda_get_timers(HSTReconstructorConstContextPtr rctx, double *timers) {
#ifdef PYHST_MEASURE_TIMINGS
    assert(rctx);
    
    if (timers) {
        timers[1] = g_timer_elapsed(GPU_CONTEXT(rctx)->togpu_timer, NULL);
        timers[2] = g_timer_elapsed(GPU_CONTEXT(rctx)->fromgpu_timer, NULL);
        timers[3] = g_timer_elapsed(GPU_CONTEXT(rctx)->texture_timer, NULL);
        timers[4] = g_timer_elapsed(GPU_CONTEXT(rctx)->pre_timer, NULL);
        timers[5] = g_timer_elapsed(GPU_CONTEXT(rctx)->main_timer, NULL);
        timers[6] = g_timer_elapsed(GPU_CONTEXT(rctx)->init_timer, NULL);
        timers[0] = timers[1] + timers[2] + timers[3] + timers[4] + timers[5]; // no init included
    }
#endif /* PYHST_MEASURE_TIMINGS */

    return hst_cuda_timers;
}

static int hst_cuda_send(HSTReconstructorContext *rctx, const float *data) {
    GPUContext *ctx = GPU_CONTEXT(rctx);

    HSTSetup *setup = rctx->setup;

#ifdef PYHST_MEASURE_TIMINGS
    g_timer_continue(ctx->togpu_timer);
#endif /* PYHST_MEASURE_TIMINGS */

/*
     // CUDA 4.2 profiller will show nothing if there is not enough runtime on default stream
    if ((!ctx->current_buffer)||(!data)) {
        int i;
        dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
        dim3 dimGrid(1, 1);
        for (i = 0; i<128;i++) {
            hst_cuda_pack_kernel_zpad<<<dimGrid,dimBlock>>>(ctx->gpu_buffer, setup->dim_fft, ctx->gpu_data, ctx->bin_pitch_size, setup->num_bins);
        }
    }
*/

    if (ctx->current_buffer) {
        cudaStreamSynchronize(ctx->stream[2]);
        ctx->gpu_data = ctx->gpu_input[ctx->current_buffer - 1];
        ctx->gpu_result = ctx->gpu_output[ctx->current_buffer - 1];
#if SLICE_BLOCK > 1
        ctx->gpu_data4 = ctx->gpu_input4[ctx->current_buffer - 1];
        ctx->gpu_result4 = ctx->gpu_output4[ctx->current_buffer - 1];
#endif
        ctx->synchronized = 1;
    }


    if (data) {
        if ((++ctx->current_buffer) > 2) ctx->current_buffer = 1;
        
#if SLICE_BLOCK > 1
	// Only simple processing
#else /* HST_CUDA_4SLICE_MODE */
        if (setup->num_projections%2) {
	    CUDA_SAFE_CALL(cudaMemset(ctx->gpu_input[ctx->current_buffer - 1] + setup->num_projections * ctx->bin_pitch_size, 0, ctx->bin_pitch_size*sizeof(float)));
        }
#endif /* HST_CUDA_4SLICE_MODE */
    
#ifndef PYHST_BP_BENCHMARK
/*
	for (int i = 0; i < setup->num_projections; i++) {
	    for (int j = 0; j < setup->num_bins; j++) {
		printf("%.3f ", data[i * ctx->bin_pitch_size + j]);
	    }
	    printf("\n");
	}
	printf("\n\n\n");*/
        if (blocking) {
	    CUDA_SAFE_CALL(cudaMemcpy2D(ctx->gpu_input[ctx->current_buffer - 1], ctx->bin_pitch_size * sizeof(float), data, setup->num_bins * sizeof(float), setup->num_bins * sizeof(float), SLICE_BLOCK * setup->num_projections, cudaMemcpyHostToDevice));
//	    printf("%u %u\n", setup->num_bins * sizeof(float), SLICE_BLOCK * setup->num_projections);
	} else {
            CUDA_SAFE_CALL(cudaMemcpy2DAsync(ctx->gpu_input[ctx->current_buffer - 1], ctx->bin_pitch_size * sizeof(float), data, setup->num_bins * sizeof(float), setup->num_bins * sizeof(float), SLICE_BLOCK * setup->num_projections, cudaMemcpyHostToDevice, ctx->stream[2]));
        }
#endif

    } else ctx->current_buffer = 0;
    
#ifdef PYHST_MEASURE_TIMINGS
    g_timer_stop(ctx->togpu_timer);
#endif /* PYHST_MEASURE_TIMINGS */
    
    return 0;
}

static int hst_cuda_wait(HSTReconstructorContext *rctx) {
    GPUContext *ctx;
//    HSTSetup *setup;

    ctx = GPU_CONTEXT(rctx);
//    setup = rctx->setup;

#ifdef PYHST_MEASURE_TIMINGS
	g_timer_continue(ctx->fromgpu_timer);
#endif /* PYHST_MEASURE_TIMINGS */

    cudaStreamSynchronize(ctx->stream[3]);

#ifdef PYHST_MEASURE_TIMINGS
	g_timer_stop(ctx->fromgpu_timer);
#endif /* PYHST_MEASURE_TIMINGS */

    return 0;
}

static void hst_report_kernel(GPUContext *ctx, const char *kernel, size_t ppt, dim3 grid, dim3 block) {
    if (ctx->last_kernel != kernel) {
	printf("  Device: %s - Data: %s, Kernel: %s%s%s %s, Instr: %s, SLICES: %u, PPT: %lux%lu (%s), Block: %ix%i, Dim: %lux%lu, Texture: %s/%s, Consts: %s, Benchmark: %s [%s] \n",
	    hst_cuda_device_prop[ctx->device].name,
# ifdef HST_HALF_MODE
	    "half",
# else
	    "float",
# endif
# ifdef HST_HYBRID
	    "[Hybrid] ",
# else
	    "",
# endif
	    kernel,
#ifdef HST_LINEAR_MPLINEAR
	    (strstr(kernel, "alu")?"/mp":""),
#else
	    (strstr(kernel, "alu")?"/simd":""),
#endif
#if (HST_LINEAR_ASSYMETRY > 1)||(HST_OVERS_ASSYMETRY > 1)
	    (strstr(kernel, "alu")?" (assymetric)":""),
#else
	    "",
#endif
#if !defined(HW_IGNORE_OLD_HARDWARE)
	    "GT200",
#elif defined(HST_OPTIMIZE_KEPLER)
	    "Kepler+",
#else
	    "Fermi",
#endif
	    SLICE_BLOCK, ppt, ppt, 
#ifdef HST_SQUARE_PPT
	    "square",
#elif defined(HST_ZMAP)&&(ASSYMETRY < 2)
	    "Y/ZZ",
#elif defined(HST_LMAP)&&(ASSYMETRY < 2)
            "Y/Lin",
#else
            "Y/SQ",
#endif
	    block.y, block.x, ppt * grid.y * block.y, ppt * grid.x * block.x,
	    (ctx->texlin?"linear":"nn"),
#ifdef HST_CREATE_TEXTURE
	    "create",
#else
	    "bind",
#endif
#if defined(HST_GMEM_CONST)&&defined(HST_CACHE_MINH)
	    (strstr(kernel, "alu")?"gmem":"cmem"),
#else
	    "cmem"
#ifdef HST_C_TRIG
	    "/c_trig"
#endif
	    ,
#endif

#if defined(PYHST_BP_BENCHMARK)
	    "BP",
#elif defined(PYHST_RECON_BENCHMARK)
	    "RECON",
#else
	    "-",
#endif
	    (blocking?"Blocking":"Non-blocking")
	);


#ifdef HST_NEWTEX4
	const int  newtex_proj =  4;
#else
	const int  newtex_proj =  16;
#endif

        printf("   Base/%s - %s/%u ppt\n",
#ifdef HST_HALF_MODE
	    "half",
#else
	    "float",
#endif
#ifdef HST_BASE_REMAP
            "remap",
# ifdef HST_BASE_PPT
            HST_BASE_PPT
# else
            0
# endif
#else
            "simple", 1
#endif
        );

        printf("   NewTex/%s - Mode: %s/%u %s, Cache: %s, Loops: %s, Cache Block: %u, NewTex PPT: %u, Balance: (%u alu, %u tex), Bounds: (NewTex: %u, Hybrid: %u)\n",
#ifdef HST_HALF_MODE
	    "half",
#else
	    "float",
#endif
#ifdef HST_HYBRID
# ifdef HST_HYBRID_NEWTEX
	    "hybrid/newtex", newtex_proj, "",
# else
	    "hybrid/simpletex", HST_LINEAR_PPT, "ppt",
# endif
#else
# ifdef HST_NEWTEX_PROJ_MAJOR
	    "newtex", newtex_proj, "projmajor",
# else
	    "newtex", newtex_proj, "projwarp",
# endif
#endif 

#ifdef HST_CACHE_CONST
	    "const"
# if defined(ST_CCACHE_KEPLER)
	    "/Kepler"
# elif defined(HST_CCACHE_LD128)
	    "/LD128"
# elif defined(HST_NEWTEX_REUSE_BUFS)
	    "/combined"
# endif
	    ,
#else
	    "-",
#endif
#if defined(HST_FLOAT_LOOPS)
	    "float",
#else
	    "int",
#endif
	    HST_NEWTEX_CACHE_BLOCK,
#ifdef HST_NEWTEX4_PPT
	    HST_NEWTEX4_PPT,
#else
	    0,
#endif
#ifdef HST_HYBRID
# ifdef HST_HYBRID_BALANCE_ALU
	    HST_HYBRID_BALANCE_ALU, HST_NEWTEX_MIN_BLOCKS - HST_HYBRID_BALANCE_ALU,
# else
	    1, 1,
# endif
#else
	    0, 0,
#endif

#ifdef HST_SET_BOUNDS
	    HST_NEWTEX_MIN_BLOCKS,
# ifdef HST_HYBRID
	    HST_HYBRID_MIN_BLOCKS
# else
	    0
# endif
#else
	0, 0 
#endif
        );

	printf("   Linear Options - Round: %s, Index: %s, Shmem: %s, Read: %s, Cache: %s ProjBlock: %u/%u, PPT: %ux%u Bound: %u\n",
#ifdef HST_LINEAR_FANCY_FLOOR
	    "fancy",
#else
	    "floor",
#endif
#ifdef HST_LINEAR_FANCY_INDEX
	    "fancy",
#else
	    "imul",
#endif
#if defined(HST_OPTIMIZE_KEPLER)
	    "float*",
#elif (SLICE_BLOCK == 4)
	    "float*/2",
#else
	    "float*",
#endif
#ifdef HST_MEM_FETCHES
	    "[]",
#else
	    "tex",
#endif
#ifdef HST_HALF_CACHE
	    "half + "
#else
	    "float + "
#endif
#ifdef HST_CACHE_Y
	    "Y,"
#endif
#ifdef HST_CACHE_MINH
	    "min,"
#endif
#ifdef HST_CACHE_SUBH
# ifdef HST_CACHE_SUBH_X
	    "hx,"
# else
	    "h,"
# endif
#endif
#ifdef HST_CACHE_SIN
	    "sin/cos,"
#endif
	    "",
	    min(HST_LINEAR_CACHE_BLOCK,  HST_LINEAR_BLOCK *  HST_LINEAR_BLOCK / HST_LINEAR_ASSYMETRY), HST_LINEAR_PROJ_BLOCK,
	    HST_LINEAR_PPT, HST_LINEAR_PPT,
#ifdef HST_SET_BOUNDS
	    HST_LINEAR_MIN_BLOCKS
#else
	    0
#endif
	);
 
	printf("   Overs. Options - Round: %s, Index: %s, Shmem: %s, Read: %s, Cache: %s ProjBlock: %u/%u, PPT: %ux%u Bounds: %u\n",
#ifdef HST_OVERS_FANCY_ROUND
	    "fancy",
#else
	    "rint",
#endif
#ifdef HST_OVERS_FANCY_INDEX
	    "fancy",
#else
	    "imul",
#endif
#if SLICE_BLOCK == 4
	    "float*/2",
#elif defined(HST_SHMEM64)
	    "float*/warp",
#else
	    "float*",
#endif
	    "tex",
#ifdef HST_CACHE_MINH
	    "min,"
#endif
#ifdef HST_CACHE_SUBH
# ifdef HST_CACHE_SUBH_X
	    "hx,"
# else
	    "h,"
# endif
#endif
#ifdef HST_CACHE_SIN
	    "sin/cos,"
#endif
	    "",
	    min(HST_LINEAR_CACHE_BLOCK,  HST_LINEAR_BLOCK *  HST_LINEAR_BLOCK / HST_OVERS_ASSYMETRY), HST_OVERS_PROJ_BLOCK,
	    HST_OVERS_PPT, HST_OVERS_PPT,
#ifdef HST_SET_BOUNDS
	    HST_OVERS_MIN_BLOCKS
#else
	    0
#endif
	);


	ctx->last_kernel = kernel;
    }
    
}


static int hst_cuda_reconstruct(HSTReconstructorContext *rctx, float *SLICE, const float *SINOGRAMS) {
    int i = 0;
    
    int ppt;
    
    int batch_size;//, nextbatch;
    int batch;

    GPUContext *ctx;
    HSTSetup *setup;
    
    float *gpu_data;
    float *gpu_buffer;

    int mid;

    assert(rctx);
    ctx = GPU_CONTEXT(rctx);
    setup = rctx->setup;

    assert(ctx);
    assert(setup);

    ppt = ctx->points_per_thread;

    gpu_buffer = ctx->gpu_buffer;

    int dim_fft = setup->dim_fft;
    int num_proj = setup->num_projections;
    int num_bins = setup->num_bins;
    int num_y = setup->num_y;
    float axis_position = setup->axis_position;

#ifndef PYHST_BP_BENCHMARK
    int num_x = setup->num_x;
#endif

    mid = (num_bins + dim_fft) / 2;

    dim3 dimFullGrid(ctx->bp_grid_columns, ctx->bp_grid_lines);
    dim3 dimPadGrid(setup->dim_fft / BLOCK_SIZE,  SLICE_BLOCK * ctx->projection_grid_size);
    dim3 dimInputGrid(ctx->bin_grid_size, SLICE_BLOCK * ctx->projection_grid_size);
    dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);



    batch_size = SLICE_BLOCK * ctx->fft_batch_size;	// Could be shorter on the last iteration

#ifdef HST_MEASURE_GPUBOOST
    while (1) {
	hst_cuda_send(rctx, SINOGRAMS);
	if ((ctx->bp_runs%100)==0) {
	    printf("BP runs: %u\n", ctx->bp_runs);
	}
#else
    if ((ctx->current_buffer)&&(!ctx->synchronized)) {
        hst_cuda_send(rctx, NULL);
    } 
#endif

/*
#ifdef PYHST_MEASURE_TIMINGS
    g_timer_continue(ctx->mem_timer);
#endif 
    CUDA_SAFE_CALL(cudaMemcpy2DAsync(ctx->gpu_data, ctx->bin_pitch_size * sizeof(float), SINOGRAMS, num_bins * sizeof(float), num_bins * sizeof(float), batch_size, cudaMemcpyHostToDevice, ctx->stream[0]));
#ifdef PYHST_MEASURE_TIMINGS
    g_timer_stop(ctx->mem_timer);
#endif 
*/

	// Debug: Clean to prevent accomulating of error
    //CUDA_SAFE_CALL(cudaMemset(ctx->gpu_data, 0, 2*hst_cuda_calc_blocks(num_proj, 2, NULL)*ctx->bin_pitch_size*sizeof(float)));

	/* In the case of odd number of projections, our pad function will not handle second projection of the last pair, it
	will be simply copied from the gpu_data. However, even if gpu_data is originally zeroed, the convolution may introduce
	small numbers. Over multiple slices, this small numbers may accomulate to significant value affecting the precision of
	convolution and, hence, the value of the first vector of the pair. This effect could be easily seen if fai360 mode is
	on */
    if ((num_proj%2)&&(!ctx->synchronized)) {
#ifdef PYHST_MEASURE_TIMINGS
	g_timer_continue(ctx->togpu_timer);
#endif /* PYHST_MEASURE_TIMINGS */
	CUDA_SAFE_CALL(cudaMemset(ctx->gpu_data + num_proj * ctx->bin_pitch_size, 0, ctx->bin_pitch_size*sizeof(float)));
#ifdef PYHST_MEASURE_TIMINGS
	g_timer_stop(ctx->togpu_timer);
#endif /* PYHST_MEASURE_TIMINGS */
    }

    for (batch = 0, i = 0; batch < SLICE_BLOCK * num_proj; batch += SLICE_BLOCK * batch_size, ++i) {
        gpu_data = ctx->gpu_data + batch * ctx->bin_pitch_size;
#ifndef PYHST_BP_BENCHMARK
	const float *data = SINOGRAMS + batch * num_bins;
#endif

	if (batch + batch_size >= num_proj) {
	    batch_size = num_proj - batch;
	}

#ifdef PYHST_MEASURE_TIMINGS
	g_timer_continue(ctx->togpu_timer);
#endif /* PYHST_MEASURE_TIMINGS */

#ifndef PYHST_BP_BENCHMARK
        if (!ctx->synchronized) {
	    if (blocking) {
	        CUDA_SAFE_CALL(cudaMemcpy2D(gpu_data, ctx->bin_pitch_size * sizeof(float), data, num_bins * sizeof(float), num_bins * sizeof(float), SLICE_BLOCK * batch_size, cudaMemcpyHostToDevice));
	    } else {
	        CUDA_SAFE_CALL(cudaMemcpy2DAsync(gpu_data, ctx->bin_pitch_size * sizeof(float), data, num_bins * sizeof(float), num_bins * sizeof(float), SLICE_BLOCK * batch_size, cudaMemcpyHostToDevice, ctx->stream[i%2]));
	    }
	}
#endif

	    // Instead we can allocate dedicated gpu_buffers...
	cudaStreamSynchronize(ctx->stream[(i+1)%2]);
#ifdef PYHST_MEASURE_TIMINGS
	g_timer_stop(ctx->togpu_timer);
#endif /* PYHST_MEASURE_TIMINGS */

#ifdef PYHST_MEASURE_TIMINGS
	g_timer_continue(ctx->pre_timer);
#endif /* PYHST_MEASURE_TIMINGS */

#ifdef HST_FILTER2
	int dim_fft_pitch = dim_fft;
#else
	int dim_fft_pitch = 1 * dim_fft;
#endif

        if (dim_fft > num_bins) {
            if (setup->zero_padding) {
		hst_cuda_pack_kernel_zpad<<<dimPadGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_buffer, dim_fft_pitch, gpu_data, ctx->bin_pitch_size, num_bins);
	    } else {
		hst_cuda_pack_kernel_epad<<<dimPadGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_buffer, dim_fft_pitch, gpu_data, ctx->bin_pitch_size, num_bins, mid);
	    }
	} else {
	    hst_cuda_pack_kernel<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_buffer, dim_fft_pitch, gpu_data, ctx->bin_pitch_size);
	}

#ifdef HST_FILTER2
	dim3 dimFilterGrid(ctx->filter_grid_size, SLICE_BLOCK * ctx->projection_grid_size);
        CUFFT_SAFE_CALL(cufftExecC2C(ctx->fft_plan[i%2], (cufftComplex*)gpu_buffer, (cufftComplex*)gpu_buffer, CUFFT_FORWARD));
        hst_cuda_filter_kernel<<<dimFilterGrid,dimBlock,0,ctx->stream[i%2]>>>(2*dim_fft, gpu_buffer, ctx->gpu_filter);
        CUFFT_SAFE_CALL(cufftExecC2C(ctx->fft_plan[i%2], (cufftComplex*)gpu_buffer, (cufftComplex*)gpu_buffer, CUFFT_INVERSE));
#else
	dim3 dimFilterGrid(ctx->filter_grid_size, 2 * SLICE_BLOCK * ctx->projection_grid_size);	// i.e. (2 * dim_fft) x (2 * (num_proj/2) * SLICE_BLOCK)
        CUFFT_SAFE_CALL(cufftExecR2C(ctx->fft_plan[i%2], (cufftReal*)gpu_buffer, (cufftComplex*)ctx->fft_buffer));
        hst_cuda_filter_kernel<<<dimFilterGrid,dimBlock,0,ctx->stream[i%2]>>>(2 * dim_fft, ctx->fft_buffer, ctx->gpu_filter);
        CUFFT_SAFE_CALL(cufftExecC2R(ctx->ifft_plan[i%2], (cufftComplex*)ctx->fft_buffer, (cufftReal*)gpu_buffer));
#endif

        if (setup->fai360) {
#ifdef PYHST_ASTRA_SCALING
	    printf("Astra scaling is not supported in FAI360 mode yet\n");
	    exit(1);
#endif
            hst_cuda_unpack_kernel_fai360<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_data, ctx->bin_pitch_size, gpu_buffer, dim_fft_pitch, ctx->bin_grid_size * BLOCK_SIZE / 2, ctx->gpu_limits + 2 * batch, batch);
	} else {
#ifdef HST_FLOAT16
# ifdef PYHST_ASTRA_SCALING
	    printf("Astra scaling is not supported in HALF mode yet\n");
	    exit(1);
# endif
 	    hst_cuda_unpack_to_half_kernel<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>((half*)gpu_data, ctx->bin_pitch_size, gpu_buffer, dim_fft_pitch);
#else /* HST_FLOAT16 */
# ifdef PYHST_ASTRA_SCALING
	    const float scaling = M_PI_F / (2 * setup->num_projections * dim_fft);
# else
	    const float scaling = 1.f;
# endif
 	    hst_cuda_unpack_kernel<<<dimInputGrid,dimBlock,0,ctx->stream[i%2]>>>(gpu_data, ctx->bin_pitch_size, gpu_buffer, dim_fft_pitch, scaling);
#endif /* HST_FLOAT16 */
	}

#ifdef PYHST_MEASURE_TIMINGS
	g_timer_stop(ctx->pre_timer);
#endif /* PYHST_MEASURE_TIMINGS */
    }	

#ifdef PYHST_MEASURE_TIMINGS
    g_timer_continue(ctx->pre_timer);
#endif /* PYHST_MEASURE_TIMINGS */
    cudaStreamSynchronize(ctx->stream[(i+1)%2]);
#ifdef PYHST_MEASURE_TIMINGS
    g_timer_stop(ctx->pre_timer);
#endif /* PYHST_MEASURE_TIMINGS */

/*
    // Debug: Check filtered projections itself
    CUDA_SAFE_CALL(cudaMemcpy2D(SLICE, num_x * sizeof(float), ctx->gpu_data, ctx->bin_pitch_size * sizeof(float), num_x * sizeof(float), 652, cudaMemcpyDeviceToHost));
    return 0;
*/

#ifdef PYHST_MEASURE_TIMINGS
    g_timer_continue(ctx->texture_timer);
#endif /* PYHST_MEASURE_TIMINGS */

#ifdef HST_CUDA_ARRAY
    dim3 dimPackGrid(ctx->filter_grid_size, 2 * ctx->projection_grid_size);
# if SLICE_BLOCK == 4
    hst_cuda_slice2array_4<<<dimPackGrid, dimBlock, 0, ctx->stream[i%2]>>>(ctx->gpu_data4, ctx->bin_pitch_size, num_proj, ctx->gpu_data, ctx->bin_pitch_size, num_proj);
# elif SLICE_BLOCK == 2
    hst_cuda_slice2array_2<<<dimPackGrid, dimBlock, 0, ctx->stream[i%2]>>>(ctx->gpu_data4, ctx->bin_pitch_size, num_proj, ctx->gpu_data, ctx->bin_pitch_size, num_proj);
# endif /* HST_CUDA_4SLICE_MODE */
# if SLICE_BLOCK > 1
#  ifdef HST_HALF_MODE
    CUDA_SAFE_CALL( cudaMemcpy2DToArray(ctx->gpu_array, 0, 0, ctx->gpu_data4, ctx->bin_pitch_size * SLICE_BLOCK * sizeof(float) / 2, num_bins * SLICE_BLOCK * sizeof(float) / 2, num_proj, cudaMemcpyDeviceToDevice));
#  else
    CUDA_SAFE_CALL( cudaMemcpy2DToArray(ctx->gpu_array, 0, 0, ctx->gpu_data4, ctx->bin_pitch_size * SLICE_BLOCK * sizeof(float), num_bins * SLICE_BLOCK * sizeof(float), num_proj, cudaMemcpyDeviceToDevice));
#  endif
# else
    CUDA_SAFE_CALL( cudaMemcpy2DToArray(ctx->gpu_array, 0, 0, ctx->gpu_data, ctx->bin_pitch_size * SLICE_BLOCK * sizeof(float), num_bins * sizeof(float), num_proj, cudaMemcpyDeviceToDevice));
# endif 
#else /* HST_CUDA_ARRAY */
# ifdef HST_FLOAT16
    CUDA_SAFE_CALL( cudaBindTexture2D(NULL, tex_projes, ctx->gpu_data, ctx->half_desc, num_bins, num_proj, ctx->bin_pitch_size * sizeof(half) ) );
# else /* HST_FLOAT16 */
    CUDA_SAFE_CALL( cudaBindTexture2D(NULL, tex_projes, ctx->gpu_data, ctx->float_desc, num_bins, num_proj, ctx->bin_pitch_size * sizeof(float)) );
# endif /* HST_FLOAT16 */
#endif /* HST_CUDA_ARRAY */

#ifdef PYHST_MEASURE_TIMINGS
    g_timer_stop(ctx->texture_timer);
#endif /* PYHST_MEASURE_TIMINGS */

    dim3 dimBPBlock(ctx->block_size_x, ctx->block_size_y);
    dim3 dimGrid(ctx->bp_grid_columns/ppt, ctx->bp_grid_lines/ppt );

#ifndef PYHST_BP_BENCHMARK
    int dimrecx = ctx->bp_grid_columns * ctx->block_size_x;
#endif

    batch_size = ctx->bp_batch_size;	// Could be shorter on the last iteration

    for (batch = 0, i = 0; batch < num_y; batch += batch_size, ++i) {
	if (batch + batch_size >= num_y) {
	    batch_size = num_y - batch;
	}

#ifdef PYHST_MEASURE_TIMINGS
	g_timer_continue(ctx->main_timer);
#endif /* PYHST_MEASURE_TIMINGS */

/*        printf("Args (%f, %f), Offset (%f, %f), Axis (%f)\n",
            setup->offset_x  -  axis_position + 0.5f, setup->offset_y - axis_position + 0.5f,
            setup->offset_x, setup->offset_y,
            axis_position
        );*/
	if (ctx->base_kernel) {
	    hst_report_kernel(ctx, "base", ppt, dimGrid, dimBPBlock); 
#ifndef HST_HYBRID
# if defined(HST_BASE_REMAP)&&defined(HST_BASE_PPT)
            hst_cuda_linear_companion<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, num_proj, num_bins, GPU_RESULT(ctx), setup->offset_x  -  axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);
# else
	    hst_cuda_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (ctx->tex, num_proj, num_bins, GPU_RESULT(ctx), setup->offset_x  -  axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);
# endif
#endif
	} else if (ctx->tex_kernel) {
    	    hst_report_kernel(ctx, "tex", ppt, dimGrid, dimBPBlock); 
	    hst_kepler_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, num_proj, num_bins, GPU_RESULT(ctx), setup->offset_x  -  axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);
//	    hst_kepler_orig_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (ctx->tex, num_proj, num_bins, GPU_RESULT(ctx), setup->offset_x  -  axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);
	} else if (ctx->linear_kernel) {

	    if (setup->oversampling == 1) {
#ifdef HST_HYBRID
# if HST_LINEAR_ASSYMETRY > 1
#   error "Assymetry is not supported with hybrid kernels"
# endif
		hst_report_kernel(ctx, "alu_hybrid", ppt, dimGrid, dimBPBlock); 
		hst_cuda_nn_hybrid<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, GPU_SINO(ctx), num_proj, num_bins, ctx->bin_pitch_size, GPU_RESULT(ctx), setup->offset_x  -  axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);
#else
        	dim3 dimBPBlockA(ctx->block_size_x, ctx->block_size_y/HST_NN_ASSYMETRY);
    		hst_report_kernel(ctx, "alu_nn", ppt, dimGrid, dimBPBlockA); 
    		hst_cuda_alu_nn<<<dimGrid, dimBPBlockA, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, GPU_SINO(ctx), num_proj, num_bins, ctx->bin_pitch_size, GPU_RESULT(ctx), setup->offset_x  -  axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);
#endif
	    } else if (setup->oversampling) {
        	dim3 dimBPBlockA(ctx->block_size_x, ctx->block_size_y/HST_OVERS_ASSYMETRY);
    		hst_report_kernel(ctx, "alu_oversample4", ppt, dimGrid, dimBPBlockA); 
    		hst_cuda_alu_oversample4<<<dimGrid, dimBPBlockA, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, GPU_SINO(ctx), num_proj, num_bins, ctx->bin_pitch_size, GPU_RESULT(ctx), setup->offset_x  -  axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);
	    } else {
#ifdef HST_HYBRID
# if HST_LINEAR_ASSYMETRY > 1
#   error "Assymetry is not supported with hybrid kernels"
# endif
		hst_report_kernel(ctx, "alu_hybrid", ppt, dimGrid, dimBPBlock); 
		hst_cuda_linear_hybrid<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, GPU_SINO(ctx), num_proj, num_bins, ctx->bin_pitch_size, GPU_RESULT(ctx), setup->offset_x  -  axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);
#else
        	dim3 dimBPBlockA(ctx->block_size_x, ctx->block_size_y/HST_LINEAR_ASSYMETRY);
		hst_report_kernel(ctx, "alu_linear", ppt, dimGrid, dimBPBlockA); 
		hst_cuda_alu_linear<<<dimGrid, dimBPBlockA, 0, ctx->stream[i%2]>>> (ctx->tex, ctx->gpu_const, GPU_SINO(ctx), num_proj, num_bins, ctx->bin_pitch_size, GPU_RESULT(ctx), setup->offset_x  -  axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);
#endif
	    }
/*	} else if (ctx->mplinear_kernel) {
	    if (setup->oversampling) {
    		hst_report_kernel(ctx, "mpoversample4", ppt, dimGrid, dimBPBlock); 
    		hst_cuda_mpoversample4_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, GPU_RESULT_COMPAT(ctx), setup->offset_x  -  axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);
	    } else {
#ifdef HST_MP_ASYMMETRIC
                dim3 dimBPBlockA(ctx->block_size_x, ctx->block_size_y/2);
    		hst_report_kernel(ctx, "mplinear", ppt, dimGrid, dimBPBlockA); 
		hst_cuda_mplinear_kernel<<<dimGrid, dimBPBlockA, 0, ctx->stream[i%2]>>> (num_proj, num_bins, GPU_RESULT_COMPAT(ctx), setup->offset_x  -  axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);
#else
    		hst_report_kernel(ctx, "mplinear/symmetric", ppt, dimGrid, dimBPBlock); 
		hst_cuda_mpoversample4_kernel<<<dimGrid, dimBPBlock, 0, ctx->stream[i%2]>>> (num_proj, num_bins, GPU_RESULT_COMPAT(ctx), setup->offset_x  -  axis_position + 0.5f, setup->offset_y - axis_position + 0.5f, batch);
#endif // HST_MP_ASYMMETRIC
            }*/
	} else {
	    pyhst_error("No back-projection kernel is configured");
	}
	
#ifdef PYHST_MEASURE_TIMINGS
	g_timer_stop(ctx->main_timer);
	g_timer_continue(ctx->fromgpu_timer);
#endif /* PYHST_MEASURE_TIMINGS */

#ifndef PYHST_BP_BENCHMARK
        if (!ctx->synchronized) {
	    if (blocking) {
		hst_cuda_array2slice(0, ctx->gpu_result4, dimrecx, num_y, ctx->gpu_result, dimrecx, num_y);
    	        CUDA_SAFE_CALL(cudaMemcpy2D(SLICE + num_x * batch, num_x*sizeof(float), ctx->gpu_result + dimrecx * batch, dimrecx * sizeof(float), num_x * sizeof(float), SLICE_BLOCK * batch_size, cudaMemcpyDeviceToHost));
	    } else {
		hst_cuda_array2slice(ctx->stream[i%2], ctx->gpu_result4, dimrecx, num_y, ctx->gpu_result, dimrecx, num_y);
	        CUDA_SAFE_CALL(cudaMemcpy2DAsync(SLICE + num_x * batch, num_x*sizeof(float), ctx->gpu_result + dimrecx * batch, dimrecx * sizeof(float), num_x * sizeof(float), SLICE_BLOCK * batch_size, cudaMemcpyDeviceToHost, ctx->stream[i%2]));
	    }
	}
#endif

#ifdef PYHST_MEASURE_TIMINGS
	g_timer_stop(ctx->fromgpu_timer);
#endif /* PYHST_MEASURE_TIMINGS */
    }


#ifdef PYHST_MEASURE_TIMINGS
	g_timer_continue(ctx->main_timer);
#endif /* PYHST_MEASURE_TIMINGS */

    if (ctx->synchronized) {
        cudaStreamSynchronize(ctx->stream[0]);
        cudaStreamSynchronize(ctx->stream[1]);
    }

#ifdef PYHST_MEASURE_TIMINGS
    g_timer_stop(ctx->main_timer);
    g_timer_continue(ctx->texture_timer);
#endif /* PYHST_MEASURE_TIMINGS */

#ifndef HST_CUDA_ARRAY
    cudaUnbindTexture(tex_projes);
#endif /* ! HST_CUDA_ARRAY */
    
#ifdef PYHST_MEASURE_TIMINGS
    g_timer_stop(ctx->texture_timer);
    g_timer_continue(ctx->fromgpu_timer);
#endif /* PYHST_MEASURE_TIMINGS */

    if (ctx->synchronized) {
#ifdef PYHST_BP_BENCHMARK
        cudaThreadSynchronize();
#else

        if (blocking) {
	    hst_cuda_array2slice(0, ctx->gpu_result4, dimrecx, num_y, ctx->gpu_result, dimrecx, num_y);
	    CUDA_SAFE_CALL(cudaMemcpy2D(SLICE, num_x * sizeof(float), ctx->gpu_result, dimrecx * sizeof(float), num_x * sizeof(float), SLICE_BLOCK * num_y, cudaMemcpyDeviceToHost));
        } else {
            cudaStreamSynchronize(ctx->stream[3]);
	    hst_cuda_array2slice(ctx->stream[3], ctx->gpu_result4, dimrecx, num_y, ctx->gpu_result, dimrecx, num_y);
	    CUDA_SAFE_CALL(cudaMemcpy2DAsync(SLICE, num_x * sizeof(float), ctx->gpu_result, dimrecx * sizeof(float), num_x * sizeof(float), SLICE_BLOCK * num_y, cudaMemcpyDeviceToHost, ctx->stream[3]));
	}
#endif
    } else {
        cudaThreadSynchronize();
    }
    
#ifdef PYHST_MEASURE_TIMINGS
    g_timer_stop(ctx->fromgpu_timer);
#endif /* PYHST_MEASURE_TIMINGS */

    ctx->synchronized = 0;

    {
	double total_elapsed = g_timer_elapsed(GPU_CONTEXT(rctx)->main_timer, NULL);
	double fbp_elapsed = g_timer_elapsed(GPU_CONTEXT(rctx)->fbp_timer, NULL);
	if (ctx->bp_runs < CUDA_MAX_STATS) {
	    ctx->stats[ctx->bp_runs] = total_elapsed - ctx->last_elapsed;
	    ctx->fbp_stats[ctx->bp_runs] = fbp_elapsed - ctx->last_fbp_elapsed;

	    ctx->bp_runs++;
	} else {
#ifdef HST_MEASURE_GPUBOOST
	    for (int i = 0; i < ctx->bp_runs; i++) {
		if ((i%10)==0) printf("\n BP Stats: ");
		printf("% 6.3lf   ", ctx->stats[i] * 1000);
	    }
	    printf("\n");

	    for (int i = 0; i < ctx->bp_runs; i++) {
		if ((i%10)==0) printf("\nFBP Stats: ");
		printf("% 6.3lf   ", ctx->fbp_stats[i] * 1000);
	    }
	    printf("\n\n");
	    ctx->bp_runs = 0;
#endif
	}
	ctx->last_elapsed = total_elapsed;
	ctx->last_fbp_elapsed = fbp_elapsed;
    }


#ifdef HST_MEASURE_GPUBOOST
    }
#endif

    return 0;
}


static HSTReconstructor hst_gpu_info = {
    0,
    hst_cuda_get_title,
    hst_cuda_create_context,
    hst_cuda_init_context,
    hst_cuda_free_context,
    hst_cuda_destroy_context,
    hst_cuda_configure,
    hst_cuda_send,
    NULL,
    hst_cuda_reconstruct,
    hst_reconstructor_postprocess_slice,
    hst_cuda_wait,
    hst_cuda_get_timers
};


HSTReconstructor *hst_cuda_init(int flags) {
    int i, dev, selected_device = -1;
    char *stmp;
    int device_count;

    CUDA_SAFE_CALL(cudaGetDeviceCount(&device_count));
    
    if (device_count > HST_CUDA_MAX_DEVICES) {
	pyhst_warning("There is %u CUDA-enabled devices detected in the system, but hst_cuda is configured to use only %u", device_count, HST_CUDA_MAX_DEVICES);
	device_count = HST_CUDA_MAX_DEVICES;
    }
    
    stmp = getenv("CUDA_DEVICE");
    if ((stmp)&&(stmp[0])) selected_device = atoi(stmp);

    pyhst_info("NVIDIA devices: %u", device_count);
    for (i = 0, dev = 0; i < device_count; ++i) {
        CUDA_SAFE_CALL(cudaGetDeviceProperties(&hst_cuda_device_prop[dev], i));

        if ((hst_cuda_device_prop[dev].computeMode == cudaComputeModeProhibited)||((hst_cuda_device_prop[dev].major == 9999 && hst_cuda_device_prop[dev].minor == 9999))) continue;
#ifdef HW_IGNORE_OLD_HARDWARE
	if (hst_cuda_device_prop[dev].major < 2) continue;
#endif /* HW_IGNORE_OLD_HARDWARE */
        pyhst_debug(" * Device %d: %s v. %d.%d: %d SM, %.2f MHZ, %d MB%s", dev, hst_cuda_device_prop[dev].name, hst_cuda_device_prop[dev].major, hst_cuda_device_prop[dev].minor, hst_cuda_device_prop[dev].multiProcessorCount, hst_cuda_device_prop[dev].clockRate * 1e-6, hst_cuda_device_prop[dev].totalGlobalMem/1048576, hst_cuda_device_prop[dev].deviceOverlap?", overlap":"");

	if ((selected_device < 0)||(i == selected_device)) {
	    hst_cuda_device_num[dev++] = i;
	}
    }

    stmp = getenv("CUDA_LAUNCH_BLOCKING");
    if ((stmp)&&(stmp[0])&&(stmp[0] != '0')) blocking = 1;

    hst_gpu_info.devices = dev * PARALLEL_PER_GPU;
    return &hst_gpu_info;
}

void hst_cuda_free() {
}

void *hst_cuda_host_malloc(size_t size, HSTPinnedAccess access) {
    cudaError_t err;
    void *tmp;

    if (hst_gpu_info.devices) {
	err = cudaHostAlloc(&tmp, size, cudaHostAllocPortable);//4);//cudaHostAllocWriteCombined);
	if (err) return NULL;
    } else {
	tmp = malloc(size);
    }
    return tmp;
}

void hst_cuda_host_free(void *ptr) {
    if (hst_gpu_info.devices) {
	cudaFreeHost(ptr);
    } else {
	free(ptr);
    }
}