/tomo/pyhst : contents of dfi_cuda/dfi_cuda_sinc

: (revision 276)

To get this branch, use:

bzr branch
http://darksoft.org/webbzr/tomo/pyhst

/*
 * The PyHST program is Copyright (C) 2002-2011 of the
 * European Synchrotron Radiation Facility (ESRF) and
 * Karlsruhe Institute of Technology (KIT).
 *
 * PyHST is free software: you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * hst is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

texture<cufftComplex, cudaTextureType2D, cudaReadModeElementType> tex_projections;
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_ktbl;

static __device__ inline cufftComplex ComplexMul(cufftComplex a, cufftComplex b)
{
    cufftComplex c;
    c.x = a.x * b.x - a.y * b.y;
    c.y = a.x * b.y + a.y * b.x;
    return c;
}

static __device__ inline cufftComplex ComplexScale(cufftComplex a, float s)
{
    cufftComplex c;
    c.x = s * a.x;
    c.y = s * a.y;
    return c;
}

__global__ static void  dfi_cuda_zeropadding_complex(cufftReal *input, int sino_width, cufftComplex *output) {
    
    const int dim_x = gridDim.x * blockDim.x;
    int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
    int idy = blockIdx.y * BLOCK_SIZE + threadIdx.y;
    long out_idx = idy * dim_x + idx;

    int lpart = sino_width/2;
    int rpart = sino_width/2;
    int dim_x2 = dim_x/2;
    int len_to_lpart = dim_x - lpart;

    output[out_idx].x = 
            (idx < rpart) ? input[idy * sino_width + (lpart + idx)] : 
                            (idx < (dim_x2 + (dim_x2 - lpart))) ? 0.0f : 
                                                                  input[idy * sino_width + (idx - len_to_lpart)];
    output[out_idx].y = 0.0f;                                                              
}

__global__ static void  dfi_cuda_zeropadding_real(cufftReal *input, int sino_width, int dim2_fft, cufftReal *output) {
    
    const int dim_x = gridDim.x * blockDim.x;
    int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
    int idy = blockIdx.y * BLOCK_SIZE + threadIdx.y;
    long out_idx = idy * dim2_fft + idx;

    int lpart = sino_width/2;
    int rpart = sino_width/2;
    int dim_x2 = dim_x/2;
    int len_to_lpart = dim_x - lpart;

    output[out_idx] = 
            (idx < rpart) ? input[idy * sino_width + (lpart + idx)] : 
                            (idx < (dim_x2 + (dim_x2 - lpart))) ? 0.0f : 
                                                                  input[idy * sino_width + (idx - len_to_lpart)];                                                          
}

__global__ static void  dfi_cuda_interpolation_sinc(int spectrum_offset,
                                                    float L2, 
                                                    int ktbl_len2, 
                                                    int raster_size, 
                                                    int raster_size2, 
                                                    float table_spacing, 
                                                    float angle_step_rad,
                                                    float theta_max,
                                                    float rho_max,
                                                    int dim_x,
                                                    float max_radius,
                                                    cufftComplex *output) {
    //variables
    float2 out_coord, norm_gl_coord, in_coord;
    int iul, iuh, ivl, ivh, sign, i, j, k;
    float res_real, res_imag, weight, kernel_x_val;
    long out_idx;

    sign = 1;
    res_real = 0.0f, res_imag = 0.0f;
    out_idx = 0;

    out_coord.x = (blockIdx.x * BLOCK_SIZE + threadIdx.x);
    out_coord.y = (blockIdx.y * BLOCK_SIZE + threadIdx.y) + spectrum_offset;

    out_idx = out_coord.y * dim_x + out_coord.x;
    
    //calculate coordinates
    norm_gl_coord.x = out_coord.x - 1;
    norm_gl_coord.y = out_coord.y - raster_size2;

    float radius = sqrt(norm_gl_coord.x * norm_gl_coord.x + norm_gl_coord.y * norm_gl_coord.y);

    if (radius <= max_radius) {
    	in_coord.y = atan2(norm_gl_coord.y,norm_gl_coord.x);
    	in_coord.y = -in_coord.y; // spike here! (mirroring along y-axis)

        sign = (in_coord.y < 0.0) ? -1 : 1;
            
        in_coord.y = (in_coord.y < 0.0f) ? in_coord.y+= M_PI : in_coord.y;
        in_coord.y = (float) min(1.0f + in_coord.y/angle_step_rad, theta_max - 1);

        in_coord.x = (float) min(radius, (float)raster_size2);

    	//sinc interpoaltion
        iul = (int)ceil(in_coord.x - L2);
        iul = (iul < 0) ? 0 : iul;

        iuh = (int)floor(in_coord.x + L2);
        iuh = (iuh > rho_max - 1) ? iuh = rho_max - 1 : iuh;

        ivl = (int)ceil(in_coord.y - L2);
        ivl = (ivl < 0) ? 0 : ivl;

        ivh = (int)floor(in_coord.y + L2);
        ivh = (ivh > theta_max - 1) ? ivh = theta_max - 1 : ivh;
                 
        float kernel_y[20];
        for (i = ivl, j = 0; i <= ivh; ++i, ++j) {
        	kernel_y[j] = tex2D(tex_ktbl, ktbl_len2 + (in_coord.y - (float)i) * table_spacing, 0.5f);
        }

        for (i = iul; i <= iuh; ++i) {
        	kernel_x_val = tex2D(tex_ktbl, ktbl_len2 + (in_coord.x - (float)i) * table_spacing, 0.5f);

                for (k = ivl, j = 0; k <= ivh; ++k, ++j) {
                    weight = kernel_y[j] * kernel_x_val;

                    cufftComplex data_val = tex2D(tex_projections, (float)i, (float)k);

                    res_real += data_val.x * weight;
                    res_imag += data_val.y * weight;
                }
        }

        output[out_idx].x = res_real;
        output[out_idx].y = sign * res_imag;
    }
}

__global__ void dfi_cuda_swap_quadrants_complex(cufftComplex *input, cufftComplex *output, int dim_x)
{
    int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
    int idy = blockIdx.y * BLOCK_SIZE + threadIdx.y;

    const int dim_y = gridDim.y * blockDim.y; //a half of real length

    output[idy * dim_x + idx] = input[(dim_y + idy) * dim_x + idx + 1];
    output[(dim_y + idy) * dim_x + idx] = input[idy * dim_x + idx + 1];
}

__global__ void dfi_cuda_swap_quadrants_real(cufftReal *output)
{
    int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
    int idy = blockIdx.y * BLOCK_SIZE + threadIdx.y;

    const int dim_x = gridDim.x * blockDim.x;
    int dim_x2 = dim_x/2, dim_y2 = dim_x2;
    long sw_idx1, sw_idx2;

    sw_idx1 = idy * dim_x + idx;

    cufftReal temp = output[sw_idx1];
    if (idx < dim_x2) {
        sw_idx2 = (dim_y2 + idy) * dim_x + (dim_x2 + idx);
        output[sw_idx1] = output[sw_idx2];
        output[sw_idx2] = temp;
    }
    else {
        sw_idx2 = (dim_y2 + idy) * dim_x + (idx - dim_x2);
        output[sw_idx1] = output[sw_idx2];
        output[sw_idx2] = temp;
    }
}

__global__ void dfi_cuda_crop_roi(cufftReal *input, int x, int y, int roi_x, int roi_y, int raster_size, float scale, cufftReal *output)
{
    const int dim_x = gridDim.x * blockDim.x;
    int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
    int idy = blockIdx.y * BLOCK_SIZE + threadIdx.y;

    if (idx < roi_x && idy < roi_y) {
        output[idy * roi_x + idx] = input[(idy + y) * raster_size + (idx + x)] * scale;
    }
}