/tomo/pyhst : contents of hst_cuda/hst_cuda_int

: (revision 276)

To get this branch, use:

bzr branch
http://darksoft.org/webbzr/tomo/pyhst

/*
 * The PyHST program is Copyright (C) 2002-2011 of the
 * European Synchrotron Radiation Facility (ESRF) and
 * Karlsruhe Institute of Technology (KIT).
 *
 * PyHST is free software: you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * hst is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#if CUDA_VERSION_MAJOR > 3
//# define HST_TEXTURE16
# define HST_TEXTURE16_PACKED
#endif /* CUDA_VERSION_MAJOR > 3 */

#ifdef HST_TEXTURE16
# ifdef HST_TEXTURE16_PACKED
texture<uint32_t, 2, cudaReadModeElementType> int_projes;
# else /* HST_TEXTURE16_PACKED */
texture<uint16_t, 2, cudaReadModeElementType> int_projes;
# endif /* HST_TEXTURE16_PACKED */

__global__ static void  hst_cuda_normalize_kernel(unsigned short *res, float *data, int pitch, float *params) {

    int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
    int idy = blockIdx.y * BLOCK_SIZE + threadIdx.y;
    
    float min_val = params[0];
    float max_val = params[1];
    
    int pos = idy * pitch + idx;

    res[pos] = round(65535. * (data[pos] - min_val) / (max_val - min_val));
}

__global__ static void  hst_cuda_denormalize_kernel(float *data, float *params, int num_proj) {

    int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
    int idy = blockIdx.y * BLOCK_SIZE + threadIdx.y;
    
    float min_val = params[0];
    float max_val = params[1];
    
    int pos = BLOCK_SIZE_X*gridDim.x * idy + idx;
    data[pos] = num_proj * min_val + ((uint32_t*)data)[pos] * (max_val - min_val) / 65535.;
}

__global__ static void hst_cuda_intkernel_esrf(int num_proj, int num_bins, uint32_t *d_SLICE, float apos_off_x, float apos_off_y, int batch) {
    const int idx = blockIdx.x * BLOCK_SIZE_X + threadIdx.x;
    const int idy = blockIdx.y * BLOCK_SIZE_Y + threadIdx.y + batch;

    float h;
    int res;

    res=0;

    const float bx = idx + apos_off_x;
    const float by = idy + apos_off_y;

#pragma unroll 8
    for (int proje=0; proje<num_proj; proje++) {
        const float4 all = c_all[proje];
        h = all.z + bx * all.x - by * all.y;

        res=res+tex2D(int_projes, h, proje + 0.5f) ;
    }

    d_SLICE[ BLOCK_SIZE_X*gridDim.x*idy + idx ] = res;
}

__global__ static void hst_cuda_intkernel(int num_proj, int num_bins, uint32_t *d_SLICE, float apos_off_x, float apos_off_y, int batch) {
    int i;
    int proj;
    
    const int tidx = threadIdx.x;
    const int tidy = threadIdx.y;

    const int bidx = blockIdx.x * BLOCK_SIZE_X;
    const int bidy = batch + blockIdx.y * BLOCK_SIZE_Y;

    const int idx = bidx + threadIdx.x;
    const int idy = bidy + threadIdx.y;

    float4 all;

    uint32_t res = 0;

    const float bx = bidx + apos_off_x;
    const float by = bidy + apos_off_y;

    const float x = bx + tidx;
    const float y = by + tidy;

    float h;
    int minh;

    __shared__  float   cache_subh[BLOCK_SIZE_X][BLOCK_SIZE_Y + 1];
    __shared__  float   cache_sin[BLOCK_SIZE_Y];
    __shared__  uint32_t   cache[BLOCK_SIZE_Y][BLOCK_SIZE_X + 1];
    
    for (int proje=0; proje<num_proj; proje += BLOCK_SIZE_Y) {
	int proj = proje + tidy;

        all = c_all[proj];
	h = all.z + bx * all.x - by * all.y;
	minh = (int)(h + all.w);
	minh = (minh >> 1);

	cache_subh[tidx][tidy] = all.z + x * all.x - (minh << 1);
	if (tidx == 0) cache_sin[tidy] = all.y;

        cache[tidy][tidx] = tex2D(int_projes, minh + tidx, proje + tidy);

	__syncthreads();

//#pragma unroll 16
	for (i = 0; i < BLOCK_SIZE_Y; i++) {
	    uint16_t *cache_row = (uint16_t*)(&cache[i]);
	    
	    minh = cache_subh[tidx][i] - y * cache_sin[i];
	    res += cache_row[minh];
	}

	__syncthreads();

    }

    d_SLICE[ BLOCK_SIZE_X*gridDim.x*idy + idx ] = res;
}

#endif /* HST_TEXTURE16 */