/tomo/pyhst : contents of hst_cuda/hst_cuda

: (revision 276)

To get this branch, use:

bzr branch
http://darksoft.org/webbzr/tomo/pyhst

/*
 * The PyHST program is Copyright (C) 2002-2011 of the
 * European Synchrotron Radiation Facility (ESRF) and
 * Karlsruhe Institute of Technology (KIT).
 *
 * PyHST is free software: you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * hst is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
#ifdef HST_HALF_MODE
# include "cuda_fp16.h"
#endif

__global__ void hst_cuda_heatup_kernel(float seed, float *g_data) {
	int idx = blockIdx.x*blockDim.x + threadIdx.x;

	float  tmps[CUDA_HEATUP_ELEMENTS] = {0};
	for(int i = 0; i < CUDA_HEATUP_KERNEL_ITERATIONS; i++){	
#pragma unroll
		for(int j=0; j<CUDA_HEATUP_ELEMENTS; j++){
			tmps[j] = tmps[j]*tmps[j]+seed;
		}
	}
		// Multiply add reduction
	float sum = 0.f;
#pragma unroll
	for(int j=0; j<CUDA_HEATUP_ELEMENTS; j+=2)
		sum += tmps[j]*tmps[j+1];

	if ( sum == -1.f ) 
		g_data[idx] = sum;
}


#ifdef HST_HALF_MODE
__global__ static void hst_cuda_slice2array_4(float2 *gpu_array, int num_bin, int num_proj, float *gpu_data, int pad_bin, int pad_proj) {
#else
__global__ static void hst_cuda_slice2array_4(float4 *gpu_array, int num_bin, int num_proj, float *gpu_data, int pad_bin, int pad_proj) {
#endif
    int step = pad_bin * pad_proj;
    int  idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
    int  idy = blockIdx.y * BLOCK_SIZE + threadIdx.y;

#ifdef HST_HALF_MODE
	union {
	    float f;
	    half2 h;
	} u1, u2;
	
	float2 f1 = { gpu_data[           idy * pad_bin + idx], gpu_data[    step + idy * pad_bin + idx] };
	float2 f2 = { gpu_data[2 * step + idy * pad_bin + idx], gpu_data[3 * step + idy * pad_bin + idx] };
	u1.h = __float22half2_rn(f1);
	u2.h = __float22half2_rn(f2);
	
	gpu_array[idy * num_bin + idx]  = (float2){ u1.f, u2.f };
#else
    gpu_array[idy * num_bin + idx].x = gpu_data[           idy * pad_bin + idx];
    gpu_array[idy * num_bin + idx].y = gpu_data[    step + idy * pad_bin + idx];
    gpu_array[idy * num_bin + idx].z = gpu_data[2 * step + idy * pad_bin + idx];
    gpu_array[idy * num_bin + idx].w = gpu_data[3 * step + idy * pad_bin + idx];
#endif
}

__global__ static void hst_cuda_array2slice_4(float4 *gpu_array, int num_bin, int num_proj, float *gpu_data, int pad_bin, int pad_proj) {
    int step = pad_bin * pad_proj;
    int  idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
    int  idy = blockIdx.y * BLOCK_SIZE + threadIdx.y;

    if (idy < num_proj) {
	gpu_data[           idy * pad_bin + idx] = gpu_array[idy * num_bin + idx].x;
	gpu_data[    step + idy * pad_bin + idx] = gpu_array[idy * num_bin + idx].y;
	gpu_data[2 * step + idy * pad_bin + idx] = gpu_array[idy * num_bin + idx].z;
	gpu_data[3 * step + idy * pad_bin + idx] = gpu_array[idy * num_bin + idx].w;
    }
}


__global__ static void hst_cuda_slice2array_2(float2 *gpu_array, int num_bin, int num_proj, float *gpu_data, int pad_bin, int pad_proj) {
    int step = pad_bin * pad_proj;
    int  idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
    int  idy = blockIdx.y * BLOCK_SIZE + threadIdx.y;

    gpu_array[idy * num_bin + idx].x = gpu_data[           idy * pad_bin + idx];
    gpu_array[idy * num_bin + idx].y = gpu_data[    step + idy * pad_bin + idx];
}

__global__ static void hst_cuda_array2slice_2(float2 *gpu_array, int num_bin, int num_proj, float *gpu_data, int pad_bin, int pad_proj) {
    int step = pad_bin * pad_proj;
    int  idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
    int  idy = blockIdx.y * BLOCK_SIZE + threadIdx.y;

    if (idy < num_proj) {
	gpu_data[           idy * pad_bin + idx] = gpu_array[idy * num_bin + idx].x;
	gpu_data[    step + idy * pad_bin + idx] = gpu_array[idy * num_bin + idx].y;
    }
}




__global__ static void  hst_cuda_pack_kernel(cufftReal *out, int dpitch, cufftReal *data, int spitch) {
    float tmp1, tmp2;

    int  idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
    int  idy = blockIdx.y * BLOCK_SIZE + threadIdx.y;

    int pos = 2 * idy * spitch + idx;

    tmp1 = data[pos];
    tmp2 = data[pos + spitch];

#ifdef HST_FILTER2
    pos = 2 * (idy * dpitch + idx);
    out[pos] = tmp1;
    out[pos + 1] = tmp2;
#else
    pos = (2 * idy * dpitch + idx);
    out[pos] = tmp1;
    out[pos + dpitch] = tmp2;
#endif
}

__global__ static void  hst_cuda_pack_kernel_zpad(cufftReal *out, int dpitch, cufftReal *data, int spitch, int num_bins) {
    float tmp1, tmp2;

    int  idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
    int  idy = blockIdx.y * BLOCK_SIZE + threadIdx.y;

    int pos;

    if (idx < num_bins) {
        pos = 2 * idy * spitch + idx;

        tmp1 = data[pos];
        tmp2 = data[pos + spitch];
    } else {
        tmp1 = 0;
        tmp2 = 0;
    }

#ifdef HST_FILTER2
    pos = 2 * (idy * dpitch + idx);
    out[pos] = tmp1;
    out[pos + 1] = tmp2;
#else
    pos = (2 * idy * dpitch + idx);
    out[pos] = tmp1;
    out[pos + dpitch] = tmp2;
#endif
}

__global__ static void  hst_cuda_pack_kernel_epad(cufftReal *out, int dpitch, cufftReal *data, int spitch, int num_bins, int mid) {
    float tmp1, tmp2;

    int  idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
    int  idy = blockIdx.y * BLOCK_SIZE + threadIdx.y;
    int pos;

    if (idx < num_bins) {
        pos = 2 * idy * spitch + idx;

        tmp1 = data[pos];
        tmp2 = data[pos + spitch];
    } else if (idx < mid) {
        pos = 2 * idy * spitch + num_bins - 1;

        tmp1 = data[pos];
        tmp2 = data[pos + spitch];
    } else {
        pos = 2 * idy * spitch;

        tmp1 = data[pos];
        tmp2 = data[pos + spitch];
    }


#ifdef HST_FILTER2
    pos = 2 * (idy * dpitch + idx);
    out[pos] = tmp1;
    out[pos + 1] = tmp2;
#else
    pos = (2 * idy * dpitch + idx);
    out[pos] = tmp1;
    out[pos + dpitch] = tmp2;
#endif
}


__global__ static void  hst_cuda_filter_kernel(int vector_size, float *data, const float *filter) {
    int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
    int id = idx + (blockIdx.y * BLOCK_SIZE + threadIdx.y)*vector_size;

    data[id] *= filter[idx];
}

/*
 left_side:
    flat_zone_start = (axis_position_corr - flat_zone)
    dx = flat_zone_start - i
    multiplier = (2 * prof_shift + prof_fact)  + dx * (prof_fact / pente_zone)

    (2 * prof_shift + prof_fact) is always 1
    prof_fact is -1 or 1

    1 + prof_fact * min(1, (flat_zone_start - i) / pente_zone)
    prof_fact < 0
        param2 is prof_fact / pente_zone
	param1 is flat_zone_start + 1/param2
	multiplier = max(0, (param1 - i)*param2)
    else
        param2 is prof_fact / pente_zone
	param1 is flat_zone_start + 1/param2
	multiplier = min(2, (param1 - i)*param2)

 right_side:
    flat_zone_start = (axis_position_corr + flat_zone)
    dx = flat_zone_start - i

    multiplier = (2 * prof_shift + prof_fact)  + dx * (prof_fact / pente_zone)

    (2 * prof_shift + prof_fact) is always 1
    prof_fact is -1 or 1

    1 + prof_fact * max(-1, (flat_zone_start - i) / pente_zone)
    prof_fact < 0
        param2 is prof_fact / pente_zone
	param1 is flat_zone_start + 1/param2
	multiplier = min(2, (param1 - i)*param2)
    else
        param2 is prof_fact / pente_zone
	param1 is flat_zone_start + 1/param2
	multiplier = max(0, (param1 - i)*param2)

    __global__ static void  limit_kernel_max(float *data, float param1, float param2) {
	int  i = blockIdx.x * BLOCK_SIZE + threadIdx.x;

	data[i] *= max(0., (param1 - i) * param2);
    }

    __global__ static void  limit_kernel_min(float *data, float param1, float param2) {
	int  i = blockIdx.x * BLOCK_SIZE + threadIdx.x;

	data[i] *= min(2., (param1 - i) * param2);
    }

 the next functions are joint

    __global__ static void  limit_kernel_positive(float *data, int first_right, float param1a, float param1b, float param2) {
	int  i = blockIdx.x * BLOCK_SIZE + threadIdx.x;
	int flag = i < first_right;

	data[i] *= min(2., max(flag?1.:0., ((flag?param1a:param1b) - i) * param2));
    }

    __global__ static void  limit_kernel_negative(float *data, int first_right, float param1a, float param1b, float param2) {
	int  i = blockIdx.x * BLOCK_SIZE + threadIdx.x;
	int flag = i < first_right;

	data[i] *= max(0., min(flag?1.:2., ((flag?param1a:param1b) - i) * param2));
    }
 and now completely blocked and joint:
__global__ static void  hst_cuda_limit_kernel(float *data, int pitch, int ystart, float *params) {
    int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
    int idy = blockIdx.y * BLOCK_SIZE + threadIdx.y + ystart;
    int i = idy * pitch + idx;

    float axis_position_corr = c_axiss[idy];
    float flat_zone = params[idy * 2];
    float param = params[idy * 2 + 1];

    int flag = idx < (axis_position_corr + flat_zone);

    float multiplier = ((flag?((axis_position_corr - flat_zone) + param):((axis_position_corr + flat_zone) + param)) - idx)/param;


    if (param > 0) {
	data[i] *= min(2., max(flag?1.:0., multiplier));
    } else {
	data[i] *= max(0., min(flag?1.:2., multiplier));
    }
}

*/

__global__ static void  hst_cuda_unpack_kernel(cufftReal *out, int dpitch, cufftReal *data, int spitch, float scaling) {
    float tmp1, tmp2;

    int  idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
    int  idy = blockIdx.y * BLOCK_SIZE + threadIdx.y;

#ifdef HST_FILTER2
    int pos = 2 * (idy * spitch + idx);
    tmp1 = data[pos];
    tmp2 = data[pos + 1];
#else
    int pos = (2 * idy * spitch + idx);
    tmp1 = data[pos];
    tmp2 = data[pos + spitch];
#endif

    pos = 2 * idy * dpitch + idx ;

    out[pos] = tmp1 * scaling;
    out[pos + dpitch] = tmp2 * scaling;
}



/*
 This is backup variant. Theoretically, the first function should work slightly faster due to the reduced amount
 of memory accesses. However, it hardly could be seen!
__global__ static void  hst_cuda_unpack_kernel_fai360(cufftReal *out, int dpitch, cufftReal *data, int spitch, int half, float *params, int batch) {
    float tmp1, tmp2;

    int  idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
    int  idy = blockIdx.y * BLOCK_SIZE + threadIdx.y;

    int pos = 2 * (idy * spitch + idx);

    tmp1 = data[pos];
    tmp2 = data[pos + 1];

    pos = 2 * idy * dpitch + idx ;


    float axis_position_corr = c_axiss[batch + 2*idy];
    float flat_zone = params[idy * 4];
    float param = params[idy * 4 + 1];

    int flag = idx < (axis_position_corr + flat_zone);
    float multiplier = ((flag?((axis_position_corr - flat_zone) + param):((axis_position_corr + flat_zone) + param)) - idx)/param;


    if (param > 0) {
	out[pos] = tmp1 * min(2., max(flag?1.:0., multiplier));
    } else {
	out[pos] = tmp1 * max(0., min(flag?1.:2., multiplier));
    }

    axis_position_corr = c_axiss[batch + (++idy)];
    flat_zone = params[idy * 2];
    param = params[idy * 2 + 1];

    flag = idx < (axis_position_corr + flat_zone);
    multiplier = ((flag?((axis_position_corr - flat_zone) + param):((axis_position_corr + flat_zone) + param)) - idx)/param;

    pos += dpitch;
    if (param > 0) {
	out[pos] = tmp2 * min(2., max(flag?1.:0., multiplier));
    } else {
	out[pos] = tmp2 * max(0., min(flag?1.:2., multiplier));
    }

}
*/