diff options
Diffstat (limited to 'src/kernels/stacked-backproject.cl')
-rw-r--r-- | src/kernels/stacked-backproject.cl | 287 |
1 files changed, 255 insertions, 32 deletions
diff --git a/src/kernels/stacked-backproject.cl b/src/kernels/stacked-backproject.cl index d6fff5f..c9938cb 100644 --- a/src/kernels/stacked-backproject.cl +++ b/src/kernels/stacked-backproject.cl @@ -23,7 +23,7 @@ constant sampler_t volumeSampler_single = CLK_NORMALIZED_COORDS_FALSE | constant sampler_t volumeSampler_half = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; + CLK_FILTER_LINEAR; constant sampler_t volumeSampler_int8 = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | @@ -47,6 +47,34 @@ interleave_single ( global float *sinogram, write_imagef(interleaved_sinograms, (int4)(idx, idy, idz, 0),(float4)(x,y,0.0f,0.0f)); } +/*kernel void +texture_single (read_only image2d_array_t sinogram, + global float2 *reconstructed_buffer, + constant float *sin_lut, + constant float *cos_lut, + const unsigned int x_offset, + const unsigned int y_offset, + const unsigned int angle_offset, + const unsigned int n_projections, + const float axis_pos, + unsigned long size) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + const float bx = idx - axis_pos + x_offset + 0.5f; + const float by = idy - axis_pos + y_offset + 0.5f; + float2 sum = {0.0f, 0.0f}; + + for(int proj = 0; proj < n_projections; proj++) { + float h = -by * sin_lut[angle_offset + proj] + bx * cos_lut[angle_offset + proj] + axis_pos; + sum += read_imagef (sinogram, volumeSampler_single, (float4)(h, proj + 0.5f,idz,0.0f)).xy; + } + + reconstructed_buffer[idx + idy*size + idz*size*size] = sum; +}*/ + + kernel void texture_single ( read_only image2d_array_t sinogram, @@ -73,12 +101,12 @@ texture_single ( int global_sizex = get_global_size(0); int global_sizey = get_global_size(1); - /* Computing sequential numbers of 4x4 square, quadrant, and pixel within quadrant */ + // Computing sequential numbers of 4x4 square, quadrant, and pixel within quadrant int square = local_idy%4; int quadrant = local_idx/4; int pixel = local_idx%4; - /* Computing projection and pixel offsets */ + // Computing projection and pixel offsets int projection_index = local_idy/4; int2 remapped_index_local = {(4*square + 2*(quadrant%2) + (pixel%2)), @@ -94,6 +122,27 @@ texture_single ( __local float2 shared_mem[64][4]; __local float2 reconstructed_cache[16][16]; +/*#ifdef DEVICE_TESLA_K20XM +#pragma unroll 4 +#endif +#ifdef DEVICE_TESLA_P100_PCIE_16GB +#pragma unroll 2 +#endif +#ifdef DEVICE_GEFORCE_GTX_TITAN_BLACK +#pragma unroll 8 +#endif +#ifdef DEVICE_GEFORCE_GTX_TITAN +#pragma unroll 14 +#endif +#ifdef DEVICE_GEFORCE_GTX_1080_TI +#pragma unroll 10 +#endif +#ifdef DEVICE_QUADRO_M6000 +#pragma unroll 2 +#endif +#ifdef DEVICE_GFX1010 +#pragma unroll 4 +#endif*/ for(int proj = projection_index; proj < n_projections; proj+=4) { float sine_value = sin_lut[angle_offset + proj]; @@ -106,7 +155,7 @@ texture_single ( int2 remapped_index = {(local_idx%4), (4*local_idy + (local_idx/4))}; for(int q=0; q<4;q+=1){ - /* Moving partial sums to shared memory */ + // Moving partial sums to shared memory shared_mem[(local_sizex*remapped_index_local.y + remapped_index_local.x)][projection_index] = sum[q]; barrier(CLK_LOCAL_MEM_FENCE); // syncthreads @@ -124,7 +173,7 @@ texture_single ( barrier(CLK_LOCAL_MEM_FENCE); // syncthreads } - reconstructed_buffer[global_idx + global_idy*size + idz*size*size] = reconstructed_cache[local_idy][local_idx] * M_PI_F / n_projections; + reconstructed_buffer[global_idx + global_idy*size + idz*size*size] = reconstructed_cache[local_idy][local_idx]; } kernel void @@ -163,6 +212,54 @@ interleave_half (global float *sinogram, write_imagef(interleaved_sinograms, (int4)(idx, idy, idz, 0),(float4)(b)); } +/*kernel void +texture_half (read_only image2d_array_t sinogram, + global float4 *reconstructed_buffer, + constant float *sin_lut, + constant float *cos_lut, + const unsigned int x_offset, + const unsigned int y_offset, + const unsigned int angle_offset, + const unsigned int n_projections, + const float axis_pos, + unsigned long size) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + const float bx = idx - axis_pos + x_offset + 0.5f; + const float by = idy - axis_pos + y_offset + 0.5f; + float4 sum = {0.0f, 0.0f, 0.0f, 0.0f}; + +#ifdef DEVICE_TESLA_K20XM +#pragma unroll 4 +#endif +#ifdef DEVICE_TESLA_P100_PCIE_16GB +#pragma unroll 2 +#endif +#ifdef DEVICE_GEFORCE_GTX_TITAN_BLACK +#pragma unroll 8 +#endif +#ifdef DEVICE_GEFORCE_GTX_TITAN +#pragma unroll 14 +#endif +#ifdef DEVICE_GEFORCE_GTX_1080_TI +#pragma unroll 10 +#endif +#ifdef DEVICE_QUADRO_M6000 +#pragma unroll 2 +#endif +#ifdef DEVICE_GFX1010 +#pragma unroll 4 +#endif + for(int proj = 0; proj < n_projections; proj++) { + float h = -by * sin_lut[angle_offset + proj] + bx * cos_lut[angle_offset + proj] + axis_pos; + sum += read_imagef (sinogram, volumeSampler_half, (float4)(h, proj + 0.5f,idz,0.0f)); + } + + reconstructed_buffer[idx + idy*size + idz*size*size] = sum; +}*/ + kernel void texture_half ( read_only image2d_array_t sinogram, @@ -189,12 +286,12 @@ texture_half ( int global_sizex = get_global_size(0); int global_sizey = get_global_size(1); - /* Computing sequential numbers of 4x4 square, quadrant, and pixel within quadrant */ + // Computing sequential numbers of 4x4 square, quadrant, and pixel within quadrant int square = local_idy%4; int quadrant = local_idx/4; int pixel = local_idx%4; - /* Computing projection and pixel offsets */ + // Computing projection and pixel offsets int projection_index = local_idy/4; int2 remapped_index_local = {(4*square + 2*(quadrant%2) + (pixel%2)),(2* (quadrant/2) + (pixel/2))}; int2 remapped_index_global = {(get_group_id(0)*get_local_size(0)+remapped_index_local.x), @@ -206,6 +303,27 @@ texture_half ( __local float4 shared_mem[64][4]; __local float4 reconstructed_cache[16][16]; +#ifdef DEVICE_TESLA_K20XM +#pragma unroll 4 +#endif +#ifdef DEVICE_TESLA_P100_PCIE_16GB +#pragma unroll 2 +#endif +#ifdef DEVICE_GEFORCE_GTX_TITAN_BLACK +#pragma unroll 8 +#endif +#ifdef DEVICE_GEFORCE_GTX_TITAN +#pragma unroll 14 +#endif +#ifdef DEVICE_GEFORCE_GTX_1080_TI +#pragma unroll 10 +#endif +#ifdef DEVICE_QUADRO_M6000 +#pragma unroll 2 +#endif +#ifdef DEVICE_GFX1010 +#pragma unroll 4 +#endif for(int proj = projection_index; proj < n_projections; proj+=4) { float sine_value = sin_lut[angle_offset + proj]; @@ -218,7 +336,7 @@ texture_half ( int2 remapped_index = {(local_idx%4), (4*local_idy + (local_idx/4))}; for(int q=0; q<4;q+=1){ - /* Moving partial sums to shared memory */ + // Moving partial sums to shared memory shared_mem[(local_sizex*remapped_index_local.y + remapped_index_local.x)][projection_index] = sum[q]; barrier(CLK_LOCAL_MEM_FENCE); // syncthreads @@ -235,7 +353,7 @@ texture_half ( } barrier(CLK_LOCAL_MEM_FENCE); // syncthreads } - reconstructed_buffer[global_idx + global_idy*size + idz*size*size] = reconstructed_cache[local_idy][local_idx] * M_PI_F / n_projections; + reconstructed_buffer[global_idx + global_idy*size + idz*size*size] = reconstructed_cache[local_idy][local_idx]; } kernel void @@ -258,6 +376,11 @@ uninterleave_half (global float4 *reconstructed_buffer, output[idx + idy*sizex + (output_offset+3)*sizex*sizey] = b.w; } +union converter { + uint2 storage; + uchar8 a; +}; + kernel void interleave_uint (global float *sinogram, write_only image2d_array_t interleaved_sinograms, @@ -270,22 +393,85 @@ interleave_uint (global float *sinogram, const int sizex = get_global_size(0); const int sizey = get_global_size(1); - int sinogram_offset = idz*4; + int sinogram_offset = idz*8; const float scale = 255.0f / (max - min); - uint4 b = {(sinogram[idx + idy * sizex + (sinogram_offset) * sizex * sizey] - min)*scale, - (sinogram[idx + idy * sizex + (sinogram_offset+1) * sizex * sizey] - min)*scale, - (sinogram[idx + idy * sizex + (sinogram_offset+2) * sizex * sizey] - min)*scale, - (sinogram[idx + idy * sizex + (sinogram_offset+3) * sizex * sizey] - min)*scale}; - - write_imageui(interleaved_sinograms, (int4)(idx, idy, idz, 0),(uint4)(b)); + union converter il; + il.a.s0 = (sinogram[idx + idy * sizex + (sinogram_offset) * sizex * sizey] - min)*scale; + il.a.s1 = (sinogram[idx + idy * sizex + (sinogram_offset+1) * sizex * sizey] - min)*scale; + il.a.s2 = (sinogram[idx + idy * sizex + (sinogram_offset+2) * sizex * sizey] - min)*scale; + il.a.s3 = (sinogram[idx + idy * sizex + (sinogram_offset+3) * sizex * sizey] - min)*scale; + il.a.s4 = (sinogram[idx + idy * sizex + (sinogram_offset+4) * sizex * sizey] - min)*scale; + il.a.s5 = (sinogram[idx + idy * sizex + (sinogram_offset+5) * sizex * sizey] - min)*scale; + il.a.s6 = (sinogram[idx + idy * sizex + (sinogram_offset+6) * sizex * sizey] - min)*scale; + il.a.s7 = (sinogram[idx + idy * sizex + (sinogram_offset+7) * sizex * sizey] - min)*scale; + + write_imageui(interleaved_sinograms, (int4)(idx, idy, idz, 0),(uint4)((uint)il.storage.x,(uint)il.storage.y,0,0)); } +/*kernel void +texture_uint (read_only image2d_array_t sinogram, + global uint8 *reconstructed_buffer, + constant float *sin_lut, + constant float *cos_lut, + const unsigned int x_offset, + const unsigned int y_offset, + const unsigned int angle_offset, + const unsigned int n_projections, + const float axis_pos, + unsigned long size) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + const float bx = idx - axis_pos + x_offset + 0.5f; + const float by = idy - axis_pos + y_offset + 0.5f; + uint8 sum = {0,0,0,0,0,0,0,0}; + +#ifdef DEVICE_TESLA_K20XM +#pragma unroll 4 +#endif +#ifdef DEVICE_TESLA_P100_PCIE_16GB +#pragma unroll 2 +#endif +#ifdef DEVICE_GEFORCE_GTX_TITAN_BLACK +#pragma unroll 8 +#endif +#ifdef DEVICE_GEFORCE_GTX_TITAN +#pragma unroll 14 +#endif +#ifdef DEVICE_GEFORCE_GTX_1080_TI +#pragma unroll 10 +#endif +#ifdef DEVICE_QUADRO_M6000 +#pragma unroll 2 +#endif +#ifdef DEVICE_GFX1010 +#pragma unroll 4 +#endif + union converter tex; + for(int proj = 0; proj < n_projections; proj++) { + float h = -by * sin_lut[angle_offset + proj] + bx * cos_lut[angle_offset + proj] + axis_pos; + tex.storage = read_imageui (sinogram, volumeSampler_int8, (float4)(h, proj + 0.5f,idz,0.0f)).xy; + + sum.s0 += (uint)tex.a.s0; + sum.s1 += (uint)tex.a.s1; + sum.s2 += (uint)tex.a.s2; + sum.s3 += (uint)tex.a.s3; + sum.s4 += (uint)tex.a.s4; + sum.s5 += (uint)tex.a.s5; + sum.s6 += (uint)tex.a.s6; + sum.s7 += (uint)tex.a.s7; + } + + reconstructed_buffer[idx + idy*size + idz*size*size] = sum; +}*/ + kernel void texture_uint ( read_only image2d_array_t sinogram, - global uint4 *reconstructed_buffer, + global uint8 *reconstructed_buffer, constant float *sin_lut, constant float *cos_lut, const unsigned int x_offset, @@ -308,12 +494,12 @@ texture_uint ( int global_sizex = get_global_size(0); int global_sizey = get_global_size(1); - /* Computing sequential numbers of 4x4 square, quadrant, and pixel within quadrant */ + // Computing sequential numbers of 4x4 square, quadrant, and pixel within quadrant int square = local_idy%4; int quadrant = local_idx/4; int pixel = local_idx%4; - /* Computing projection and pixel offsets */ + // Computing projection and pixel offsets int projection_index = local_idy/4; int2 remapped_index_local = {(4*square + 2*(quadrant%2) + (pixel%2)),(2* (quadrant/2) + (pixel/2))}; int2 remapped_index_global = {(get_group_id(0)*get_local_size(0)+remapped_index_local.x), @@ -321,23 +507,56 @@ texture_uint ( float2 pixel_coord = {(remapped_index_global.x-axis_pos+x_offset+0.5f), (remapped_index_global.y-axis_pos+y_offset+0.5f)}; //bx and by - uint4 sum[4] = {0,0,0,0}; - __local uint4 shared_mem[64][4]; - __local uint4 reconstructed_cache[16][16]; + uint8 sum[4] = {0,0,0,0}; + __local uint8 shared_mem[64][4]; + __local uint8 reconstructed_cache[16][16]; + + union converter tex; + +#ifdef DEVICE_TESLA_K20XM +#pragma unroll 4 +#endif +#ifdef DEVICE_TESLA_P100_PCIE_16GB +#pragma unroll 2 +#endif +#ifdef DEVICE_GEFORCE_GTX_TITAN_BLACK +#pragma unroll 8 +#endif +#ifdef DEVICE_GEFORCE_GTX_TITAN +#pragma unroll 14 +#endif +#ifdef DEVICE_GEFORCE_GTX_1080_TI +#pragma unroll 10 +#endif +#ifdef DEVICE_QUADRO_M6000 +#pragma unroll 2 +#endif +#ifdef DEVICE_GFX1010 +#pragma unroll 4 +#endif for(int proj = projection_index; proj < n_projections; proj+=4) { float sine_value = sin_lut[angle_offset + proj]; float h = pixel_coord.x * cos_lut[angle_offset + proj] - pixel_coord.y * sin_lut[angle_offset + proj] + axis_pos; for(int q=0; q<4; q+=1){ - sum[q] += read_imageui(sinogram, volumeSampler_int8, (float4)(h-4*q*sine_value, proj + 0.5f,idz, 0.0)); + tex.storage = read_imageui(sinogram, volumeSampler_int8, (float4)(h-4*q*sine_value, proj + 0.5f,idz, 0.0)).xy; + + sum[q].s0 += (uint)tex.a.s0; + sum[q].s1 += (uint)tex.a.s1; + sum[q].s2 += (uint)tex.a.s2; + sum[q].s3 += (uint)tex.a.s3; + sum[q].s4 += (uint)tex.a.s4; + sum[q].s5 += (uint)tex.a.s5; + sum[q].s6 += (uint)tex.a.s6; + sum[q].s7 += (uint)tex.a.s7; + } } - int2 remapped_index = {(local_idx%4), (4*local_idy + (local_idx/4))}; for(int q=0; q<4;q+=1){ - /* Moving partial sums to shared memory */ + // Moving partial sums to shared memory shared_mem[(local_sizex*remapped_index_local.y + remapped_index_local.x)][projection_index] = sum[q]; barrier(CLK_LOCAL_MEM_FENCE); // syncthreads @@ -359,7 +578,7 @@ texture_uint ( } kernel void -uninterleave_uint (global uint4 *reconstructed_buffer, +uninterleave_uint (global uint8 *reconstructed_buffer, global float *output, const float min, const float max, @@ -371,11 +590,15 @@ uninterleave_uint (global uint4 *reconstructed_buffer, const int sizex = get_global_size(0); const int sizey = get_global_size(1); - int output_offset = idz*4; + int output_offset = idz*8; float scale = (max-min)/255.0f; - output[idx + idy*sizex + (output_offset)*sizex*sizey] = ((reconstructed_buffer[idx + idy*sizex + idz*sizex*sizey].x)*scale+min)* M_PI_F / n_projections; - output[idx + idy*sizex + (output_offset+1)*sizex*sizey] = ((reconstructed_buffer[idx + idy*sizex + idz*sizex*sizey].y)*scale+min)* M_PI_F / n_projections; - output[idx + idy*sizex + (output_offset+2)*sizex*sizey] = ((reconstructed_buffer[idx + idy*sizex + idz*sizex*sizey].z)*scale+min)* M_PI_F / n_projections; - output[idx + idy*sizex + (output_offset+3)*sizex*sizey] = ((reconstructed_buffer[idx + idy*sizex + idz*sizex*sizey].w)*scale+min)* M_PI_F / n_projections; -}
\ No newline at end of file + output[idx + idy*sizex + (output_offset)*sizex*sizey] = ((reconstructed_buffer[idx + idy*sizex + idz*sizex*sizey].s0)*scale+min) ; + output[idx + idy*sizex + (output_offset+1)*sizex*sizey] = ((reconstructed_buffer[idx + idy*sizex + idz*sizex*sizey].s1)*scale+min) ; + output[idx + idy*sizex + (output_offset+2)*sizex*sizey] = ((reconstructed_buffer[idx + idy*sizex + idz*sizex*sizey].s2)*scale+min) ; + output[idx + idy*sizex + (output_offset+3)*sizex*sizey] = ((reconstructed_buffer[idx + idy*sizex + idz*sizex*sizey].s3)*scale+min) ; + output[idx + idy*sizex + (output_offset+4)*sizex*sizey] = ((reconstructed_buffer[idx + idy*sizex + idz*sizex*sizey].s4)*scale+min) ; + output[idx + idy*sizex + (output_offset+5)*sizex*sizey] = ((reconstructed_buffer[idx + idy*sizex + idz*sizex*sizey].s5)*scale+min) ; + output[idx + idy*sizex + (output_offset+6)*sizex*sizey] = ((reconstructed_buffer[idx + idy*sizex + idz*sizex*sizey].s6)*scale+min) ; + output[idx + idy*sizex + (output_offset+7)*sizex*sizey] = ((reconstructed_buffer[idx + idy*sizex + idz*sizex*sizey].s7)*scale+min) ; +} |