1 files changed, 255 insertions, 32 deletions
diff --git a/src/kernels/stacked-backproject.cl b/src/kernels/stacked-backproject.cl
index d6fff5f..c9938cb 100644
--- a/src/kernels/stacked-backproject.cl
+++ b/src/kernels/stacked-backproject.cl
@@ -23,7 +23,7 @@ constant sampler_t volumeSampler_single = CLK_NORMALIZED_COORDS_FALSE |
 
 constant sampler_t volumeSampler_half = CLK_NORMALIZED_COORDS_FALSE |
                                           CLK_ADDRESS_CLAMP |
-                                          CLK_FILTER_NEAREST;
+                                          CLK_FILTER_LINEAR;
 
 constant sampler_t volumeSampler_int8 = CLK_NORMALIZED_COORDS_FALSE |
                                       CLK_ADDRESS_CLAMP_TO_EDGE |
@@ -47,6 +47,34 @@ interleave_single ( global float *sinogram,
     write_imagef(interleaved_sinograms, (int4)(idx, idy, idz, 0),(float4)(x,y,0.0f,0.0f));
 }
 
+/*kernel void
+texture_single (read_only image2d_array_t sinogram,
+                 global float2 *reconstructed_buffer,
+                 constant float *sin_lut,
+                 constant float *cos_lut,
+                 const unsigned int x_offset,
+                 const unsigned int y_offset,
+                 const unsigned int angle_offset,
+                 const unsigned int n_projections,
+                 const float axis_pos,
+                 unsigned long size)
+{
+    const int idx = get_global_id(0);
+    const int idy = get_global_id(1);
+    const int idz = get_global_id(2);
+    const float bx = idx - axis_pos + x_offset + 0.5f;
+    const float by = idy - axis_pos + y_offset + 0.5f;
+    float2 sum = {0.0f, 0.0f};
+
+    for(int proj = 0; proj < n_projections; proj++) {
+        float h = -by * sin_lut[angle_offset + proj] + bx * cos_lut[angle_offset + proj] + axis_pos;
+        sum += read_imagef (sinogram, volumeSampler_single, (float4)(h, proj + 0.5f,idz,0.0f)).xy;
+    }
+
+    reconstructed_buffer[idx + idy*size + idz*size*size] = sum;
+}*/
+
+
 kernel void
 texture_single (
         read_only image2d_array_t sinogram,
@@ -73,12 +101,12 @@ texture_single (
         int global_sizex = get_global_size(0);
         int global_sizey = get_global_size(1);
 
-        /* Computing sequential numbers of 4x4 square, quadrant, and pixel within quadrant */
+        // Computing sequential numbers of 4x4 square, quadrant, and pixel within quadrant
         int square = local_idy%4;
         int quadrant = local_idx/4;
         int pixel = local_idx%4;
 
-        /* Computing projection and pixel offsets */
+        // Computing projection and pixel offsets
         int projection_index = local_idy/4;
 
         int2 remapped_index_local   = {(4*square + 2*(quadrant%2) + (pixel%2)),
@@ -94,6 +122,27 @@ texture_single (
         __local float2 shared_mem[64][4];
         __local float2 reconstructed_cache[16][16];
 
+/*#ifdef DEVICE_TESLA_K20XM
+#pragma unroll 4
+#endif
+#ifdef DEVICE_TESLA_P100_PCIE_16GB
+#pragma unroll 2
+#endif
+#ifdef DEVICE_GEFORCE_GTX_TITAN_BLACK
+#pragma unroll 8
+#endif
+#ifdef DEVICE_GEFORCE_GTX_TITAN
+#pragma unroll 14
+#endif
+#ifdef DEVICE_GEFORCE_GTX_1080_TI
+#pragma unroll 10
+#endif
+#ifdef DEVICE_QUADRO_M6000
+#pragma unroll 2
+#endif
+#ifdef DEVICE_GFX1010
+#pragma unroll 4
+#endif*/
 
         for(int proj = projection_index; proj < n_projections; proj+=4) {
             float sine_value = sin_lut[angle_offset + proj];
@@ -106,7 +155,7 @@ texture_single (
         int2 remapped_index = {(local_idx%4), (4*local_idy + (local_idx/4))};
 
         for(int q=0; q<4;q+=1){
-            /* Moving partial sums to shared memory */
+            // Moving partial sums to shared memory
             shared_mem[(local_sizex*remapped_index_local.y + remapped_index_local.x)][projection_index] = sum[q];
 
             barrier(CLK_LOCAL_MEM_FENCE); // syncthreads
@@ -124,7 +173,7 @@ texture_single (
             barrier(CLK_LOCAL_MEM_FENCE); // syncthreads
         }
 
-        reconstructed_buffer[global_idx + global_idy*size + idz*size*size] = reconstructed_cache[local_idy][local_idx] * M_PI_F / n_projections;
+        reconstructed_buffer[global_idx + global_idy*size + idz*size*size] = reconstructed_cache[local_idy][local_idx];
 }
 
 kernel void
@@ -163,6 +212,54 @@ interleave_half (global float *sinogram,
     write_imagef(interleaved_sinograms, (int4)(idx, idy, idz, 0),(float4)(b));
 }
 
+/*kernel void
+texture_half (read_only image2d_array_t sinogram,
+                 global float4 *reconstructed_buffer,
+                 constant float *sin_lut,
+                 constant float *cos_lut,
+                 const unsigned int x_offset,
+                 const unsigned int y_offset,
+                 const unsigned int angle_offset,
+                 const unsigned int n_projections,
+                 const float axis_pos,
+                 unsigned long size)
+{
+    const int idx = get_global_id(0);
+    const int idy = get_global_id(1);
+    const int idz = get_global_id(2);
+    const float bx = idx - axis_pos + x_offset + 0.5f;
+    const float by = idy - axis_pos + y_offset + 0.5f;
+    float4 sum = {0.0f, 0.0f, 0.0f, 0.0f};
+
+#ifdef DEVICE_TESLA_K20XM
+#pragma unroll 4
+#endif
+#ifdef DEVICE_TESLA_P100_PCIE_16GB
+#pragma unroll 2
+#endif
+#ifdef DEVICE_GEFORCE_GTX_TITAN_BLACK
+#pragma unroll 8
+#endif
+#ifdef DEVICE_GEFORCE_GTX_TITAN
+#pragma unroll 14
+#endif
+#ifdef DEVICE_GEFORCE_GTX_1080_TI
+#pragma unroll 10
+#endif
+#ifdef DEVICE_QUADRO_M6000
+#pragma unroll 2
+#endif
+#ifdef DEVICE_GFX1010
+#pragma unroll 4
+#endif
+    for(int proj = 0; proj < n_projections; proj++) {
+        float h = -by * sin_lut[angle_offset + proj] + bx * cos_lut[angle_offset + proj] + axis_pos;
+        sum += read_imagef (sinogram, volumeSampler_half, (float4)(h, proj + 0.5f,idz,0.0f));
+    }
+
+    reconstructed_buffer[idx + idy*size + idz*size*size] = sum;
+}*/
+
 kernel void
 texture_half (
         read_only image2d_array_t sinogram,
@@ -189,12 +286,12 @@ texture_half (
     int global_sizex = get_global_size(0);
     int global_sizey = get_global_size(1);
 
-    /* Computing sequential numbers of 4x4 square, quadrant, and pixel within quadrant */
+    // Computing sequential numbers of 4x4 square, quadrant, and pixel within quadrant 
     int square = local_idy%4;
     int quadrant = local_idx/4;
     int pixel = local_idx%4;
 
-    /* Computing projection and pixel offsets */
+    // Computing projection and pixel offsets 
     int projection_index = local_idy/4;
     int2 remapped_index_local   = {(4*square + 2*(quadrant%2) + (pixel%2)),(2* (quadrant/2) + (pixel/2))};
     int2 remapped_index_global  = {(get_group_id(0)*get_local_size(0)+remapped_index_local.x),
@@ -206,6 +303,27 @@ texture_half (
     __local float4 shared_mem[64][4];
     __local float4 reconstructed_cache[16][16];
 
+#ifdef DEVICE_TESLA_K20XM
+#pragma unroll 4
+#endif
+#ifdef DEVICE_TESLA_P100_PCIE_16GB
+#pragma unroll 2
+#endif
+#ifdef DEVICE_GEFORCE_GTX_TITAN_BLACK
+#pragma unroll 8
+#endif
+#ifdef DEVICE_GEFORCE_GTX_TITAN
+#pragma unroll 14
+#endif
+#ifdef DEVICE_GEFORCE_GTX_1080_TI
+#pragma unroll 10
+#endif
+#ifdef DEVICE_QUADRO_M6000
+#pragma unroll 2
+#endif
+#ifdef DEVICE_GFX1010
+#pragma unroll 4
+#endif
 
     for(int proj = projection_index; proj < n_projections; proj+=4) {
         float sine_value = sin_lut[angle_offset + proj];
@@ -218,7 +336,7 @@ texture_half (
     int2 remapped_index = {(local_idx%4), (4*local_idy + (local_idx/4))};
 
     for(int q=0; q<4;q+=1){
-        /* Moving partial sums to shared memory */
+        // Moving partial sums to shared memory
         shared_mem[(local_sizex*remapped_index_local.y + remapped_index_local.x)][projection_index] = sum[q];
 
         barrier(CLK_LOCAL_MEM_FENCE); // syncthreads
@@ -235,7 +353,7 @@ texture_half (
         }
         barrier(CLK_LOCAL_MEM_FENCE); // syncthreads
     }
-    reconstructed_buffer[global_idx + global_idy*size + idz*size*size] = reconstructed_cache[local_idy][local_idx] * M_PI_F / n_projections;
+    reconstructed_buffer[global_idx + global_idy*size + idz*size*size] = reconstructed_cache[local_idy][local_idx];
 }
 
 kernel void
@@ -258,6 +376,11 @@ uninterleave_half (global float4 *reconstructed_buffer,
     output[idx + idy*sizex + (output_offset+3)*sizex*sizey] = b.w;
 }
 
+union converter {
+    uint2 storage;
+    uchar8 a;
+};
+
 kernel void
 interleave_uint (global float *sinogram,
             write_only image2d_array_t interleaved_sinograms,
@@ -270,22 +393,85 @@ interleave_uint (global float *sinogram,
     const int sizex = get_global_size(0);
     const int sizey = get_global_size(1);
 
-    int sinogram_offset = idz*4;
+    int sinogram_offset = idz*8;
 
     const float scale = 255.0f / (max - min);
 
-    uint4 b = {(sinogram[idx + idy * sizex + (sinogram_offset) * sizex * sizey] - min)*scale,
-               (sinogram[idx + idy * sizex + (sinogram_offset+1) * sizex * sizey] - min)*scale,
-               (sinogram[idx + idy * sizex + (sinogram_offset+2) * sizex * sizey] - min)*scale,
-               (sinogram[idx + idy * sizex + (sinogram_offset+3) * sizex * sizey] - min)*scale};
-
-   write_imageui(interleaved_sinograms, (int4)(idx, idy, idz, 0),(uint4)(b));
+    union converter il;
+    il.a.s0 = (sinogram[idx + idy * sizex + (sinogram_offset) * sizex * sizey] - min)*scale;
+    il.a.s1 = (sinogram[idx + idy * sizex + (sinogram_offset+1) * sizex * sizey] - min)*scale;
+    il.a.s2 = (sinogram[idx + idy * sizex + (sinogram_offset+2) * sizex * sizey] - min)*scale;
+    il.a.s3 = (sinogram[idx + idy * sizex + (sinogram_offset+3) * sizex * sizey] - min)*scale;
+    il.a.s4 = (sinogram[idx + idy * sizex + (sinogram_offset+4) * sizex * sizey] - min)*scale;
+    il.a.s5 = (sinogram[idx + idy * sizex + (sinogram_offset+5) * sizex * sizey] - min)*scale;
+    il.a.s6 = (sinogram[idx + idy * sizex + (sinogram_offset+6) * sizex * sizey] - min)*scale;
+    il.a.s7 = (sinogram[idx + idy * sizex + (sinogram_offset+7) * sizex * sizey] - min)*scale;
+
+    write_imageui(interleaved_sinograms, (int4)(idx, idy, idz, 0),(uint4)((uint)il.storage.x,(uint)il.storage.y,0,0));
 }
 
+/*kernel void
+texture_uint (read_only image2d_array_t sinogram,
+                 global uint8 *reconstructed_buffer,
+                 constant float *sin_lut,
+                 constant float *cos_lut,
+                 const unsigned int x_offset,
+                 const unsigned int y_offset,
+                 const unsigned int angle_offset,
+                 const unsigned int n_projections,
+                 const float axis_pos,
+                 unsigned long size)
+{
+    const int idx = get_global_id(0);
+    const int idy = get_global_id(1);
+    const int idz = get_global_id(2);
+    const float bx = idx - axis_pos + x_offset + 0.5f;
+    const float by = idy - axis_pos + y_offset + 0.5f;
+    uint8 sum = {0,0,0,0,0,0,0,0};
+
+#ifdef DEVICE_TESLA_K20XM
+#pragma unroll 4
+#endif
+#ifdef DEVICE_TESLA_P100_PCIE_16GB
+#pragma unroll 2
+#endif
+#ifdef DEVICE_GEFORCE_GTX_TITAN_BLACK
+#pragma unroll 8
+#endif
+#ifdef DEVICE_GEFORCE_GTX_TITAN
+#pragma unroll 14
+#endif
+#ifdef DEVICE_GEFORCE_GTX_1080_TI
+#pragma unroll 10
+#endif
+#ifdef DEVICE_QUADRO_M6000
+#pragma unroll 2
+#endif
+#ifdef DEVICE_GFX1010
+#pragma unroll 4
+#endif
+    union converter tex;
+    for(int proj = 0; proj < n_projections; proj++) {
+        float h = -by * sin_lut[angle_offset + proj] + bx * cos_lut[angle_offset + proj] + axis_pos;
+        tex.storage = read_imageui (sinogram, volumeSampler_int8, (float4)(h, proj + 0.5f,idz,0.0f)).xy;
+
+        sum.s0 += (uint)tex.a.s0;
+        sum.s1 += (uint)tex.a.s1;
+        sum.s2 += (uint)tex.a.s2;
+        sum.s3 += (uint)tex.a.s3;
+        sum.s4 += (uint)tex.a.s4;
+        sum.s5 += (uint)tex.a.s5;
+        sum.s6 += (uint)tex.a.s6;
+        sum.s7 += (uint)tex.a.s7;
+    }
+
+    reconstructed_buffer[idx + idy*size + idz*size*size] = sum;
+}*/
+
 kernel void
 texture_uint (
         read_only image2d_array_t sinogram,
-        global uint4 *reconstructed_buffer,
+        global uint8 *reconstructed_buffer,
         constant float *sin_lut,
         constant float *cos_lut,
         const unsigned int x_offset,
@@ -308,12 +494,12 @@ texture_uint (
         int global_sizex = get_global_size(0);
         int global_sizey = get_global_size(1);
 
-        /* Computing sequential numbers of 4x4 square, quadrant, and pixel within quadrant */
+        // Computing sequential numbers of 4x4 square, quadrant, and pixel within quadrant
         int square = local_idy%4;
         int quadrant = local_idx/4;
         int pixel = local_idx%4;
 
-        /* Computing projection and pixel offsets */
+        // Computing projection and pixel offsets
         int projection_index = local_idy/4;
         int2 remapped_index_local   = {(4*square + 2*(quadrant%2) + (pixel%2)),(2* (quadrant/2) + (pixel/2))};
         int2 remapped_index_global  = {(get_group_id(0)*get_local_size(0)+remapped_index_local.x),
@@ -321,23 +507,56 @@ texture_uint (
 
         float2 pixel_coord = {(remapped_index_global.x-axis_pos+x_offset+0.5f), (remapped_index_global.y-axis_pos+y_offset+0.5f)}; //bx and by
 
-        uint4 sum[4] = {0,0,0,0};
-        __local uint4 shared_mem[64][4];
-        __local uint4 reconstructed_cache[16][16];
+        uint8 sum[4] = {0,0,0,0};
+        __local uint8 shared_mem[64][4];
+        __local uint8 reconstructed_cache[16][16];
+
+        union converter tex;
+
+#ifdef DEVICE_TESLA_K20XM
+#pragma unroll 4
+#endif
+#ifdef DEVICE_TESLA_P100_PCIE_16GB
+#pragma unroll 2
+#endif
+#ifdef DEVICE_GEFORCE_GTX_TITAN_BLACK
+#pragma unroll 8
+#endif
+#ifdef DEVICE_GEFORCE_GTX_TITAN
+#pragma unroll 14
+#endif
+#ifdef DEVICE_GEFORCE_GTX_1080_TI
+#pragma unroll 10
+#endif
+#ifdef DEVICE_QUADRO_M6000
+#pragma unroll 2
+#endif
+#ifdef DEVICE_GFX1010
+#pragma unroll 4
+#endif
 
         for(int proj = projection_index; proj < n_projections; proj+=4) {
             float sine_value = sin_lut[angle_offset + proj];
             float h = pixel_coord.x * cos_lut[angle_offset + proj] - pixel_coord.y * sin_lut[angle_offset + proj] + axis_pos;
             for(int q=0; q<4; q+=1){
-               sum[q] += read_imageui(sinogram, volumeSampler_int8, (float4)(h-4*q*sine_value, proj + 0.5f,idz, 0.0));
+               tex.storage = read_imageui(sinogram, volumeSampler_int8, (float4)(h-4*q*sine_value, proj + 0.5f,idz, 0.0)).xy;
+
+               sum[q].s0 += (uint)tex.a.s0;
+               sum[q].s1 += (uint)tex.a.s1;
+               sum[q].s2 += (uint)tex.a.s2;
+               sum[q].s3 += (uint)tex.a.s3;
+               sum[q].s4 += (uint)tex.a.s4;
+               sum[q].s5 += (uint)tex.a.s5;
+               sum[q].s6 += (uint)tex.a.s6;
+               sum[q].s7 += (uint)tex.a.s7;
+
             }
         }
 
-
         int2 remapped_index = {(local_idx%4), (4*local_idy + (local_idx/4))};
 
         for(int q=0; q<4;q+=1){
-            /* Moving partial sums to shared memory */
+            // Moving partial sums to shared memory
             shared_mem[(local_sizex*remapped_index_local.y + remapped_index_local.x)][projection_index] = sum[q];
 
             barrier(CLK_LOCAL_MEM_FENCE); // syncthreads
@@ -359,7 +578,7 @@ texture_uint (
 }
 
 kernel void
-uninterleave_uint (global uint4 *reconstructed_buffer,
+uninterleave_uint (global uint8 *reconstructed_buffer,
                 global float *output,
                 const float min,
                 const float max,
@@ -371,11 +590,15 @@ uninterleave_uint (global uint4 *reconstructed_buffer,
     const int sizex = get_global_size(0);
     const int sizey = get_global_size(1);
 
-    int output_offset = idz*4;
+    int output_offset = idz*8;
     float scale = (max-min)/255.0f;
 
-    output[idx + idy*sizex + (output_offset)*sizex*sizey] = ((reconstructed_buffer[idx + idy*sizex + idz*sizex*sizey].x)*scale+min)* M_PI_F / n_projections;
-    output[idx + idy*sizex + (output_offset+1)*sizex*sizey] = ((reconstructed_buffer[idx + idy*sizex + idz*sizex*sizey].y)*scale+min)* M_PI_F / n_projections;
-    output[idx + idy*sizex + (output_offset+2)*sizex*sizey] = ((reconstructed_buffer[idx + idy*sizex + idz*sizex*sizey].z)*scale+min)* M_PI_F / n_projections;
-    output[idx + idy*sizex + (output_offset+3)*sizex*sizey] = ((reconstructed_buffer[idx + idy*sizex + idz*sizex*sizey].w)*scale+min)* M_PI_F / n_projections;
-}
-\ No newline at end of file
+    output[idx + idy*sizex + (output_offset)*sizex*sizey] = ((reconstructed_buffer[idx + idy*sizex + idz*sizex*sizey].s0)*scale+min)   ;
+    output[idx + idy*sizex + (output_offset+1)*sizex*sizey] = ((reconstructed_buffer[idx + idy*sizex + idz*sizex*sizey].s1)*scale+min) ;
+    output[idx + idy*sizex + (output_offset+2)*sizex*sizey] = ((reconstructed_buffer[idx + idy*sizex + idz*sizex*sizey].s2)*scale+min) ;
+    output[idx + idy*sizex + (output_offset+3)*sizex*sizey] = ((reconstructed_buffer[idx + idy*sizex + idz*sizex*sizey].s3)*scale+min) ;
+    output[idx + idy*sizex + (output_offset+4)*sizex*sizey] = ((reconstructed_buffer[idx + idy*sizex + idz*sizex*sizey].s4)*scale+min) ;
+    output[idx + idy*sizex + (output_offset+5)*sizex*sizey] = ((reconstructed_buffer[idx + idy*sizex + idz*sizex*sizey].s5)*scale+min) ;
+    output[idx + idy*sizex + (output_offset+6)*sizex*sizey] = ((reconstructed_buffer[idx + idy*sizex + idz*sizex*sizey].s6)*scale+min) ;
+    output[idx + idy*sizex + (output_offset+7)*sizex*sizey] = ((reconstructed_buffer[idx + idy*sizex + idz*sizex*sizey].s7)*scale+min) ;
+}