/perf/fdk : contents of process.c at revision 2

: (revision 2)

To get this branch, use:

bzr branch
http://darksoft.org/webbzr/perf/fdk

#include <stdio.h>
#include <stdlib.h>

#ifndef __USE_BSD
# define __USE_BSD
#endif
#include <math.h>

#include <pthread.h>
#include <stdint.h>
#include <string.h>

#include <limits.h>

#include <ippi.h>
#include <ipps.h>
#include <ippcore.h>

#include "process.h"

/* global variables */
int counter = 0;
pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;


void statusinfo(IppStatus st) 
{
    if ((int)st != 0)
    {
        printf("%d : %s\n", st, ippGetStatusString(st));
    }
}  




/* canonical multiplication of square matrices */
int mult(Ipp32f *a, Ipp32f *b, Ipp32f *c) 
{
    int i, j, k;
	
    for (k = 0; k < 4; k++) 
    {
        for (i = 0; i < 4; i++) 
        {
            for (j = 0; j < 4; j++) 
            {
                c[i * 4 + j] += a[i * 4 + k] * b[k * 4 + j];
            }
        }
    }

    return 0;
} /* mult */


void *process (void *args) 
{
    int err;
    int i, j;

    struct thread_info *t_info = (struct thread_info*) args;

    Ipp32f *px_map = NULL, *py_map = NULL;
    Ipp32f *tmp_2 = NULL;
    Ipp32f *current_slice = NULL;

    Ipp32f angle, z;

    IppiSize im_size = {t_info->n_elements, t_info->n_elements};
    IppiRect im_roi_size = {0, 0, t_info->n_elements, t_info->n_elements};

    int imStepBytes, ippStepBytes;

    int i_angle, slice_number;

    /* step in bytes */
    imStepBytes = t_info->n_elements * sizeof(float);

    /* allocate temporal array */
    tmp_2 = ippiMalloc_32f_C1(t_info->n_elements, t_info->n_elements, &ippStepBytes);
    if (tmp_2 == NULL) 
    {
        printf("Cannot allocate tmp_2");
	exit(-1);
    }

    current_slice = ippiMalloc_32f_C1(t_info->n_elements, t_info->n_elements, &ippStepBytes);
    if (current_slice == NULL) 
    {
        printf("Cannot allocate current_slice");
	exit(-1);
    }
    
    /* allocate interpolation maps */
    px_map = ippiMalloc_32f_C1(t_info->n_elements, t_info->n_elements, &ippStepBytes);
    if (px_map == NULL) 
    {
        printf("Cannot allocate px_map");
	exit(-1);
    }
    
    py_map = ippiMalloc_32f_C1(t_info->n_elements, t_info->n_elements, &ippStepBytes);
    if (py_map == NULL) 
    {
        printf("Cannot allocate py_map");
	exit(-1);
    }
        
    
    while (1)
    {
        /* counter increment */
retry: 
        if (pthread_mutex_lock(&mutex)) {
	    printf("Retrying\n");
	    goto retry;
	}

        slice_number = counter;

        if (slice_number >= t_info->n_elements) 
        {
            pthread_mutex_unlock(&mutex);
            break;
        }

        counter += t_info->slices_per_iter;

        pthread_mutex_unlock(&mutex);

        /* z coordinate of current slice */
        z = t_info->slice_coord_z[slice_number];

        /* set current slice to zero */ 
        statusinfo(ippiSet_32f_C1R((Ipp32f)0, current_slice, ippStepBytes, im_size));
        
        for (i_angle = 0; i_angle < t_info->n_proj; i_angle++)
        {    
            /* set temporal variable to zero */
            statusinfo(ippiSet_32f_C1R((Ipp32f)0, tmp_2, ippStepBytes, im_size));
            
            /* current rotation angle */
            angle = 2*M_PI/t_info->n_proj*i_angle;
            
            /* set some matrices to calculate forward projection matrix operator in homogeneous coordinates*/
            Ipp32f P1[4 * 4] = {(t_info->d + t_info->detector_offset_z) / t_info->pixel_size, 0, (t_info->detector_size / 2 - t_info->detector_offset_u) / t_info->pixel_size, 0,
                0, -(t_info->d + t_info->detector_offset_z) / t_info->pixel_size, (t_info->detector_size / 2 - t_info->detector_offset_v) / t_info->pixel_size, 0,
                0, 0, 1, 0,
                0, 0, 0, 1};
        
            Ipp32f P2[4 * 4] = {t_info->u_detector[0], t_info->u_detector[1], t_info->u_detector[2], 0,
                t_info->v_detector[0], t_info->v_detector[1], t_info->v_detector[2], 0,
                t_info->n_detector[0], t_info->n_detector[1], t_info->n_detector[2], 0,
                0, 0, 0, 1};
        
            Ipp32f P3[4 * 4] = {1, 0, 0, -t_info->source[0],
                0, 1, 0, -t_info->source[1],
                0, 0, 1, -t_info->source[2],
                0, 0, 0, 1};
        
            Ipp32f P4[4 * 4] = {1, 0, 0, 0,
                0, 1, 0, 0,
                0, 0, 1, t_info->cor + t_info->cor_offset,
                0, 0, 0, 1};
        
            Ipp32f P5[4 * 4] = {(Ipp32f)(cosf(angle)), 0, (Ipp32f)(-sinf(angle)), 0,
                0, 1, 0, 0,
                (Ipp32f)(sinf(angle)), 0, (Ipp32f)(cosf(angle)), 0,
                0, 0, 0, 1};
            
            /* set to zero temporal arrays */
            Ipp32f P_tmp_1[4 * 4] = {0};
            Ipp32f P_tmp_2[4 * 4] = {0};
            Ipp32f P_tmp_3[4 * 4] = {0};
            Ipp32f P[4 * 4] = {0};
            
            /* forward projection operator */
            mult(P1, P2, P_tmp_1);
            mult(P_tmp_1, P3, P_tmp_2);
            mult(P_tmp_2, P4, P_tmp_3);
            mult(P_tmp_3, P5, P);
            
            int n_elements = t_info->n_elements;
            for (i = 0; i < n_elements; i++)
            {
#pragma simd 
                for (j = 0; j < n_elements; j++)
                {
                    int idx = i * n_elements + j;
                    
                    float w_x = P[0] * t_info->slice_x[idx] + P[1] * t_info->slice_y[idx] + P[2] * z + P[3];
                    float w_y = P[4] * t_info->slice_x[idx] + P[5] * t_info->slice_y[idx] + P[6] * z + P[7];
                    float w_z = P[8] * t_info->slice_x[idx] + P[9] * t_info->slice_y[idx] + P[10] * z + P[11];
                    
                    px_map[idx] =  w_x / w_z;
                    py_map[idx] =  w_y / w_z;
                }
            }
            
            /* interpolate */
            statusinfo(ippiRemap_32f_C1R(t_info->projections + (ippStepBytes / sizeof(float) * (long)t_info->n_elements * i_angle), im_size, ippStepBytes, im_roi_size, px_map, ippStepBytes, py_map, ippStepBytes, tmp_2, ippStepBytes, im_size, IPPI_INTER_LINEAR));

            /* accumulate */
            /* in-place addition of two matrices */
            statusinfo(ippiAdd_32f_C1IR(tmp_2, ippStepBytes, current_slice, ippStepBytes, im_size));
        }
        
        /* write current slice to final array */
        //memcpy(t_info->out_volume + (t_info->n_elements * t_info->n_elements * slice_number), current_slice, t_info->n_elements * t_info->n_elements * sizeof(float));
        statusinfo(ippiCopy_32f_C1R(current_slice, ippStepBytes, t_info->out_volume + ((long)t_info->n_elements * t_info->n_elements * slice_number), imStepBytes, im_size));
    }
    
    /* free memory */
    ippiFree(px_map);
    ippiFree(py_map);
    ippiFree(tmp_2);
    ippiFree(current_slice); 
} /* process */