/perf/fdk

To get this branch, use:
bzr branch http://darksoft.org/webbzr/perf/fdk
1 by Suren A. Chilingaryan
Initial
1
#include <stdio.h>
2
#include <stdlib.h>
3
2 by Suren A. Chilingaryan
Intel compiler
4
#ifndef __USE_BSD
5
# define __USE_BSD
6
#endif
1 by Suren A. Chilingaryan
Initial
7
#include <math.h>
8
9
#include <pthread.h>
10
#include <stdint.h>
11
#include <string.h>
12
13
#include <limits.h>
14
15
#include <ippi.h>
16
#include <ipps.h>
17
#include <ippcore.h>
18
19
#include "process.h"
20
21
/* global variables */
22
int counter = 0;
23
pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
24
25
26
void statusinfo(IppStatus st) 
27
{
28
    if ((int)st != 0)
29
    {
30
        printf("%d : %s\n", st, ippGetStatusString(st));
31
    }
32
}  
33
34
35
36
37
/* canonical multiplication of square matrices */
38
int mult(Ipp32f *a, Ipp32f *b, Ipp32f *c) 
39
{
40
    int i, j, k;
41
	
42
    for (k = 0; k < 4; k++) 
43
    {
44
        for (i = 0; i < 4; i++) 
45
        {
46
            for (j = 0; j < 4; j++) 
47
            {
48
                c[i * 4 + j] += a[i * 4 + k] * b[k * 4 + j];
49
            }
50
        }
51
    }
52
53
    return 0;
54
} /* mult */
55
56
57
void *process (void *args) 
58
{
59
    int err;
2 by Suren A. Chilingaryan
Intel compiler
60
    int i, j;
1 by Suren A. Chilingaryan
Initial
61
62
    struct thread_info *t_info = (struct thread_info*) args;
63
64
    Ipp32f *px_map = NULL, *py_map = NULL;
65
    Ipp32f *tmp_2 = NULL;
66
    Ipp32f *current_slice = NULL;
67
2 by Suren A. Chilingaryan
Intel compiler
68
    Ipp32f angle, z;
1 by Suren A. Chilingaryan
Initial
69
70
    IppiSize im_size = {t_info->n_elements, t_info->n_elements};
71
    IppiRect im_roi_size = {0, 0, t_info->n_elements, t_info->n_elements};
72
73
    int imStepBytes, ippStepBytes;
74
75
    int i_angle, slice_number;
76
77
    /* step in bytes */
78
    imStepBytes = t_info->n_elements * sizeof(float);
79
80
    /* allocate temporal array */
81
    tmp_2 = ippiMalloc_32f_C1(t_info->n_elements, t_info->n_elements, &ippStepBytes);
82
    if (tmp_2 == NULL) 
83
    {
84
        printf("Cannot allocate tmp_2");
85
	exit(-1);
86
    }
87
88
    current_slice = ippiMalloc_32f_C1(t_info->n_elements, t_info->n_elements, &ippStepBytes);
89
    if (current_slice == NULL) 
90
    {
91
        printf("Cannot allocate current_slice");
92
	exit(-1);
93
    }
94
    
95
    /* allocate interpolation maps */
96
    px_map = ippiMalloc_32f_C1(t_info->n_elements, t_info->n_elements, &ippStepBytes);
97
    if (px_map == NULL) 
98
    {
99
        printf("Cannot allocate px_map");
100
	exit(-1);
101
    }
102
    
103
    py_map = ippiMalloc_32f_C1(t_info->n_elements, t_info->n_elements, &ippStepBytes);
104
    if (py_map == NULL) 
105
    {
106
        printf("Cannot allocate py_map");
107
	exit(-1);
108
    }
109
        
110
    
111
    while (1)
112
    {
113
        /* counter increment */
114
retry: 
115
        if (pthread_mutex_lock(&mutex)) {
116
	    printf("Retrying\n");
117
	    goto retry;
118
	}
119
2 by Suren A. Chilingaryan
Intel compiler
120
        slice_number = counter;
121
122
        if (slice_number >= t_info->n_elements) 
1 by Suren A. Chilingaryan
Initial
123
        {
124
            pthread_mutex_unlock(&mutex);
125
            break;
126
        }
2 by Suren A. Chilingaryan
Intel compiler
127
128
        counter += t_info->slices_per_iter;
129
1 by Suren A. Chilingaryan
Initial
130
        pthread_mutex_unlock(&mutex);
2 by Suren A. Chilingaryan
Intel compiler
131
1 by Suren A. Chilingaryan
Initial
132
        /* z coordinate of current slice */
133
        z = t_info->slice_coord_z[slice_number];
2 by Suren A. Chilingaryan
Intel compiler
134
1 by Suren A. Chilingaryan
Initial
135
        /* set current slice to zero */ 
136
        statusinfo(ippiSet_32f_C1R((Ipp32f)0, current_slice, ippStepBytes, im_size));
137
        
138
        for (i_angle = 0; i_angle < t_info->n_proj; i_angle++)
139
        {    
140
            /* set temporal variable to zero */
141
            statusinfo(ippiSet_32f_C1R((Ipp32f)0, tmp_2, ippStepBytes, im_size));
142
            
143
            /* current rotation angle */
144
            angle = 2*M_PI/t_info->n_proj*i_angle;
145
            
146
            /* set some matrices to calculate forward projection matrix operator in homogeneous coordinates*/
147
            Ipp32f P1[4 * 4] = {(t_info->d + t_info->detector_offset_z) / t_info->pixel_size, 0, (t_info->detector_size / 2 - t_info->detector_offset_u) / t_info->pixel_size, 0,
148
                0, -(t_info->d + t_info->detector_offset_z) / t_info->pixel_size, (t_info->detector_size / 2 - t_info->detector_offset_v) / t_info->pixel_size, 0,
149
                0, 0, 1, 0,
150
                0, 0, 0, 1};
151
        
152
            Ipp32f P2[4 * 4] = {t_info->u_detector[0], t_info->u_detector[1], t_info->u_detector[2], 0,
153
                t_info->v_detector[0], t_info->v_detector[1], t_info->v_detector[2], 0,
154
                t_info->n_detector[0], t_info->n_detector[1], t_info->n_detector[2], 0,
155
                0, 0, 0, 1};
156
        
157
            Ipp32f P3[4 * 4] = {1, 0, 0, -t_info->source[0],
158
                0, 1, 0, -t_info->source[1],
159
                0, 0, 1, -t_info->source[2],
160
                0, 0, 0, 1};
161
        
162
            Ipp32f P4[4 * 4] = {1, 0, 0, 0,
163
                0, 1, 0, 0,
164
                0, 0, 1, t_info->cor + t_info->cor_offset,
165
                0, 0, 0, 1};
166
        
167
            Ipp32f P5[4 * 4] = {(Ipp32f)(cosf(angle)), 0, (Ipp32f)(-sinf(angle)), 0,
168
                0, 1, 0, 0,
169
                (Ipp32f)(sinf(angle)), 0, (Ipp32f)(cosf(angle)), 0,
170
                0, 0, 0, 1};
171
            
172
            /* set to zero temporal arrays */
173
            Ipp32f P_tmp_1[4 * 4] = {0};
174
            Ipp32f P_tmp_2[4 * 4] = {0};
175
            Ipp32f P_tmp_3[4 * 4] = {0};
176
            Ipp32f P[4 * 4] = {0};
177
            
178
            /* forward projection operator */
179
            mult(P1, P2, P_tmp_1);
180
            mult(P_tmp_1, P3, P_tmp_2);
181
            mult(P_tmp_2, P4, P_tmp_3);
182
            mult(P_tmp_3, P5, P);
183
            
2 by Suren A. Chilingaryan
Intel compiler
184
            int n_elements = t_info->n_elements;
185
            for (i = 0; i < n_elements; i++)
1 by Suren A. Chilingaryan
Initial
186
            {
2 by Suren A. Chilingaryan
Intel compiler
187
#pragma simd 
188
                for (j = 0; j < n_elements; j++)
1 by Suren A. Chilingaryan
Initial
189
                {
2 by Suren A. Chilingaryan
Intel compiler
190
                    int idx = i * n_elements + j;
1 by Suren A. Chilingaryan
Initial
191
                    
2 by Suren A. Chilingaryan
Intel compiler
192
                    float w_x = P[0] * t_info->slice_x[idx] + P[1] * t_info->slice_y[idx] + P[2] * z + P[3];
193
                    float w_y = P[4] * t_info->slice_x[idx] + P[5] * t_info->slice_y[idx] + P[6] * z + P[7];
194
                    float w_z = P[8] * t_info->slice_x[idx] + P[9] * t_info->slice_y[idx] + P[10] * z + P[11];
1 by Suren A. Chilingaryan
Initial
195
                    
196
                    px_map[idx] =  w_x / w_z;
197
                    py_map[idx] =  w_y / w_z;
198
                }
199
            }
200
            
201
            /* interpolate */
202
            statusinfo(ippiRemap_32f_C1R(t_info->projections + (ippStepBytes / sizeof(float) * (long)t_info->n_elements * i_angle), im_size, ippStepBytes, im_roi_size, px_map, ippStepBytes, py_map, ippStepBytes, tmp_2, ippStepBytes, im_size, IPPI_INTER_LINEAR));
203
204
            /* accumulate */
205
            /* in-place addition of two matrices */
206
            statusinfo(ippiAdd_32f_C1IR(tmp_2, ippStepBytes, current_slice, ippStepBytes, im_size));
207
        }
208
        
209
        /* write current slice to final array */
210
        //memcpy(t_info->out_volume + (t_info->n_elements * t_info->n_elements * slice_number), current_slice, t_info->n_elements * t_info->n_elements * sizeof(float));
211
        statusinfo(ippiCopy_32f_C1R(current_slice, ippStepBytes, t_info->out_volume + ((long)t_info->n_elements * t_info->n_elements * slice_number), imStepBytes, im_size));
212
    }
213
    
214
    /* free memory */
215
    ippiFree(px_map);
216
    ippiFree(py_map);
217
    ippiFree(tmp_2);
218
    ippiFree(current_slice); 
219
} /* process */
220