/articles/toma

To get this branch, use:
bzr branch http://darksoft.org/webbzr/articles/toma
47 by Suren A. Chilingaryan
The conceptual part of transaction paper
1
\begin{algorithm}[htb]
2
\DontPrintSemicolon
3
\caption{\label{alg:alurec} ALU-based implementation of the back-projection kernel}
4
%\begin{algorithmic}
5
\KwIn {Texture, projection constants~($\cmem{c}_*$),  dimensions~($n_*$), cache sizes~($s_*$), and other parameters~($v_*$)}
6
\KwAssume {$n_s=32$; $n_q=4$, $s_t=16$, $s_d=16$}
7
\KwShMem {$\shmem{\vfloat{d}}[s_d][\frac{3}{2} \mul n_s]$, $\shmem{\vfloat{h}}_m[s_d]$}
8
\KwOut {Reconstructed slice $\gmem{\vfloat{r}}$}
9
\Begin {
10
%$\vdata{m}_g'' \eq \vdata{m}_b \vmul \vdata{n}_t + \vdata{m}_t''$ \;
11
%$\vdata{m} \eq \vdata{m}_g'' - \vlist{v_a - \fconst{0.5}, v_a - \fconst{0.5}}$ \;
12
%$m_l \eq \vy{m_t} \cdot \vx{n_t} + \vx{m_t}$ \;
13
14
\tcc{Caching mapping for $s_t=16$ and $s_d=16$}
15
$\vlist{m_d, m_p} \eq \vdata{m_t}$ \;
16
$\vdata{f}_b \eq \vdata{m}_b - \vdata{v}_a$ \;
17
18
\tcc{Reconstruction mapping for $n_s=32$}
19
$quadrant \eq \vx{m_t} \idiv 4$ \;
20
$pixel \eq \vx{m_t} \imod 4$ \;
21
$\vx{m_t'} \eq 4 \mul (\vy{m_t} \imod 8) + 2 \mul (quadrant \imod 2) +  (pixel \imod 2) $ \;
22
$\vy{m_t'} \eq 4 \mul (\vy{m_t} \idiv 8) + 2 \mul (quadrant \idiv 2) +  (pixel \idiv 2) $ \;
23
$\vdata{m}'_g \eq n_s \vmul \vdata{m}_b + \vdata{m}'_t$ \;
24
$\vdata{f}'_g \eq \vdata{m}'_g - \vdata{v}_a$ \;
25
26
\tcc{Set accumulators to 0 and run projection loop}
27
$\vfloat{s}[n_q] \eq \{0\}$ \;
28
\ForToBy{p_b}{0}{n_p}{s_d}{
29
            \tcc{Compute the minimal required bin}
30
     $h_b \eq \cmem{c_a}[p_b + m_p] + \vx{f_b} \mul \cmem{c_c}[p_b + m_p] - \vy{f_b} \mul \cmem{c_s}[p_b + m_p]$\;
31
     $h_m \eq $ \KwFloor{$h_b + \cmem{c_m}[p_b + m_p]$} \;
32
   
33
        \tcc{Cache it in the shared memory}
34
     \If{$m_d == 0$}{
35
        $\shmem{h}_m[m_p] = \cmem{c}_a[p_b + m_p] - h_m$ \;
36
     }
37
    
38
        \tcc{Cache the data in the shared memory}
39
     \ForTo{i}{0}{3}{
40
       $h \eq i \mul s_t + m_d$ \;
41
       $\shmem{\vfloat{d}}[m_p][h] \eq $ \KwTex{$h_m + h + \fconstf{0.5}$, $p + m_p + \fconstf{0.5}$} \;
42
     }
43
44
    \KwSyncThreads \;
45
46
    \ForTo{p_i}{0}{s_d}{
47
        $p \eq p_b + p_i$ \;
48
        $c_s \eq \cmem{c_s}[p]$ \;
49
%        $h_b \eq \cmem{c_a}[p] + \vx{f_b} \mul \cmem{c_c}[p] - \vy{f_b} \mul \cmem{c_s}[p] + \fconst{0.5}$ \;
50
%        $h_m \eq floor(h_b + \cmem{c_m})$ \;
51
        
52
        $h \eq \shmem{h}_m[p_i] + \vx{f'_g} \mul \cmem{c_c}[p] - \vy{f'_g} \mul \cmem{c_s}[p]$ \;
53
54
        \ForTo{q}{0}{n_q}{
55
            \tcc{Compute the offset in cache}
56
            $h_i \eq $ \KwFloor{$h$} \;
57
            $h_l \eq h - h_i$ \;
58
59
            \tcc{Iterpolate}            
60
            $\vfloat{d_1} \eq \shmem{\vfloat{d}}[p_i][h_i]$ \;
61
            $\vfloat{d_2} \eq \shmem{\vfloat{d}}[p_i][h_i + 1] - \vfloat{d_1}$ \;
62
            
63
            $\vfloat{s}[q] \aeq \vfloat{d_1} + h_l \mul \vfloat{d_2}$ \;
64
65
            \tcc{Move to the next position}
66
            $h \seq (n_s \idiv n_q) \mul c_s$ \;
67
        }
68
    }
69
70
    \KwSyncThreads \;
71
}
72
73
\tcc{Save the results to global memory}
74
\ForTo{q}{0}{n_q}{
75
    $\gmem{\vfloat{r}}[\vy{m_g'} + 8 \mul q][\vx{m_g'}] \eq \vfloat{r}[q]$ \;
76
}
77
78
}
79
\end{algorithm}
80