bzr branch
http://darksoft.org/webbzr/articles/toma
47
by Suren A. Chilingaryan
The conceptual part of transaction paper |
1 |
\begin{algorithm}[htb] |
2 |
\DontPrintSemicolon
|
|
3 |
\caption{\label{alg:alurec} ALU-based implementation of the back-projection kernel} |
|
4 |
%\begin{algorithmic}
|
|
5 |
\KwIn {Texture, projection constants~($\cmem{c}_*$), dimensions~($n_*$), cache sizes~($s_*$), and other parameters~($v_*$)} |
|
6 |
\KwAssume {$n_s=32$; $n_q=4$, $s_t=16$, $s_d=16$} |
|
7 |
\KwShMem {$\shmem{\vfloat{d}}[s_d][\frac{3}{2} \mul n_s]$, $\shmem{\vfloat{h}}_m[s_d]$} |
|
8 |
\KwOut {Reconstructed slice $\gmem{\vfloat{r}}$} |
|
9 |
\Begin { |
|
10 |
%$\vdata{m}_g'' \eq \vdata{m}_b \vmul \vdata{n}_t + \vdata{m}_t''$ \;
|
|
11 |
%$\vdata{m} \eq \vdata{m}_g'' - \vlist{v_a - \fconst{0.5}, v_a - \fconst{0.5}}$ \;
|
|
12 |
%$m_l \eq \vy{m_t} \cdot \vx{n_t} + \vx{m_t}$ \;
|
|
13 |
||
14 |
\tcc{Caching mapping for $s_t=16$ and $s_d=16$} |
|
15 |
$\vlist{m_d, m_p} \eq \vdata{m_t}$ \; |
|
16 |
$\vdata{f}_b \eq \vdata{m}_b - \vdata{v}_a$ \; |
|
17 |
||
18 |
\tcc{Reconstruction mapping for $n_s=32$} |
|
19 |
$quadrant \eq \vx{m_t} \idiv 4$ \; |
|
20 |
$pixel \eq \vx{m_t} \imod 4$ \; |
|
21 |
$\vx{m_t'} \eq 4 \mul (\vy{m_t} \imod 8) + 2 \mul (quadrant \imod 2) + (pixel \imod 2) $ \; |
|
22 |
$\vy{m_t'} \eq 4 \mul (\vy{m_t} \idiv 8) + 2 \mul (quadrant \idiv 2) + (pixel \idiv 2) $ \; |
|
23 |
$\vdata{m}'_g \eq n_s \vmul \vdata{m}_b + \vdata{m}'_t$ \; |
|
24 |
$\vdata{f}'_g \eq \vdata{m}'_g - \vdata{v}_a$ \; |
|
25 |
||
26 |
\tcc{Set accumulators to 0 and run projection loop} |
|
27 |
$\vfloat{s}[n_q] \eq \{0\}$ \; |
|
28 |
\ForToBy{p_b}{0}{n_p}{s_d}{ |
|
29 |
\tcc{Compute the minimal required bin} |
|
30 |
$h_b \eq \cmem{c_a}[p_b + m_p] + \vx{f_b} \mul \cmem{c_c}[p_b + m_p] - \vy{f_b} \mul \cmem{c_s}[p_b + m_p]$\; |
|
31 |
$h_m \eq $ \KwFloor{$h_b + \cmem{c_m}[p_b + m_p]$} \; |
|
32 |
||
33 |
\tcc{Cache it in the shared memory} |
|
34 |
\If{$m_d == 0$}{ |
|
35 |
$\shmem{h}_m[m_p] = \cmem{c}_a[p_b + m_p] - h_m$ \; |
|
36 |
}
|
|
37 |
||
38 |
\tcc{Cache the data in the shared memory} |
|
39 |
\ForTo{i}{0}{3}{ |
|
40 |
$h \eq i \mul s_t + m_d$ \; |
|
41 |
$\shmem{\vfloat{d}}[m_p][h] \eq $ \KwTex{$h_m + h + \fconstf{0.5}$, $p + m_p + \fconstf{0.5}$} \; |
|
42 |
}
|
|
43 |
||
44 |
\KwSyncThreads \; |
|
45 |
||
46 |
\ForTo{p_i}{0}{s_d}{ |
|
47 |
$p \eq p_b + p_i$ \; |
|
48 |
$c_s \eq \cmem{c_s}[p]$ \; |
|
49 |
% $h_b \eq \cmem{c_a}[p] + \vx{f_b} \mul \cmem{c_c}[p] - \vy{f_b} \mul \cmem{c_s}[p] + \fconst{0.5}$ \;
|
|
50 |
% $h_m \eq floor(h_b + \cmem{c_m})$ \;
|
|
51 |
||
52 |
$h \eq \shmem{h}_m[p_i] + \vx{f'_g} \mul \cmem{c_c}[p] - \vy{f'_g} \mul \cmem{c_s}[p]$ \; |
|
53 |
||
54 |
\ForTo{q}{0}{n_q}{ |
|
55 |
\tcc{Compute the offset in cache} |
|
56 |
$h_i \eq $ \KwFloor{$h$} \; |
|
57 |
$h_l \eq h - h_i$ \; |
|
58 |
||
59 |
\tcc{Iterpolate} |
|
60 |
$\vfloat{d_1} \eq \shmem{\vfloat{d}}[p_i][h_i]$ \; |
|
61 |
$\vfloat{d_2} \eq \shmem{\vfloat{d}}[p_i][h_i + 1] - \vfloat{d_1}$ \; |
|
62 |
||
63 |
$\vfloat{s}[q] \aeq \vfloat{d_1} + h_l \mul \vfloat{d_2}$ \; |
|
64 |
||
65 |
\tcc{Move to the next position} |
|
66 |
$h \seq (n_s \idiv n_q) \mul c_s$ \; |
|
67 |
}
|
|
68 |
}
|
|
69 |
||
70 |
\KwSyncThreads \; |
|
71 |
}
|
|
72 |
||
73 |
\tcc{Save the results to global memory} |
|
74 |
\ForTo{q}{0}{n_q}{ |
|
75 |
$\gmem{\vfloat{r}}[\vy{m_g'} + 8 \mul q][\vx{m_g'}] \eq \vfloat{r}[q]$ \; |
|
76 |
}
|
|
77 |
||
78 |
}
|
|
79 |
\end{algorithm} |
|
80 |