bzr branch
http://darksoft.org/webbzr/articles/toma
47
by Suren A. Chilingaryan
The conceptual part of transaction paper |
1 |
\begin{algorithm}[htb] |
2 |
\DontPrintSemicolon
|
|
3 |
\caption{\label{alg:texrec_pascal} Optimized implementation of the back-projection kernel relaying on the texture engine to perform interpolation} |
|
4 |
%\begin{algorithmic}
|
|
5 |
\KwIn {Texture, projection constants~($\cmem{c}_*$), dimensions~($n_*$), cache sizes~($s_*$), and other parameters~($v_*$)} |
|
6 |
\KwShMem {$\shmem{\vfloat{s}}[64][4]$, $\shmem{\vfloat{r}}[16][16]$} |
|
7 |
\KwOut {Reconstructed slice $\gmem{\vfloat{r}}$} |
|
8 |
\Begin { |
|
53
by Suren A. Chilingaryan
transact: partially apply proofs by Evelina |
9 |
\tcc{Computing sequential numbers of 4x4 square, quadrant, and pixel within quadrant} |
47
by Suren A. Chilingaryan
The conceptual part of transaction paper |
10 |
$square \eq \vy{m_t} \imod 4$ \; |
11 |
$quadrant \eq \vx{m_t} \idiv 4$ \; |
|
12 |
$pixel \eq \vx{m_t} \imod 4$ \; |
|
13 |
||
53
by Suren A. Chilingaryan
transact: partially apply proofs by Evelina |
14 |
\tcc{Computing projection and pixel offsets} |
47
by Suren A. Chilingaryan
The conceptual part of transaction paper |
15 |
$m_p \eq \vy{m_t} \idiv 4$ \; |
16 |
$\vx{m_t'} \eq 4 \mul square + 2 \mul (quadrant \imod 2) + (pixel \imod 2) $ \; |
|
17 |
$\vy{m_t'} \eq 2 \mul (quadrant \idiv 2) + (pixel \idiv 2) $ \; |
|
18 |
||
53
by Suren A. Chilingaryan
transact: partially apply proofs by Evelina |
19 |
\tcc{Computing pixel coordinates} |
47
by Suren A. Chilingaryan
The conceptual part of transaction paper |
20 |
$\vdata{m}'_g \eq \vdata{m}_b \vmul \vdata{n}_t + \vdata{m}'_t$ \; |
21 |
$\vdata{f}'_g \eq \vdata{m}'_g - \vdata{v}_a$ \; |
|
22 |
||
53
by Suren A. Chilingaryan
transact: partially apply proofs by Evelina |
23 |
\tcc{Computing partial sums} |
47
by Suren A. Chilingaryan
The conceptual part of transaction paper |
24 |
$\vfloat{s}[4] \eq \{0\}$ \; |
25 |
\ForToBy{p}{m_p}{n_p}{4}{ |
|
26 |
$c_s \eq \vy{\cmem{c_s}[p]}$ \; |
|
27 |
$h \eq \cmem{c_a}[p] + \vx{f'_g} \mul \cmem{c_c}[p] - \vy{f'_g} \mul \cmem{c_s}[p] + \fconstf{0.5}$ \; |
|
28 |
||
29 |
\ForTo{q}{0}{4}{ |
|
30 |
$\vfloat{s}[q] \aeq $ \KwTex{$h - 4 \mul q \mul c_s$, $p + \fconstf{0.5}$} \; |
|
31 |
}
|
|
32 |
}
|
|
33 |
||
34 |
\tcc{Reduction} |
|
35 |
$\vdata{m}_t'' = \vlist{\vx{m_t} \imod 4, 4 \mul \vy{m_t} + \vx{m_t} \idiv 4}$ \; |
|
36 |
||
37 |
\ForTo{q}{0}{4}{ |
|
38 |
\tcc{Moving partial sums to shared memory} |
|
39 |
$\shmem{\vfloat{s}}[\vx{n_t} \mul \vy{m_t'} + \vx{m_t'}][m_p] \eq \vfloat{s}[q]$ \; |
|
40 |
\KwSyncThreads \; |
|
41 |
||
42 |
\tcc{Performing reduction} |
|
43 |
$\vfloat{r} \eq \shmem{\vfloat{s}}[\vy{m_t'}][\vx{m_t'}]$ \; |
|
44 |
\CFor{i \eq 2}{i \geq 1}{i \deq 2}{ |
|
45 |
$\vfloat{r} \aeq$ \KwShflXor{$\vfloat{r}$, $i$, $4$} \; |
|
46 |
}
|
|
47 |
||
53
by Suren A. Chilingaryan
transact: partially apply proofs by Evelina |
48 |
\tcc{Grouping results in shared memory to coalesce global memory writes} |
47
by Suren A. Chilingaryan
The conceptual part of transaction paper |
49 |
\If{$\vx{m_t''} \ifeq 0$} { |
50 |
$\shmem{\vfloat{r}}[4 \mul q + \vy{m_t''} \idiv 16][\vy{m_t''} \imod 16] \eq\vfloat{r}$ \; |
|
51 |
}
|
|
52 |
\KwSyncThreads \; |
|
53 |
}
|
|
54 |
||
55 |
$\gmem{\vfloat{r}}[\vy{m_g}][\vx{m_g}] \eq \shmem{\vfloat{r}}[\vy{m_t}][\vx{m_t}]$ \; |
|
56 |
}
|
|
57 |
\end{algorithm} |