/articles/toma

To get this branch, use:
bzr branch http://darksoft.org/webbzr/articles/toma
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
\begin{algorithm}[htb]
\DontPrintSemicolon
\caption{\label{alg:texrec_pascal} Optimized implementation of the back-projection kernel relaying on the texture engine to perform interpolation}
%\begin{algorithmic}
\KwIn {Texture, projection constants~($\cmem{c}_*$),  dimensions~($n_*$), cache sizes~($s_*$), and other parameters~($v_*$)}
\KwShMem {$\shmem{\vfloat{s}}[64][4]$, $\shmem{\vfloat{r}}[16][16]$}
\KwOut {Reconstructed slice $\gmem{\vfloat{r}}$}
\Begin {
\tcc{Computing sequential numbers of 4x4 square, quadrant, and pixel within quadrant}
$square \eq \vy{m_t} \imod 4$ \;
$quadrant \eq \vx{m_t} \idiv 4$ \;
$pixel \eq \vx{m_t} \imod 4$ \;

\tcc{Computing projection and pixel offsets}
$m_p \eq \vy{m_t} \idiv 4$ \;
$\vx{m_t'} \eq 4 \mul square + 2 \mul (quadrant \imod 2) +  (pixel \imod 2) $ \;
$\vy{m_t'} \eq 2 \mul (quadrant \idiv 2) +  (pixel \idiv 2) $ \;

\tcc{Computing pixel coordinates}
$\vdata{m}'_g \eq \vdata{m}_b \vmul \vdata{n}_t + \vdata{m}'_t$ \;
$\vdata{f}'_g \eq \vdata{m}'_g - \vdata{v}_a$ \;

\tcc{Computing partial sums}
$\vfloat{s}[4] \eq \{0\}$ \;
\ForToBy{p}{m_p}{n_p}{4}{
    $c_s \eq \vy{\cmem{c_s}[p]}$ \;
    $h \eq \cmem{c_a}[p] + \vx{f'_g} \mul \cmem{c_c}[p] - \vy{f'_g} \mul \cmem{c_s}[p] + \fconstf{0.5}$ \;
    
	\ForTo{q}{0}{4}{
       $\vfloat{s}[q] \aeq $ \KwTex{$h - 4 \mul q \mul c_s$, $p + \fconstf{0.5}$} \;
    }
}

\tcc{Reduction}
$\vdata{m}_t'' =  \vlist{\vx{m_t} \imod 4, 4 \mul \vy{m_t} + \vx{m_t} \idiv 4}$ \;

\ForTo{q}{0}{4}{
    \tcc{Moving partial sums to shared memory}
    $\shmem{\vfloat{s}}[\vx{n_t} \mul \vy{m_t'} + \vx{m_t'}][m_p] \eq \vfloat{s}[q]$ \;
    \KwSyncThreads \;
  
    \tcc{Performing reduction}
    $\vfloat{r} \eq \shmem{\vfloat{s}}[\vy{m_t'}][\vx{m_t'}]$ \;
    \CFor{i \eq 2}{i \geq 1}{i \deq 2}{
        $\vfloat{r} \aeq$ \KwShflXor{$\vfloat{r}$, $i$, $4$} \;   
    }
  
    \tcc{Grouping results in shared memory to coalesce global memory writes}
    \If{$\vx{m_t''} \ifeq 0$} {
        $\shmem{\vfloat{r}}[4 \mul q + \vy{m_t''} \idiv 16][\vy{m_t''} \imod 16] \eq\vfloat{r}$ \;
    }
    \KwSyncThreads \;
}

$\gmem{\vfloat{r}}[\vy{m_g}][\vx{m_g}] \eq \shmem{\vfloat{r}}[\vy{m_t}][\vx{m_t}]$  \;
}
\end{algorithm}