/articles/toma : contents of alg_52_alurec.tex at revision 100

: (revision 100)

To get this branch, use:

bzr branch
http://darksoft.org/webbzr/articles/toma

\begin{algorithm}[htb]
\DontPrintSemicolon
\caption{\label{alg:alurec} ALU-based implementation of the back-projection kernel}
%\begin{algorithmic}
\KwIn {Texture and the projection constants $\cmem{c}_*$. Dimensions ($n_*$), cache sizes ($s_*$), and parameters ($v_*$) as specified in \tablename~\ref{table:alg_prms}. The used variables are described in \tablename~\ref{table:alg_idxs}~and~\ref{table:alg_vars}.}
\KwAssume {$n_s=32$, $n_q=4$, $s_t=16$, $s_i = 3$}
\KwShMem {$\shmem{\vfloat{d}}[s_d][\frac{3}{2} \mul n_s]$, $\shmem{\vfloat{h}}_m[s_d]$}
\KwOut {Reconstructed slice $\gmem{\vfloat{r}}$}
\Begin {
%$\vdata{m}_g'' \eq \vdata{m}_b \vmul \vdata{n}_t + \vdata{m}_t''$ \;
%$\vdata{m} \eq \vdata{m}_g'' - \vlist{v_a - \fconst{0.5}, v_a - \fconst{0.5}}$ \;
%$m_l \eq \vy{m_t} \cdot \vx{n_t} + \vx{m_t}$ \;

\tcc{Simplified mapping}
$\vlist{m_d, m_p} \eq \vdata{m_t}$ \;
$m_t' \eq \vlist{ n_t \mul (\vy{m_t} \imod 2) + \vx{m_t}, \vy{m_t} \idiv 2 }$ \;
$m_g' \eq \vlist{n_s \mul \vx{m_b} + \vx{m_t'},  n_s \mul \vy{m_b} + \vy{m_t'} }$ \;

\tcc{Set accumulators to 0 and run projection loop}
$\vfloat{s}[n_q] \eq \{0\}$ \;
\ForToBy{p_b}{0}{n_p}{s_d}{
    \If{$m_p < s_d$}{
            \tcc{Compute the minimal required bin}
        $p \eq p_b + m_p$ \;
        $h_b \eq \cmem{c_a}[p] + \vx{f_b} \mul \cmem{c_c}[p] - \vy{f_b} \mul \cmem{c_s}[p]$ \;
        $h_m \eq $ \KwFloor{$h_b + \cmem{c_m}[p]$} \;
   
            \tcc{Cache it in the shared memory}
        \If{$m_d == 0$}{
            $\shmem{h}_m[m_p] = \cmem{c}_a[p] - h_m$ \;
        }
    
        \tcc{Cache the data in the shared memory}
        \ForTo{i}{0}{s_i}{
            $h \eq i \mul s_t + m_d$ \;
            $\shmem{\vfloat{d}}[m_p][h] \eq $ \KwTex{$h_m + h + \fconst{0.5}$, $p + \fconst{0.5}$} \;
        }
    }

    \KwSync \;

    \ForTo{p_i}{0}{s_d}{
        $p \eq p_b + p_i$ \;
        $c_s \eq \cmem{c_s}[p]$ \;
%        $h_b \eq \cmem{c_a}[p] + \vx{f_b} \mul \cmem{c_c}[p] - \vy{f_b} \mul \cmem{c_s}[p] + \fconst{0.5}$ \;
%        $h_m \eq floor(h_b + \cmem{c_m})$ \;
        
        $h \eq \shmem{h}_m[p_i] + \vx{f'_g} \mul \cmem{c_c}[p] - \vy{f'_g} \mul \cmem{c_s}[p]$ \;

        \ForTo{q}{0}{n_q}{
            \tcc{Compute the offset in cache}
            $h_i \eq $ \KwFloor{$h$} \;
            $h_l \eq h - h_i$ \;

            \tcc{Iterpolate}            
            $\vfloat{d_1} \eq \shmem{\vfloat{d}}[p_i][h_i]$ \;
            $\vfloat{d_2} \eq \shmem{\vfloat{d}}[p_i][h_i + 1] - \vfloat{d_1}$ \;
            
            $\vfloat{s}[q] \aeq \vfloat{d_1} + h_l \mul \vfloat{d_2}$ \;

            \tcc{Move to the next position}
            $h \seq (n_s \idiv n_q) \mul c_s$ \;
        }
    }

    \KwSync \;
}

\tcc{Save the results to global memory}
\ForTo{q}{0}{n_q}{
    $\gmem{\vfloat{r}}[\vy{m_g'} + 8 \mul q][\vx{m_g'}] \eq \vfloat{r}[q]$ \;
}

}
\end{algorithm}