/articles/toma : contents of alg_alurec_pascal.tex at revision 100

: (revision 100)

To get this branch, use:

bzr branch
http://darksoft.org/webbzr/articles/toma

\begin{algorithm}[htb]
\DontPrintSemicolon
\caption{\label{alg:alurec} ALU-based implementation of the back-projection kernel}
%\begin{algorithmic}
\KwIn {Texture, projection constants~($\cmem{c}_*$),  dimensions~($n_*$), cache sizes~($s_*$), and other parameters~($v_*$)}
\KwAssume {$n_s=32$; $n_q=4$, $s_t=16$, $s_d=16$}
\KwShMem {$\shmem{\vfloat{d}}[s_d][\frac{3}{2} \mul n_s]$, $\shmem{\vfloat{h}}_m[s_d]$}
\KwOut {Reconstructed slice $\gmem{\vfloat{r}}$}
\Begin {
%$\vdata{m}_g'' \eq \vdata{m}_b \vmul \vdata{n}_t + \vdata{m}_t''$ \;
%$\vdata{m} \eq \vdata{m}_g'' - \vlist{v_a - \fconst{0.5}, v_a - \fconst{0.5}}$ \;
%$m_l \eq \vy{m_t} \cdot \vx{n_t} + \vx{m_t}$ \;

\tcc{Caching mapping for $s_t=16$ and $s_d=16$}
$\vlist{m_d, m_p} \eq \vdata{m_t}$ \;
$\vdata{f}_b \eq \vdata{m}_b - \vdata{v}_a$ \;

\tcc{Reconstruction mapping for $n_s=32$}
$quadrant \eq \vx{m_t} \idiv 4$ \;
$pixel \eq \vx{m_t} \imod 4$ \;
$\vx{m_t'} \eq 4 \mul (\vy{m_t} \imod 8) + 2 \mul (quadrant \imod 2) +  (pixel \imod 2) $ \;
$\vy{m_t'} \eq 4 \mul (\vy{m_t} \idiv 8) + 2 \mul (quadrant \idiv 2) +  (pixel \idiv 2) $ \;
$\vdata{m}'_g \eq n_s \vmul \vdata{m}_b + \vdata{m}'_t$ \;
$\vdata{f}'_g \eq \vdata{m}'_g - \vdata{v}_a$ \;

\tcc{Set accumulators to 0 and run projection loop}
$\vfloat{s}[n_q] \eq \{0\}$ \;
\ForToBy{p_b}{0}{n_p}{s_d}{
            \tcc{Compute the minimal required bin}
     $h_b \eq \cmem{c_a}[p_b + m_p] + \vx{f_b} \mul \cmem{c_c}[p_b + m_p] - \vy{f_b} \mul \cmem{c_s}[p_b + m_p]$\;
     $h_m \eq $ \KwFloor{$h_b + \cmem{c_m}[p_b + m_p]$} \;
   
        \tcc{Cache it in the shared memory}
     \If{$m_d == 0$}{
        $\shmem{h}_m[m_p] = \cmem{c}_a[p_b + m_p] - h_m$ \;
     }
    
        \tcc{Cache the data in the shared memory}
     \ForTo{i}{0}{3}{
       $h \eq i \mul s_t + m_d$ \;
       $\shmem{\vfloat{d}}[m_p][h] \eq $ \KwTex{$h_m + h + \fconstf{0.5}$, $p + m_p + \fconstf{0.5}$} \;
     }

    \KwSyncThreads \;

    \ForTo{p_i}{0}{s_d}{
        $p \eq p_b + p_i$ \;
        $c_s \eq \cmem{c_s}[p]$ \;
%        $h_b \eq \cmem{c_a}[p] + \vx{f_b} \mul \cmem{c_c}[p] - \vy{f_b} \mul \cmem{c_s}[p] + \fconst{0.5}$ \;
%        $h_m \eq floor(h_b + \cmem{c_m})$ \;
        
        $h \eq \shmem{h}_m[p_i] + \vx{f'_g} \mul \cmem{c_c}[p] - \vy{f'_g} \mul \cmem{c_s}[p]$ \;

        \ForTo{q}{0}{n_q}{
            \tcc{Compute the offset in cache}
            $h_i \eq $ \KwFloor{$h$} \;
            $h_l \eq h - h_i$ \;

            \tcc{Iterpolate}            
            $\vfloat{d_1} \eq \shmem{\vfloat{d}}[p_i][h_i]$ \;
            $\vfloat{d_2} \eq \shmem{\vfloat{d}}[p_i][h_i + 1] - \vfloat{d_1}$ \;
            
            $\vfloat{s}[q] \aeq \vfloat{d_1} + h_l \mul \vfloat{d_2}$ \;

            \tcc{Move to the next position}
            $h \seq (n_s \idiv n_q) \mul c_s$ \;
        }
    }

    \KwSyncThreads \;
}

\tcc{Save the results to global memory}
\ForTo{q}{0}{n_q}{
    $\gmem{\vfloat{r}}[\vy{m_g'} + 8 \mul q][\vx{m_g'}] \eq \vfloat{r}[q]$ \;
}

}
\end{algorithm}