/articles/toma : contents of alg_alurec_pascal.tex at revision 100

: (revision 100)

To get this branch, use:

bzr branch
http://darksoft.org/webbzr/articles/toma

\begin{algorithm}[htb]
\DontPrintSemicolon
\caption{\label{alg:alurec} ALU-based implementation of the back-projection kernel}
%\begin{algorithmic}
\KwIn {Texture, projection constants~($\cmem{c}_*$),  dimensions~($n_*$), cache sizes~($s_*$), and other parameters~($v_*$)}
\KwAssume {$n_s=32$; $n_q=4$, $s_t=16$, $s_d=16$}
\KwShMem {$\shmem{\vfloat{d}}[s_d][\frac{3}{2} \mul n_s]$, $\shmem{\vfloat{h}}_m[s_d]$}
\KwOut {Reconstructed slice $\gmem{\vfloat{r}}$}
\Begin {
%$\vdata{m}_g'' \eq \vdata{m}_b \vmul \vdata{n}_t + \vdata{m}_t''$ \;
%$\vdata{m} \eq \vdata{m}_g'' - \vlist{v_a - \fconst{0.5}, v_a - \fconst{0.5}}$ \;
%$m_l \eq \vy{m_t} \cdot \vx{n_t} + \vx{m_t}$ \;

\tcc{Caching mapping for $s_t=16$ and $s_d=16$}
$\vlist{m_d, m_p} \eq \vdata{m_t}$ \;
$\vdata{f}_b \eq \vdata{m}_b - \vdata{v}_a$ \;

\tcc{Reconstruction mapping for $n_s=32$}
$quadrant \eq \vx{m_t} \idiv 4$ \;
$pixel \eq \vx{m_t} \imod 4$ \;
$\vx{m_t'} \eq 4 \mul (\vy{m_t} \imod 8) + 2 \mul (quadrant \imod 2) +  (pixel \imod 2) $ \;
$\vy{m_t'} \eq 4 \mul (\vy{m_t} \idiv 8) + 2 \mul (quadrant \idiv 2) +  (pixel \idiv 2) $ \;
$\vdata{m}'_g \eq n_s \vmul \vdata{m}_b + \vdata{m}'_t$ \;
$\vdata{f}'_g \eq \vdata{m}'_g - \vdata{v}_a$ \;

\tcc{Set accumulators to 0 and run projection loop}
$\vfloat{s}[n_q] \eq \{0\}$ \;
\ForToBy{p_b}{0}{n_p}{s_d}{
            \tcc{Compute the minimal required bin}
     $h_b \eq \cmem{c_a}[p_b + m_p] + \vx{f_b} \mul \cmem{c_c}[p_b + m_p] - \vy{f_b} \mul \cmem{c_s}[p_b + m_p]$\;
     $h_m \eq $ \KwFloor{$h_b + \cmem{c_m}[p_b + m_p]$} \;
   
        \tcc{Cache it in the shared memory}
     \If{$m_d == 0$}{
        $\shmem{h}_m[m_p] = \cmem{c}_a[p_b + m_p] - h_m$ \;
     }
    
        \tcc{Cache the data in the shared memory}
     \ForTo{i}{0}{3}{
       $h \eq i \mul s_t + m_d$ \;
       $\shmem{\vfloat{d}}[m_p][h] \eq $ \KwTex{$h_m + h + \fconstf{0.5}$, $p + m_p + \fconstf{0.5}$} \;
     }

    \KwSyncThreads \;

    \ForTo{p_i}{0}{s_d}{
        $p \eq p_b + p_i$ \;
        $c_s \eq \cmem{c_s}[p]$ \;
%        $h_b \eq \cmem{c_a}[p] + \vx{f_b} \mul \cmem{c_c}[p] - \vy{f_b} \mul \cmem{c_s}[p] + \fconst{0.5}$ \;
%        $h_m \eq floor(h_b + \cmem{c_m})$ \;
        
        $h \eq \shmem{h}_m[p_i] + \vx{f'_g} \mul \cmem{c_c}[p] - \vy{f'_g} \mul \cmem{c_s}[p]$ \;

        \ForTo{q}{0}{n_q}{
            \tcc{Compute the offset in cache}
            $h_i \eq $ \KwFloor{$h$} \;
            $h_l \eq h - h_i$ \;

            \tcc{Iterpolate}            
            $\vfloat{d_1} \eq \shmem{\vfloat{d}}[p_i][h_i]$ \;
            $\vfloat{d_2} \eq \shmem{\vfloat{d}}[p_i][h_i + 1] - \vfloat{d_1}$ \;
            
            $\vfloat{s}[q] \aeq \vfloat{d_1} + h_l \mul \vfloat{d_2}$ \;

            \tcc{Move to the next position}
            $h \seq (n_s \idiv n_q) \mul c_s$ \;
        }
    }

    \KwSyncThreads \;
}

\tcc{Save the results to global memory}
\ForTo{q}{0}{n_q}{
    $\gmem{\vfloat{r}}[\vy{m_g'} + 8 \mul q][\vx{m_g'}] \eq \vfloat{r}[q]$ \;
}

}
\end{algorithm}


47 by Suren A. Chilingaryan The conceptual part of transaction paper	1	\begin{algorithm}[htb]
	2	\DontPrintSemicolon
	3	\caption{\label{alg:alurec} ALU-based implementation of the back-projection kernel}
	4	%\begin{algorithmic}
	5	\KwIn {Texture, projection constants~($\cmem{c}_$), dimensions~($n_$), cache sizes~($s_$), and other parameters~($v_$)}
	6	\KwAssume {$n_s=32$; $n_q=4$, $s_t=16$, $s_d=16$}
	7	\KwShMem {$\shmem{\vfloat{d}}[s_d][\frac{3}{2} \mul n_s]$, $\shmem{\vfloat{h}}_m[s_d]$}
	8	\KwOut {Reconstructed slice $\gmem{\vfloat{r}}$}
	9	\Begin {
	10	%$\vdata{m}_g'' \eq \vdata{m}_b \vmul \vdata{n}_t + \vdata{m}_t''$ \;
	11	%$\vdata{m} \eq \vdata{m}_g'' - \vlist{v_a - \fconst{0.5}, v_a - \fconst{0.5}}$ \;
	12	%$m_l \eq \vy{m_t} \cdot \vx{n_t} + \vx{m_t}$ \;
	13
	14	\tcc{Caching mapping for $s_t=16$ and $s_d=16$}
	15	$\vlist{m_d, m_p} \eq \vdata{m_t}$ \;
	16	$\vdata{f}_b \eq \vdata{m}_b - \vdata{v}_a$ \;
	17
	18	\tcc{Reconstruction mapping for $n_s=32$}
	19	$quadrant \eq \vx{m_t} \idiv 4$ \;
	20	$pixel \eq \vx{m_t} \imod 4$ \;
	21	$\vx{m_t'} \eq 4 \mul (\vy{m_t} \imod 8) + 2 \mul (quadrant \imod 2) + (pixel \imod 2) $ \;
	22	$\vy{m_t'} \eq 4 \mul (\vy{m_t} \idiv 8) + 2 \mul (quadrant \idiv 2) + (pixel \idiv 2) $ \;
	23	$\vdata{m}'_g \eq n_s \vmul \vdata{m}_b + \vdata{m}'_t$ \;
	24	$\vdata{f}'_g \eq \vdata{m}'_g - \vdata{v}_a$ \;
	25
	26	\tcc{Set accumulators to 0 and run projection loop}
	27	$\vfloat{s}[n_q] \eq \{0\}$ \;
	28	\ForToBy{p_b}{0}{n_p}{s_d}{
	29	\tcc{Compute the minimal required bin}
	30	$h_b \eq \cmem{c_a}[p_b + m_p] + \vx{f_b} \mul \cmem{c_c}[p_b + m_p] - \vy{f_b} \mul \cmem{c_s}[p_b + m_p]$\;
	31	$h_m \eq $ \KwFloor{$h_b + \cmem{c_m}[p_b + m_p]$} \;
	32
	33	\tcc{Cache it in the shared memory}
	34	\If{$m_d == 0$}{
	35	$\shmem{h}_m[m_p] = \cmem{c}_a[p_b + m_p] - h_m$ \;
	36	}
	37
	38	\tcc{Cache the data in the shared memory}
	39	\ForTo{i}{0}{3}{
	40	$h \eq i \mul s_t + m_d$ \;
	41	$\shmem{\vfloat{d}}[m_p][h] \eq $ \KwTex{$h_m + h + \fconstf{0.5}$, $p + m_p + \fconstf{0.5}$} \;
	42	}
	43
	44	\KwSyncThreads \;
	45
	46	\ForTo{p_i}{0}{s_d}{
	47	$p \eq p_b + p_i$ \;
	48	$c_s \eq \cmem{c_s}[p]$ \;
	49	% $h_b \eq \cmem{c_a}[p] + \vx{f_b} \mul \cmem{c_c}[p] - \vy{f_b} \mul \cmem{c_s}[p] + \fconst{0.5}$ \;
	50	% $h_m \eq floor(h_b + \cmem{c_m})$ \;
	51
	52	$h \eq \shmem{h}_m[p_i] + \vx{f'_g} \mul \cmem{c_c}[p] - \vy{f'_g} \mul \cmem{c_s}[p]$ \;
	53
	54	\ForTo{q}{0}{n_q}{
	55	\tcc{Compute the offset in cache}
	56	$h_i \eq $ \KwFloor{$h$} \;
	57	$h_l \eq h - h_i$ \;
	58
	59	\tcc{Iterpolate}
	60	$\vfloat{d_1} \eq \shmem{\vfloat{d}}[p_i][h_i]$ \;
	61	$\vfloat{d_2} \eq \shmem{\vfloat{d}}[p_i][h_i + 1] - \vfloat{d_1}$ \;
	62
	63	$\vfloat{s}[q] \aeq \vfloat{d_1} + h_l \mul \vfloat{d_2}$ \;
	64
65	\tcc{Move to the next position}
66	$h \seq (n_s \idiv n_q) \mul c_s$ \;
67	}
68	}
69
70	\KwSyncThreads \;
71	}
72
73	\tcc{Save the results to global memory}
74	\ForTo{q}{0}{n_q}{
75	$\gmem{\vfloat{r}}[\vy{m_g'} + 8 \mul q][\vx{m_g'}] \eq \vfloat{r}[q]$ \;
76	}
77
78	}
79	\end{algorithm}
80