/articles/toma

To get this branch, use:
bzr branch http://darksoft.org/webbzr/articles/toma
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
\begin{algorithm}[htb]
\DontPrintSemicolon
\caption{\label{alg:newtex4_shuffle} The reduction loop of \algorithmname~\ref{alg:newtex4} using shuffle instruction}
%\Begin{
  \ForTo{q}{0}{4}{
    \tcc{Moving partial sums to shared memory}
    $\shmem{\vfloat{s}}[\vx{n_t} \mul \vy{\ver{2}{m_t}} + \vx{\ver{2}{m_t}}][m_p] \eq \vfloat{s}[q]$ \;
    \KwSync \;
  
      \tcc{Performing reduction}
      $\vfloat{r} \eq \shmem{\vfloat{s}}[\vy{\ver{2}{m_t}}][\vx{\ver{2}{m_t}}]$ \;
      \CFor{i \eq 2}{i \geq 1}{i \deq 2}{
         $\vfloat{r} \aeq$ \KwShflXor{$\vfloat{r}$, $i$, $4$} \;   
      }
  
      \tcc{To coalesce global memory writes, the results are grouped in shared memory}
      \If{$\vx{\ver{3}{m_t}} \ifeq 0$} {
         $\shmem{\vfloat{r}}[4 \mul q + \vy{\ver{3}{m_t}} \idiv 16][\vy{\ver{3}{m_t}} \imod 16] \eq\vfloat{r}$ \;
      }
      
%      \If{$\vx{m_t} \ifeq 0$} {
%         $\shmem{\vfloat{r}}[4 \mul b_y + \vy{m_t} \idiv 4][4 \mul b_x + \vy{m_t} \imod 4] \eq \vfloat{r}$ \;
%      }
      \KwSync \;
  }
%}
\end{algorithm}