/articles/toma

To get this branch, use:
bzr branch http://darksoft.org/webbzr/articles/toma
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
\begin{algorithm}[htb]
\DontPrintSemicolon
\caption{\label{alg:newtex4remap} Optimizing thread mapping for the better cache locality and reduced load on constant memory
% Each GPU thread is associated with a single projection ($m_p$) in 4-projection set and processes a pixel with offset $\ver{2}{\vdata{m}_t}$  during the first iteration. The pixels on the subsequent iterations are computed by adding 4 to $y$-coordinate on each step as elaborated in \algorithmname~\ref{alg:newtex4}.
}
%\begin{algorithmic}
\KwIn {$\vdata{m_t}$ is the original mapping as reported by CUDA/OpenCL}
%\KwOut {$\ver{1}{\vdata{m}_t}$ is a new mapping proposed in \sectionname~\ref{section:remap} to improve locality of the texture fetches. $m_p$ and $\vdata{m}_t^2$ define an alternative mapping allowing also to reduce the load on constant memory as explained in \sectionname~\ref{section:newtex}. Each GPU thread is associated with a single projection ($m_p$) in 4-projection set and processes a pixel with offset $\vdata{m}_t^2$ during the first iteration. The pixels on the subsequent iterations are computed by adding 4 to $y$-coordinate on each step as elaborated in \algorithmname~\ref{alg:newtex4}.}
\KwOut {$\ver{1}{\vdata{m}_t}$ is a new mapping proposed in \sectionname~\ref{section:remap} to improve locality of the texture fetches. $m_p$ and $\ver{2}{\vdata{m}_t}$ define an alternative mapping allowing also to reduce the load on constant memory as explained in \sectionname~\ref{section:newtex}.}


\Begin {
\tcc{Each thread is responsible for one of 4 pixels laying within a small 2x2 pixel square which is in its own right is one of 4 squares composing the larger 4x4 pixel block. Here we determine the sequential number of pixel in small square, the sequential number of the small square in the larger pixel block, and the sequential number of these block.}
$block_n \eq \vy{m_t} $ \;
$square_n \eq \vx{m_t} \idiv 4$ \;
$pixel_n \eq \vx{m_t} \imod 4$ \;

\tcc{Converting the sequential number to $x$,$y$ coordinates.}
$\vdata{block} \eq \vlist{block_n \imod 4, block_n \idiv 4}$ \;
$\vdata{square} \eq \vlist{square_n \imod 2, square_n \idiv 2}$ \;
$\vdata{pixel} \eq \vlist{pixel_n \imod 2, pixel_n \idiv 2}$ \;

\tcc{Compute the actual pixel offset for the first mapping}
$\ver{1}{\vdata{m}_t} \eq  4 \mul \vdata{block} + 2 \mul \vdata{square} + \vdata{pixel}$ \;

\tcc{Compute the projection and  pixel offset for the second mapping}
$\ver{2}{\vdata{m}_t} \eq  2 \mul \vdata{square} + \vdata{pixel}$ \;
$\vx{\ver{2}{m_t}} \aeq 4 \mul \vx{block} $ \;
$m_p \eq \vy{block}$ \;
}
\end{algorithm}