1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
\begin{algorithm}[htb]
\DontPrintSemicolon
\caption{\label{alg:newtex4_cc} The main loop of \algorithmname~\ref{alg:newtex4} modified to cache geometrical constants in the shared memory}
\KwIn { Similar to \algorithmname~\ref{alg:newtex4}, but projection constants $\gmem{c}_*$ are provided in global GPU memory}
\KwShMem { $\shmem{\vdata{c}_{cs}}[s_p]$, $\shmem{c_{a}}[s_p]$ }
\ForToBy{p_b}{0}{n_p}{s_p}{
\tcc{Caching projection constants in shared memory}
$m_l \eq \vy{m_t} \mul \vx{n_t} + \vx{m_t}$ \;
$\shmem{\vdata{c}_{cs}}[m_l] \eq \vlist{\gmem{c_c}[p_b + m_l], \gmem{c_s}[p_b + m_l]}$ \;
$\shmem{c_a}[m_l] \eq \gmem{c_a}[p_b + m_l]$ \;
\KwSync \;
\tcc{Computing partial sums}
\ForToBy{p}{m_p}{min(s_p, n_p - p_b)}{4}{
$c_s \eq \vy{\shmem{c_{cs}}[p]}$ \;
$h \eq \shmem{c_a}[p] + \vx{f'_g} \mul \vx{\shmem{c_{cs}}[p]} - \vy{f'_g} \mul \vy{\shmem{c_{cs}}[p]} + \fconst{0.5}$ \;
\ForTo{q}{0}{4}{
$\vfloat{s}[q] \aeq $ \KwTex{$h$, $p_b + p + \fconst{0.5}$} \;
% $h \seq 4 \mul \vy{\shmem{c_{cs}}[p]}$ \;
$h \seq 4 \mul c_s$ \;
}
}
\KwSync \;
}
\end{algorithm}
|