/articles/toma

To get this branch, use:
bzr branch http://darksoft.org/webbzr/articles/toma
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
\begin{table}[htb] %[htbp]
\begin{threeparttable}
\caption{\label{tbl:shmemconf} The optimal parameters to prevent shared memory bank conflicts in ALU-based reconstruction kernel}
\centering
\noindent
%\resizebox{\columnwidth}{!}{\begin{tabular}{} ... \end{\tabular}}
\begin{tabularx}{\columnwidth}{ | c | l | l | l | X | }
\multicolumn{5}{l}{\textbf{Standard Caching Mode} (see \sectionname~\ref{section:alurec_shmem})} \\
\hline
Area & $n_v$ & Platform & Threads & Optimizations \\
\hline
\multirow{6}{*}{32x32}
& \multirow{2}{*}{1}
  & 32-bit      & 16 & - \\
& & 64-bit      &  8 & write64 \\
\cline{2-5}
& \multirow{2}{*}{2}
&   32-bit      & 16 & - \\
& & 64-bit      & 16 & - \\
\cline{2-5}
& \multirow{2}{*}{4}
&   AMD \& Fermi & 16 & double-buffer \\
& & Kepler+      & 16 & - \\
\hline
\multirow{6}{*}{64x64}
& \multirow{2}{*}{1}
  & 32-bit      & 32 & - \\
& & 64-bit      & 16 & write64 \\
\cline{2-5}
& \multirow{2}{*}{2}
&   32-bit      & 32 &  - \\
& & 64-bit      & 32 &  - \\
\cline{2-5}
& \multirow{2}{*}{4}
&  AMD \& Fermi & 32 & double-buffer \\
& & Kepler+     & 32 & - \\
\hline
\multicolumn{5}{l}{} \\
\multicolumn{5}{l}{\textbf{Advanced Caching Mode}  (see \sectionname~\ref{section:ycache})} \\
\hline
Area & $n_v$ & Platform & Threads & Optimizations \\
\hline
\multirow{1}{*}{32x32}
& \multirow{1}{*}{1}
  & All      & 16 & - \\
\hline
\multirow{1}{*}{64x64}
& \multirow{1}{*}{1}
  & All      & 32 & - \\
\hline
\end{tabularx}
\begin{tablenotes}
\item For each considered configuration, the number of threads per projection row and the required optimizations are specified. The \emph{double-buffer} optimization splits the shared memory cache in 2 parts to prevent bank conflicts on the NVIDIA Fermi and all considered AMD architectures. The \emph{write64} optimization combines two writes to shared memory to use full bandwidth of Kepler GPUs.
\end{tablenotes}
\end{threeparttable}
\end{table}