/articles/toma

To get this branch, use:
bzr branch http://darksoft.org/webbzr/articles/toma
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
%\vspace{2mm}
\begin{table*}
%[]

\caption{\label{tbl:gpuspec} List and specification of considered GPU architectures}
\centering
%\resizebox{\textwidth}{!}{
\begin{threeparttable}
\begin{tabular} { | l | cccccc | ccc |}
\hline
                                        & \multicolumn{6}{c|}{NVIDIA GeForce~\cite{nvidia2017cudapg}}                                   & \multicolumn{3}{c|}{AMD Radeon~\cite{amd2013openclpg}} \\
					& GTX295\tnote{1}& GTX580	& GTX680	& Titan     	& GTX980    	& Titan X    	& HD5970\tnote{1}& HD7970     	& R9-290	\\
\hline
Architecture	        		& GT200         & Fermi         & Kepler        & Kepler    	& Maxwell   	& Pascal	& Cypress   	& Tahiti     	& Hawaii	\\
Architecture Code Name         		& GT200         & GF110         & GK104         & GK110         & GM204         & GP104         & VLIW5         & GCN1          & GCN2          \\
Release Year   				& 2009		& 2010		& 2012		& 2013		& 2015		& 2016		& 2009		& 2012		& 2013		\\
Reference                               & \cite{nvidia2008gt200} & \cite{nvidia2009gf110} & \cite{nvidia2012gk104} & \cite{nvidia2012gk110} & \cite{nvidia2014gm204} & \cite{nvidia2016gp104} & \cite{amd2008vliw} & \cite{amd2012gcn} & \cite{amd2012gcn} \\
\hline

\hline
\rowhead{Global Memory}
\hline
Global Memory (GB)			& 0.9		& 1.5		& 2		& 6		& 4		& 12		& 1		& 3		& 4		\\
Memory Bandwidth (GB/s)			& 112		& 192		& 192		& 288		& 224		& 480		& 128		& 264		& 320		\\
L2 Cache (KB)				& -		& 768		& 512		& 1536		& 208		& 3072		& 512		& 768		& 1024		\\
L2 Bandwidth (GB/s)			& -		& 296		& 515		& 763		& 641		& 1351		& 371		& 710		& 970		\\
%L1 (KB/SM)				& 0		& 16 - 48	& 16 - 48	& 16 - 48	& 24		& 48		& 8		& 16		& 16		\\
%L1 Bandwidth (GB/s)			& 0		& 1581		& 1030?		& 1779?		& 1282		& 3152		& 928		& 1894		& 2424		\\ 
%L1 Cache Fastest Mechanism		& -		& Auto		& Tex		& L1/asm	& L1/asm	& L1/asm	& Auto		& Auto		& Auto		\\
\hline

\hline
\rowhead{Execution Units}
\hline
Number of SM				& 30		& 16		& 8		& 14		& 16		& 28		& 20		& 32		& 40		\\
ALU Reference Clock (MHz)		& 1242		& 1544		& 1006  	& 837   	& 1126  	& 1417	        & 725		& 925		& 947		\\ 
ALU Max Turbo Clock (Mhz)               & -             & -             & 1110          & 1202          & 1392          & 1911          & -             & -             & -             \\
ALU Benchmark Clock (MHz)\tnote{3}	& 1242		& 1544		& 1006		& 993		& 1252		& 1759		& 725		& 925		& 947		\\
Warp Schedulers (per SM)		& 1		& 2		& 4		& 4		& 4		& 4		& 1		& 5		& 5		\\
Max Instructions per Warp		& 2		& 1		& 2		& 2		& 2		& 2		& 5		& 1		& 1		\\
ALU Units (per SM)			& 8	        & 2x16          & 6x32		& 6x32		& 4x32		& 4x32		& 16x4		& 4x16		& 4x16		\\  
SFU Units (per SM) 			& 2	        & 4	        & 32		& 32		& 32		& 32		& 16		& -		& - 		\\
Texture Units (per SM) 			& 2.66\tnote{2} & 4		& 16		& 16		& 8		& 8		& 4		& 4		& 4		\\
ILP Required for Peak GFlops	        & Yes		& No		& Yes		& Yes		& No		& No		& Yes		& No		& No		\\
\hline

\hline
\rowhead{Hardware resources}
\hline
Warp Size				& 32		& 32		& 32		& 32		& 32		& 32		& 64		& 64		& 64		\\
Max Resident Warps (per SM)		& 32		& 48		& 64		& 64		& 64		& 64		& 24		& 40		& 40		\\
Shared Memory (KB/SM)			& 16	    	& 16-48	        & 16-48    	& 16-48 	& 96 		& 96 		& 32 		& 64		& 64    	\\
Registers (KB/SM)			& 64		& 128		& 256		& 256		& 256		& 256		& 256		& 256		& 256		\\
Max 32-bit regs. per thread		& 128		& 63		& 63		& 255		& 255		& 255		& 248	        & 256	        & 256	        \\
Regs. Per Thread at Full Occupancy	& 16		& 21		& 32		& 32		& 32		& 32		& 40		& 25		& 25		\\
%Sh.Mem 32-bit words per thr at 100\%   & 4		& 8		& 6		& 6		& 12		& 12		& 5.3		& 6.4		& 6.4		\\

\hline
\rowhead{Shared \& Constant Memory}
\hline
Shared Memory Banks		        & 16		& 32		& 32		& 32		& 32		& 32		& 32		& 32		& 32		\\
Sh.Mem Bank Width (bits)		& 32		& 32		& 64		& 64		& 32		& 32		& 32		& 32		& 32		\\
Sh.Mem Bank Broadcasts                  & Yes           & Yes           & Yes           & Yes           & Yes           & Yes           & No            & Yes           & Yes           \\
Speed-up using 64-bit Loads\tnote{4}    & -             & -             & 100\%         & 100\%         & -             & -             & 15\%\tnote{4} & 40\%          & -             \\
Conflict-free Loads (up to, bits)       & 32            & 64            & 128           & 128           & 128           & 128           & 64            & 64            & 64            \\
Sh.Mem Max Bandiwdth (GB/s)	        & 1324		& 1581		& 2060		& 3559		& 2564		& 6304		& 1856		& 3789		& 4849		\\
C.Mem. Max Bandwidth (GB/s)\tnote{5}    & 875		& 1511		& 1980 		& 3120		& 4186		& 11500		& 928		& 7578		& 9697		\\
%Shared/Const Memory Parallelism        & No            & No            & Yes           & Yes           & No            & No            & Yes           & Yes           & Yes           \\
%Combined Bandwidth (GB/s)\tnote{6}     & -             & -             & 3700          & 5800          & -             & -             & 690          & 6933          & 11547         \\

\hline
\rowhead{Instruction throughput}
\hline
Units executing FP-insructions		& ALU,SFU      & ALU		& ALU		& ALU		& ALU		& ALU		& ALU,SFU	& ALU		& ALU		\\
Units executing bit-shifts\tnote{6}	& ALU		& ALU		& SFU		& SFU		& ALU,SFU	& ALU,SFU	& SFU		& ALU		& ALU		\\
Units executing type-conversions\tnote{6}& ALU		& ALU		& SFU		& SFU		& SFU		& SFU		& SFU		& ALU		& ALU		\\

FP Performance (GFlops)\tnote{7}	& 994\tnote{8}	& 1581 		& 3090		& 5338		& 5128		& 12608	 	& 2320       	& 3789		& 4849          \\
Bit-shift Performance (G-ops)		& 331		& 395		& 258		& 444		& 1282		& 3152		& 232		& 1894		& 2424		\\
Type-mangling performance (G-ops)\tnote{9} & 331	& 395		& 258		& 444		& 641		& 1576		& 232		& 1894\tnote{10} & 2424\tnote{10}		\\

\hline
\rowhead{Performane of Texture Engine}
\hline
Texture Engine (GT/s)			& 51 		& 49 		& 129 		& 222 		& 160 		& 394 		& 58 		& 118		& 152 		\\
TE, 64-bit Data, Bi-linear (GT/s)\tnote{6} & 25         & 49            & 123           & 204           & 156           & 398           & 26            & 55            & 113           \\
TE, 64-bit Data, Nearest (GT/s)\tnote{6}   & 25         & 50            & 132           & 212           & 156           & 400           & 52            & 103           & 131           \\
TE, 128-bit Data, Nearest (GT/s)\tnote{6}  & 12		& 25		& 70		& 114		& 79		& 200		& 49		& 116 		& 147		\\

\hline
\rowhead{Performance Ratios}
\hline
Constant to Shared Memories             & 1             & 1             & 1             & 1             & 2             & 2             & 0.5           & 2             & 2             \\

C.Mem to Texture (Words/Texels)	        & 6.5           & 8             & 4             & 4             & 8             & 8             & 4             & 16            & 16            \\
Sh.Mem to Texture (Words/Texels)        & 6.5		& 8		& 4		& 4		& 4		& 4		& 8		& 8		& 8		\\
Type-conv to Texture (Ops/Texels)       & 6.5           & 8             & 2             & 2             & 4             & 4             & 4             & 16            & 16            \\

GFlops to Texture (Ops/Texels)	        & 19.4		& 32		& 24		& 24		& 32		& 32		& 40		& 32		& 32		\\
GFlops to Sh.Mem (Ops/Words)            & 3             & 4             & 6             & 6             & 8             & 8             & 5             & 4             & 4             \\ 
GFlops to Type-conversion 		& 3		& 4		& 12		& 12		& 8		& 8		& 10		& 2		& 2		\\

\hline

\end{tabular}
\begin{tablenotes}
\item The presented numbers are either taken from the referenced programming guide and specifications or computed based on the other presented values. All exceptions which are obtained using micro-benchmarking 
are indicated with footnotes.
\item1 The characteristics for a single GPU core are given
\item2 On GT200 the texture units are not included in SM, but are part of Texture Clusters which includes several SM
\item3 GPUBoost technology adjusts clock according to load and temperature. In this row we specify the approximate clock rate during the benchmarks
\item4 Using 64-bit loads are only faster if two shared memory operations can't be combined in a single VLIW instruction
\item5 On NVIDIA platform the bandwidth of constant memory is obtained with benchmarking
\item6 Measured using micro-benchmarking
\item7 MAD/FMA are counted as two operations
\item8 GT200 is capable to launch 4 floating-point multiplications per SFU
\item9 Rounding operations and converting between 32-bit integer and floating-point types
\item10 On GCN architectures, we have measured a 4-times higher type-mangling performance as compared to the values listed in the AMD specification. The measured values are presented in the table.

\end{tablenotes}
\end{threeparttable}
%}
\end{table*}
%\vspace{4mm}

% For our algorithm, we particularly interested in performance of rounding and type-conversion operations, bit-shifts, and standard floating-point arithmetics.
% Both GT200 and Fermi ALUs are cloced twice as SM (including scheduler). That makes 2 SM clocks on GT200 to execute full warp (but only 1 clock on Fermi). This is used to dual-issue ALU and SFU.
% Furthermore SFU on GT200 is able to execute 4 fp MUL operations. Using dual-issue, both ALU and SFU may be used to perform fp-computations. This is not the case on Fermi. SFU are not capable of fp MUL.
% On Fermi, ALUs are used for type-conversions and bit-shifts. Starting from Kepler, this operations are moved to SFU.
% AMD VLIW5 restricts warps to 32 per SM. However, total limit of 496 per complete GPU limiting it to 24.8 per SM.
% On GCN2 ulong should be used to access indexed constants. It significantly improves the performance


%On AMD we can reduce occupancy be using shared memory, but not increase it.
%L1 cache on AMD with const __restrict for read_only bufers (L1 bandwidth VLIW is 4x 16bytes per clock)
%