2
* Copyright 1993-2007 NVIDIA Corporation. All rights reserved.
6
* This source code is subject to NVIDIA ownership rights under U.S. and
7
* international Copyright laws. Users and possessors of this source code
8
* are hereby granted a nonexclusive, royalty-free license to use this code
9
* in individual and commercial software.
11
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
12
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
13
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
14
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
15
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
16
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
17
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
18
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
19
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
20
* OR PERFORMANCE OF THIS SOURCE CODE.
22
* U.S. Government End Users. This source code is a "commercial item" as
23
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
24
* "commercial computer software" and "commercial computer software
25
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
26
* and is provided to the U.S. Government only as a commercial end item.
27
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
28
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
29
* source code with only those rights set forth herein.
31
* Any use of this source code in individual and commercial software must
32
* include, in the user documentation and internal comments to the code,
33
* the above Disclaimer and U.S. Government End Users Notice.
36
/* Matrix multiplication: C = A * B.
40
#ifndef _MATRIXMUL_KERNEL_H_
41
#define _MATRIXMUL_KERNEL_H_
44
#include "matrixMul.h"
46
#define CHECK_BANK_CONFLICTS 0
47
#if CHECK_BANK_CONFLICTS
48
#define AS(i, j) CUT_BANK_CHECKER(((float*)&As[0][0]), (BLOCK_SIZE * i + j))
49
#define BS(i, j) CUT_BANK_CHECKER(((float*)&Bs[0][0]), (BLOCK_SIZE * i + j))
51
#define AS(i, j) As[i][j]
52
#define BS(i, j) Bs[i][j]
55
////////////////////////////////////////////////////////////////////////////////
56
//! Matrix multiplication on the device: C = A * B
57
//! wA is A's width and wB is B's width
58
////////////////////////////////////////////////////////////////////////////////
60
texture<float, 1, cudaReadModeElementType> tex;
63
testTex( float* C, int texA, int texB, int wA, int wB)
74
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
76
C[c + wB * ty + tx] = tex1Dfetch(tex, ((tx%2)?texA:texB) + c + wB * ty + tx);
80
matrixTexSimpleMul( float* C, int texA, int texB, int wA, int wB)
90
// Csub is used to store the element of the block sub-matrix
91
// that is computed by the thread
94
int a = texA + wA * BLOCK_SIZE * by + wA * ty;
95
int b = texB + BLOCK_SIZE * bx + tx;
98
for (; a < aEnd; ++a, b+= wB )
99
Csub += tex1Dfetch(tex, a) * tex1Dfetch(tex, b);
101
C[wB * BLOCK_SIZE * by + wB * ty + BLOCK_SIZE * bx + tx] = Csub;
105
matrixSimpleMul( float* C, float* A, float* B, int wA, int wB)
112
int tx = threadIdx.x;
113
int ty = threadIdx.y;
115
// Csub is used to store the element of the block sub-matrix
116
// that is computed by the thread
119
int a = wA * BLOCK_SIZE * by + wA * ty;
120
int b = BLOCK_SIZE * bx + tx;
123
for (; a < aEnd; ++a, b+= wB )
126
C[wB * BLOCK_SIZE * by + wB * ty + BLOCK_SIZE * bx + tx] = Csub;
130
matrixTexMul( float* C, int texA, int texB, int wA, int wB)
138
int tx = threadIdx.x;
139
int ty = threadIdx.y;
141
// Index of the first sub-matrix of A processed by the block
142
int aBegin = wA * BLOCK_SIZE * by;
144
// Index of the last sub-matrix of A processed by the block
145
int aEnd = aBegin + wA - 1;
147
// Step size used to iterate through the sub-matrices of A
148
int aStep = BLOCK_SIZE;
150
// Index of the first sub-matrix of B processed by the block
151
int bBegin = BLOCK_SIZE * bx;
153
// Step size used to iterate through the sub-matrices of B
154
int bStep = BLOCK_SIZE * wB;
156
// Csub is used to store the element of the block sub-matrix
157
// that is computed by the thread
160
// Loop over all the sub-matrices of A and B
161
// required to compute the block sub-matrix
162
for (int a = aBegin, b = bBegin;
164
a += aStep, b += bStep) {
166
// Declaration of the shared memory array As used to
167
// store the sub-matrix of A
168
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
170
// Declaration of the shared memory array Bs used to
171
// store the sub-matrix of B
172
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
174
// Load the matrices from device memory
175
// to shared memory; each thread loads
176
// one element of each matrix
177
AS(ty, tx) = tex1Dfetch(tex, texA + a + wA * ty + tx);
178
BS(ty, tx) = tex1Dfetch(tex, texB + b + wB * ty + tx);
180
// Synchronize to make sure the matrices are loaded
183
// Multiply the two matrices together;
184
// each thread computes one element
185
// of the block sub-matrix
186
for (int k = 0; k < BLOCK_SIZE; ++k)
187
Csub += AS(ty, k) * BS(k, tx);
189
// Synchronize to make sure that the preceding
190
// computation is done before loading two new
191
// sub-matrices of A and B in the next iteration
195
// Write the block sub-matrix to device memory;
196
// each thread writes one element
197
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
198
C[c + wB * ty + tx] = Csub;
202
matrixMul( float* C, float* A, float* B, int wA, int wB)
209
int tx = threadIdx.x;
210
int ty = threadIdx.y;
212
// Index of the first sub-matrix of A processed by the block
213
int aBegin = wA * BLOCK_SIZE * by;
215
// Index of the last sub-matrix of A processed by the block
216
int aEnd = aBegin + wA - 1;
218
// Step size used to iterate through the sub-matrices of A
219
int aStep = BLOCK_SIZE;
221
// Index of the first sub-matrix of B processed by the block
222
int bBegin = BLOCK_SIZE * bx;
224
// Step size used to iterate through the sub-matrices of B
225
int bStep = BLOCK_SIZE * wB;
227
// Csub is used to store the element of the block sub-matrix
228
// that is computed by the thread
231
// Loop over all the sub-matrices of A and B
232
// required to compute the block sub-matrix
233
for (int a = aBegin, b = bBegin;
235
a += aStep, b += bStep) {
237
// Declaration of the shared memory array As used to
238
// store the sub-matrix of A
239
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
241
// Declaration of the shared memory array Bs used to
242
// store the sub-matrix of B
243
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
245
// Load the matrices from device memory
246
// to shared memory; each thread loads
247
// one element of each matrix
248
AS(ty, tx) = A[a + wA * ty + tx];
249
BS(ty, tx) = B[b + wB * ty + tx];
251
// Synchronize to make sure the matrices are loaded
254
// Multiply the two matrices together;
255
// each thread computes one element
256
// of the block sub-matrix
257
for (int k = 0; k < BLOCK_SIZE; ++k)
258
Csub += AS(ty, k) * BS(k, tx);
260
// Synchronize to make sure that the preceding
261
// computation is done before loading two new
262
// sub-matrices of A and B in the next iteration
266
// Write the block sub-matrix to device memory;
267
// each thread writes one element
268
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
269
C[c + wB * ty + tx] = Csub;
272
#endif // #ifndef _MATRIXMUL_KERNEL_H_