2
* Copyright 1993-2007 NVIDIA Corporation. All rights reserved.
6
* This source code is subject to NVIDIA ownership rights under U.S. and
7
* international Copyright laws. Users and possessors of this source code
8
* are hereby granted a nonexclusive, royalty-free license to use this code
9
* in individual and commercial software.
11
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
12
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
13
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
14
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
15
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
16
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
17
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
18
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
19
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
20
* OR PERFORMANCE OF THIS SOURCE CODE.
22
* U.S. Government End Users. This source code is a "commercial item" as
23
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
24
* "commercial computer software" and "commercial computer software
25
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
26
* and is provided to the U.S. Government only as a commercial end item.
27
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
28
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
29
* source code with only those rights set forth herein.
31
* Any use of this source code in individual and commercial software must
32
* include, in the user documentation and internal comments to the code,
33
* the above Disclaimer and U.S. Government End Users Notice.
36
/* Matrix multiplication: C = A * B.
39
* This sample implements matrix multiplication and is exactly the same as
40
* Chapter 7 of the programming guide.
41
* It has been written for clarity of exposition to illustrate various CUDA
42
* programming principles, not with the goal of providing the most
43
* performant generic kernel for matrix multiplication.
45
* CUBLAS provides high-performance matrix multiplication.
58
#include <matrixMul_kernel.cu>
62
////////////////////////////////////////////////////////////////////////////////
63
// declaration, forward
64
void runTest(int argc, char** argv);
65
void randomInit(float*, int, int, int);
66
void printDiff(float*, float*, int, int);
69
void computeGold(float*, const float*, const float*, unsigned int, unsigned int, unsigned int);
71
////////////////////////////////////////////////////////////////////////////////
73
////////////////////////////////////////////////////////////////////////////////
75
main(int argc, char** argv)
79
// CUT_EXIT(argc, argv);
82
////////////////////////////////////////////////////////////////////////////////
83
//! Run a simple test for CUDA
84
////////////////////////////////////////////////////////////////////////////////
88
runTest(int argc, char** argv)
91
CUT_DEVICE_INIT(argc, argv);
93
// set seed for rand()
96
// allocate host memory for matrices A and B
97
unsigned int size_A = WA * HA;
98
unsigned int mem_size_A = sizeof(float) * size_A;
99
float* h_A = (float*) malloc(mem_size_A);
100
unsigned int size_B = WB * HB;
101
unsigned int mem_size_B = sizeof(float) * size_B;
102
float* h_B = (float*) malloc(mem_size_B);
104
// initialize host memory
105
randomInit(h_A, size_A, WA, HA);
106
randomInit(h_B, size_B, WB, HB);
110
// allocate device memory
112
CUDA_SAFE_CALL(cudaMalloc((void**) &d_A, mem_size_A));
114
CUDA_SAFE_CALL(cudaMalloc((void**) &d_B, mem_size_B));
116
// copy host memory to device
117
CUDA_SAFE_CALL(cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice) );
118
CUDA_SAFE_CALL(cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice) );
121
CUDA_SAFE_CALL(cudaMalloc((void**) &d_T, mem_size_A + mem_size_B));
124
CUDA_SAFE_CALL(cudaBindTexture(0, tex, d_T, mem_size_A + mem_size_B));
126
// CUDA_SAFE_CALL(cudaMemset(d_T, 0, mem_size_A + mem_size_B) );
127
CUDA_SAFE_CALL(cudaMemcpy(d_T , h_A, mem_size_A, cudaMemcpyHostToDevice) );
128
CUDA_SAFE_CALL(cudaMemcpy(d_T + size_A, h_B, mem_size_B, cudaMemcpyHostToDevice) );
132
// allocate device memory for result
133
unsigned int size_C = WC * HC;
134
unsigned int mem_size_C = sizeof(float) * size_C;
136
CUDA_SAFE_CALL(cudaMalloc((void**) &d_C, mem_size_C));
138
// allocate host memory for the result
139
float* h_C = (float*) malloc(mem_size_C);
141
// create and start timer
142
unsigned int timer = 0;
143
cutCreateTimer(&timer);
145
// setup execution parameters
146
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
147
dim3 grid(WC / threads.x, HC / threads.y);
150
testTex<<< grid, threads >>>(d_C, 0, size_A, WA, WB);
152
CUDA_SAFE_CALL(cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost) );
154
for (int k=0;k<10;k++) {
155
for (int l=0;l<10;l++)
156
printf("%f ", h_C[k*WA+l]);
163
// execute the kernel
164
matrixTexSimpleMul<<< grid, threads >>>(d_C, 0, size_A, WA, WB);
165
cudaThreadSynchronize();
167
cutResetTimer(timer);
168
cutStartTimer(timer);
169
for (i=0;i<iters;++i) {
170
matrixTexSimpleMul<<< grid, threads >>>(d_C, 0, size_A, WA, WB);
172
cudaThreadSynchronize();
174
printf("Processing time (simple,tex): %f (ms) \n", cutGetTimerValue(timer)/iters);
176
// copy result from device to host
177
CUDA_SAFE_CALL(cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost) );
179
matrixSimpleMul<<< grid, threads >>>(d_C, d_A, d_B, WA, WB);
180
cudaThreadSynchronize();
182
// execute the kernel
183
cutResetTimer(timer);
184
cutStartTimer(timer);
185
for (i=0;i<iters;++i) {
186
matrixSimpleMul<<< grid, threads >>>(d_C, d_A, d_B, WA, WB);
188
cudaThreadSynchronize();
190
printf("Processing time (Simple): %f (ms) \n", cutGetTimerValue(timer) / iters);
194
// execute the kernel
195
matrixTexMul<<< grid, threads >>>(d_C, 0, size_A, WA, WB);
196
cudaThreadSynchronize();
198
cutResetTimer(timer);
199
cutStartTimer(timer);
200
for (i=0;i<iters;++i) {
201
matrixTexMul<<< grid, threads >>>(d_C, 0, size_A, WA, WB);
203
cudaThreadSynchronize();
205
printf("Processing time (texture): %f (ms) \n", cutGetTimerValue(timer)/iters);
209
matrixMul<<< grid, threads >>>(d_C, d_A, d_B, WA, WB);
210
cudaThreadSynchronize();
212
// execute the kernel
213
cutResetTimer(timer);
214
cutStartTimer(timer);
215
for (i=0;i<iters;++i) {
216
matrixMul<<< grid, threads >>>(d_C, d_A, d_B, WA, WB);
218
cudaThreadSynchronize();
220
printf("Processing time (Device): %f (ms) \n", cutGetTimerValue(timer) / iters);
222
// check if kernel execution generated and error
223
CUT_CHECK_ERROR("Kernel execution failed");
227
// compute reference solution
228
float* reference = (float*) malloc(mem_size_C);
230
cutResetTimer(timer);
231
cutStartTimer(timer);
232
computeGold(reference, h_A, h_B, HA, WA, WB);
234
printf("Processing time (CPU): %f (ms) \n", cutGetTimerValue(timer));
237
CUTBoolean res = cutCompareL2fe(reference, h_C, size_C, 1e-6f);
238
printf("Test %s \n", (1 == res) ? "PASSED" : "FAILED");
239
if (res!=1) printDiff(reference, h_C, WC, HC);
247
CUDA_SAFE_CALL(cudaFree(d_A));
248
CUDA_SAFE_CALL(cudaFree(d_B));
249
CUDA_SAFE_CALL(cudaFree(d_C));
250
CUDA_SAFE_CALL(cudaFree(d_T));
251
CUT_SAFE_CALL(cutDeleteTimer(timer));
254
// Allocates a matrix with random float entries.
255
void randomInit(float* data, int size, int w, int h)
258
for (int i = 0; i < w; i++)
259
for (int j = 0; j < h; j++)
260
data[i + j*w] = i + 0.0001 * j;
263
// for (int i = 0; i < size; ++i)
264
// data[i] = rand() / (float)RAND_MAX;
268
void printDiff(float *data1, float *data2, int width, int height)
272
for (j=0; j<height; j++) {
273
for (i=0; i<width; i++) {
275
if (data1[k] != data2[k]) {
276
printf("diff(%d,%d) CPU=%4.4f, GPU=%4.4f \n", i,j, data1[k], data2[k]);
281
printf(" nTotal Errors = %d n", error_count);