8
typedef float data_type;
9
#define matrix_width 4096
10
#define matrix_height 4096
11
#define size (matrix_width * matrix_height)
15
#include <test_kernel.cu>
24
cutCreateTimer(&timer);
26
data_type *d_a, *d_b, *d_res;
29
CUDA_SAFE_CALL(cudaMalloc( (void**) &d_a, size*sizeof(data_type)));
30
CUDA_SAFE_CALL(cudaMalloc( (void**) &d_b, size*sizeof(data_type)));
31
CUDA_SAFE_CALL(cudaMalloc( (void**) &d_res, size*sizeof(data_type)));
33
CUDA_SAFE_CALL(cudaMemcpy(d_a, a, size*sizeof(data_type), cudaMemcpyHostToDevice));
34
CUDA_SAFE_CALL(cudaMemcpy(d_b, b, size*sizeof(data_type), cudaMemcpyHostToDevice));
36
vecAdd<<<1,size>>>(d_res, d_a, d_b);
38
CUDA_SAFE_CALL(cudaMemcpy(res, d_res, size*sizeof(data_type),cudaMemcpyDeviceToHost));
45
should be symmetrical, otherwise (if 16x8) after first step of
46
transpose we will have 16 threads to iterate other 8 elements),
47
using other dimmension of threads will destroy memory access */
50
int block_height = 16;
51
int real_block_size = block_width * block_height;
53
int wblocks = matrix_width / block_width;
54
int real_width = wblocks * block_width;
55
if (real_width < matrix_width) {
57
real_width += block_width;
60
int hblocks = matrix_height / block_height;
61
int real_height = hblocks * block_height;
62
if (real_height < matrix_height) {
64
real_height += block_height;
67
int real_size = real_width * real_height;
69
dim3 block_dim(block_width, block_height, 1);
70
dim3 grid_dim(real_width / block_width, real_height / block_height, 1);
72
a = (float*) malloc(real_size*sizeof(data_type));
73
b = (float*) malloc(real_size*sizeof(data_type));
78
for (i = 0; i < real_width; i++) {
79
for (j = 0; j < real_height; j++) {
80
a[i + j*real_width] = i + 0.0001 * j;
86
CUDA_SAFE_CALL(cudaMalloc( (void**) &d_a, real_size*sizeof(data_type)));
87
CUDA_SAFE_CALL(cudaMalloc( (void**) &d_b, real_size*sizeof(data_type)));
89
CUDA_SAFE_CALL(cudaMemcpy(d_a, a, real_size*sizeof(data_type), cudaMemcpyHostToDevice));
92
transpose3<<<grid_dim, block_dim, (1 + block_width) * block_height*sizeof(data_type)>>>(d_b, d_a, real_width, real_height);
93
cudaThreadSynchronize();
97
for (i=0;i<iters;++i) {
98
transpose3<<<grid_dim, block_dim/*, (1 + block_width) * block_height*sizeof(data_type)*/>>>(d_b, d_a, real_width, real_height);
100
cudaThreadSynchronize();
102
float time3 = cutGetTimerValue(timer);
104
transpose2<<<grid_dim, block_dim, (1 + block_width) * block_height*sizeof(data_type)>>>(d_b, d_a, real_width, real_height);
105
cudaThreadSynchronize();
107
cutResetTimer(timer);
108
cutStartTimer(timer);
109
for (i=0;i<iters;++i) {
110
transpose2<<<grid_dim, block_dim, (1 + block_width) * block_height*sizeof(data_type)>>>(d_b, d_a, real_width, real_height);
112
cudaThreadSynchronize();
114
float time2 = cutGetTimerValue(timer);
117
transpose1<<<grid_dim, block_dim, (1 + block_width) * block_height*sizeof(data_type)>>>(d_b, d_a, real_width, real_height);
118
cudaThreadSynchronize();
120
cutResetTimer(timer);
121
cutStartTimer(timer);
122
for (i=0;i<iters;++i) {
123
transpose1<<<grid_dim, block_dim, (1 + block_width) * block_height*sizeof(data_type)>>>(d_b, d_a, real_width, real_height);
125
cudaThreadSynchronize();
127
float time1 = cutGetTimerValue(timer);
133
CUDA_SAFE_CALL(cudaMemcpy(b, d_b, size*sizeof(data_type),cudaMemcpyDeviceToHost));
136
printf("T1 transpose average time: %0.3f ms\n", time1 / iters);
137
printf("T2 transpose average time: %0.3f ms\n", time2 / iters);
138
printf("T3 transpose average time: %0.3f ms\n", time3 / iters);
140
// int runs = blocks / 65535;
145
for (i = 0; i < 10; i++) {
146
printf("%lf ", b[i]);
156
main( int argc, char** argv)
160
CUDA_SAFE_CALL(cudaGetDeviceCount(&deviceCount));
161
if (deviceCount == 0)
162
printf("There is no device supporting CUDA\n");
164
for (dev = 0; dev < deviceCount; ++dev) {
165
cudaDeviceProp deviceProp;
166
CUDA_SAFE_CALL(cudaGetDeviceProperties(&deviceProp, dev));
168
if (deviceProp.major == 9999 && deviceProp.minor == 9999)
169
printf("There is no device supporting CUDA.\n");
170
else if (deviceCount == 1)
171
printf("There is 1 device supporting CUDA\n");
173
printf("There are %d devices supporting CUDA\n", deviceCount);
175
printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
176
printf(" Major revision number: %d\n",
178
printf(" Minor revision number: %d\n",
180
printf(" Total amount of global memory: %u bytes\n",
181
deviceProp.totalGlobalMem);
182
#if CUDART_VERSION >= 2000
183
printf(" Number of multiprocessors: %d\n",
184
deviceProp.multiProcessorCount);
185
printf(" Number of cores: %d\n",
186
8 * deviceProp.multiProcessorCount);
188
printf(" Total amount of constant memory: %u bytes\n",
189
deviceProp.totalConstMem);
190
printf(" Total amount of shared memory per block: %u bytes\n",
191
deviceProp.sharedMemPerBlock);
192
printf(" Total number of registers available per block: %d\n",
193
deviceProp.regsPerBlock);
194
printf(" Warp size: %d\n",
195
deviceProp.warpSize);
196
printf(" Maximum number of threads per block: %d\n",
197
deviceProp.maxThreadsPerBlock);
198
printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n",
199
deviceProp.maxThreadsDim[0],
200
deviceProp.maxThreadsDim[1],
201
deviceProp.maxThreadsDim[2]);
202
printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n",
203
deviceProp.maxGridSize[0],
204
deviceProp.maxGridSize[1],
205
deviceProp.maxGridSize[2]);
206
printf(" Maximum memory pitch: %u bytes\n",
207
deviceProp.memPitch);
208
printf(" Texture alignment: %u bytes\n",
209
deviceProp.textureAlignment);
210
printf(" Clock rate: %.2f GHz\n",
211
deviceProp.clockRate * 1e-6f);
212
#if CUDART_VERSION >= 2000
213
printf(" Concurrent copy and execution: %s\n",
214
deviceProp.deviceOverlap ? "Yes" : "No");
220
// CUT_EXIT(argc, argv);