18
static cl_platform_id platform = 0;
19
static cl_device_id device = 0;
21
static cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
22
static cl_context ctx = 0;
23
static cl_command_queue queues[2] = {0};
24
static cl_command_queue queue = 0;
26
static cl_mem dev_res[2], dev_a[2], dev_b[2];
27
static cl_mem pinned_mem_res, pinned_mem_a, pinned_mem_b;
28
float *pinned_res, *pinned_a, *pinned_b;
35
int exercise_required_alignment = PPT * BLOCK_SIZE;
37
int exercise_init(const char *name, size_t size) {
47
err = clGetPlatformIDs(1, &platform, NULL);
48
if (err != CL_SUCCESS) {
49
printf( "clGetPlatformIDs() failed with %d\n", err );
53
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
54
if (err != CL_SUCCESS) {
55
printf( "clGetDeviceIDs() failed with %d\n", err );
59
err = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(param), param, NULL);
61
printf("Using device: %s\n", param);
65
props[1] = (cl_context_properties)platform;
66
ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
67
if (err != CL_SUCCESS) {
68
printf( "clCreateContext() failed with %d\n", err );
72
for (i = 0; i < 2; i++) {
73
queues[i] = clCreateCommandQueue(ctx, device, CL_QUEUE_PROFILING_ENABLE, &err);
74
if (err != CL_SUCCESS) {
75
printf( "clCreateCommandQueue() failed with %d\n", err );
76
clReleaseContext(ctx);
80
dev_a[i] = clCreateBuffer(ctx, CL_MEM_READ_ONLY, size * size * sizeof(float), NULL, &err);
81
if (err != CL_SUCCESS) {
82
printf("clCreateBuffer() failed with %d\n", err);
86
dev_b[i] = clCreateBuffer(ctx, CL_MEM_READ_ONLY, size * size * sizeof(float), NULL, &err);
87
if (err != CL_SUCCESS) {
88
printf("clCreateBuffer() failed with %d\n", err);
92
dev_res[i] = clCreateBuffer(ctx, CL_MEM_READ_WRITE, size * size * sizeof(float), NULL, &err);
93
if (err != CL_SUCCESS) {
94
printf("clCreateBuffer() failed with %d\n", err);
100
char *res = calloc(size * size, sizeof(float));
102
for (i = 0; i < 2; i++) {
103
err = clEnqueueWriteBuffer(queue, dev_res[i], CL_TRUE, 0, size * size * sizeof(float), res, 0, NULL, NULL);
104
if (err != CL_SUCCESS) {
105
printf("clEnqueueWriteBuffer() failed with %d\n", err);
111
err = clFinish(queue);
112
if (err != CL_SUCCESS) {
113
printf("clFinish() failed with %d\n", err);
118
sprintf(param, "%s.cl", name);
119
f = fopen(param, "r");
121
printf("Can't open file with OpenCL kernels\n");
125
fseek(f, 0, SEEK_END);
127
fseek(f, 0, SEEK_SET);
129
source = (char*)malloc(len + 128);
131
printf("Can't allocate memory for OpenCL source\n");
135
sprintf(source, "#define BLOCK_SIZE %lu\n#define PPT %lu\n\n", BLOCK_SIZE, PPT, PPT);
136
if (fread(source + strlen(source), 1, len, f) != len) {
137
printf("Can't read OpenCL source\n");
142
len = strlen(source);
144
app = clCreateProgramWithSource(ctx, 1, (const char**)&source, &len, &err);
145
if (err != CL_SUCCESS) {
146
printf("clCreateProgramWithSource() failed with %d\n", err);
150
err = clBuildProgram(app, 1, &device, "", NULL, NULL);
151
if (err != CL_SUCCESS) {
152
printf("clBuildProgram() failed with %d\n", err);
158
kernel = clCreateKernel(app, "multiply", &err);
159
if (err != CL_SUCCESS) {
160
printf("clCreateKernel() failed with %d\n", err);
165
clSetKernelArg(kernel, 3, sizeof(unsigned long), &matrix_size);
166
err = clSetKernelArg(kernel, 4, 2 * SHMEM * BLOCK_SIZE * BLOCK_SIZE * sizeof(float), NULL);
171
int exercise_allocate(float **res, float **a, float **b, size_t size) {
172
pinned_mem_a = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, size * size * sizeof(float), NULL, &err);
173
if (err != CL_SUCCESS) {
174
printf("clCreateBuffer() for pinned memory failed with %d\n", err);
178
pinned_a = (cl_float*)clEnqueueMapBuffer(queue, pinned_mem_a, CL_TRUE, CL_MAP_WRITE, 0, size * size * sizeof(float), 0, NULL, NULL, &err);
179
if (err != CL_SUCCESS) {
180
printf("clEnqueueMapBuffer() failed with %d\n", err);
184
pinned_mem_b = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, size * size * sizeof(float), NULL, &err);
185
if (err != CL_SUCCESS) {
186
printf("clCreateBuffer() for pinned memory failed with %d\n", err);
190
pinned_b = (cl_float*)clEnqueueMapBuffer(queue, pinned_mem_b, CL_TRUE, CL_MAP_WRITE, 0, size * size * sizeof(float), 0, NULL, NULL, &err);
191
if (err != CL_SUCCESS) {
192
printf("clEnqueueMapBuffer() failed with %d\n", err);
196
pinned_mem_res = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, size * size * sizeof(float), NULL, &err);
197
if (err != CL_SUCCESS) {
198
printf("clCreateBuffer() for pinned memory failed with %d\n", err);
202
pinned_res = (cl_float*)clEnqueueMapBuffer(queue, pinned_mem_res, CL_TRUE, CL_MAP_READ, 0, size * size * sizeof(float), 0, NULL, NULL, &err);
203
if (err != CL_SUCCESS) {
204
printf("clEnqueueMapBuffer() failed with %d\n", err);
216
void exercise_free() {
219
clReleaseKernel(kernel);
220
clReleaseProgram(app);
222
clEnqueueUnmapMemObject(queue, pinned_mem_a, pinned_a, 0, NULL, NULL);
223
clEnqueueUnmapMemObject(queue, pinned_mem_b, pinned_b, 0, NULL, NULL);
224
clEnqueueUnmapMemObject(queue, pinned_mem_res, pinned_res, 0, NULL, NULL);
226
clReleaseMemObject(pinned_mem_a);
227
clReleaseMemObject(pinned_mem_b);
228
clReleaseMemObject(pinned_mem_res);
230
clReleaseCommandQueue(queue);
231
clReleaseContext(ctx);
233
for (i = 0; i < 2; i++) {
234
clReleaseMemObject(dev_res[i]);
235
clReleaseMemObject(dev_b[i]);
236
clReleaseMemObject(dev_a[i]);
240
size_t exercise(float *res, float *a, float *b, size_t size, int iterations) {
243
cl_event events[iterations];
247
size_t local_size[] = {BLOCK_SIZE, BLOCK_SIZE};
248
size_t global_size[] = {size/PPT, size/PPT};
250
for (i = 0; i <= iterations; i++) {
251
if (i < iterations) {
254
err = clEnqueueWriteBuffer(queues[pos], dev_a[pos], CL_FALSE, 0, size * size * sizeof(float), a, 0, NULL, NULL);
255
if (err != CL_SUCCESS) {
256
printf("clEnqueueWriteBuffer() failed with %d\n", err);
260
err = clEnqueueWriteBuffer(queues[pos], dev_b[pos], CL_FALSE, 0, size * size * sizeof(float), b, 0, NULL, NULL);
261
if (err != CL_SUCCESS) {
262
printf("clEnqueueWriteBuffer() failed with %d\n", err);
271
clSetKernelArg(kernel, 0, sizeof(cl_mem), &dev_res[pos]);
272
clSetKernelArg(kernel, 1, sizeof(cl_mem), &dev_a[pos]);
273
clSetKernelArg(kernel, 2, sizeof(cl_mem), &dev_b[pos]);
275
err = clEnqueueNDRangeKernel(queues[pos], kernel, 2, 0, global_size, local_size, 0, NULL, &events[i-1]);
276
if (err != CL_SUCCESS) {
277
printf("clEnqueueNDRangeKernel() failed with %d\n", err);
281
err = clEnqueueReadBuffer(queues[pos], dev_res[pos], CL_FALSE, 0, size * size * sizeof(float), res, 0, NULL, NULL);
282
if (err != CL_SUCCESS) {
283
printf("clEnqueueReadBuffer() failed with %d\n", err);
288
err = clFinish(queues[0]);
289
if (err != CL_SUCCESS) {
290
printf("clFinish() failed with %d\n", err);
294
err = clFinish(queues[1]);
295
if (err != CL_SUCCESS) {
296
printf("clFinish() failed with %d\n", err);
301
for (i = 0; i < iterations; i++) {
302
err = clGetEventProfilingInfo(events[i], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
303
if (err != CL_SUCCESS) {
304
printf("clGetEventProfilingInfo() failed with %d\n", err);
308
err = clGetEventProfilingInfo(events[i], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
309
if (err != CL_SUCCESS) {
310
printf("clGetEventProfilingInfo() failed with %d\n", err);
314
runtime += end - start;