/opencl/tools

To get this branch, use:
bzr branch http://darksoft.org/webbzr/opencl/tools
1 by Suren A. Chilingaryan
Initial commit
1
#include <stdio.h>
2
#include <stdlib.h>
3
#include <string.h>
4
#include <pthread.h>
5
6
#include <CL/cl.h>
7
8
#define ITERATIONS 500000
9
10
#define fail(code, reason) {\
11
    fprintf(stderr, "%s\n", reason); \
12
    exit (code); \
13
}
14
15
pthread_barrier_t bar;
16
17
typedef struct  {
18
    cl_uint num_devices;
19
    cl_device_id *devices;
20
} device_info_t;
21
22
static inline cl_program load_app(cl_context ctx, cl_uint num_devices, cl_device_id *devices, const char *name) {
23
    FILE *f;
24
    size_t len;
25
    char *buf = "__kernel void test() {}";
26
    device_info_t info = { num_devices, devices }; 
27
    
28
    cl_program app;
29
    
30
    if (name) {
31
	f = fopen(name, "rb");
32
	if (!f) return NULL;
33
34
        fseek(f, 0, SEEK_END); 
35
	len = ftell(f);
36
	fseek(f, 0, SEEK_SET); 
37
    
38
	buf = (char*)malloc(len*sizeof(char));
39
	if (!buf) return NULL;
40
    
41
	if (fread(buf, 1, len, f) != len) {
42
	    free(buf);
43
	    fclose(f);
44
	    return NULL;
45
	}
46
    
47
	fclose(f);
48
    } 
49
50
    int i;
51
    printf("Compiling for ");
52
    for (i=0;i<num_devices;i++) {
53
	char name[256];
54
	int err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(name), name, NULL);
55
	if (err != CL_SUCCESS) puts("Unknown ");
56
	else printf("%s ", name);
57
    }
58
    printf("\n");
59
    
60
    app = clCreateProgramWithSource(ctx, 1, (const char**)&buf, &len, NULL);
61
    
62
    
63
    if (app) {
64
	size_t size;
65
	char build_log[4096];
66
	cl_build_status build_status;
67
68
	//"-cl-nv-maxrregcount=48"
69
	//"-cl-nv-opt-level=<N>" (0 - no optimizations, 3 - default)
70
	//"-cl-nv-arch sm_XX" - selects the target CUDA Compute Level architecture to compile for (sm_10 for 1.0, sm_11 for 1.1, sm_12 for 1.2, sm_13 for 1.3 and sm_20 for 2.0 (Fermi))
71
	//"--cl-nv-cstd=CLX.X" - selects the target OpenCL C language version (CL1.0 or CL1.1)
72
	char *build_flags = "-cl-nv-arch sm_20";
73
	int err = clBuildProgram(app, num_devices, devices, build_flags, NULL, NULL);
74
	if (err != CL_SUCCESS) printf("Application build failed (%i)\n", err);
75
	    
76
	for (i = 0; i < num_devices; i++) {
77
	    do {
78
		err = clGetProgramBuildInfo(app, devices[i], CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &build_status, NULL);
79
	    } while (build_status == CL_BUILD_IN_PROGRESS);
80
		
81
	    err = clGetProgramBuildInfo(app, devices[i], CL_PROGRAM_BUILD_LOG, sizeof(build_log) - 1, &build_log, &size);
82
	    if (!err) {
83
		build_log[size] = 0;
84
	        if (!err) {
85
		    if (build_status == CL_BUILD_SUCCESS) {
86
		    } else if (build_status == CL_BUILD_ERROR) {
87
		        printf("Build failed for device %i:\n======================\n%s\n--------------------\n\n", i, build_log);
88
		    } else {
89
		        printf("Build failed for device %i\n", i);
90
		    }
91
		}
92
	    }
93
	}
94
    } else {
95
	printf("Program creation failed\n");
96
    }
97
        
98
    if (name) free(buf);
99
    return app;
100
}
101
102
void bench(cl_context ctx, cl_device_id device, cl_kernel kern) {
103
    cl_int err;
104
105
    size_t runtime;
106
    size_t us, flops;
107
    struct timeval tv1,tv2;
108
    size_t i, iterations = ITERATIONS;
109
    size_t dims[2] = {1024, 1024};
110
111
    cl_command_queue queue = clCreateCommandQueue(ctx, device, 0, &err);
112
    if (err != CL_SUCCESS) 
113
	fail(-1, "clCreateCommandQueue() failed");
114
115
    clEnqueueNDRangeKernel (queue, kern, 2, NULL, dims, NULL, 0, NULL, NULL);
116
    clFinish(queue);
117
118
    // synchronizing
119
    pthread_barrier_wait(&bar);
120
121
    gettimeofday(&tv1, NULL);
122
    for (i = 0; i < iterations; i++) {
123
	clEnqueueNDRangeKernel (queue, kern, 2, NULL, dims, NULL, 0, NULL, NULL);
124
    }
125
    clFinish(queue);
126
    gettimeofday(&tv2, NULL);
127
128
    us = (tv2.tv_sec - tv1.tv_sec)*1000000 + (tv2.tv_usec - tv1.tv_usec);
129
    us /= iterations;
130
131
132
    clReleaseCommandQueue(queue);
133
134
    printf("  Execution time: %lu us\n", us);
135
136
}
137
138
struct thread_info {
139
    cl_context ctx;
140
    cl_device_id device;
141
    cl_kernel kern;
142
};
143
144
void *bench_thread(void *vargs) {
145
    struct thread_info *args = (struct thread_info*)vargs;
146
147
    bench(args->ctx, args->device, args->kern);
148
}
149
150
151
int main(int argc, char *argv[]) {
152
    int devid = -1;
153
    char *source = NULL;
154
    
155
    cl_int err;
156
    cl_device_id devices[16];
157
    pthread_t thr[16];
158
    struct thread_info args[16];
159
160
    cl_context ctx;
161
    cl_context ctxs[16];
162
163
    cl_program app;
164
    cl_program apps[16];
165
166
    cl_kernel kern;
167
    cl_kernel kerns[16];
168
169
170
    char fname[256];    
171
    unsigned char *binary[16];
172
    size_t binary_size[16];
173
    size_t real_size;
174
175
176
    FILE *f;
177
    
178
/*
179
    if (argc < 2) {
180
	fprintf(stderr, "Usage: %s [source] [device]\n", argv[0]);
181
	exit(0);	
182
    }
183
    
184
185
    if (argc > 2) {
186
	devid = atoi(argv[2]);	
187
    }
188
*/
189
190
    cl_uint max = 0;
191
    cl_platform_id selected_platform;
192
    cl_uint i, j, num_platforms, num_devices;
193
    cl_platform_id platforms[4];
194
195
    clGetPlatformIDs(4, platforms, &num_platforms);
196
197
    for (i = 0; i < num_platforms; i++) {
198
	err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
199
	if ((err == CL_SUCCESS)&&(num_devices > max)) {
200
	    max = num_devices;
201
	    selected_platform = platforms[i];
202
	}
203
    }
204
    
205
    clGetDeviceIDs(selected_platform, CL_DEVICE_TYPE_GPU, 16, devices, &num_devices);
206
    
207
    if (devid >= (int)num_devices) fail(-1, "Invalid device number is specified");
208
209
210
//    cl_context ctx = clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU, NULL, NULL, NULL);
211
//    if (!ctx) fail(-1, "Failed to create OpenCL context");
212
213
  
214
//    err = clGetContextInfo(ctx, CL_CONTEXT_DEVICES, 16 * sizeof(cl_device_id), &devices, &num_devices);
215
//    if (err != CL_SUCCESS) fail(-1, "clGetContextInfo call is failed");
216
217
//    num_devices /= sizeof(cl_device_id);
218
219
    
220
    for (i = 0; i < num_devices; i++) {
221
	ctxs[i] = clCreateContext(0, 1, &devices[i], NULL, NULL, &err);
222
        if (!ctxs[i]) fail(-1, "Failed to create OpenCL context");
223
	
224
	apps[i] = load_app(ctxs[i], 1, &devices[i], source);
225
	if (!apps[i]) fail(-1, "Compilation failed");
226
    
227
	kerns[i] = clCreateKernel(apps[i], "test", &err);
228
	if (err != CL_SUCCESS) fail(-1, "Error creating kernel");
229
    }
230
231
    pthread_barrier_init(&bar, NULL, num_devices);
232
233
    printf("Running in individual context:\n");
234
    for (i = 0; i < num_devices; i++) {
235
	args[i].ctx = ctxs[i];
236
	args[i].device = devices[i];
237
	args[i].kern = kerns[i];
238
//	pthread_create(&thr[i], NULL, bench_thread, &args[i]);
239
    }
240
241
    for (i = 0; i < num_devices; i++) {
242
	void *res;
243
//	pthread_join(thr[i], &res);
244
        clReleaseKernel(kerns[i]);
245
	clReleaseProgram(apps[i]);
246
	clReleaseContext(ctxs[i]);
247
    }
248
    printf("\n\n");
249
250
    pthread_barrier_destroy(&bar);
251
252
    for (j = /*2*/num_devices; j <= num_devices; j++) {
253
	pthread_barrier_init(&bar, NULL, j);
254
	
255
	ctx = clCreateContext(0, j, devices, NULL, NULL, &err);
256
	if (!ctx) fail(-1, "Failed to create OpenCL context");
257
258
	app = load_app(ctx, j, devices, source);
259
	if (!app) fail(-1, "Compilation failed");
260
261
	kern = clCreateKernel(app, "test", &err);
262
	if (err != CL_SUCCESS) fail(-1, "Error creating kernel");
263
264
265
	printf("Running parallel for %i devices:\n", j);
266
	for (i = 0; i < j; i++) {
267
	    args[i].ctx = ctx;
268
	    args[i].device = devices[i];
269
	    args[i].kern = kern;
270
	    pthread_create(&thr[i], NULL, bench_thread, &args[i]);
271
	}
272
273
	for (i = 0; i < j; i++) {
274
	    void *res;
275
	    pthread_join(thr[i], &res);
276
	}
277
278
	printf("\n\n");
279
280
	pthread_barrier_destroy(&bar);
281
    
282
	clReleaseKernel(kern);
283
	clReleaseProgram(app);
284
	clReleaseContext(ctx);
285
    }
286
    
287
    pthread_barrier_init(&bar, NULL, 1);
288
289
    ctx = clCreateContext(0, num_devices, devices, NULL, NULL, &err);
290
    if (!ctx) fail(-1, "Failed to create OpenCL context");
291
292
    app = load_app(ctx, num_devices, devices, source);
293
    if (!app) fail(-1, "Compilation failed");
294
295
    kern = clCreateKernel(app, "test", &err);
296
    if (err != CL_SUCCESS) fail(-1, "Error creating kernel");
297
298
    printf("Running sequential (multi-devices context):\n");
299
    for (i = 0; i < num_devices; i++) {
300
	bench(ctx, devices[i], kern);
301
    }
302
303
    clReleaseKernel(kern);
304
    clReleaseProgram(app);
305
    clReleaseContext(ctx);
306
    
307
    pthread_barrier_destroy(&bar);
308
}