From 04ef44def02c76029dc91d1eb17d1532e2714a5c Mon Sep 17 00:00:00 2001 From: "Suren A. Chilingaryan" Date: Sun, 19 Jun 2016 05:11:21 +0200 Subject: Streaming --- kernel.cl | 5 +++ test.c | 112 ++++++++++++++++++++++++++++++++------------------------------ 2 files changed, 63 insertions(+), 54 deletions(-) diff --git a/kernel.cl b/kernel.cl index 26a0009..258ea5f 100644 --- a/kernel.cl +++ b/kernel.cl @@ -17,3 +17,8 @@ __kernel void measure(uint iter, __global uint *input, __global uint *output, __ output[iter + 1] = RD32(REG_PERF_COUNTER); WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr); } + +__kernel void nil() +{ + ; +} diff --git a/test.c b/test.c index ae92c57..3327102 100644 --- a/test.c +++ b/test.c @@ -12,12 +12,15 @@ #define KERNEL_CONTROL //#define OPENCL_TIMINGS +#define CPU_WAIT +//#define CPU_MARKER #define BAR PCILIB_BAR0 #define TLP_SIZE 64 #define PAGE_SIZE 4096 #define NUM_PAGES 16 +#define ITERATIONS 100 #define KMEM_DEFAULT_FLAGS (pcilib_kmem_flags_t)(PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE) @@ -184,6 +187,8 @@ int main(void) CL_CHECK_STATUS (clSetKernelArg (measure_kernel, 3, sizeof (cl_mem), &bar_cl)); CL_CHECK_STATUS (clSetKernelArg (measure_kernel, 4, sizeof (ulong), &bus_address.surface_bus_address)); + cl_kernel nil_kernel = clCreateKernel (program, "nil", &err); + pcilib_kmem_handle_t *desc_kmem = pcilib_alloc_kernel_memory (pci, PCILIB_KMEM_TYPE_CONSISTENT, 1, 128, 4096, KMEM_USE_RING, KMEM_DEFAULT_FLAGS); uintptr_t desc_bus = pcilib_kmem_get_block_ba (pci, desc_kmem, 0); volatile void *desc = (uint32_t *) pcilib_kmem_get_block_ua (pci, desc_kmem, 0); @@ -227,44 +232,6 @@ int main(void) struct timespec tss, tse, tsk; - - for (i = 0; i < 100; i++) { - WR32_sleep (REG_DMA, 0); - WR32_sleep (REG_PERF_COUNTER, 0); - WR64_sleep (REG_DESCRIPTOR_ADDRESS, bus_address.surface_bus_address); - - *(volatile uint32_t*)marker = 0; - *hwaddr = 0; - - clock_gettime(CLOCK_REALTIME, &tss); - WR32 (REG_DMA, 1); - - if (i < 50) { - while ((*hwaddr) == 0) { - } - } else { - while ((*(volatile uint32_t*)marker) < 1) { - } - } - clock_gettime(CLOCK_REALTIME, &tse); - - CL_CHECK_STATUS (clEnqueueNDRangeKernel (command_queue, process_kernel, 1, NULL, &work_size, NULL, 0, NULL, &event)); - CL_CHECK_STATUS (clWaitForEvents (1, &event)); - clock_gettime(CLOCK_REALTIME, &tsk); - - double lath = 4. * RD32 (0x20) / 1000; - - double lat = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.; - double latk = (tsk.tv_sec - tss.tv_sec)*1000000 + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.; - - printf(" iteration %u, hw: %6.3lf us, sw: %6.3lf us, +krn: %6.3lf us, maker: %u\n", i, lath, lat, latk, *(volatile uint32_t*)marker); - - CL_CHECK_STATUS (clReleaseEvent (event)); - } - - usleep(100000); - - #ifdef KERNEL_CONTROL cl_kernel kernel = measure_kernel; #else @@ -272,13 +239,14 @@ int main(void) #endif /* KERNEL_CONTROL */ - for (i = 0; i < 10; i++) { + for (i = 0; i < ITERATIONS; i++) { cl_event wevent[NUM_PAGES], kevent[NUM_PAGES]; printf("Iteration %i\n", i); WR32_sleep (REG_DMA, 0); WR32_sleep (REG_PERF_COUNTER, 0); *(volatile uint32_t*)marker = 0; + *hwaddr = 0; clock_gettime(CLOCK_REALTIME, &tss); // we rather need to trigger it every few milliseconds and see what happens. @@ -287,11 +255,13 @@ int main(void) #ifndef KERNEL_CONTROL WR64_sleep (REG_DESCRIPTOR_ADDRESS, bus_address.surface_bus_address + j * PAGE_SIZE); #endif -// if (j) { -// CL_CHECK_STATUS (clEnqueueWaitSignalAMD (command_queue, input, j + 1, 1, &wevent[j-1], &wevent[j])); -// } else { - CL_CHECK_STATUS (clEnqueueWaitSignalAMD (command_queue, input, j + 1, 0, NULL, &wevent[j])); -// } + +#ifdef CPU_WAIT + wevent[j] = clCreateUserEvent(context, &err); CL_CHECK_STATUS(err); +#else + CL_CHECK_STATUS (clEnqueueWaitSignalAMD (command_queue, input, j + 1, 0, NULL, &wevent[j])); +#endif + CL_CHECK_STATUS (clSetKernelArg (kernel, 0, sizeof (uint), &j)); CL_CHECK_STATUS (clEnqueueNDRangeKernel (command_queue, kernel, 1, NULL, &work_size, NULL, 1, &wevent[j], &kevent[j])); CL_CHECK_STATUS (clFlush(command_queue)); @@ -299,7 +269,9 @@ int main(void) #ifdef KERNEL_CONTROL // we write one extra in the end. +# ifndef CPU_WAIT CL_CHECK_STATUS (clEnqueueWaitSignalAMD (command_queue, input, NUM_PAGES + 1, 0, NULL, NULL)); +# endif CL_CHECK_STATUS (clFlush(command_queue)); #endif clock_gettime(CLOCK_REALTIME, &tse); @@ -309,27 +281,53 @@ int main(void) clock_gettime(CLOCK_REALTIME, &tss); double lat_flush = (tss.tv_sec - tse.tv_sec)*1000000 + 1. * (tss.tv_nsec - tse.tv_nsec) / 1000.; + #ifdef KERNEL_CONTROL WR64 (REG_DESCRIPTOR_ADDRESS, bus_address.surface_bus_address); #endif + + clock_gettime(CLOCK_REALTIME, &tss); WR32 (REG_DMA, 1); - int cur; + printf(" Markers (us):"); + double lastlat = 0; +#ifdef CPU_WAIT + for (j = 0; j < NUM_PAGES; j++) { +# ifdef CPU_MARKER + while ((*hwaddr) == 0) { + } + *hwaddr = 0; +# else + while ((*(volatile uint32_t*)marker) < (j + 1)) { + } +# endif + clock_gettime(CLOCK_REALTIME, &tse); + CL_CHECK_STATUS (clSetUserEventStatus(wevent[j], CL_COMPLETE)); + double latm = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.; + printf(" %8.3lf", latm - lastlat); + lastlat = latm; + } +#else + int cur, curupd; for (cur = 0; (*(volatile uint32_t*)marker) < NUM_PAGES;) { if (cur != (*(volatile uint32_t*)marker)) { clock_gettime(CLOCK_REALTIME, &tse); - cur = (*(volatile uint32_t*)marker); + curupd = (*(volatile uint32_t*)marker); double latm = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.; - printf(" Marker %u after %6.3lf us\n", cur, latm); + for (cur++; cur < curupd; cur++) printf(" -"); + printf(" %8.3lf", latm - lastlat); + cur = curupd; + lastlat = latm; } - } +#endif + printf("\n"); CL_CHECK_STATUS (clWaitForEvents (1, &kevent[NUM_PAGES - 1])); CL_CHECK_STATUS (clFinish(command_queue)); clock_gettime(CLOCK_REALTIME, &tse); - double lat = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.; + double lat = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.; lat /= NUM_PAGES; printf(" Markers: 0x%lx %u\n", *hwaddr, *(volatile uint32_t*)marker); @@ -350,18 +348,23 @@ int main(void) cl_ulong end, endk, endw, startw, startk; CL_CHECK_STATUS (clGetEventProfilingInfo (kevent[j - 1], CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &endk, NULL)); - CL_CHECK_STATUS (clGetEventProfilingInfo (wevent[j], CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &startw, NULL)); - CL_CHECK_STATUS (clGetEventProfilingInfo (wevent[j], CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &endw, NULL)); CL_CHECK_STATUS (clGetEventProfilingInfo (kevent[j], CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &startk, NULL)); CL_CHECK_STATUS (clGetEventProfilingInfo (kevent[j], CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &end, NULL)); +#ifdef CPU_WAIT + startw = endk; + endw = startk; +#else + CL_CHECK_STATUS (clGetEventProfilingInfo (wevent[j], CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &startw, NULL)); + CL_CHECK_STATUS (clGetEventProfilingInfo (wevent[j], CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &endw, NULL)); +#endif printf("k-%.3lf-w-%.3lf-w-%.3lf-k-%.3lf ", 1. * (startw - endk) / 1000, 1. * (endw - startw) / 1000, 1. * (startk - endw) / 1000, 1. * (end - startk) / 1000); } printf("\n"); - double lath = 4. * RD32 (0x20) / 1000; - printf(" fpga: %6.3lf us, software: %6.3lf us, sched: %6.3lf us, flush: %6.3lf us\n", lath, lat, lat_sched, lat_flush); + double lath = 4. * RD32 (0x20) / 1000; lath /= NUM_PAGES; + printf(" fpga: %8.3lf us, software: %8.3lf us, sched: %8.3lf us, flush: %8.3lf us\n", lath, lat, lat_sched, lat_flush); /* CL_CHECK_STATUS(clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, &res_size)); @@ -383,7 +386,7 @@ int main(void) #ifdef KERNEL_CONTROL uint data[1024]; CL_CHECK_STATUS (clEnqueueReadBuffer (command_queue, output, CL_TRUE, 0, 4096, data, 0, NULL, NULL)); - printf("\nLatencies: "); + printf(" FPGA Latencies: "); for (j = 0; j < NUM_PAGES + 1; j++) { /* if (j) printf("%6.3lf ", 4. * (data[j] - data[j - 1]) / 1000); @@ -411,6 +414,7 @@ int main(void) pcilib_close(pci); + clReleaseKernel (nil_kernel); clReleaseKernel (process_kernel); clReleaseKernel (measure_kernel); clReleaseProgram (program); -- cgit v1.2.1