#include #include #include #include #include #include #include #include "CL/cl.h" #include "CL/cl_ext.h" #define KERNEL_CONTROL //#define OPENCL_TIMINGS #define CPU_WAIT //#define CPU_MARKER #define BAR PCILIB_BAR0 #define TLP_SIZE 64 #define PAGE_SIZE 4096 #define NUM_PAGES 16 #define ITERATIONS 100 #define KMEM_DEFAULT_FLAGS (pcilib_kmem_flags_t)(PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE) #define KMEM_USE_RING PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 1) #define KMEM_USE_DEFAULT PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 2) #define REG_RESET_DMA 0x00 #define REG_DMA 0x04 #define REG_NUM_PACKETS_PER_DESCRIPTOR 0x10 #define REG_PERF_COUNTER 0x20 //#define REG_PERF_COUNTER 0x28 #define REG_PACKET_LENGTH 0x0C #define REG_DESCRIPTOR_ADDRESS 0x50 #define REG_UPDATE_ADDRESS 0x58 #define REG_UPDATE_THRESHOLD 0x60 #define REG_UPDATE_COUNTER 0x70 #define REG_COUNTER 0x9000 #define WR32(addr, value) *(uint32_t *) (bar + (addr)) = (value); #define RD32(addr) (*(uint32_t *) (bar + (addr))) #define WR32_sleep(addr, value) *(uint32_t *) (bar + (addr)) = (value); usleep (100); #define WR64(addr, value) *(uint64_t *) (bar + (addr)) = (value); #define RD64(addr) (*(uint64_t *) (bar + (addr))) #define WR64_sleep(addr, value) *(uint64_t *) (bar + (addr)) = (value); usleep (100); #define DATA_SIZE NUM_PAGES * PAGE_SIZE #define CL_CHECK_STATUS(error) { \ if ((error) != CL_SUCCESS) fprintf (stderr, "OpenCL error <%s:%i>: %i\n", __FILE__, __LINE__, (error)); } static void check_data(cl_command_queue queue, cl_mem mem, size_t size) { uint32_t *data; data = malloc (size); memset (data, 0, size); CL_CHECK_STATUS (clEnqueueReadBuffer (queue, mem, CL_TRUE, 0, size, data, 0, NULL, NULL)); printf("%lx\n", data[0]); free (data); } int main(void) { int i; cl_uint j = 0; cl_context context; cl_command_queue command_queue; cl_int err; cl_uint num_of_platforms=0; cl_platform_id platform_id; cl_device_id device_id; cl_uint num_of_devices=0; cl_mem input, output;//, host; cl_bus_address_amd bus_address; cl_event event, event1, event2; cl_int status; cl_command_type type; size_t res_size; clEnqueueMakeBuffersResidentAMD_fn clEnqueueMakeBuffersResidentAMD; clEnqueueWaitSignalAMD_fn clEnqueueWaitSignalAMD; CL_CHECK_STATUS(clGetPlatformIDs(1, &platform_id, &num_of_platforms)); clEnqueueMakeBuffersResidentAMD = (clEnqueueMakeBuffersResidentAMD_fn)clGetExtensionFunctionAddressForPlatform(platform_id, "clEnqueueMakeBuffersResidentAMD"); clEnqueueWaitSignalAMD = clGetExtensionFunctionAddressForPlatform (platform_id, "clEnqueueWaitSignalAMD"); CL_CHECK_STATUS(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,&num_of_devices)); cl_context_properties properties[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) platform_id, 0 }; context = clCreateContext(properties, 1, &device_id, NULL,NULL, &err); CL_CHECK_STATUS(err); cl_queue_properties props[] = { CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE/*|CL_QUEUE_ON_DEVICE|CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE|CL_QUEUE_ON_DEVICE_DEFAULT, CL_QUEUE_SIZE, CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE*/, 0}; command_queue = clCreateCommandQueueWithProperties(context, device_id, props, &err); CL_CHECK_STATUS(err); input = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_BUS_ADDRESSABLE_AMD, DATA_SIZE, NULL, &err); CL_CHECK_STATUS(err); output = clCreateBuffer(context, CL_MEM_READ_WRITE, 4096, NULL, &err); CL_CHECK_STATUS(err); memset(&bus_address, 0, sizeof(cl_bus_address_amd)); // CL_CHECK_STATUS(clEnqueueMakeBuffersResidentAMD(command_queue, 1, &input, CL_TRUE, &bus_address, 0, 0, NULL)); CL_CHECK_STATUS(clEnqueueMakeBuffersResidentAMD(command_queue, 1, &input, CL_FALSE, &bus_address, 0, 0, &event)); CL_CHECK_STATUS (clWaitForEvents (1, &event)); CL_CHECK_STATUS (clReleaseEvent (event)); printf("bus adress : surface : 0x%lx, marker : 0x%lx\n", bus_address.surface_bus_address, bus_address.marker_bus_address); pcilib_t *pci = pcilib_open("/dev/fpga0", PCILIB_MODEL_DETECT); if (!pci) { printf("pcilib_open failed\n"); exit(1); } volatile void *bar = pcilib_resolve_bar_address(pci, BAR, 0); if (!bar) { pcilib_close(pci); printf("map bar\n"); exit(1); } const pcilib_bar_info_t *bar_info = pcilib_get_bar_info(pci, BAR); if (!bar_info) { pcilib_close(pci); printf("get bar info\n"); exit(1); } cl_bus_address_amd amd_addr = { .surface_bus_address = bar_info->phys_addr, .marker_bus_address = bar_info->phys_addr }; cl_mem bar_cl = clCreateBuffer (context, CL_MEM_EXTERNAL_PHYSICAL_AMD, bar_info->size, &amd_addr, &err); if (err) { pcilib_close(pci); printf("Error (%i) mapping BAR to GPU\n", err); exit(1); } FILE *f = fopen("kernel.cl", "rb"); fseek(f, 0, SEEK_END); long fsize = ftell(f); fseek(f, 0, SEEK_SET); //same as rewind(f); char *cl_string = malloc(fsize + 1); fread(cl_string, fsize, 1, f); cl_string[fsize] = 0; fclose(f); cl_program program = clCreateProgramWithSource (context, 1, (const char **) &cl_string, NULL, &err); CL_CHECK_STATUS (err); CL_CHECK_STATUS(clBuildProgram (program, 1, &device_id, "-cl-std=CL2.0 -D CL_VERSION_2_0", NULL, NULL)); size_t work_size = 1; cl_kernel process_kernel = clCreateKernel (program, "process", &err); CL_CHECK_STATUS (err); CL_CHECK_STATUS (clSetKernelArg (process_kernel, 0, sizeof (uint), &j)); CL_CHECK_STATUS (clSetKernelArg (process_kernel, 1, sizeof (cl_mem), &input)); CL_CHECK_STATUS (clSetKernelArg (process_kernel, 2, sizeof (cl_mem), &output)); CL_CHECK_STATUS (clSetKernelArg (process_kernel, 3, sizeof (cl_mem), &bar_cl)); CL_CHECK_STATUS (clSetKernelArg (process_kernel, 4, sizeof (ulong), &bus_address.surface_bus_address)); cl_kernel measure_kernel = clCreateKernel (program, "measure", &err); CL_CHECK_STATUS (clSetKernelArg (measure_kernel, 0, sizeof (uint), &j)); CL_CHECK_STATUS (clSetKernelArg (measure_kernel, 1, sizeof (cl_mem), &input)); CL_CHECK_STATUS (clSetKernelArg (measure_kernel, 2, sizeof (cl_mem), &output)); CL_CHECK_STATUS (clSetKernelArg (measure_kernel, 3, sizeof (cl_mem), &bar_cl)); CL_CHECK_STATUS (clSetKernelArg (measure_kernel, 4, sizeof (ulong), &bus_address.surface_bus_address)); cl_kernel nil_kernel = clCreateKernel (program, "nil", &err); pcilib_kmem_handle_t *desc_kmem = pcilib_alloc_kernel_memory (pci, PCILIB_KMEM_TYPE_CONSISTENT, 1, 128, 4096, KMEM_USE_RING, KMEM_DEFAULT_FLAGS); uintptr_t desc_bus = pcilib_kmem_get_block_ba (pci, desc_kmem, 0); volatile void *desc = (uint32_t *) pcilib_kmem_get_block_ua (pci, desc_kmem, 0); memset ((uint32_t *)desc, 0, 5 * sizeof (uint32_t)); volatile uint64_t *hwaddr = (uint64_t*)(desc + 2 * sizeof(uint32_t)); pcilib_kmem_handle_t *kbuf_kmem = pcilib_alloc_kernel_memory(pci, PCILIB_KMEM_TYPE_DMA_C2S_PAGE, 1, 4096, 4096, KMEM_USE_DEFAULT, KMEM_DEFAULT_FLAGS); uintptr_t kbuf_bus = pcilib_kmem_get_block_ba (pci, kbuf_kmem, 0); volatile uint32_t *kbuf = (uint32_t *) pcilib_kmem_get_block_ua (pci, kbuf_kmem, 0); memset ((uint32_t *)kbuf, 0, 4096); void *marker = pcilib_map_area(pci, bus_address.marker_bus_address, 4096); if (!marker) { printf("pcilib_map_area failed\n"); exit(1); } *(uint32_t*)marker = 0; void *gpubuf = pcilib_map_area(pci, bus_address.surface_bus_address, 4096); *(uint32_t*)gpubuf = 0x1; check_data(command_queue, input, 4); WR32 (REG_RESET_DMA, 1); usleep (100000); WR32 (REG_RESET_DMA, 0); usleep (100000); WR32_sleep (REG_NUM_PACKETS_PER_DESCRIPTOR, PAGE_SIZE / (4 * TLP_SIZE)); WR32_sleep (REG_PACKET_LENGTH, 0x80000 | TLP_SIZE); WR32_sleep (REG_UPDATE_THRESHOLD, 1); WR64_sleep (REG_UPDATE_ADDRESS, desc_bus); WR64_sleep (REG_UPDATE_COUNTER, bus_address.marker_bus_address); // WR32_sleep (REG_PERF_COUNTER, 0); // WR32 (REG_DMA, 1); WR32_sleep (0x9048, 0x232); WR32_sleep (REG_COUNTER, 1); usleep(1000000); struct timespec tss, tse, tsk; #ifdef KERNEL_CONTROL cl_kernel kernel = measure_kernel; #else cl_kernel kernel = process_kernel; #endif /* KERNEL_CONTROL */ for (i = 0; i < ITERATIONS; i++) { cl_event wevent[NUM_PAGES], kevent[NUM_PAGES]; printf("Iteration %i\n", i); WR32_sleep (REG_DMA, 0); WR32_sleep (REG_PERF_COUNTER, 0); *(volatile uint32_t*)marker = 0; *hwaddr = 0; clock_gettime(CLOCK_REALTIME, &tss); // we rather need to trigger it every few milliseconds and see what happens. // CL_CHECK_STATUS (clEnqueueWaitSignalAMD (command_queue, input, 0, 0, NULL, &event)); for (j = 0; j < NUM_PAGES; j++) { #ifndef KERNEL_CONTROL WR64_sleep (REG_DESCRIPTOR_ADDRESS, bus_address.surface_bus_address + j * PAGE_SIZE); #endif #ifdef CPU_WAIT wevent[j] = clCreateUserEvent(context, &err); CL_CHECK_STATUS(err); #else CL_CHECK_STATUS (clEnqueueWaitSignalAMD (command_queue, input, j + 1, 0, NULL, &wevent[j])); #endif CL_CHECK_STATUS (clSetKernelArg (kernel, 0, sizeof (uint), &j)); CL_CHECK_STATUS (clEnqueueNDRangeKernel (command_queue, kernel, 1, NULL, &work_size, NULL, 1, &wevent[j], &kevent[j])); CL_CHECK_STATUS (clFlush(command_queue)); } #ifdef KERNEL_CONTROL // we write one extra in the end. # ifndef CPU_WAIT CL_CHECK_STATUS (clEnqueueWaitSignalAMD (command_queue, input, NUM_PAGES + 1, 0, NULL, NULL)); # endif CL_CHECK_STATUS (clFlush(command_queue)); #endif clock_gettime(CLOCK_REALTIME, &tse); double lat_sched = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.; // usleep(10000); clock_gettime(CLOCK_REALTIME, &tss); double lat_flush = (tss.tv_sec - tse.tv_sec)*1000000 + 1. * (tss.tv_nsec - tse.tv_nsec) / 1000.; #ifdef KERNEL_CONTROL WR64 (REG_DESCRIPTOR_ADDRESS, bus_address.surface_bus_address); #endif clock_gettime(CLOCK_REALTIME, &tss); WR32 (REG_DMA, 1); printf(" Markers (us):"); double lastlat = 0; #ifdef CPU_WAIT for (j = 0; j < NUM_PAGES; j++) { # ifdef CPU_MARKER while ((*hwaddr) == 0) { } *hwaddr = 0; # else while ((*(volatile uint32_t*)marker) < (j + 1)) { } # endif clock_gettime(CLOCK_REALTIME, &tse); CL_CHECK_STATUS (clSetUserEventStatus(wevent[j], CL_COMPLETE)); double latm = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.; printf(" %8.3lf", latm - lastlat); lastlat = latm; } #else int cur, curupd; for (cur = 0; (*(volatile uint32_t*)marker) < NUM_PAGES;) { if (cur != (*(volatile uint32_t*)marker)) { clock_gettime(CLOCK_REALTIME, &tse); curupd = (*(volatile uint32_t*)marker); double latm = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.; for (cur++; cur < curupd; cur++) printf(" -"); printf(" %8.3lf", latm - lastlat); cur = curupd; lastlat = latm; } } #endif printf("\n"); CL_CHECK_STATUS (clWaitForEvents (1, &kevent[NUM_PAGES - 1])); CL_CHECK_STATUS (clFinish(command_queue)); clock_gettime(CLOCK_REALTIME, &tse); double lat = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.; lat /= NUM_PAGES; printf(" Markers: 0x%lx %u\n", *hwaddr, *(volatile uint32_t*)marker); printf(" GPU latencies: "); for (j = 1; j < NUM_PAGES; j++) { /* cl_ulong start, submit, end; CL_CHECK_STATUS (clGetEventProfilingInfo (wevent[j], CL_PROFILING_COMMAND_SUBMIT, sizeof (cl_ulong), &submit, NULL)); CL_CHECK_STATUS (clGetEventProfilingInfo (wevent[j], CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &start, NULL)); CL_CHECK_STATUS (clGetEventProfilingInfo (wevent[j], CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &end, NULL)); printf(" Page %i start-end: %6.3lf us, submit-end: %6.3lf us\n", j, 1. * (end - start) / 1000, 1. * (start - submit) / 1000); CL_CHECK_STATUS (clGetEventProfilingInfo (kevent[j], CL_PROFILING_COMMAND_SUBMIT, sizeof (cl_ulong), &submit, NULL)); CL_CHECK_STATUS (clGetEventProfilingInfo (kevent[j], CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &start, NULL)); CL_CHECK_STATUS (clGetEventProfilingInfo (kevent[j], CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &end, NULL)); printf(" Kernel %i start-end: %6.3lf us, submit-end: %6.3lf us\n", j, 1. * (end - start) / 1000, 1. * (start - submit) / 1000);*/ cl_ulong end, endk, endw, startw, startk; CL_CHECK_STATUS (clGetEventProfilingInfo (kevent[j - 1], CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &endk, NULL)); CL_CHECK_STATUS (clGetEventProfilingInfo (kevent[j], CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &startk, NULL)); CL_CHECK_STATUS (clGetEventProfilingInfo (kevent[j], CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &end, NULL)); #ifdef CPU_WAIT startw = endk; endw = startk; #else CL_CHECK_STATUS (clGetEventProfilingInfo (wevent[j], CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &startw, NULL)); CL_CHECK_STATUS (clGetEventProfilingInfo (wevent[j], CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &endw, NULL)); #endif printf("k-%.3lf-w-%.3lf-w-%.3lf-k-%.3lf ", 1. * (startw - endk) / 1000, 1. * (endw - startw) / 1000, 1. * (startk - endw) / 1000, 1. * (end - startk) / 1000); } printf("\n"); double lath = 4. * RD32 (0x20) / 1000; lath /= NUM_PAGES; printf(" fpga: %8.3lf us, software: %8.3lf us, sched: %8.3lf us, flush: %8.3lf us\n", lath, lat, lat_sched, lat_flush); /* CL_CHECK_STATUS(clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, &res_size)); printf(" Event return: %i (CL_COMPLETE: %i)\n", status, CL_COMPLETE); CL_CHECK_STATUS(clGetEventInfo(event, CL_EVENT_COMMAND_TYPE, sizeof(type), &type, &res_size)); printf(" Event type: 0x%x (CL_COMMAND_WAIT_SIGNAL_AMD: 0x%x)\n", type, CL_COMMAND_WAIT_SIGNAL_AMD); */ for (j = 0; j < NUM_PAGES; j++) { // CL_CHECK_STATUS (clReleaseEvent (wevent[j])); CL_CHECK_STATUS (clReleaseEvent (kevent[j])); } // CL_CHECK_STATUS (clReleaseEvent (event)); #ifdef KERNEL_CONTROL uint data[1024]; CL_CHECK_STATUS (clEnqueueReadBuffer (command_queue, output, CL_TRUE, 0, 4096, data, 0, NULL, NULL)); printf(" FPGA Latencies: "); for (j = 0; j < NUM_PAGES + 1; j++) { /* if (j) printf("%6.3lf ", 4. * (data[j] - data[j - 1]) / 1000); else printf("%6.3lf ", 4. * data[j] / 1000);*/ printf("%u ", data[j]); } printf("\n"); #endif } WR32 (REG_COUNTER, 0); WR32 (REG_DMA, 0); usleep(10000); WR32 (REG_RESET_DMA, 1); usleep (100000); WR32 (REG_RESET_DMA, 0); usleep (100000); pcilib_unmap_area(pci, marker, 4096); pcilib_unmap_area(pci, gpubuf, 4096); pcilib_free_kernel_memory(pci, kbuf_kmem, KMEM_DEFAULT_FLAGS); pcilib_free_kernel_memory(pci, desc_kmem, KMEM_DEFAULT_FLAGS); clReleaseMemObject(bar_cl); pcilib_close(pci); clReleaseKernel (nil_kernel); clReleaseKernel (process_kernel); clReleaseKernel (measure_kernel); clReleaseProgram (program); clReleaseMemObject(output); clReleaseMemObject(input); clReleaseCommandQueue(command_queue); clReleaseContext(context); }