#include #include #include #include #include #include #include #include #include #include #include #include #include "gpudirect.h" #include "config.h" #include "ipedma.h" #include "kernels.h" #define DEVICE "/dev/fpga0" #define BAR PCILIB_BAR0 #define KMEM_DEFAULT_FLAGS (pcilib_kmem_flags_t)(PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE) #define KMEM_USE_RING PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 1) #define KMEM_USE_DEFAULT PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 2) #define gdrAssert(ans) { gdrError((ans), __FILE__, __LINE__); } inline int gdrError(int code, const char *file, int line) { if (code != 0) { fprintf(stderr,"GDRassert: %i %s %d\n", code, file, line); return code; } else { return 0; } } #define initAssert(ans) { initError((ans), __FILE__, __LINE__); } inline int initError(CUresult code, const char *file, int line) { if (code != CUDA_SUCCESS) { const char *error = NULL; cuGetErrorString (code, &error); fprintf(stderr,"GPUassert: %s (Code: %i) %s %d\n", error, code, file, line); return code; } else { return 0; } } #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline int gpuAssert(cudaError_t code, const char *file, int line) { if (code != cudaSuccess) { fprintf(stderr,"GPUassert: %s (Code: %i) %s %d\n", cudaGetErrorString(code), code, file, line); return code; } else { return 0; } } int main(int argc, char *argv[]) { int err; //CUDA initialization initAssert (cuInit(0)); int num_gpus; initAssert (cuDeviceGetCount (&num_gpus)); printf ("Found %i GPUs on the system\n", num_gpus); CUdevice gpu; //will be used to find the correct GPU for (num_gpus--; num_gpus >= 0; num_gpus--) { CUdevice current_gpu; initAssert (cuDeviceGet (¤t_gpu, num_gpus)); char gpu_name[30] = {0}; initAssert (cuDeviceGetName (gpu_name, 30, current_gpu)); printf(" GPU %i: %s\n", num_gpus, gpu_name); if (strncmp (gpu_name, "Tesla K40", 9) == 0) { printf (" Found a Tesla GPU! I'll use that one.\n"); gpu = current_gpu; break; } } //The CU_CTX_MAP_HOST is what we are interested in! CUcontext context; initAssert (cuCtxCreate (&context, CU_CTX_MAP_HOST | CU_CTX_SCHED_AUTO, gpu)); initAssert (cuCtxSetCurrent (context)); //NOTE: API Version 3010 is problematic //(see https://www.cs.cmu.edu/afs/cs/academic/class/15668-s11/www/cuda-doc/html/group__CUDART__DRIVER.html) unsigned int api_version; initAssert (cuCtxGetApiVersion (context, &api_version)); printf (" CUDA API Version: %u\n", api_version); //printf ("CUDA init done\n\n"); CUdevprop gpu_props; initAssert(cuDeviceGetProperties(&gpu_props, gpu)); printf (" GPU Clock %lu KHz\n", gpu_props.clockRate); cudaStream_t stream; cudaStreamCreate(&stream); cudaEvent_t events[GPU_ITERS]; for (int i = 0; i < GPU_ITERS; i++) cudaEventCreate(&events[i]); CUdeviceptr d_A, d_D, d_RES; initAssert(cuMemAlloc(&d_D, GPU_PAGE)); // Should be multiple of GPU page, or mapping of next allocation will segfault the gdrcopy module initAssert(cuMemAlloc(&d_A, PAGE_SIZE)); initAssert(cuMemAlloc(&d_RES, GPU_ITERS * sizeof(uint64_t))); unsigned int flag = 1; initAssert(cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, d_D)); initAssert(cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, d_A)); gdr_mh_t A_mh, D_mh; gdr_info_t A_info, D_info; void *A_bar_ptr = NULL; void *D_bar_ptr = NULL; gdr_t g = gdr_open(); gdrAssert(g == NULL); gdrAssert(gdr_pin_buffer(g, d_D, GPU_PAGE, 0, 0, &D_mh)); gdrAssert(gdr_map(g, D_mh, &D_bar_ptr, GPU_PAGE)); gdrAssert(gdr_get_info(g, D_mh, &D_info)); gdrAssert(gdr_pin_buffer(g, d_A, PAGE_SIZE, 0, 0, &A_mh)); gdrAssert(gdr_map(g, A_mh, &A_bar_ptr, PAGE_SIZE)); gdrAssert(gdr_get_info(g, A_mh, &A_info)); int D_bar_off = D_info.va - d_D; volatile uint32_t *D = (uint32_t *)((char *)D_bar_ptr + D_bar_off); int A_bar_off = A_info.va - d_A; volatile uint32_t *A = (uint32_t *)((char *)A_bar_ptr + A_bar_off); printf("\nDevicePtr: %lx, GDR ptr: %p, Bus ptr: %lx, (Bar: %p, Offset: %i), VA: 0x%lx, Size: %lu\n", d_A, A, A_info.bus_addr, A_bar_ptr, A_bar_off, A_info.va, A_info.mapped_size); pcilib_t *pci; volatile void *bar; const pcilib_bar_info_t *bar_info; pci = pcilib_open(DEVICE, PCILIB_MODEL_DETECT); if (!pci) { printf("pcilib_open\n"); exit(1); } bar = pcilib_resolve_bar_address(pci, BAR, 0); if (!bar) { pcilib_close(pci); printf("map bar\n"); exit(1); } //printf("BAR mapped to: %p\n", bar); CUdeviceptr dBAR; // initAssert (cuMemHostRegister ((void*)((((uintptr_t)bar)/65536)*65536), 65536, CU_MEMHOSTREGISTER_DEVICEMAP)); initAssert (cuMemHostRegister ((void*)bar, 4096, CU_MEMHOSTREGISTER_IOMEMORY)); initAssert (cuMemHostGetDevicePointer(&dBAR, (void*)bar, 0)); // no effect //initAssert (cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, dBAR)); bar_info = pcilib_get_bar_info(pci, BAR); printf("Bar: %p (Phys: 0x%lx, Size: 0x%x)\n", bar_info[BAR].virt_addr, bar_info[BAR].phys_addr, bar_info[BAR].size); pcilib_kmem_handle_t *kdesc_kmem = pcilib_alloc_kernel_memory (pci, PCILIB_KMEM_TYPE_CONSISTENT, 1, 128, 4096, KMEM_USE_RING, KMEM_DEFAULT_FLAGS); uintptr_t kdesc_bus = pcilib_kmem_get_block_ba (pci, kdesc_kmem, 0); volatile void *kdesc = (uint32_t *) pcilib_kmem_get_block_ua (pci, kdesc_kmem, 0); pcilib_kmem_handle_t *kbuf_kmem = pcilib_alloc_kernel_memory(pci, PCILIB_KMEM_TYPE_DMA_C2S_PAGE, 1, ((PAGE_SIZE%4096)?(4096 * (1 + PAGE_SIZE/4096)):PAGE_SIZE), 4096, KMEM_USE_DEFAULT, KMEM_DEFAULT_FLAGS); uintptr_t kbuf_bus = pcilib_kmem_get_block_ba (pci, kbuf_kmem, 0); volatile uint32_t *kbuf = (uint32_t *) pcilib_kmem_get_block_ua (pci, kbuf_kmem, 0); memset ((uint32_t *)kbuf, 0, PAGE_SIZE); #ifdef GPU_DESC volatile void *desc = D; uintptr_t desc_bus = D_info.bus_addr; #else volatile void *desc = kdesc; uintptr_t desc_bus = kdesc_bus; #endif printf("\nSize: %lu bytes (%lu %lu-byte descriptors with packet length set to %lu), GPU itertions: %lu, Iterations: %lu\n", SIZE, NUM_PAGES, PAGE_SIZE, TLP_SIZE, GPU_ITERS, ITERS); memset ((uint32_t *)desc, 0, 5 * sizeof (uint32_t)); volatile uint64_t *hwaddr = (uint64_t*)((char*)desc + DESCRIPTOR_OFFSET + 2 * sizeof(uint32_t)); WR32 (REG_RESET_DMA, 1); usleep (100000); WR32 (REG_RESET_DMA, 0); usleep (100000); WR32_sleep (REG_NUM_PACKETS_PER_DESCRIPTOR, PAGE_SIZE / (4 * TLP_SIZE)); WR32_sleep (REG_PACKET_LENGTH, 0x80000 | TLP_SIZE); WR32_sleep (REG_UPDATE_THRESHOLD, 1); WR64_sleep (REG_UPDATE_COUNTER, D_info.bus_addr); WR64_sleep (REG_UPDATE_ADDRESS, desc_bus + DESCRIPTOR_OFFSET); WR32_sleep (REG_DMA, 1); WR32_sleep (REG_INTERCONNECT, 0x232); WR32_sleep (REG_COUNTER, 1); usleep(100000); #ifdef VERBOSE struct timespec tss, tsk; # ifndef GPU_DESC struct timespec tse; # endif #else struct timeval tvs, tve; #endif /* VERBOSE */ for (int i = 0; i < ITERS; i++) { clock_gettime(CLOCK_REALTIME, &tss); #ifdef GPU_DESC ipedma<<<1, 1>>>((void*)dBAR, A_info.bus_addr, (uint32_t*)d_D, (uint64_t*)(d_D + DESCRIPTOR_OFFSET), (uint32_t*)d_A); #else #ifdef CUDA8 cudaDeviceSynchronize(); *(uint32_t*)D = 0; WR32 (REG_DMA, 0); WR32 (REG_PERF_COUNTER, 0); /* for (int j = 0; j < GPU_ITERS; j++) { WR64 (REG_DESCRIPTOR_ADDRESS, A_info.bus_addr); } WR32 (REG_DMA, 1); usleep(10000); */ WR64 (REG_DESCRIPTOR_ADDRESS, A_info.bus_addr); for (int j = 0; j < GPU_ITERS; j++) { initAssert(cuStreamWaitValue32(stream, d_D, j + 1, CU_STREAM_WAIT_VALUE_GEQ|CU_STREAM_WAIT_VALUE_FLUSH)); measure<<<1, 1, 0, stream>>>(j, (void*)dBAR, A_info.bus_addr, (uint64_t*)d_RES, (uint32_t*)d_A); cudaEventRecord(events[j], stream); } // printf("D = %u\n", *(uint32_t*)D); // Wait until all is pushed down. usleep(1000); clock_gettime(CLOCK_REALTIME, &tss); WR32 (REG_DMA, 1); memcpy(&tse, &tss, sizeof(struct timeval)); # else // WR64 (REG_DESCRIPTOR_ADDRESS, kbuf_bus); WR64 (REG_DESCRIPTOR_ADDRESS, A_info.bus_addr); do { } while (*hwaddr == 0); clock_gettime(CLOCK_REALTIME, &tse); null<<<1, 1>>>((uint32_t*)d_A); # endif #endif err = cudaDeviceSynchronize(); if (err) printf("Oopps, synchronization error %i", err); clock_gettime(CLOCK_REALTIME, &tsk); *hwaddr = 0; #ifdef VERBOSE initAssert(cuMemcpyDtoH((void*)kbuf, d_A, PAGE_SIZE)); # ifdef USE_HW_CONTER double lath = 4. * RD32 (0x20) / 1000; # else double lath = 0; # endif double disp = 0, min = 1E+10, max = 0; long num = 0; # ifdef GPU_DESC double lat = 1000. * kbuf[0] / gpu_props.clockRate; double latk = 1000. * kbuf[1] / gpu_props.clockRate; double latc = ((tsk.tv_sec - tss.tv_sec)*1000000. + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.) / GPU_ITERS; # else double lat = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.; double latk = (tsk.tv_sec - tss.tv_sec)*1000000 + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.; double latc = 0; #ifdef CUDA8 uint64_t res[GPU_ITERS]; cudaMemcpy(res, (void*)d_RES, GPU_ITERS * sizeof(uint64_t), cudaMemcpyDeviceToHost); printf("Iterations (us):"); for (int j = 1; j < GPU_ITERS; j++) { float ms; cudaEventElapsedTime(&ms, events[j - 1], events[j]); double lati = ms * 1000.; // double lati = 4. * (res[j] - res[j - 1]) / 1000; // double lati = 1000. * (res[j] - res[j - 1]) / gpu_props.clockRate; lat += lati; if (j > 1) disp += pow(lat - lati * j, 2) / (j * (j - 1)); if (lati > max) max = lati; if (lati < min) min = lati; if (lati > 11) num++; printf(" % 6.3lf", lati); } printf("\n"); lat /= GPU_ITERS; latk /= GPU_ITERS; latc /= GPU_ITERS; lath /= GPU_ITERS; disp = sqrt(disp / (GPU_ITERS - 1)); #endif # endif printf("hw: % 8.3lf us, sw: % 8.3lf us (% 8.3lf - % 8.3lf / % 8.3lf / % 3lu), +krn: % 8.3lf us, total: % 8.3lf us: %x %x %x %x\n", lath, lat, min, max, disp, num, latk, latc, kbuf[0], kbuf[1], kbuf[2], kbuf[3]); #else if (!i) gettimeofday(&tvs, NULL); #endif /* VERBOSE */ } #ifndef VERBOSE gettimeofday(&tve, NULL); size_t avglat = (tve.tv_sec - tvs.tv_sec)*1000000 + (tve.tv_usec - tvs.tv_usec); printf("Latency: %.3lf us (average for %i iterations)\n", 1. * avglat / ITERS, ITERS); #endif /* VERBOSE */ usleep(1000000); WR32 (REG_COUNTER, 0); WR32 (REG_DMA, 0); WR32 (REG_RESET_DMA, 1); usleep (100000); WR32 (REG_RESET_DMA, 0); usleep (100000); pcilib_free_kernel_memory(pci, kbuf_kmem, KMEM_DEFAULT_FLAGS); pcilib_free_kernel_memory(pci, kdesc_kmem, KMEM_DEFAULT_FLAGS); pcilib_close(pci); printf("\nPCI closed\n"); gdr_unmap(g, A_mh, A_bar_ptr, PAGE_SIZE); gdr_unpin_buffer(g, A_mh); gdr_unmap(g, D_mh, D_bar_ptr, GPU_PAGE); gdr_unpin_buffer(g, D_mh); gdr_close(g); cuMemFree(d_RES); cuMemFree(d_A); cuMemFree(d_D); for (int i = 0; i < GPU_ITERS; i++) cudaEventDestroy(events[i]); cudaStreamDestroy(stream); printf("GDR closed\n"); }