From 8a59e1d17a83e4744071dfa790db974c296c206e Mon Sep 17 00:00:00 2001 From: "Suren A. Chilingaryan" Date: Fri, 17 Jun 2016 21:14:09 +0200 Subject: Use undocumented event-based API --- config.h | 3 +- gdr_test.cu | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++++------- gpudirect.h | 29 +++++++++++++++++ kernels.cu | 18 +++++++++++ kernels.h | 3 ++ 5 files changed, 147 insertions(+), 12 deletions(-) create mode 100644 gpudirect.h diff --git a/config.h b/config.h index a0d29f3..a461638 100644 --- a/config.h +++ b/config.h @@ -1,5 +1,6 @@ #define VERBOSE -#define GPU_DESC +//#define GPU_DESC +#define CUDA8 #define USE_HW_CONTER diff --git a/gdr_test.cu b/gdr_test.cu index d0118d5..42f7cc5 100644 --- a/gdr_test.cu +++ b/gdr_test.cu @@ -14,6 +14,7 @@ #include #include +#include "gpudirect.h" #include "config.h" #include "ipedma.h" #include "kernels.h" @@ -119,9 +120,17 @@ int main(int argc, char *argv[]) { initAssert(cuDeviceGetProperties(&gpu_props, gpu)); printf (" GPU Clock %lu KHz\n", gpu_props.clockRate); - CUdeviceptr d_A, d_D; + cudaStream_t stream; + cudaStreamCreate(&stream); + + cudaEvent_t events[GPU_ITERS]; + for (int i = 0; i < GPU_ITERS; i++) + cudaEventCreate(&events[i]); + + CUdeviceptr d_A, d_D, d_RES; initAssert(cuMemAlloc(&d_D, GPU_PAGE)); // Should be multiple of GPU page, or mapping of next allocation will segfault the gdrcopy module initAssert(cuMemAlloc(&d_A, PAGE_SIZE)); + initAssert(cuMemAlloc(&d_RES, GPU_ITERS * sizeof(uint64_t))); unsigned int flag = 1; initAssert(cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, d_D)); initAssert(cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, d_A)); @@ -199,7 +208,7 @@ int main(int argc, char *argv[]) { printf("\nSize: %lu bytes (%lu %lu-byte descriptors with packet length set to %lu), GPU itertions: %lu, Iterations: %lu\n", SIZE, NUM_PAGES, PAGE_SIZE, TLP_SIZE, GPU_ITERS, ITERS); memset ((uint32_t *)desc, 0, 5 * sizeof (uint32_t)); - volatile uint64_t *hwaddr = (uint64_t*)((char*)desc + 2 * sizeof(uint32_t)); + volatile uint64_t *hwaddr = (uint64_t*)((char*)desc + DESCRIPTOR_OFFSET + 2 * sizeof(uint32_t)); WR32 (REG_RESET_DMA, 1); usleep (100000); @@ -209,7 +218,7 @@ int main(int argc, char *argv[]) { WR32_sleep (REG_NUM_PACKETS_PER_DESCRIPTOR, PAGE_SIZE / (4 * TLP_SIZE)); WR32_sleep (REG_PACKET_LENGTH, 0x80000 | TLP_SIZE); WR32_sleep (REG_UPDATE_THRESHOLD, 1); - WR64_sleep (REG_UPDATE_COUNTER, desc_bus); + WR64_sleep (REG_UPDATE_COUNTER, D_info.bus_addr); WR64_sleep (REG_UPDATE_ADDRESS, desc_bus + DESCRIPTOR_OFFSET); WR32_sleep (REG_DMA, 1); WR32_sleep (REG_INTERCONNECT, 0x232); @@ -232,14 +241,48 @@ int main(int argc, char *argv[]) { #ifdef GPU_DESC ipedma<<<1, 1>>>((void*)dBAR, A_info.bus_addr, (uint32_t*)d_D, (uint64_t*)(d_D + DESCRIPTOR_OFFSET), (uint32_t*)d_A); #else +#ifdef CUDA8 + cudaDeviceSynchronize(); + + *(uint32_t*)D = 0; + WR32 (REG_DMA, 0); + WR32 (REG_PERF_COUNTER, 0); + +/* + for (int j = 0; j < GPU_ITERS; j++) { + WR64 (REG_DESCRIPTOR_ADDRESS, A_info.bus_addr); + } + WR32 (REG_DMA, 1); + usleep(10000); +*/ + WR64 (REG_DESCRIPTOR_ADDRESS, A_info.bus_addr); + + for (int j = 0; j < GPU_ITERS; j++) { + initAssert(cuStreamWaitValue32(stream, d_D, j + 1, CU_STREAM_WAIT_VALUE_GEQ|CU_STREAM_WAIT_VALUE_FLUSH)); + measure<<<1, 1, 0, stream>>>(j, (void*)dBAR, A_info.bus_addr, (uint64_t*)d_RES, (uint32_t*)d_A); + cudaEventRecord(events[j], stream); + } +// printf("D = %u\n", *(uint32_t*)D); + + // Wait until all is pushed down. + usleep(1000); + + clock_gettime(CLOCK_REALTIME, &tss); + WR32 (REG_DMA, 1); + memcpy(&tse, &tss, sizeof(struct timeval)); + +# else // WR64 (REG_DESCRIPTOR_ADDRESS, kbuf_bus); + WR64 (REG_DESCRIPTOR_ADDRESS, A_info.bus_addr); + do { } while (*hwaddr == 0); clock_gettime(CLOCK_REALTIME, &tse); null<<<1, 1>>>((uint32_t*)d_A); +# endif #endif err = cudaDeviceSynchronize(); if (err) printf("Oopps, synchronization error %i", err); @@ -251,23 +294,57 @@ int main(int argc, char *argv[]) { #ifdef VERBOSE initAssert(cuMemcpyDtoH((void*)kbuf, d_A, PAGE_SIZE)); -# ifdef GPU_DESC - double lat = 1000. * kbuf[0] / gpu_props.clockRate; - double latk = 1000. * kbuf[1] / gpu_props.clockRate; - double latc = ((tsk.tv_sec - tss.tv_sec)*1000000. + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.) / GPU_ITERS; # ifdef USE_HW_CONTER double lath = 4. * RD32 (0x20) / 1000; # else double lath = 0; # endif -#else + + double disp = 0, min = 1E+10, max = 0; + long num = 0; +# ifdef GPU_DESC + double lat = 1000. * kbuf[0] / gpu_props.clockRate; + double latk = 1000. * kbuf[1] / gpu_props.clockRate; + double latc = ((tsk.tv_sec - tss.tv_sec)*1000000. + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.) / GPU_ITERS; +# else double lat = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.; double latk = (tsk.tv_sec - tss.tv_sec)*1000000 + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.; double latc = 0; - double lath = 0; -#endif +#ifdef CUDA8 + uint64_t res[GPU_ITERS]; + cudaMemcpy(res, (void*)d_RES, GPU_ITERS * sizeof(uint64_t), cudaMemcpyDeviceToHost); + printf("Iterations (us):"); + + for (int j = 1; j < GPU_ITERS; j++) { + float ms; + cudaEventElapsedTime(&ms, events[j - 1], events[j]); + double lati = ms * 1000.; +// double lati = 4. * (res[j] - res[j - 1]) / 1000; +// double lati = 1000. * (res[j] - res[j - 1]) / gpu_props.clockRate; + + lat += lati; + if (j > 1) disp += pow(lat - lati * j, 2) / (j * (j - 1)); + + if (lati > max) max = lati; + if (lati < min) min = lati; + if (lati > 11) num++; + + printf(" % 6.3lf", lati); + } + printf("\n"); + + + lat /= GPU_ITERS; + latk /= GPU_ITERS; + latc /= GPU_ITERS; + lath /= GPU_ITERS; + + disp = sqrt(disp / (GPU_ITERS - 1)); + +#endif +# endif - printf("hw: % 6.3lf us, sw: % 6.3lf us, +krn: % 6.3lf us, total: % 7.3lf us: %x %x %x %x\n", lath, lat, latk, latc, kbuf[0], kbuf[1], kbuf[2], kbuf[3]); + printf("hw: % 8.3lf us, sw: % 8.3lf us (% 8.3lf - % 8.3lf / % 8.3lf / % 3lu), +krn: % 8.3lf us, total: % 8.3lf us: %x %x %x %x\n", lath, lat, min, max, disp, num, latk, latc, kbuf[0], kbuf[1], kbuf[2], kbuf[3]); #else if (!i) gettimeofday(&tvs, NULL); #endif /* VERBOSE */ @@ -306,7 +383,14 @@ int main(int argc, char *argv[]) { gdr_unpin_buffer(g, D_mh); gdr_close(g); + cuMemFree(d_RES); cuMemFree(d_A); cuMemFree(d_D); + + for (int i = 0; i < GPU_ITERS; i++) + cudaEventDestroy(events[i]); + + cudaStreamDestroy(stream); + printf("GDR closed\n"); } diff --git a/gpudirect.h b/gpudirect.h new file mode 100644 index 0000000..9449ff9 --- /dev/null +++ b/gpudirect.h @@ -0,0 +1,29 @@ +enum CU_STREAM_WAIT_FLAGS { + CU_STREAM_WAIT_VALUE_GEQ = 0x0, + CU_STREAM_WAIT_VALUE_EQ = 0x1, + CU_STREAM_WAIT_VALUE_AND = 0x2, + CU_STREAM_WAIT_VALUE_FLUSH = 1<<30 +}; + +enum CU_STREAM_WRITE_FLAGS { + CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER = 0x1 +}; + +enum CU_STREAM_MEM_OP_FLAGS { + CU_STREAM_MEM_OP_WAIT_VALUE_32 = 1, + CU_STREAM_MEM_OP_WRITE_VALUE_32 = 2, + CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES = 3 +}; + +# ifdef __cplusplus +extern "C" { +# endif + +CUresult cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, uint32_t value, unsigned int flags); +CUresult cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, uint32_t value, unsigned int flags); +//CUresult cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags); + + +# ifdef __cplusplus +} +# endif diff --git a/kernels.cu b/kernels.cu index f4ea114..7e7e689 100644 --- a/kernels.cu +++ b/kernels.cu @@ -7,6 +7,24 @@ __global__ void null(uint32_t *data) { } +__global__ void feedback(volatile void *bar, uint32_t *data) { + WR32 (REG_PERF_COUNTER, 1); +} + +__global__ void measure(int iter, volatile void *bar, uint64_t bus_addr, uint64_t *res, uint32_t *data) { + // Clocks are incorrect as not running while waiting. + //res[iter] = clock64(); + // Thats does not work either (no RD support in kernels?) + //res[iter] = RD32(REG_PERF_COUNTER); + + WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr); + + if ((iter + 1) == GPU_ITERS) { + WR32 (REG_PERF_COUNTER, 1); + } +} + + __device__ void ksleep(uint32_t clocks) { clock_t start = clock64(), now; diff --git a/kernels.h b/kernels.h index 9158401..12eaa5e 100644 --- a/kernels.h +++ b/kernels.h @@ -1,4 +1,7 @@ __global__ void null(uint32_t *data); +__global__ void feedback(volatile void *bar, uint32_t *data); +__global__ void measure(int iter, volatile void *bar, uint64_t bus_addr, uint64_t *res, uint32_t *data); + __global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint32_t *counter, volatile uint64_t *desc, uint32_t *data); -- cgit v1.2.1