From ca1b1cea796bcfaeb86f201cf35065a606921cc1 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 24 May 2016 00:04:46 +0200 Subject: Read hw counter --- config.h | 6 +++++- gdr_test.cu | 11 ++++++++++- ipedma.h | 15 ++++++++------- kernels.cu | 16 ++++++++++++++++ 4 files changed, 39 insertions(+), 9 deletions(-) diff --git a/config.h b/config.h index acefc3b..7d95924 100644 --- a/config.h +++ b/config.h @@ -1,9 +1,13 @@ #define ITERS 100 -#define GPU_ITERS 1000 +#define GPU_ITERS 100 + #define TLP_SIZE 64 #define GPU_PAGE 65536 + #define PAGE_SIZE 4096 #define VERBOSE #define GPU_DESC + +#define USE_HW_CONTER diff --git a/gdr_test.cu b/gdr_test.cu index 13af482..cc624fc 100644 --- a/gdr_test.cu +++ b/gdr_test.cu @@ -172,6 +172,8 @@ int main(int argc, char *argv[]) { // initAssert (cuMemHostRegister ((void*)((((uintptr_t)bar)/65536)*65536), 65536, CU_MEMHOSTREGISTER_DEVICEMAP)); initAssert (cuMemHostRegister ((void*)bar, 4096, CU_MEMHOSTREGISTER_IOMEMORY)); initAssert (cuMemHostGetDevicePointer(&dBAR, (void*)bar, 0)); + // no effect + //initAssert (cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, dBAR)); bar_info = pcilib_get_bar_info(pci, BAR); printf("%p (Phys: 0x%lx, Size: 0x%x)\n", bar_info[BAR].virt_addr, bar_info[BAR].phys_addr, bar_info[BAR].size); @@ -208,6 +210,7 @@ int main(int argc, char *argv[]) { WR32 (REG_UPDATE_THRESHOLD, 0); WR64 (REG_UPDATE_ADDRESS, desc_bus); WR32 (REG_DMA, 1); + WR32 (REG_INTERCONNECT, 0x232); WR32 (REG_COUNTER, 1); #ifdef VERBOSE @@ -244,13 +247,19 @@ int main(int argc, char *argv[]) { double lat = 1000. * kbuf[0] / gpu_props.clockRate; double latk = 1000. * kbuf[1] / gpu_props.clockRate; double latc = ((tsk.tv_sec - tss.tv_sec)*1000000 + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.) / GPU_ITERS; +# ifdef USE_HW_CONTER + double lath = 4. * RD32 (0x20) / 1000; +# else + double lath = 0; +# endif #else double lat = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.; double latk = (tsk.tv_sec - tss.tv_sec)*1000000 + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.; double latc = 0; + double lath = 0; #endif - printf("Latency: %.3lf us / %.3lf us (%.3lf us) %x %x %x %x\n", lat, latk, latc, kbuf[0], kbuf[1], kbuf[2], kbuf[3]); + printf("hw: % 6.3lf us, sw: % 6.3lf us, +krn: % 6.3lf us, total: % 7.3lf us: %x %x %x %x\n", lath, lat, latk, latc, kbuf[0], kbuf[1], kbuf[2], kbuf[3]); #else if (!i) gettimeofday(&tvs, NULL); #endif /* VERBOSE */ diff --git a/ipedma.h b/ipedma.h index 284b058..299499c 100644 --- a/ipedma.h +++ b/ipedma.h @@ -1,20 +1,21 @@ #define REG_RESET_DMA 0x00 #define REG_DMA 0x04 #define REG_NUM_PACKETS_PER_DESCRIPTOR 0x10 -#define REG_PERF_COUNTER 0x28 +#define REG_PERF_COUNTER 0x20 +//#define REG_PERF_COUNTER 0x28 #define REG_PACKET_LENGTH 0x0C #define REG_DESCRIPTOR_ADDRESS 0x50 #define REG_UPDATE_ADDRESS 0x58 #define REG_UPDATE_THRESHOLD 0x60 - +#define REG_INTERCONNECT 0x9048 #define REG_COUNTER 0x9000 -#define WR32(addr, value) *(uint32_t *) (((char*)(bar)) + (addr)) = (value); -#define RD32(addr) (*(uint32_t *) (((char*)(bar)) + (addr))) -#define WR32_sleep(addr, value) *(uint32_t *) (((char*)(bar)) + (addr)) = (value); usleep (100); +#define WR32(addr, value) *(volatile uint32_t *) (((char*)(bar)) + (addr)) = (value); +#define RD32(addr) (*(volatile uint32_t *) (((char*)(bar)) + (addr))) +#define WR32_sleep(addr, value) *(volatile uint32_t *) (((char*)(bar)) + (addr)) = (value); usleep (100); -#define WR64(addr, value) *(uint64_t *) (((char*)(bar)) + (addr)) = (value); -#define RD64(addr) (*(uint64_t *) (((char*)(bar)) + (addr))) +#define WR64(addr, value) *(volatile uint64_t *) (((char*)(bar)) + (addr)) = (value); +#define RD64(addr) (*(volatile uint64_t *) (((char*)(bar)) + (addr))) #define WR64_sleep(addr, value) *(uint64_t *) (((char*)(bar)) + (addr)) = (value); usleep (100); diff --git a/kernels.cu b/kernels.cu index 341bb59..3b5e1d7 100644 --- a/kernels.cu +++ b/kernels.cu @@ -17,8 +17,20 @@ __global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint64_t desc[1] = 0; + // Reset counter +#ifdef USE_HW_CONTER + WR32 (REG_DMA, 0); + WR32 (REG_PERF_COUNTER, 0); + + WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr); + + t1 = clock64(); + WR32 (REG_DMA, 1); +#else t1 = clock64(); WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr); +#endif + do { if (++wait > 0x10000) break; @@ -26,6 +38,10 @@ __global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint64_t t2 = clock64(); +#ifdef USE_HW_CONTER + WR32 (REG_PERF_COUNTER, 1); +#endif + null<<<1,1>>>(data); cudaDeviceSynchronize(); t3 = clock64(); -- cgit v1.2.1