summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--config.h3
-rw-r--r--gdr_test.cu106
-rw-r--r--gpudirect.h29
-rw-r--r--kernels.cu18
-rw-r--r--kernels.h3
5 files changed, 147 insertions, 12 deletions
diff --git a/config.h b/config.h
index a0d29f3..a461638 100644
--- a/config.h
+++ b/config.h
@@ -1,5 +1,6 @@
#define VERBOSE
-#define GPU_DESC
+//#define GPU_DESC
+#define CUDA8
#define USE_HW_CONTER
diff --git a/gdr_test.cu b/gdr_test.cu
index d0118d5..42f7cc5 100644
--- a/gdr_test.cu
+++ b/gdr_test.cu
@@ -14,6 +14,7 @@
#include <pcilib/bar.h>
#include <pcilib/kmem.h>
+#include "gpudirect.h"
#include "config.h"
#include "ipedma.h"
#include "kernels.h"
@@ -119,9 +120,17 @@ int main(int argc, char *argv[]) {
initAssert(cuDeviceGetProperties(&gpu_props, gpu));
printf (" GPU Clock %lu KHz\n", gpu_props.clockRate);
- CUdeviceptr d_A, d_D;
+ cudaStream_t stream;
+ cudaStreamCreate(&stream);
+
+ cudaEvent_t events[GPU_ITERS];
+ for (int i = 0; i < GPU_ITERS; i++)
+ cudaEventCreate(&events[i]);
+
+ CUdeviceptr d_A, d_D, d_RES;
initAssert(cuMemAlloc(&d_D, GPU_PAGE)); // Should be multiple of GPU page, or mapping of next allocation will segfault the gdrcopy module
initAssert(cuMemAlloc(&d_A, PAGE_SIZE));
+ initAssert(cuMemAlloc(&d_RES, GPU_ITERS * sizeof(uint64_t)));
unsigned int flag = 1;
initAssert(cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, d_D));
initAssert(cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, d_A));
@@ -199,7 +208,7 @@ int main(int argc, char *argv[]) {
printf("\nSize: %lu bytes (%lu %lu-byte descriptors with packet length set to %lu), GPU itertions: %lu, Iterations: %lu\n", SIZE, NUM_PAGES, PAGE_SIZE, TLP_SIZE, GPU_ITERS, ITERS);
memset ((uint32_t *)desc, 0, 5 * sizeof (uint32_t));
- volatile uint64_t *hwaddr = (uint64_t*)((char*)desc + 2 * sizeof(uint32_t));
+ volatile uint64_t *hwaddr = (uint64_t*)((char*)desc + DESCRIPTOR_OFFSET + 2 * sizeof(uint32_t));
WR32 (REG_RESET_DMA, 1);
usleep (100000);
@@ -209,7 +218,7 @@ int main(int argc, char *argv[]) {
WR32_sleep (REG_NUM_PACKETS_PER_DESCRIPTOR, PAGE_SIZE / (4 * TLP_SIZE));
WR32_sleep (REG_PACKET_LENGTH, 0x80000 | TLP_SIZE);
WR32_sleep (REG_UPDATE_THRESHOLD, 1);
- WR64_sleep (REG_UPDATE_COUNTER, desc_bus);
+ WR64_sleep (REG_UPDATE_COUNTER, D_info.bus_addr);
WR64_sleep (REG_UPDATE_ADDRESS, desc_bus + DESCRIPTOR_OFFSET);
WR32_sleep (REG_DMA, 1);
WR32_sleep (REG_INTERCONNECT, 0x232);
@@ -232,14 +241,48 @@ int main(int argc, char *argv[]) {
#ifdef GPU_DESC
ipedma<<<1, 1>>>((void*)dBAR, A_info.bus_addr, (uint32_t*)d_D, (uint64_t*)(d_D + DESCRIPTOR_OFFSET), (uint32_t*)d_A);
#else
+#ifdef CUDA8
+ cudaDeviceSynchronize();
+
+ *(uint32_t*)D = 0;
+ WR32 (REG_DMA, 0);
+ WR32 (REG_PERF_COUNTER, 0);
+
+/*
+ for (int j = 0; j < GPU_ITERS; j++) {
+ WR64 (REG_DESCRIPTOR_ADDRESS, A_info.bus_addr);
+ }
+ WR32 (REG_DMA, 1);
+ usleep(10000);
+*/
+
WR64 (REG_DESCRIPTOR_ADDRESS, A_info.bus_addr);
+
+ for (int j = 0; j < GPU_ITERS; j++) {
+ initAssert(cuStreamWaitValue32(stream, d_D, j + 1, CU_STREAM_WAIT_VALUE_GEQ|CU_STREAM_WAIT_VALUE_FLUSH));
+ measure<<<1, 1, 0, stream>>>(j, (void*)dBAR, A_info.bus_addr, (uint64_t*)d_RES, (uint32_t*)d_A);
+ cudaEventRecord(events[j], stream);
+ }
+// printf("D = %u\n", *(uint32_t*)D);
+
+ // Wait until all is pushed down.
+ usleep(1000);
+
+ clock_gettime(CLOCK_REALTIME, &tss);
+ WR32 (REG_DMA, 1);
+ memcpy(&tse, &tss, sizeof(struct timeval));
+
+# else
// WR64 (REG_DESCRIPTOR_ADDRESS, kbuf_bus);
+ WR64 (REG_DESCRIPTOR_ADDRESS, A_info.bus_addr);
+
do {
} while (*hwaddr == 0);
clock_gettime(CLOCK_REALTIME, &tse);
null<<<1, 1>>>((uint32_t*)d_A);
+# endif
#endif
err = cudaDeviceSynchronize();
if (err) printf("Oopps, synchronization error %i", err);
@@ -251,23 +294,57 @@ int main(int argc, char *argv[]) {
#ifdef VERBOSE
initAssert(cuMemcpyDtoH((void*)kbuf, d_A, PAGE_SIZE));
-# ifdef GPU_DESC
- double lat = 1000. * kbuf[0] / gpu_props.clockRate;
- double latk = 1000. * kbuf[1] / gpu_props.clockRate;
- double latc = ((tsk.tv_sec - tss.tv_sec)*1000000. + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.) / GPU_ITERS;
# ifdef USE_HW_CONTER
double lath = 4. * RD32 (0x20) / 1000;
# else
double lath = 0;
# endif
-#else
+
+ double disp = 0, min = 1E+10, max = 0;
+ long num = 0;
+# ifdef GPU_DESC
+ double lat = 1000. * kbuf[0] / gpu_props.clockRate;
+ double latk = 1000. * kbuf[1] / gpu_props.clockRate;
+ double latc = ((tsk.tv_sec - tss.tv_sec)*1000000. + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.) / GPU_ITERS;
+# else
double lat = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.;
double latk = (tsk.tv_sec - tss.tv_sec)*1000000 + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.;
double latc = 0;
- double lath = 0;
-#endif
+#ifdef CUDA8
+ uint64_t res[GPU_ITERS];
+ cudaMemcpy(res, (void*)d_RES, GPU_ITERS * sizeof(uint64_t), cudaMemcpyDeviceToHost);
+ printf("Iterations (us):");
+
+ for (int j = 1; j < GPU_ITERS; j++) {
+ float ms;
+ cudaEventElapsedTime(&ms, events[j - 1], events[j]);
+ double lati = ms * 1000.;
+// double lati = 4. * (res[j] - res[j - 1]) / 1000;
+// double lati = 1000. * (res[j] - res[j - 1]) / gpu_props.clockRate;
+
+ lat += lati;
+ if (j > 1) disp += pow(lat - lati * j, 2) / (j * (j - 1));
+
+ if (lati > max) max = lati;
+ if (lati < min) min = lati;
+ if (lati > 11) num++;
+
+ printf(" % 6.3lf", lati);
+ }
+ printf("\n");
+
+
+ lat /= GPU_ITERS;
+ latk /= GPU_ITERS;
+ latc /= GPU_ITERS;
+ lath /= GPU_ITERS;
+
+ disp = sqrt(disp / (GPU_ITERS - 1));
+
+#endif
+# endif
- printf("hw: % 6.3lf us, sw: % 6.3lf us, +krn: % 6.3lf us, total: % 7.3lf us: %x %x %x %x\n", lath, lat, latk, latc, kbuf[0], kbuf[1], kbuf[2], kbuf[3]);
+ printf("hw: % 8.3lf us, sw: % 8.3lf us (% 8.3lf - % 8.3lf / % 8.3lf / % 3lu), +krn: % 8.3lf us, total: % 8.3lf us: %x %x %x %x\n", lath, lat, min, max, disp, num, latk, latc, kbuf[0], kbuf[1], kbuf[2], kbuf[3]);
#else
if (!i) gettimeofday(&tvs, NULL);
#endif /* VERBOSE */
@@ -306,7 +383,14 @@ int main(int argc, char *argv[]) {
gdr_unpin_buffer(g, D_mh);
gdr_close(g);
+ cuMemFree(d_RES);
cuMemFree(d_A);
cuMemFree(d_D);
+
+ for (int i = 0; i < GPU_ITERS; i++)
+ cudaEventDestroy(events[i]);
+
+ cudaStreamDestroy(stream);
+
printf("GDR closed\n");
}
diff --git a/gpudirect.h b/gpudirect.h
new file mode 100644
index 0000000..9449ff9
--- /dev/null
+++ b/gpudirect.h
@@ -0,0 +1,29 @@
+enum CU_STREAM_WAIT_FLAGS {
+ CU_STREAM_WAIT_VALUE_GEQ = 0x0,
+ CU_STREAM_WAIT_VALUE_EQ = 0x1,
+ CU_STREAM_WAIT_VALUE_AND = 0x2,
+ CU_STREAM_WAIT_VALUE_FLUSH = 1<<30
+};
+
+enum CU_STREAM_WRITE_FLAGS {
+ CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER = 0x1
+};
+
+enum CU_STREAM_MEM_OP_FLAGS {
+ CU_STREAM_MEM_OP_WAIT_VALUE_32 = 1,
+ CU_STREAM_MEM_OP_WRITE_VALUE_32 = 2,
+ CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES = 3
+};
+
+# ifdef __cplusplus
+extern "C" {
+# endif
+
+CUresult cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, uint32_t value, unsigned int flags);
+CUresult cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, uint32_t value, unsigned int flags);
+//CUresult cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
+
+
+# ifdef __cplusplus
+}
+# endif
diff --git a/kernels.cu b/kernels.cu
index f4ea114..7e7e689 100644
--- a/kernels.cu
+++ b/kernels.cu
@@ -7,6 +7,24 @@
__global__ void null(uint32_t *data) {
}
+__global__ void feedback(volatile void *bar, uint32_t *data) {
+ WR32 (REG_PERF_COUNTER, 1);
+}
+
+__global__ void measure(int iter, volatile void *bar, uint64_t bus_addr, uint64_t *res, uint32_t *data) {
+ // Clocks are incorrect as not running while waiting.
+ //res[iter] = clock64();
+ // Thats does not work either (no RD support in kernels?)
+ //res[iter] = RD32(REG_PERF_COUNTER);
+
+ WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr);
+
+ if ((iter + 1) == GPU_ITERS) {
+ WR32 (REG_PERF_COUNTER, 1);
+ }
+}
+
+
__device__ void ksleep(uint32_t clocks) {
clock_t start = clock64(), now;
diff --git a/kernels.h b/kernels.h
index 9158401..12eaa5e 100644
--- a/kernels.h
+++ b/kernels.h
@@ -1,4 +1,7 @@
__global__ void null(uint32_t *data);
+__global__ void feedback(volatile void *bar, uint32_t *data);
+__global__ void measure(int iter, volatile void *bar, uint64_t bus_addr, uint64_t *res, uint32_t *data);
+
__global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint32_t *counter, volatile uint64_t *desc, uint32_t *data);