summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorroot <root@ipepdvdev1.ipe.kit.edu>2016-05-25 06:35:59 +0200
committerroot <root@ipepdvdev1.ipe.kit.edu>2016-05-25 06:35:59 +0200
commita6f3e96f2cafc183ab29e53007a86bb968d654b8 (patch)
tree5248f06934289d65e366507890ebac3c7a4816ab
parentca1b1cea796bcfaeb86f201cf35065a606921cc1 (diff)
downloadgdrtest-a6f3e96f2cafc183ab29e53007a86bb968d654b8.tar.gz
gdrtest-a6f3e96f2cafc183ab29e53007a86bb968d654b8.tar.bz2
gdrtest-a6f3e96f2cafc183ab29e53007a86bb968d654b8.tar.xz
gdrtest-a6f3e96f2cafc183ab29e53007a86bb968d654b8.zip
Support bigger writes
-rw-r--r--CMakeLists.txt8
-rw-r--r--config.h37
-rw-r--r--gdr_test.cu48
-rw-r--r--ipedma.h4
-rw-r--r--kernels.cu52
-rw-r--r--kernels.h2
-rwxr-xr-xtest.sh12
7 files changed, 119 insertions, 44 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f3369d9..1290255 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,10 +6,16 @@ set(IPECAMERA_ABI_VERSION "0")
cmake_minimum_required(VERSION 2.6)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
-add_definitions("-fPIC --std=gnu99 -Wall -O2 -gdwarf-2 -g3 -fno-omit-frame-pointer")
+add_definitions("-fPIC --std=gnu99 -O2 -gdwarf-2 -g3 -fno-omit-frame-pointer")
find_package(CUDA REQUIRED)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_35,code=sm_35;-rdc=true)
+
+if (DEFINED SIZE)
+message(" * Setting size to ${SIZE}")
+set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-Xcompiler;-DSIZE=${SIZE})
+endif ()
+
set(CUDA_SEPARABLE_COMPILATION ON)
diff --git a/config.h b/config.h
index 7d95924..a0d29f3 100644
--- a/config.h
+++ b/config.h
@@ -1,13 +1,36 @@
-#define ITERS 100
-#define GPU_ITERS 100
+#define VERBOSE
+#define GPU_DESC
+#define USE_HW_CONTER
-#define TLP_SIZE 64
#define GPU_PAGE 65536
-#define PAGE_SIZE 4096
+#define MIN(a, b) (((a) > (b))?(b):(a))
+#define MAX(a, b) (((a) < (b))?(b):(a))
-#define VERBOSE
-#define GPU_DESC
+#ifdef SIZE
+# if SIZE >= 65536
+# define TLP_SIZE 64
+# define PAGE_SIZE 65536
+# define NUM_PAGES (SIZE / GPU_PAGE)
+# elif SIZE >= 256
+# define TLP_SIZE 64
+# define PAGE_SIZE SIZE
+# define NUM_PAGES 1
+# else
+# define TLP_SIZE (SIZE / 4)
+# define PAGE_SIZE SIZE
+# define NUM_PAGES 1
+# endif
-#define USE_HW_CONTER
+# define GPU_ITERS MIN(100, MAX(4, 4l * 1024 * 1024 * 1024 / SIZE))
+# define ITERS MIN(100, MAX(10, 4l * 1024 * 1024 * 1024 / SIZE))
+#else
+# define ITERS 100
+# define GPU_ITERS 100
+
+# define TLP_SIZE 64
+# define PAGE_SIZE 4096
+# define NUM_PAGES 4
+# define SIZE (NUM_PAGES * PAGE_SIZE)
+#endif
diff --git a/gdr_test.cu b/gdr_test.cu
index cc624fc..d0118d5 100644
--- a/gdr_test.cu
+++ b/gdr_test.cu
@@ -93,11 +93,11 @@ int main(int argc, char *argv[]) {
char gpu_name[30] = {0};
initAssert (cuDeviceGetName (gpu_name, 30, current_gpu));
- printf("GPU %i: %s\n", num_gpus, gpu_name);
+ printf(" GPU %i: %s\n", num_gpus, gpu_name);
if (strncmp (gpu_name, "Tesla K40", 9) == 0) {
- printf ("Found a Tesla GPU! I'll use that one.\n");
+ printf (" Found a Tesla GPU! I'll use that one.\n");
gpu = current_gpu;
break;
}
@@ -112,12 +112,12 @@ int main(int argc, char *argv[]) {
//(see https://www.cs.cmu.edu/afs/cs/academic/class/15668-s11/www/cuda-doc/html/group__CUDART__DRIVER.html)
unsigned int api_version;
initAssert (cuCtxGetApiVersion (context, &api_version));
- printf ("CUDA API Version: %u\n", api_version);
- printf ("CUDA init done\n\n");
+ printf (" CUDA API Version: %u\n", api_version);
+ //printf ("CUDA init done\n\n");
CUdevprop gpu_props;
initAssert(cuDeviceGetProperties(&gpu_props, gpu));
- printf ("Clock %lu KHz\n", gpu_props.clockRate);
+ printf (" GPU Clock %lu KHz\n", gpu_props.clockRate);
CUdeviceptr d_A, d_D;
initAssert(cuMemAlloc(&d_D, GPU_PAGE)); // Should be multiple of GPU page, or mapping of next allocation will segfault the gdrcopy module
@@ -149,7 +149,7 @@ int main(int argc, char *argv[]) {
int A_bar_off = A_info.va - d_A;
volatile uint32_t *A = (uint32_t *)((char *)A_bar_ptr + A_bar_off);
- printf("DevicePtr: %lx, GDR ptr: %p, Bus ptr: %lx, (Bar: %p, Offset: %i), VA: 0x%lx, Size: %lu, Page: %lu\n", d_A, A, A_info.bus_addr, A_bar_ptr, A_bar_off, A_info.va, A_info.mapped_size, A_info.page_size);
+ printf("\nDevicePtr: %lx, GDR ptr: %p, Bus ptr: %lx, (Bar: %p, Offset: %i), VA: 0x%lx, Size: %lu\n", d_A, A, A_info.bus_addr, A_bar_ptr, A_bar_off, A_info.va, A_info.mapped_size);
pcilib_t *pci;
volatile void *bar;
@@ -166,7 +166,7 @@ int main(int argc, char *argv[]) {
printf("map bar\n");
exit(1);
}
- printf("BAR mapped to: %p\n", bar);
+ //printf("BAR mapped to: %p\n", bar);
CUdeviceptr dBAR;
// initAssert (cuMemHostRegister ((void*)((((uintptr_t)bar)/65536)*65536), 65536, CU_MEMHOSTREGISTER_DEVICEMAP));
@@ -176,7 +176,7 @@ int main(int argc, char *argv[]) {
//initAssert (cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, dBAR));
bar_info = pcilib_get_bar_info(pci, BAR);
- printf("%p (Phys: 0x%lx, Size: 0x%x)\n", bar_info[BAR].virt_addr, bar_info[BAR].phys_addr, bar_info[BAR].size);
+ printf("Bar: %p (Phys: 0x%lx, Size: 0x%x)\n", bar_info[BAR].virt_addr, bar_info[BAR].phys_addr, bar_info[BAR].size);
pcilib_kmem_handle_t *kdesc_kmem = pcilib_alloc_kernel_memory (pci, PCILIB_KMEM_TYPE_CONSISTENT, 1, 128, 4096, KMEM_USE_RING, KMEM_DEFAULT_FLAGS);
uintptr_t kdesc_bus = pcilib_kmem_get_block_ba (pci, kdesc_kmem, 0);
@@ -196,6 +196,7 @@ int main(int argc, char *argv[]) {
uintptr_t desc_bus = kdesc_bus;
#endif
+ printf("\nSize: %lu bytes (%lu %lu-byte descriptors with packet length set to %lu), GPU itertions: %lu, Iterations: %lu\n", SIZE, NUM_PAGES, PAGE_SIZE, TLP_SIZE, GPU_ITERS, ITERS);
memset ((uint32_t *)desc, 0, 5 * sizeof (uint32_t));
volatile uint64_t *hwaddr = (uint64_t*)((char*)desc + 2 * sizeof(uint32_t));
@@ -205,16 +206,22 @@ int main(int argc, char *argv[]) {
WR32 (REG_RESET_DMA, 0);
usleep (100000);
- WR32 (REG_NUM_PACKETS_PER_DESCRIPTOR, PAGE_SIZE / (4 * TLP_SIZE));
- WR32 (REG_PACKET_LENGTH, 0x80000 | TLP_SIZE);
- WR32 (REG_UPDATE_THRESHOLD, 0);
- WR64 (REG_UPDATE_ADDRESS, desc_bus);
- WR32 (REG_DMA, 1);
- WR32 (REG_INTERCONNECT, 0x232);
- WR32 (REG_COUNTER, 1);
+ WR32_sleep (REG_NUM_PACKETS_PER_DESCRIPTOR, PAGE_SIZE / (4 * TLP_SIZE));
+ WR32_sleep (REG_PACKET_LENGTH, 0x80000 | TLP_SIZE);
+ WR32_sleep (REG_UPDATE_THRESHOLD, 1);
+ WR64_sleep (REG_UPDATE_COUNTER, desc_bus);
+ WR64_sleep (REG_UPDATE_ADDRESS, desc_bus + DESCRIPTOR_OFFSET);
+ WR32_sleep (REG_DMA, 1);
+ WR32_sleep (REG_INTERCONNECT, 0x232);
+ WR32_sleep (REG_COUNTER, 1);
+
+ usleep(100000);
#ifdef VERBOSE
- struct timespec tss, tse, tsk;
+ struct timespec tss, tsk;
+# ifndef GPU_DESC
+ struct timespec tse;
+# endif
#else
struct timeval tvs, tve;
#endif /* VERBOSE */
@@ -223,7 +230,7 @@ int main(int argc, char *argv[]) {
clock_gettime(CLOCK_REALTIME, &tss);
#ifdef GPU_DESC
- ipedma<<<1, 1>>>((void*)dBAR, A_info.bus_addr, (uint64_t*)d_D, (uint32_t*)d_A);
+ ipedma<<<1, 1>>>((void*)dBAR, A_info.bus_addr, (uint32_t*)d_D, (uint64_t*)(d_D + DESCRIPTOR_OFFSET), (uint32_t*)d_A);
#else
WR64 (REG_DESCRIPTOR_ADDRESS, A_info.bus_addr);
// WR64 (REG_DESCRIPTOR_ADDRESS, kbuf_bus);
@@ -234,7 +241,8 @@ int main(int argc, char *argv[]) {
null<<<1, 1>>>((uint32_t*)d_A);
#endif
- cudaDeviceSynchronize();
+ err = cudaDeviceSynchronize();
+ if (err) printf("Oopps, synchronization error %i", err);
clock_gettime(CLOCK_REALTIME, &tsk);
@@ -246,7 +254,7 @@ int main(int argc, char *argv[]) {
# ifdef GPU_DESC
double lat = 1000. * kbuf[0] / gpu_props.clockRate;
double latk = 1000. * kbuf[1] / gpu_props.clockRate;
- double latc = ((tsk.tv_sec - tss.tv_sec)*1000000 + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.) / GPU_ITERS;
+ double latc = ((tsk.tv_sec - tss.tv_sec)*1000000. + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.) / GPU_ITERS;
# ifdef USE_HW_CONTER
double lath = 4. * RD32 (0x20) / 1000;
# else
@@ -288,7 +296,7 @@ int main(int argc, char *argv[]) {
pcilib_free_kernel_memory(pci, kdesc_kmem, KMEM_DEFAULT_FLAGS);
pcilib_close(pci);
- printf("PCI closed\n");
+ printf("\nPCI closed\n");
gdr_unmap(g, A_mh, A_bar_ptr, PAGE_SIZE);
diff --git a/ipedma.h b/ipedma.h
index 299499c..6daef4b 100644
--- a/ipedma.h
+++ b/ipedma.h
@@ -7,10 +7,14 @@
#define REG_DESCRIPTOR_ADDRESS 0x50
#define REG_UPDATE_ADDRESS 0x58
#define REG_UPDATE_THRESHOLD 0x60
+#define REG_UPDATE_COUNTER 0x70
#define REG_INTERCONNECT 0x9048
#define REG_COUNTER 0x9000
+
+#define DESCRIPTOR_OFFSET 256
+
#define WR32(addr, value) *(volatile uint32_t *) (((char*)(bar)) + (addr)) = (value);
#define RD32(addr) (*(volatile uint32_t *) (((char*)(bar)) + (addr)))
#define WR32_sleep(addr, value) *(volatile uint32_t *) (((char*)(bar)) + (addr)) = (value); usleep (100);
diff --git a/kernels.cu b/kernels.cu
index 3b5e1d7..f4ea114 100644
--- a/kernels.cu
+++ b/kernels.cu
@@ -7,34 +7,53 @@
__global__ void null(uint32_t *data) {
}
+__device__ void ksleep(uint32_t clocks) {
+ clock_t start = clock64(), now;
-__global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint64_t *desc, uint32_t *data) {
- int i;
+ do {
+ now = clock64();
+ } while ((start < now)&&((now - start) < clocks));
+}
+
+
+__global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint32_t *counter, volatile uint64_t *desc, uint32_t *data) {
+ int i, j;
clock_t sum = 0, sumk = 0, t1, t2, t3;
for (i = 0; i < GPU_ITERS; i++) {
long wait = 0;
- desc[1] = 0;
+ // It would not work as we don't know in which order threads/blocks are executed. We also can't push 0 in all threads
+ // as we unsure if it will overwrite. In non-iterative use case we do not need to push zero and it could work.
+ // Single thread of block should poll and, then, we synchronize. Limiting amount of blocks will be good...
+ if ((threadIdx.x == 0)&&(blockIdx.x == 0)) {
+ // Reset counter
+ //desc[1] = 0;
+ *counter = 0;
- // Reset counter
#ifdef USE_HW_CONTER
- WR32 (REG_DMA, 0);
- WR32 (REG_PERF_COUNTER, 0);
+ WR32 (REG_DMA, 0);
+ WR32 (REG_PERF_COUNTER, 0);
- WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr);
+ for (j = 0; j < NUM_PAGES; j++) {
+ WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr);
+// ksleep(10000);
+ }
- t1 = clock64();
- WR32 (REG_DMA, 1);
+ t1 = clock64();
+ WR32 (REG_DMA, 1);
#else
- t1 = clock64();
- WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr);
+ t1 = clock64();
+ for (j = 0; j < NUM_PAGES; j++) {
+ WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr);
+// ksleep(10000);
+ }
#endif
-
+ }
do {
if (++wait > 0x10000) break;
- } while (desc[1] == 0);
+ } while (((*counter) < (NUM_PAGES))/*||(desc[1] == 0)*/);
t2 = clock64();
@@ -50,8 +69,11 @@ __global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint64_t
sumk += t3 - t1;
}
- data[0] = sum / GPU_ITERS;
- data[1] = sumk / GPU_ITERS;
+ if ((threadIdx.x == 0)&&(blockIdx.x == 0)) {
+ data[0] = sum / GPU_ITERS;
+ data[1] = sumk / GPU_ITERS;
+ data[2] = *counter;
+ }
}
diff --git a/kernels.h b/kernels.h
index 74a0b44..9158401 100644
--- a/kernels.h
+++ b/kernels.h
@@ -1,5 +1,5 @@
__global__ void null(uint32_t *data);
-__global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint64_t *desc, uint32_t *data);
+__global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint32_t *counter, volatile uint64_t *desc, uint32_t *data);
diff --git a/test.sh b/test.sh
new file mode 100755
index 0000000..50f2e8c
--- /dev/null
+++ b/test.sh
@@ -0,0 +1,12 @@
+#! /bin/bash
+
+rm results.txt
+date > results.txt
+
+for pow in `seq 7 30`; do
+ size=`echo "2^$pow" | bc`
+ echo $size
+ cmake -DSIZE=$size
+ make
+ ./gdr_test >> results.txt
+done