summaryrefslogtreecommitdiffstats
path: root/kernels.cu
diff options
context:
space:
mode:
Diffstat (limited to 'kernels.cu')
-rw-r--r--kernels.cu52
1 files changed, 37 insertions, 15 deletions
diff --git a/kernels.cu b/kernels.cu
index 3b5e1d7..f4ea114 100644
--- a/kernels.cu
+++ b/kernels.cu
@@ -7,34 +7,53 @@
__global__ void null(uint32_t *data) {
}
+__device__ void ksleep(uint32_t clocks) {
+ clock_t start = clock64(), now;
-__global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint64_t *desc, uint32_t *data) {
- int i;
+ do {
+ now = clock64();
+ } while ((start < now)&&((now - start) < clocks));
+}
+
+
+__global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint32_t *counter, volatile uint64_t *desc, uint32_t *data) {
+ int i, j;
clock_t sum = 0, sumk = 0, t1, t2, t3;
for (i = 0; i < GPU_ITERS; i++) {
long wait = 0;
- desc[1] = 0;
+ // It would not work as we don't know in which order threads/blocks are executed. We also can't push 0 in all threads
+ // as we unsure if it will overwrite. In non-iterative use case we do not need to push zero and it could work.
+ // Single thread of block should poll and, then, we synchronize. Limiting amount of blocks will be good...
+ if ((threadIdx.x == 0)&&(blockIdx.x == 0)) {
+ // Reset counter
+ //desc[1] = 0;
+ *counter = 0;
- // Reset counter
#ifdef USE_HW_CONTER
- WR32 (REG_DMA, 0);
- WR32 (REG_PERF_COUNTER, 0);
+ WR32 (REG_DMA, 0);
+ WR32 (REG_PERF_COUNTER, 0);
- WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr);
+ for (j = 0; j < NUM_PAGES; j++) {
+ WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr);
+// ksleep(10000);
+ }
- t1 = clock64();
- WR32 (REG_DMA, 1);
+ t1 = clock64();
+ WR32 (REG_DMA, 1);
#else
- t1 = clock64();
- WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr);
+ t1 = clock64();
+ for (j = 0; j < NUM_PAGES; j++) {
+ WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr);
+// ksleep(10000);
+ }
#endif
-
+ }
do {
if (++wait > 0x10000) break;
- } while (desc[1] == 0);
+ } while (((*counter) < (NUM_PAGES))/*||(desc[1] == 0)*/);
t2 = clock64();
@@ -50,8 +69,11 @@ __global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint64_t
sumk += t3 - t1;
}
- data[0] = sum / GPU_ITERS;
- data[1] = sumk / GPU_ITERS;
+ if ((threadIdx.x == 0)&&(blockIdx.x == 0)) {
+ data[0] = sum / GPU_ITERS;
+ data[1] = sumk / GPU_ITERS;
+ data[2] = *counter;
+ }
}