#include #include #include "config.h" #include "ipedma.h" __global__ void null(uint32_t *data) { } __global__ void feedback(volatile void *bar, uint32_t *data) { WR32 (REG_PERF_COUNTER, 1); } __global__ void measure(int iter, volatile void *bar, uint64_t bus_addr, uint64_t *res, uint32_t *data) { // Clocks are incorrect as not running while waiting. //res[iter] = clock64(); // Thats does not work either (no RD support in kernels?) //res[iter] = RD32(REG_PERF_COUNTER); WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr); if ((iter + 1) == GPU_ITERS) { WR32 (REG_PERF_COUNTER, 1); } } __device__ void ksleep(uint32_t clocks) { clock_t start = clock64(), now; do { now = clock64(); } while ((start < now)&&((now - start) < clocks)); } __global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint32_t *counter, volatile uint64_t *desc, uint32_t *data) { int i, j; clock_t sum = 0, sumk = 0, t1, t2, t3; for (i = 0; i < GPU_ITERS; i++) { long wait = 0; // It would not work as we don't know in which order threads/blocks are executed. We also can't push 0 in all threads // as we unsure if it will overwrite. In non-iterative use case we do not need to push zero and it could work. // Single thread of block should poll and, then, we synchronize. Limiting amount of blocks will be good... if ((threadIdx.x == 0)&&(blockIdx.x == 0)) { // Reset counter //desc[1] = 0; *counter = 0; #ifdef USE_HW_CONTER WR32 (REG_DMA, 0); WR32 (REG_PERF_COUNTER, 0); for (j = 0; j < NUM_PAGES; j++) { WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr); // ksleep(10000); } t1 = clock64(); WR32 (REG_DMA, 1); #else t1 = clock64(); for (j = 0; j < NUM_PAGES; j++) { WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr); // ksleep(10000); } #endif } do { if (++wait > 0x10000) break; } while (((*counter) < (NUM_PAGES))/*||(desc[1] == 0)*/); t2 = clock64(); #ifdef USE_HW_CONTER WR32 (REG_PERF_COUNTER, 1); #endif null<<<1,1>>>(data); cudaDeviceSynchronize(); t3 = clock64(); sum += t2 - t1; sumk += t3 - t1; } if ((threadIdx.x == 0)&&(blockIdx.x == 0)) { data[0] = sum / GPU_ITERS; data[1] = sumk / GPU_ITERS; data[2] = *counter; } } /* __global__ void do_leet (int *rin) { *rin = 0x1337; } */