#include <cuda.h>
#include <stdint.h>

#include "config.h"
#include "ipedma.h"

__global__ void null(uint32_t *data) {
}

__global__ void feedback(volatile void *bar, uint32_t *data) {
    WR32 (REG_PERF_COUNTER, 1);
}

__global__ void measure(int iter, volatile void *bar, uint64_t bus_addr, uint64_t *res, uint32_t *data) {
	// Clocks are incorrect as not running while waiting.
    //res[iter] = clock64();
	// Thats does not work either (no RD support in kernels?)
    //res[iter] = RD32(REG_PERF_COUNTER);

    WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr);

    if ((iter + 1) == GPU_ITERS) {
	WR32 (REG_PERF_COUNTER, 1);
    }
}


__device__ void ksleep(uint32_t clocks) {
    clock_t start = clock64(), now;

    do {
	now = clock64();
    } while ((start < now)&&((now - start) < clocks));
}


__global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint32_t *counter, volatile uint64_t *desc, uint32_t *data) {
    int i, j;
    clock_t sum = 0, sumk = 0, t1, t2, t3;

    for (i = 0; i < GPU_ITERS; i++) {
	long wait = 0;

	    // It would not work as we don't know in which order threads/blocks are executed. We also can't push 0 in all threads
	    // as we unsure if it will overwrite. In non-iterative use case we do not need to push zero and it could work.
	    // Single thread of block should poll and, then, we synchronize. Limiting amount of blocks will be good...
	if ((threadIdx.x == 0)&&(blockIdx.x == 0)) {
		// Reset counter
	    //desc[1] = 0;
	    *counter = 0;

#ifdef USE_HW_CONTER
	    WR32 (REG_DMA, 0);
	    WR32 (REG_PERF_COUNTER, 0);

	    for (j = 0; j <  NUM_PAGES; j++) {
		WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr);
//	    	ksleep(10000);
	    }

    	    t1 = clock64();
	    WR32 (REG_DMA, 1);
#else
    	    t1 = clock64();
    	    for (j = 0; j <  NUM_PAGES; j++) {
		WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr);
//	    	ksleep(10000);
	    }
#endif
	}

        do {
	    if (++wait > 0x10000) break;
	} while (((*counter) < (NUM_PAGES))/*||(desc[1] == 0)*/);

	t2 = clock64();

#ifdef USE_HW_CONTER
	WR32 (REG_PERF_COUNTER, 1);
#endif

	null<<<1,1>>>(data);
	cudaDeviceSynchronize();
	t3 = clock64();

	sum += t2 - t1;
	sumk += t3 - t1;
    }

    if ((threadIdx.x == 0)&&(blockIdx.x == 0)) {
	data[0] = sum / GPU_ITERS;
	data[1] = sumk / GPU_ITERS;
	data[2] = *counter;
    }
}


/*
__global__ void do_leet (int *rin) {
    *rin = 0x1337;
}
*/