#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdarg.h>
#include <time.h>
#include <sched.h>
#include <sys/time.h>

#include <cuda.h>
#include <gdrapi.h>


#include <pcilib.h>
#include <pcilib/bar.h>
#include <pcilib/kmem.h>

#include "gpudirect.h"
#include "config.h"
#include "ipedma.h"
#include "kernels.h"

#define DEVICE "/dev/fpga0"

#define BAR PCILIB_BAR0


#define KMEM_DEFAULT_FLAGS      (pcilib_kmem_flags_t)(PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE)

#define KMEM_USE_RING           PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 1)
#define KMEM_USE_DEFAULT        PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 2)

#define gdrAssert(ans) { gdrError((ans), __FILE__, __LINE__); }
inline int gdrError(int code, const char *file, int line)
{
    if (code != 0) 
    {
        fprintf(stderr,"GDRassert: %i %s %d\n",
                code, file, line);
        return code;
    } else {
        return 0;
    }
}


#define initAssert(ans) { initError((ans), __FILE__, __LINE__); }
inline int initError(CUresult code, const char *file, int line)
{
    if (code != CUDA_SUCCESS) 
    {
        const char *error = NULL;
        cuGetErrorString (code, &error);
        fprintf(stderr,"GPUassert: %s (Code: %i) %s %d\n",
                error, code, file, line);
        return code;
    } else {
        return 0;
    }
}


#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline int gpuAssert(cudaError_t code, const char *file, int line)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s (Code: %i) %s %d\n",
                cudaGetErrorString(code), code, file, line);
        return code;
    } else {
        return 0;
    }
}


int main(int argc, char *argv[]) {
    int err;

        //CUDA initialization
    initAssert (cuInit(0));

    int num_gpus;
    initAssert (cuDeviceGetCount (&num_gpus));
    printf ("Found %i GPUs on the system\n", num_gpus);


    CUdevice gpu; //will be used to find the correct GPU
    for (num_gpus--; num_gpus >= 0; num_gpus--) {

        CUdevice current_gpu;
        initAssert (cuDeviceGet (&current_gpu, num_gpus));

        char gpu_name[30] = {0};
        initAssert (cuDeviceGetName (gpu_name, 30, current_gpu));

        printf(" GPU %i: %s\n", num_gpus, gpu_name);


        if (strncmp (gpu_name, "Tesla K40", 9) == 0) {
            printf ("  Found a Tesla GPU! I'll use that one.\n");
            gpu = current_gpu;
            break;
        }
    }

        //The CU_CTX_MAP_HOST is what we are interested in! 
    CUcontext context;
    initAssert (cuCtxCreate (&context, CU_CTX_MAP_HOST | CU_CTX_SCHED_AUTO, gpu));
    initAssert (cuCtxSetCurrent (context));

        //NOTE: API Version 3010 is problematic
        //(see https://www.cs.cmu.edu/afs/cs/academic/class/15668-s11/www/cuda-doc/html/group__CUDART__DRIVER.html)
    unsigned int api_version;
    initAssert (cuCtxGetApiVersion (context, &api_version));
    printf (" CUDA API Version: %u\n", api_version);
    //printf ("CUDA init done\n\n");

    CUdevprop gpu_props;
    initAssert(cuDeviceGetProperties(&gpu_props, gpu));
    printf (" GPU Clock %lu KHz\n", gpu_props.clockRate);

    cudaStream_t stream;
    cudaStreamCreate(&stream);
    
    cudaEvent_t events[GPU_ITERS];
    for (int i = 0; i < GPU_ITERS; i++)
	cudaEventCreate(&events[i]);

    CUdeviceptr d_A, d_D, d_RES;
    initAssert(cuMemAlloc(&d_D, GPU_PAGE));	// Should be multiple of GPU page, or mapping of next allocation will segfault the gdrcopy module
    initAssert(cuMemAlloc(&d_A, PAGE_SIZE));
    initAssert(cuMemAlloc(&d_RES, GPU_ITERS * sizeof(uint64_t)));
    unsigned int flag = 1;
    initAssert(cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, d_D));
    initAssert(cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, d_A));

    gdr_mh_t A_mh, D_mh;
    gdr_info_t A_info, D_info;
    void *A_bar_ptr = NULL; 
    void *D_bar_ptr = NULL;


    gdr_t g = gdr_open();
    gdrAssert(g == NULL);

    gdrAssert(gdr_pin_buffer(g, d_D, GPU_PAGE, 0, 0, &D_mh));
    gdrAssert(gdr_map(g, D_mh, &D_bar_ptr, GPU_PAGE));
    gdrAssert(gdr_get_info(g, D_mh, &D_info));

    gdrAssert(gdr_pin_buffer(g, d_A, PAGE_SIZE, 0, 0, &A_mh));
    gdrAssert(gdr_map(g, A_mh, &A_bar_ptr, PAGE_SIZE));
    gdrAssert(gdr_get_info(g, A_mh, &A_info));

    int D_bar_off = D_info.va - d_D;
    volatile uint32_t *D = (uint32_t *)((char *)D_bar_ptr + D_bar_off);

    int A_bar_off = A_info.va - d_A;
    volatile uint32_t *A = (uint32_t *)((char *)A_bar_ptr + A_bar_off);

    printf("\nDevicePtr: %lx, GDR ptr: %p, Bus ptr: %lx, (Bar: %p, Offset: %i), VA: 0x%lx, Size: %lu\n", d_A, A, A_info.bus_addr, A_bar_ptr, A_bar_off, A_info.va, A_info.mapped_size);

    pcilib_t *pci;
    volatile void *bar;
    const pcilib_bar_info_t *bar_info;

    pci = pcilib_open(DEVICE, PCILIB_MODEL_DETECT);
    if (!pci) {
        printf("pcilib_open\n");
        exit(1);
    }
    bar = pcilib_resolve_bar_address(pci, BAR, 0);
    if (!bar) {
        pcilib_close(pci);
        printf("map bar\n");
        exit(1);
    }
    //printf("BAR mapped to: %p\n", bar);

    CUdeviceptr dBAR;
//    initAssert (cuMemHostRegister ((void*)((((uintptr_t)bar)/65536)*65536), 65536, CU_MEMHOSTREGISTER_DEVICEMAP));
    initAssert (cuMemHostRegister ((void*)bar, 4096, CU_MEMHOSTREGISTER_IOMEMORY));
    initAssert (cuMemHostGetDevicePointer(&dBAR, (void*)bar, 0));
	// no effect
    //initAssert (cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, dBAR));

    bar_info = pcilib_get_bar_info(pci, BAR);
    printf("Bar: %p (Phys: 0x%lx, Size: 0x%x)\n", bar_info[BAR].virt_addr, bar_info[BAR].phys_addr, bar_info[BAR].size);

    pcilib_kmem_handle_t *kdesc_kmem = pcilib_alloc_kernel_memory (pci, PCILIB_KMEM_TYPE_CONSISTENT, 1, 128, 4096, KMEM_USE_RING, KMEM_DEFAULT_FLAGS);
    uintptr_t kdesc_bus = pcilib_kmem_get_block_ba (pci, kdesc_kmem, 0);
    volatile void *kdesc = (uint32_t *) pcilib_kmem_get_block_ua (pci, kdesc_kmem, 0);


    pcilib_kmem_handle_t *kbuf_kmem = pcilib_alloc_kernel_memory(pci, PCILIB_KMEM_TYPE_DMA_C2S_PAGE, 1, ((PAGE_SIZE%4096)?(4096 * (1 + PAGE_SIZE/4096)):PAGE_SIZE), 4096, KMEM_USE_DEFAULT, KMEM_DEFAULT_FLAGS);
    uintptr_t kbuf_bus = pcilib_kmem_get_block_ba (pci, kbuf_kmem, 0);
    volatile uint32_t *kbuf = (uint32_t *) pcilib_kmem_get_block_ua (pci, kbuf_kmem, 0);
    memset ((uint32_t *)kbuf, 0, PAGE_SIZE);

#ifdef GPU_DESC
    volatile void *desc = D;
    uintptr_t desc_bus = D_info.bus_addr;
#else
    volatile void *desc = kdesc;
    uintptr_t desc_bus = kdesc_bus;
#endif

    printf("\nSize: %lu bytes (%lu %lu-byte descriptors with packet length set to %lu), GPU itertions: %lu, Iterations: %lu\n", SIZE, NUM_PAGES, PAGE_SIZE, TLP_SIZE, GPU_ITERS, ITERS);

    memset ((uint32_t *)desc, 0, 5 * sizeof (uint32_t));
    volatile uint64_t *hwaddr = (uint64_t*)((char*)desc + DESCRIPTOR_OFFSET + 2 * sizeof(uint32_t));

    WR32 (REG_RESET_DMA, 1);
    usleep (100000);
    WR32 (REG_RESET_DMA, 0);
    usleep (100000);

    WR32_sleep (REG_NUM_PACKETS_PER_DESCRIPTOR, PAGE_SIZE / (4 * TLP_SIZE));
    WR32_sleep (REG_PACKET_LENGTH, 0x80000 | TLP_SIZE);
    WR32_sleep (REG_UPDATE_THRESHOLD, 1);
    WR64_sleep (REG_UPDATE_COUNTER, D_info.bus_addr);
    WR64_sleep (REG_UPDATE_ADDRESS, desc_bus + DESCRIPTOR_OFFSET);
    WR32_sleep (REG_DMA, 1);
    WR32_sleep (REG_INTERCONNECT, 0x232);
    WR32_sleep (REG_COUNTER, 1);
    
    usleep(100000);

#ifdef VERBOSE
    struct timespec tss, tsk;
# ifndef GPU_DESC
    struct timespec tse;
# endif
#else
    struct timeval tvs, tve;
#endif /* VERBOSE */

    for (int i = 0; i < ITERS; i++) {
	clock_gettime(CLOCK_REALTIME, &tss);

#ifdef GPU_DESC
	ipedma<<<1, 1>>>((void*)dBAR, A_info.bus_addr, (uint32_t*)d_D, (uint64_t*)(d_D + DESCRIPTOR_OFFSET), (uint32_t*)d_A);
#else
#ifdef CUDA8
	cudaDeviceSynchronize();
	
	*(uint32_t*)D = 0;
	WR32 (REG_DMA, 0);
	WR32 (REG_PERF_COUNTER, 0);

/*
	for (int j = 0; j < GPU_ITERS; j++) {
	    WR64 (REG_DESCRIPTOR_ADDRESS, A_info.bus_addr);
	}
	WR32 (REG_DMA, 1);
	usleep(10000);
*/

	WR64 (REG_DESCRIPTOR_ADDRESS, A_info.bus_addr);

	for (int j = 0; j < GPU_ITERS; j++) {
	    initAssert(cuStreamWaitValue32(stream, d_D, j + 1, CU_STREAM_WAIT_VALUE_GEQ|CU_STREAM_WAIT_VALUE_FLUSH));
	    measure<<<1, 1, 0, stream>>>(j, (void*)dBAR, A_info.bus_addr, (uint64_t*)d_RES,  (uint32_t*)d_A);
	    cudaEventRecord(events[j], stream);
	}
//	printf("D = %u\n", *(uint32_t*)D);

	    // Wait until all is pushed down.
	usleep(1000);

	clock_gettime(CLOCK_REALTIME, &tss);
	WR32 (REG_DMA, 1);
	memcpy(&tse, &tss, sizeof(struct timeval));

# else
//    WR64 (REG_DESCRIPTOR_ADDRESS, kbuf_bus);
	WR64 (REG_DESCRIPTOR_ADDRESS, A_info.bus_addr);


	do {
	} while (*hwaddr == 0);
	clock_gettime(CLOCK_REALTIME, &tse);

	null<<<1, 1>>>((uint32_t*)d_A);
# endif
#endif
	err = cudaDeviceSynchronize();
	if (err) printf("Oopps, synchronization error %i", err);

	clock_gettime(CLOCK_REALTIME, &tsk);

	*hwaddr = 0;
	
#ifdef VERBOSE
        initAssert(cuMemcpyDtoH((void*)kbuf, d_A, PAGE_SIZE));

# ifdef USE_HW_CONTER
	double lath = 4. * RD32 (0x20) / 1000;
# else
	double lath = 0;
# endif

	double disp = 0, min = 1E+10, max = 0;
	long num = 0;
# ifdef GPU_DESC
	double lat = 1000. * kbuf[0] / gpu_props.clockRate;
	double latk = 1000. * kbuf[1] / gpu_props.clockRate;
	double latc = ((tsk.tv_sec - tss.tv_sec)*1000000. + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.) / GPU_ITERS;
# else
	double lat = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.;
	double latk = (tsk.tv_sec - tss.tv_sec)*1000000 + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.;
	double latc = 0;
#ifdef CUDA8
	uint64_t res[GPU_ITERS];
	cudaMemcpy(res, (void*)d_RES, GPU_ITERS * sizeof(uint64_t),  cudaMemcpyDeviceToHost);
	printf("Iterations (us):");

	for (int j = 1; j < GPU_ITERS; j++) {
	    float ms;
	    cudaEventElapsedTime(&ms, events[j - 1], events[j]);
	    double lati = ms * 1000.;
//	    double lati = 4. * (res[j] - res[j - 1]) / 1000;
//	    double lati = 1000. * (res[j] - res[j - 1]) / gpu_props.clockRate; 

	    lat += lati;
	    if (j > 1) disp += pow(lat - lati * j, 2) / (j * (j - 1));

	    if (lati > max) max = lati;
	    if (lati < min) min = lati;
	    if (lati > 11) num++;

	    printf(" % 6.3lf", lati);
	}
	printf("\n");
	
	
	lat /= GPU_ITERS;
	latk /= GPU_ITERS;
	latc /= GPU_ITERS;
	lath /= GPU_ITERS;
	
	disp = sqrt(disp / (GPU_ITERS - 1));
	
#endif 
# endif

	printf("hw: % 8.3lf us, sw: % 8.3lf us (% 8.3lf - % 8.3lf / % 8.3lf / % 3lu), +krn: % 8.3lf us, total: % 8.3lf us: %x %x %x %x\n", lath, lat, min, max, disp, num, latk, latc, kbuf[0], kbuf[1], kbuf[2], kbuf[3]);
#else
	if (!i)  gettimeofday(&tvs, NULL);
#endif /* VERBOSE */
	
    }

#ifndef VERBOSE
    gettimeofday(&tve, NULL);
    size_t avglat = (tve.tv_sec - tvs.tv_sec)*1000000 + (tve.tv_usec - tvs.tv_usec);
    printf("Latency: %.3lf us (average for %i iterations)\n", 1. * avglat / ITERS, ITERS);
#endif /* VERBOSE */

    usleep(1000000);


    WR32 (REG_COUNTER, 0);
    WR32 (REG_DMA, 0);

    WR32 (REG_RESET_DMA, 1);
    usleep (100000);
    WR32 (REG_RESET_DMA, 0);
    usleep (100000);

    pcilib_free_kernel_memory(pci, kbuf_kmem,  KMEM_DEFAULT_FLAGS);
    pcilib_free_kernel_memory(pci, kdesc_kmem,  KMEM_DEFAULT_FLAGS);

    pcilib_close(pci);
    printf("\nPCI closed\n");


    gdr_unmap(g, A_mh, A_bar_ptr, PAGE_SIZE);
    gdr_unpin_buffer(g, A_mh);

    gdr_unmap(g, D_mh, D_bar_ptr, GPU_PAGE);
    gdr_unpin_buffer(g, D_mh);

    gdr_close(g);
    cuMemFree(d_RES);
    cuMemFree(d_A);
    cuMemFree(d_D);

    for (int i = 0; i < GPU_ITERS; i++)
	cudaEventDestroy(events[i]);

    cudaStreamDestroy(stream);

    printf("GDR closed\n");
}