// https://shanetully.com/2014/12/translating-virtual-addresses-to-physcial-addresses-in-user-space/

//http://www.linuxforums.org/forum/programming-scripting/202000-dev-mem-access-c-program.html
//https://stackoverflow.com/questions/10733816/read-and-write-process-memory-through-dev-mem-text-segment-works-but-data-seg
//https://github.com/hackndev/tools/blob/master/devmem2.c
// https://cboard.cprogramming.com/c-programming/176305-reading-writing-dev-mem.html

//=============================================================================================
// To run this program, first reserve a portion of memory only for FPGA (in u-boot limitate the meory of the Linux)
// Then change the physical address to read and write from the physycal memory.
// 
// This program read a image and then writes it in the physical memory.
// Then activates the DMA controller in the FPGA to copy the image from one address to another.
// From Linux, the program will read the image from the second address and save the image. 
//=============================================================================================

#define DEBUG 1

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/mman.h>
#include <errno.h>
#include <fcntl.h>
#include <stdint.h>

#include "hwlib.h"
#include "socal/socal.h"
#include "socal/hps.h"
#include "socal/alt_gpio.h"

#include "hps_0.h"

//#include "sgdma.h"	// Scatter-Gather DMA
#include "dma.h"		// DMA
//=============================================================================================
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
//=============================================================================================
// settings for the lightweight HPS-to-FPGA bridge
// The ALT_STM_OFST starts at 0xfc000000 and the HW_REGS_SPAN of 0x04000000 occupies all the physical space until the end
// The lightweight HPS-to-FPGA bridge starts at 0xff200000 -> ALT_LWFPGASLVS_OFST
#define HW_REGS_BASE ( ALT_STM_OFST )
#define HW_REGS_SPAN ( 0x04000000 ) //64 MB with 32 bit adress space this is 256 MB
#define HW_REGS_MASK ( HW_REGS_SPAN - 1 )
//=============================================================================================
//setting for the HPS2FPGA AXI Bridge
#define ALT_AXI_FPGASLVS_OFST (0xC0000000) // axi_master
#define HW_FPGA_AXI_SPAN (0x40000000) // Bridge span 1GB
#define HW_FPGA_AXI_MASK ( HW_FPGA_AXI_SPAN - 1 )
//=============================================================================================
// Activate the FPGA-to-SDRAM bridge
//SDRAMC beginning address
// Manual CycloneV page 833-834
#define SDRAMC_REGS_BASE	0xFFC20000	// From hps.h file it is the ALT_SDR_OFST = 0xFFC20000
#define SDRAMC_REGS_SPAN	0x00100000
#define SDRAMC_REGS_MASK	( SDRAMC_REGS_SPAN - 1 )
//Offset of FPGA-to-SDRAMC ports reset register from the beginning of SDRAMC
#define FPGAPORTRST 		0x00005080

//Remove FPGA-to-SDRAMC ports from reset so FPGA can access SDRAM through them
//*((unsigned int *)(SDRAMC_REGS + FPGAPORTRST)) = 0xFFFF;
//=============================================================================================

void* create_buffer(size_t buf_size);
int open_memory(void);
unsigned char *get_sending_weight(uint16_t in_channel, uint16_t out_channel, uint16_t epoch, unsigned char *image_data);
unsigned char *get_sending_bias(uint16_t in_channel, uint16_t out_channel, uint16_t epoch, unsigned char *image_data);
float f16_to_f32(uint16_t __x);

uint32_t * get_pio(void *virtual_base, unsigned long pio_addr) {
    void *ret = NULL;
    ret = virtual_base + ( ( unsigned long  )( ALT_LWFPGASLVS_OFST + pio_addr ) & ( unsigned long)( HW_REGS_MASK ) );
    return (uint32_t *)ret;
}

void dma_h2f(void *virtual_base, unsigned long physical_addr1) {
    int dma_h2f_size = 2048;

    //create a pointer to the DMA controller base
    void *h2p_lw_dma_addr0 = NULL;
    h2p_lw_dma_addr0 = virtual_base + ( ( unsigned long  )( ALT_LWFPGASLVS_OFST + DMA_0_BASE ) & ( unsigned long)( HW_REGS_MASK ) );

    // clear the DMA control and status
    clearDMAcontrol(h2p_lw_dma_addr0);
    _DMA_REG_STATUS(h2p_lw_dma_addr0) = 0;

    // printf("\n");
    // printf("++++Before start DMA0++++\n");
    // debugPrintDMARegister(h2p_lw_dma_addr0);
    // printf("\n");
    // debugPrintDMAStatus(h2p_lw_dma_addr0);
    // printf("++++++++++++++++++++++++\n\n");

    // Make a reset to the DMA
    // Se fizer um reset, precisa de ter um usleep de 10000 micro segundos no meio, e outro no fim.
    // Estes usleeps têm OBRIGATORIAMENTE de estar AQUI!!! Se não NÃO FUNCIONA!!!!!!! PORQUÊ? não sei...
    /*
    _DMA_REG_CONTROL(h2p_lw_dma_addr0) = _DMA_CTR_SOFTWARERESET;
    usleep(100000);
    _DMA_REG_CONTROL(h2p_lw_dma_addr0) = _DMA_CTR_SOFTWARERESET;
    usleep(100000);
    */

    // Este usleep tem OBRIGATORIAMENTE de estar AQUI!!! Se não NÃO FUNCIONA!!!!!!! PORQUÊ? não sei...
    // Correçao, funcionou sem este usleep
    //usleep(2*100000);
    
    _DMA_REG_STATUS(h2p_lw_dma_addr0) = 0;
    _DMA_REG_READ_ADDR(h2p_lw_dma_addr0)  = physical_addr1;  // read from F2SDRAM_0
    _DMA_REG_WRITE_ADDR(h2p_lw_dma_addr0) = 0;  // write to F2SDRAM_1
    _DMA_REG_LENGTH(h2p_lw_dma_addr0) = dma_h2f_size;			//write 100x 4bytes since we have a 32 bit system

    //start the transfer
    _DMA_REG_CONTROL(h2p_lw_dma_addr0) = _DMA_CTR_BYTE | _DMA_CTR_GO | _DMA_CTR_LEEN;
    


    // wait for DMA to be finished
    waitDMAFinish(h2p_lw_dma_addr0);
    // printf("++++DMA0 Status++++\n");
    // debugPrintDMARegister(h2p_lw_dma_addr0);
    // printf("\n");
    // debugPrintDMAStatus(h2p_lw_dma_addr0);
    // printf("++++++++++++++++++\n");
    // printf("SDRAM->OCM Finished\n");

    // clear the DMA control and status
    clearDMAcontrol(h2p_lw_dma_addr0);
    _DMA_REG_STATUS(h2p_lw_dma_addr0) = 0;
}

void dma_f2h(void *virtual_base, unsigned long physical_addr2) {
        //create a pointer to the DMA controller base
    void *h2p_lw_dma_addr1 = NULL;
    h2p_lw_dma_addr1 = virtual_base + ( ( unsigned long  )( ALT_LWFPGASLVS_OFST + DMA_1_BASE ) & ( unsigned long)( HW_REGS_MASK ) );

    // clear the DMA control and status
    clearDMAcontrol(h2p_lw_dma_addr1);
    _DMA_REG_STATUS(h2p_lw_dma_addr1) = 0;

    // printf("++++Before start DMA1++++\n");
    // debugPrintDMARegister(h2p_lw_dma_addr1);
    // printf("\n");
    // debugPrintDMAStatus(h2p_lw_dma_addr1);
    // printf("++++++++++++++++++++++++\n\n");

    _DMA_REG_STATUS(h2p_lw_dma_addr1) = 0;
    _DMA_REG_READ_ADDR(h2p_lw_dma_addr1)  = 0;  				// read from OCM
    _DMA_REG_WRITE_ADDR(h2p_lw_dma_addr1) = physical_addr2;  	// write to SDRAM (DDR3)
    _DMA_REG_LENGTH(h2p_lw_dma_addr1) = 4000;					// number of elements in bytes

    //start the transfer
    _DMA_REG_CONTROL(h2p_lw_dma_addr1) = _DMA_CTR_BYTE | _DMA_CTR_GO | _DMA_CTR_LEEN;

    // wait for DMA to be finished
    waitDMAFinish(h2p_lw_dma_addr1);
    // printf("++++DMA1 Status++++\n");
    // debugPrintDMARegister(h2p_lw_dma_addr1);
    // printf("\n");
    // debugPrintDMAStatus(h2p_lw_dma_addr1);
    // printf("++++++++++++++++++\n");

}

void print_mem(void * mem, int offset, int len);

void relu(float *image_data, int total){
    for (int i=0; i<total; i++) {
        if (image_data[i] < 0)
            image_data[i] = 0;
    }
}

uint16_t *max_pooling(float *image_data, int input_dim, int channel)
{
    float *res = malloc(channel*input_dim*input_dim/4*sizeof(float));
    for(int k = 0; k < channel; k++) {
        for (int i=0; i<input_dim; i+=2){
            for (int j=0; j<input_dim; j+=2){
                float a0 = image_data[(i*input_dim+j) + k*input_dim*input_dim];
                float a1 = image_data[(i*input_dim+j+1) + k*input_dim*input_dim];
                float a2 = image_data[((i+1)*input_dim+j) + k*input_dim*input_dim];
                float a3 = image_data[((i+1)*input_dim+j+1) + k*input_dim*input_dim];
                if (a0>=a1 && a0>=a2 && a0>=a3){
                    res[i/2*input_dim+j/2 + k*input_dim*input_dim/4] = image_data[(i*input_dim+j) + k*input_dim*input_dim];
                }
                else if (a1>=a0 && a1>=a2 && a1>=a3){
                    res[i/2*input_dim+j/2 + k*input_dim*input_dim/4] = image_data[(i*input_dim+j+1) + k*input_dim*input_dim];
                }
                else if (a2>=a0 && a2>=a1 && a2>=a3){
                    res[i/2*input_dim+j/2 + k*input_dim*input_dim/4] = image_data[((i+1)*input_dim+j) + k*input_dim*input_dim];
                }
                else if (a3>=a0 && a3>=a1 && a3>=a2){
                    res[i/2*input_dim+j/2 + k*input_dim*input_dim/4] = image_data[((i+1)*input_dim+j+1) + k*input_dim*input_dim];
                }
            }
        }
    }
    uint16_t *res_fp16 = malloc(channel*input_dim*input_dim/4*sizeof(uint16_t));
    for(int i = 0; i < channel*input_dim*input_dim/4; i++) {
        // float32 to float16
        uint32_t x = *((uint32_t*)&(res[i]));
        res_fp16[i] = ((x>>16)&0x8000)|((((x&0x7f800000)-0x38000000)>>13)&0x7c00)|((x>>13)&0x03ff);
    }
    free(res);
    return res_fp16;
}

int main(int argc, char *argv[]) {

    // Check inputs
    if (argc < 2){
        printf("Usage: <program> <image filename>\n");
        return 0;
    }

    printf("Running. To exit, press Ctrl+C.\n\n");

    unsigned long physical_addr1 = 0x20000000;
    unsigned long physical_addr2 = 0x21000000;

    // ---------- Load weight bias and input image ----------
    printf("Loading model and input image...\n");
    int width, height, channels;
    unsigned char *weight_bias_all = stbi_load("/root/linux/weight_bias.jpg", &width, &height, &channels, 1);
    unsigned char *weight_bias_little_endian = (unsigned char *)malloc(width*height*sizeof(unsigned char));
    for(int _wb_idx = 0; _wb_idx < width*height; _wb_idx++) {
        if (_wb_idx % 2 == 0)
            weight_bias_little_endian[_wb_idx] = weight_bias_all[_wb_idx+1];
        else
            weight_bias_little_endian[_wb_idx] = weight_bias_all[_wb_idx-1];
    }
    printf("\nWeight&Bias loaded with fp16 %dx%d\n", width/2, height/2);

    // unsigned char *input_pic = stbi_load("/root/linux/input_images/dog.jpg", &width, &height, &channels, 3);
    unsigned char *input_pic = stbi_load(argv[1], &width, &height, &channels, 3);
    printf("InputPic width %d height %d channels %d\n", width, height, channels);
    unsigned char feat_map_in_0[(width * height * channels) * 2];
    // Reshape input image: R11 G11 B11 R12 G12 B12 -> B11 B12 ... G11 G12 ... R11 R12
    // idx 2, 2+3, 2+6, ...,
    int i = 0;
    for (int channel = 2; channel >= 0; channel--) {
        for (int nth_pixel = 0; nth_pixel < 32 * 32; nth_pixel++) {
            float output = (float) input_pic[channel + 3 * nth_pixel] / (float) 255.0;
            // Float to half-precision float (fp16)
            uint32_t x = *((uint32_t *) &output);
            uint16_t h =
                    ((x >> 16) & 0x8000) | ((((x & 0x7f800000) - 0x38000000) >> 13) & 0x7c00) | ((x >> 13) & 0x03ff);

            ((uint16_t *) feat_map_in_0)[i++] = h;
        }
    }
    printf("Model and input image loaded\n");
    // ------------------------------------------------------------

    int image_size = width*height;
    // size_t size = image_size * sizeof(unsigned char); // size is the same size as image_size
    
    // ----------------- Memory Maps -----------------
    // SDRAM->FPGA
    int MAP_SIZE = 131072;
    int mem_fd1 = open_memory();

    void *map_base1;
    map_base1 = mmap(0, MAP_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, mem_fd1, physical_addr1);

    if(map_base1 == MAP_FAILED){
        printf("Can't mmap\n");
        return 1;
    }
    memset(map_base1, 0, MAP_SIZE);
    // FPGA->SDRAM
    int mem_fd2 = open_memory();

    void *map_base2;
    map_base2 = mmap(0, MAP_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, mem_fd2, physical_addr2);

    if(map_base2 == MAP_FAILED){
        printf("Can't mmap\n");
        return 1;
    }
    memset(map_base2, 0, MAP_SIZE);
    // ---------------------------------------------------


    // Add the image to the mapped memory place
    // for(int i = 0; i < image_size; i++){
        // *((unsigned char *)map_base1+i) = image_data[i];
        // *((unsigned char *)map_base1+i) = image_data[0];
        // *((unsigned char *)map_base1+i) = (unsigned char) (1);
    // }

    for(unsigned char i = 0; i < 200; i++){
        *((unsigned char *)map_base1+i) = i+1;
        // *((unsigned char *)map_base1+i) = (unsigned char) (2);
    }


    //=============================================================================================
    int fd = open_memory();
    //=============================================================================================
    // Activate the FPGA-to-SDRAM bridge
    // FPGA-to-SDRAM bridge
    
    void *fpga_sdram_virtual_base;
    fpga_sdram_virtual_base = mmap( NULL, SDRAMC_REGS_SPAN, ( PROT_READ | PROT_WRITE ), MAP_SHARED, fd, SDRAMC_REGS_BASE );

    if( fpga_sdram_virtual_base == MAP_FAILED ) {
        printf( "ERROR: axi mmap() fpga_sdram_virtual_base failed...\n" );
        close( fd );
        return( 1 );
    }

    // See cyclone V manual page 833
    // Activates the FPGA-to-SDRAM bridge
    void *f2sdram = NULL;
    f2sdram = fpga_sdram_virtual_base + ( ( unsigned long  )( 0x0 + FPGAPORTRST ) & ( unsigned long)( SDRAMC_REGS_MASK ) );

    *((unsigned int *)(f2sdram)) = 0xFFF;
    
    //lightweight HPS-to-FPGA bridge
    void *virtual_base;
    virtual_base = mmap( NULL, HW_REGS_SPAN, ( PROT_READ | PROT_WRITE ), MAP_SHARED, fd, HW_REGS_BASE );

    if( virtual_base == MAP_FAILED ) {
        printf( "ERROR: mmap() virtual_base failed...\n" );
        close( fd );
        return( 1 );
    }


    // Obtain pointers to control signal PIOs
    // uint32_t * start_haddoc = get_pio(virtual_base, IMAGE_SENT_OCM_BASE);
    // uint32_t * fpga_stat = get_pio(virtual_base, FPGA_STAT_BASE);

    uint32_t * h2f_start = get_pio(virtual_base, H2F_START_BASE);
    // uint32_t * f2h_start = get_pio(virtual_base, F2H_START_BASE);
    // uint32_t * h2f_finish = get_pio(virtual_base, H2F_FINISH_BASE);
    uint32_t * f2h_finish = get_pio(virtual_base, F2H_FINISH_BASE);
    uint32_t * h2f_read_length = get_pio(virtual_base, H2F_READ_LENGTH_BASE);
    // uint32_t * f2h_write_length = get_pio(virtual_base, F2H_WRITE_LENGTH_BASE);
    // uint32_t * h2f_buf_offset = get_pio(virtual_base, H2F_BUF_OFFSET_BASE);
    // uint32_t * f2h_buf_offset = get_pio(virtual_base, F2H_BUF_OFFSET_BASE);
    uint32_t * feature_map_dim = get_pio(virtual_base, FEAT_MAP_DIM_BASE);

    printf("Start\n");
    
    float *feat_map_out = malloc(64*32*32*sizeof(float));
    memset(feat_map_out, 0, 64*32*32*sizeof(float));

    uint16_t *feat_map_out_temp = malloc(64*32*32*sizeof(uint16_t));
    
    // Layer 0
    for(int input_ch = 0; input_ch < 3; input_ch++) {
        for(int output_ch = 0; output_ch < 64; output_ch++) {
            // Fill SDRAM
            unsigned char *base_weight = get_sending_weight(input_ch, output_ch, 0, weight_bias_little_endian);
            unsigned char *base_bias = get_sending_bias(input_ch, output_ch, 0, weight_bias_little_endian);
            memcpy(map_base1, base_weight, 9 * sizeof(uint16_t));
            memcpy(map_base1 + 9 * sizeof(uint16_t), base_bias, 1 * sizeof(uint16_t));
            memcpy(map_base1 + 10 * sizeof(uint16_t), feat_map_in_0 + 32*32*input_ch*sizeof(uint16_t), 32*32*sizeof(uint16_t));
            
            // SDRAM -> OCM
            dma_h2f(virtual_base, physical_addr1);

            // Start
            *feature_map_dim = 32;
            *h2f_read_length = (32*32+10)*2;
            *h2f_start = 0;
            *h2f_start = 1;

            // Wait for FPGA to finish
            while(!(*f2h_finish));

            // OCM -> SDRAM
            dma_f2h(virtual_base, physical_addr2);

            // Add feature map to buffer
            memcpy(feat_map_out_temp + output_ch*(32*32*sizeof(uint16_t)), map_base2, 32*32*sizeof(uint16_t));
            for(int i = 0; i < 32*32; i++) {
                feat_map_out[i+output_ch*32*32] += f16_to_f32(feat_map_out_temp[i]);
            }
            // printf("After add:\n");
            // print_mem((void *)feat_map_out, 500, 20);
            printf("L0 [OK] input %d, output %d\n", input_ch, output_ch);
        }
    }

    relu(feat_map_out, 64*32*32);
    uint16_t * feat_map_in = max_pooling(feat_map_out, 32, 64);

    
    // Layer 1
    memset(feat_map_out, 0, 128*16*16*sizeof(float));
    int layer_fm_size = 16;

    for(int input_ch = 0; input_ch < 64; input_ch++) {
        for(int output_ch = 0; output_ch < 128; output_ch++) {
            // Fill SDRAM
            unsigned char *base_weight = get_sending_weight(input_ch, output_ch, 0, weight_bias_little_endian);
            unsigned char *base_bias = get_sending_bias(input_ch, output_ch, 0, weight_bias_little_endian);
            memcpy(map_base1, base_weight, 9 * sizeof(uint16_t));
            memcpy(map_base1 + 9 * sizeof(uint16_t), base_bias, 1 * sizeof(uint16_t));
            memcpy(map_base1 + 10 * sizeof(uint16_t), feat_map_in + layer_fm_size*layer_fm_size*input_ch*sizeof(uint16_t), layer_fm_size*layer_fm_size*sizeof(uint16_t));
            
            // SDRAM -> OCM
            dma_h2f(virtual_base, physical_addr1);

            // Start
            *feature_map_dim = layer_fm_size;
            *h2f_read_length = (layer_fm_size*layer_fm_size+10)*2;
            *h2f_start = 0;
            *h2f_start = 1;

            // Wait for FPGA to finish
            while(!(*f2h_finish));

            // OCM -> SDRAM
            dma_f2h(virtual_base, physical_addr2);

            // Add feature map to buffer
            memcpy(feat_map_out_temp + output_ch*(layer_fm_size*layer_fm_size*sizeof(uint16_t)), map_base2, layer_fm_size*layer_fm_size*sizeof(uint16_t));
            for(int i = 0; i < layer_fm_size*layer_fm_size; i++) {
                feat_map_out[i+output_ch*layer_fm_size*layer_fm_size] += f16_to_f32(feat_map_out_temp[i]);
            }
            printf("L1 [OK] input %d, output %d\n", input_ch, output_ch);
        }
    }
    relu(feat_map_out, 128*16*16);
    uint16_t * feat_map_in_2 = max_pooling(feat_map_out, 16, 128);


    // Layer 2
    memset(feat_map_out, 0, 128*8*8*sizeof(float));
    layer_fm_size = 8;

    for(int input_ch = 0; input_ch < 128; input_ch++) {
        for(int output_ch = 0; output_ch < 256; output_ch++) {
            // Fill SDRAM
            unsigned char *base_weight = get_sending_weight(input_ch, output_ch, 0, weight_bias_little_endian);
            unsigned char *base_bias = get_sending_bias(input_ch, output_ch, 0, weight_bias_little_endian);
            memcpy(map_base1, base_weight, 9 * sizeof(uint16_t));
            memcpy(map_base1 + 9 * sizeof(uint16_t), base_bias, 1 * sizeof(uint16_t));
            memcpy(map_base1 + 10 * sizeof(uint16_t), feat_map_in_2 + layer_fm_size*layer_fm_size*input_ch*sizeof(uint16_t), layer_fm_size*layer_fm_size*sizeof(uint16_t));
            
            // SDRAM -> OCM
            dma_h2f(virtual_base, physical_addr1);

            // Start
            *feature_map_dim = layer_fm_size;
            *h2f_read_length = (layer_fm_size*layer_fm_size+10)*2;
            *h2f_start = 0;
            *h2f_start = 1;

            // Wait for FPGA to finish
            while(!(*f2h_finish));

            // OCM -> SDRAM
            dma_f2h(virtual_base, physical_addr2);

            // Add feature map to buffer
            memcpy(feat_map_out_temp + output_ch*(layer_fm_size*layer_fm_size*sizeof(uint16_t)), map_base2, layer_fm_size*layer_fm_size*sizeof(uint16_t));
            for(int i = 0; i < layer_fm_size*layer_fm_size; i++) {
                feat_map_out[i+output_ch*layer_fm_size*layer_fm_size] += f16_to_f32(feat_map_out_temp[i]);
            }
            printf("L2 [OK] input %d, output %d\n", input_ch, output_ch);
        }
    }
    relu(feat_map_out, 256*8*8);
    uint16_t *feat_map_in_3 = malloc(256*8*8*sizeof(uint16_t));
    for(int i = 0; i < 256*8*8; i++) {
        // float32 to float16
        uint32_t x = *((uint32_t*)&(feat_map_out[i]));
        feat_map_in_3[i] = ((x>>16)&0x8000)|((((x&0x7f800000)-0x38000000)>>13)&0x7c00)|((x>>13)&0x03ff);
    }

    // Layer 3
    memset(feat_map_out, 0, 256*8*8*sizeof(float));
    layer_fm_size = 8;

    for(int input_ch = 0; input_ch < 256; input_ch++) {
        for(int output_ch = 0; output_ch < 256; output_ch++) {
            // Fill SDRAM
            unsigned char *base_weight = get_sending_weight(input_ch, output_ch, 0, weight_bias_little_endian);
            unsigned char *base_bias = get_sending_bias(input_ch, output_ch, 0, weight_bias_little_endian);
            memcpy(map_base1, base_weight, 9 * sizeof(uint16_t));
            memcpy(map_base1 + 9 * sizeof(uint16_t), base_bias, 1 * sizeof(uint16_t));
            memcpy(map_base1 + 10 * sizeof(uint16_t), feat_map_in_3 + layer_fm_size*layer_fm_size*input_ch*sizeof(uint16_t), layer_fm_size*layer_fm_size*sizeof(uint16_t));
            
            // SDRAM -> OCM
            dma_h2f(virtual_base, physical_addr1);

            // Start
            *feature_map_dim = layer_fm_size;
            *h2f_read_length = (layer_fm_size*layer_fm_size+10)*2;
            *h2f_start = 0;
            *h2f_start = 1;

            // Wait for FPGA to finish
            while(!(*f2h_finish));

            // OCM -> SDRAM
            dma_f2h(virtual_base, physical_addr2);

            // Add feature map to buffer
            memcpy(feat_map_out_temp + output_ch*(layer_fm_size*layer_fm_size*sizeof(uint16_t)), map_base2, layer_fm_size*layer_fm_size*sizeof(uint16_t));
            for(int i = 0; i < layer_fm_size*layer_fm_size; i++) {
                feat_map_out[i+output_ch*layer_fm_size*layer_fm_size] += f16_to_f32(feat_map_out_temp[i]);
            }
            printf("L3 [OK] input %d, output %d\n", input_ch, output_ch);
        }
    }
    relu(feat_map_out, 256*8*8);
    uint16_t * feat_map_in_4 = max_pooling(feat_map_out, 8, 256);

    // Layer 4
    memset(feat_map_out, 0, 256*4*4*sizeof(float));
    layer_fm_size = 4;

    for(int input_ch = 0; input_ch < 256; input_ch++) {
        for(int output_ch = 0; output_ch < 512; output_ch++) {
            // Fill SDRAM
            unsigned char *base_weight = get_sending_weight(input_ch, output_ch, 0, weight_bias_little_endian);
            unsigned char *base_bias = get_sending_bias(input_ch, output_ch, 0, weight_bias_little_endian);
            memcpy(map_base1, base_weight, 9 * sizeof(uint16_t));
            memcpy(map_base1 + 9 * sizeof(uint16_t), base_bias, 1 * sizeof(uint16_t));
            memcpy(map_base1 + 10 * sizeof(uint16_t), feat_map_in_4 + layer_fm_size*layer_fm_size*input_ch*sizeof(uint16_t), layer_fm_size*layer_fm_size*sizeof(uint16_t));
            
            // SDRAM -> OCM
            dma_h2f(virtual_base, physical_addr1);

            // Start
            *feature_map_dim = layer_fm_size;
            *h2f_read_length = (layer_fm_size*layer_fm_size+10)*2;
            *h2f_start = 0;
            *h2f_start = 1;

            // Wait for FPGA to finish
            while(!(*f2h_finish));

            // OCM -> SDRAM
            dma_f2h(virtual_base, physical_addr2);

            // Add feature map to buffer
            memcpy(feat_map_out_temp + output_ch*(layer_fm_size*layer_fm_size*sizeof(uint16_t)), map_base2, layer_fm_size*layer_fm_size*sizeof(uint16_t));
            for(int i = 0; i < layer_fm_size*layer_fm_size; i++) {
                feat_map_out[i+output_ch*layer_fm_size*layer_fm_size] += f16_to_f32(feat_map_out_temp[i]);
            }
            printf("L3 [OK] input %d, output %d\n", input_ch, output_ch);
        }
    }
    relu(feat_map_out, 512*4*4);
    uint16_t *feat_map_in_5 = malloc(512*4*4*sizeof(uint16_t));
    for(int i = 0; i < 512*4*4; i++) {
        // float32 to float16
        uint32_t x = *((uint32_t*)&(feat_map_out[i]));
        feat_map_in_5[i] = ((x>>16)&0x8000)|((((x&0x7f800000)-0x38000000)>>13)&0x7c00)|((x>>13)&0x03ff);
    }

    // Layer 5
    memset(feat_map_out, 0, 512*4*4*sizeof(float));
    layer_fm_size = 4;

    for(int input_ch = 0; input_ch < 512; input_ch++) {
        for(int output_ch = 0; output_ch < 512; output_ch++) {
            // Fill SDRAM
            unsigned char *base_weight = get_sending_weight(input_ch, output_ch, 0, weight_bias_little_endian);
            unsigned char *base_bias = get_sending_bias(input_ch, output_ch, 0, weight_bias_little_endian);
            memcpy(map_base1, base_weight, 9 * sizeof(uint16_t));
            memcpy(map_base1 + 9 * sizeof(uint16_t), base_bias, 1 * sizeof(uint16_t));
            memcpy(map_base1 + 10 * sizeof(uint16_t), feat_map_in_5 + layer_fm_size*layer_fm_size*input_ch*sizeof(uint16_t), layer_fm_size*layer_fm_size*sizeof(uint16_t));
            
            // SDRAM -> OCM
            dma_h2f(virtual_base, physical_addr1);

            // Start
            *feature_map_dim = layer_fm_size;
            *h2f_read_length = (layer_fm_size*layer_fm_size+10)*2;
            *h2f_start = 0;
            *h2f_start = 1;

            // Wait for FPGA to finish
            while(!(*f2h_finish));

            // OCM -> SDRAM
            dma_f2h(virtual_base, physical_addr2);

            // Add feature map to buffer
            memcpy(feat_map_out_temp + output_ch*(layer_fm_size*layer_fm_size*sizeof(uint16_t)), map_base2, layer_fm_size*layer_fm_size*sizeof(uint16_t));
            for(int i = 0; i < layer_fm_size*layer_fm_size; i++) {
                feat_map_out[i+output_ch*layer_fm_size*layer_fm_size] += f16_to_f32(feat_map_out_temp[i]);
            }
            printf("L3 [OK] input %d, output %d\n", input_ch, output_ch);
        }
    }
    relu(feat_map_out, 512*2*2);
    uint16_t * feat_map_in_6 = max_pooling(feat_map_out, 2, 512);


    //=============================================================================================
    // CLEAN MMORY AND UNMAP AND FREE POINTERS

    //memset(map_base1, 0, size);
    //memset(map_base2, 0, size);
    
    if(munmap(map_base1, MAP_SIZE) == -1){
        printf("Can't munmap\n");
        return 1;
    }

    if(munmap(map_base2, MAP_SIZE) == -1){
        printf("Can't munmap\n");
        return 1;
    }

    close(mem_fd1);
    close(mem_fd2);
    
    //=================================================================================================================

    return 0;
}

void print_mem(void * mem, int offset, int len) {
    for(int i = offset; i < offset+len; i++){
        printf("%02x ", *((unsigned char *)mem+i));
        if((i-9) % 10 == 0)
            printf("\n");
    }
    printf("\n");
}

void* create_buffer(size_t buf_size) {
    //size_t buf_size = strlen(size) + 1; // Add 1 for the final characer

    if(DEBUG) printf("size buffer = %d\n", buf_size);

    // Allocate some memory to manipulate
    void *buffer = malloc(buf_size);
    //void *buffer = calloc(buf_size, sizeof(unsigned char));
    if(buffer == NULL) {
        fprintf(stderr, "Failed to allocate memory for buffer\n");
        exit(1);
    }

    // Lock the page in memory
    // Do this before writing data to the buffer so that any copy-on-write
    // mechanisms will give us our own page locked in memory
    if(mlock(buffer, buf_size) == -1) {
        fprintf(stderr, "Failed to lock page in memory: %s\n", strerror(errno));
        exit(1);
    }

    return buffer;
}

int open_memory(void) {
    // Open the memory (must be root for this)
    int fd = open("/dev/mem", ( O_RDWR | O_SYNC ));

    if(fd == -1) {
        fprintf(stderr, "Error opening /dev/mem: %s\n", strerror(errno));
        exit(1);
    }

    return fd;
}

unsigned char *
get_sending_weight(uint16_t in_channel, uint16_t out_channel, uint16_t epoch, unsigned char *image_data) {
    int fixed_index[] = {1792, 73856, 295168, 590080, 1180160, 2359808, 2359808, 2359808};
    int fixed_in_channel[] = {3, 64, 128, 256, 256, 512, 512, 512};
    int offset = 0;
    for (int i = 0; i < epoch; i++) {
        offset += fixed_index[i];
    }
    unsigned char *current = image_data + offset * 2;
    current += out_channel * fixed_in_channel[epoch] * 3 * 3 * 2;
    current += in_channel * 3 * 3 * 2;
    return current;
}

unsigned char *get_sending_bias(uint16_t in_channel, uint16_t out_channel, uint16_t epoch, unsigned char *image_data) {
    int fixed_index[] = {1792, 73856, 295168, 590080, 1180160, 2359808, 2359808, 2359808};
    int fixed_bias_len[] = {64, 128, 256, 256, 512, 512, 512, 512};
    int offset = 0;
    for (int i = 0; i < epoch; i++) {
        offset += fixed_index[i];
    }
    unsigned char *current = image_data + offset * 2;
    current += fixed_index[epoch] * 2;
    current -= fixed_bias_len[epoch] * 2;
    current += out_channel * 2;
    return current;
}

float f16_to_f32(uint16_t h) {
    return ((h&0x8000)<<16) | (((h&0x7c00)+0x1C000)<<13) | ((h&0x03FF)<<13);
//   // https://blog.csdn.net/ysaeeiderup/article/details/124104042
//   unsigned short n = *((unsigned short *)&__x);
//   unsigned int x = (unsigned int)n;
//   x = x & 0xffff;
//   unsigned int sign = x & 0x8000;                   //符号位
//   unsigned int exponent_f16 = (x & 0x7c00) >> 10;   //half指数位
//   unsigned int mantissa_f16 = x & 0x03ff;           //half小数位
//   unsigned int y = sign << 16;
//   unsigned int exponent_f32;                        //float指数位
//   unsigned int mantissa_f32;                        //float小数位
//   unsigned int first_1_pos = 0;                     //（half小数位）最高位1的位置
//   unsigned int mask;
//   unsigned int hx;
 
//   hx = x & 0x7fff;
 
//   if (hx == 0) {
//     return *((float *)&y);
//   }
//   if (hx == 0x7c00) {
//     y |= 0x7f800000;
//     return *((float *)&y);
//   }
//   if (hx > 0x7c00) {
//     y = 0x7fc00000;
//     return *((float *)&y);
//   }
 
//   exponent_f32 = 0x70 + exponent_f16;
//   mantissa_f32 = mantissa_f16 << 13;
 
//   for (first_1_pos = 0; first_1_pos < 10; first_1_pos++) {
//     if ((mantissa_f16 >> (first_1_pos + 1)) == 0) {
//       break;
//     }
//   }
 
//   if (exponent_f16 == 0) {
//     mask = (1 << 23) - 1;
//     exponent_f32 = exponent_f32 - (10 - first_1_pos) + 1;
//     mantissa_f32 = mantissa_f32 << (10 - first_1_pos);
//     mantissa_f32 = mantissa_f32 & mask;
//   }
 
//   y = y | (exponent_f32 << 23) | mantissa_f32;
 
//   return *((float *)&y);
}
