/*
 * Avalon memory-mapped peripheral that generates VGA
 *
 * Stephen A. Edwards
 * Columbia University
 */

module vga_ball(input logic        clk,
	        input logic 	   reset,
		input logic [7:0]  writedata,
		output logic [7:0] readdata,
		input logic 	   write,
		input 		   chipselect,
		input logic [17:0]  address,

		output logic [7:0] VGA_R, VGA_G, VGA_B,
		output logic 	   VGA_CLK, VGA_HS, VGA_VS,
		                   VGA_BLANK_n,
		output logic 	   VGA_SYNC_n);

   parameter IMG_WIDTH  = 525;			//Reduced 600x400 image size to this value as FPGA runs out of computational resources.
   parameter IMG_HEIGHT = 350;
   parameter IMG_SIZE   = IMG_WIDTH * IMG_HEIGHT;

   logic [10:0]	   hcount;			//Gives column number
   logic [9:0]     vcount;			//Gives row number
   logic [7:0] 	   background_r, background_g, background_b;

   logic [17:0]    array_address;
   logic           print_pixel;
   logic [7:0] 	   readmem = 1;
   
   logic [17:0]    address_minus_4;
   
   logic [17:0]    address_in_cache = 0;
   logic [7:0]     cache = 8'b0000_0000;
   logic [7:0]     out_pixel_cache = 8'b0000_0000;
   logic           async_clr = 0;
   logic           read_flag = 0;
   logic           write_flag = 0;
   logic [17:0]    conv_input_pixel_addr_start = 18'd0;

   logic [17:0]    kernel_size_sq;
   logic [17:0]    kernel_size;
   logic [17:0]    img_width_orig = 18'd525;
   logic [17:0]    img_height_orig = 18'd350;
   //logic [17:0]    img_width_orconv_input_pixel_addrig = 18'd262;			//What if the image is actually smaller?
   //logic [17:0]    img_height_orig = 18'd175;

   logic [17:0]    img_width;					//When you resize, your image width and height will change 
   logic [17:0]    img_height;

   logic [17:0]    conv_input_pixel_addr = 18'd0;
   logic [17:0]    conv_row_change_offset_kernel;		
   logic [17:0]    conv_row_change_offset_image;

   logic [15:0] kernel[1412:0];
   
   logic [31:0] conv_out = 0;
   logic [7:0] pixel_out;

   logic [2:0]   count = 0; // counter used to wait for every nth clock cycle for convolution (we use every 8th cc) (on 3 it reads a pixel from memory and on 7 it does the convolution for that pixel)

   logic [17:0] idx_offset = 18'b0000_0000_0000_0000_00; // idx into kernel
   logic    conv_done = 0;

   /*
   logic [17:0]  row_change_idx_2 = 17;
   logic [17:0]  row_change_idx_3 = 26;
   logic [17:0]  row_change_idx_4 = 35;
   logic [17:0]  row_change_idx_5 = 44;
   logic [17:0]  row_change_idx_6 = 53;
   logic [17:0]  row_change_idx_7 = 62;
   logic [17:0]  row_change_idx_8 = 71;
   logic [17:0] conv_out_next_row_end = 18'd517; //img_width-kernel_size_9+1
   logic [17:0] conv_out_next_row_end = 18'd515; //img_width-kernel_size_11+1
   logic [17:0] conv_out_next_row_end = 18'd501; //img_width-kernel_size_25+1
   */

   logic [17:0] out_pixels_idx = 0;
   logic out_pixels_wren = 1'd0;

   logic [17:0] conv_out_width;				//Output sizes, depends on image size and kernel
   logic [17:0] conv_out_height;
   logic [17:0] conv_out_end;
   //logic [17:0] conv_out_next_row_end;
   logic [17:0] kernel_select_offset;
   //logic [17:0] conv_out_next_row_end_start;
   logic [40:0] conv_out_scaled;
   logic [17:0] step_size;

   logic [17:0] mux_rdaddress;
   logic [7:0] img_q;
   logic [17:0] width_gap;
   logic [17:0] height_gap;

   always_comb begin

     // Configs for kernels
     if (background_r[6]) begin // kernel 9x9
       kernel_size = 18'd9;
       kernel_size_sq = 18'd81;
       kernel_select_offset = 18'd0;
       //conv_out_next_row_end_start = 18'd517;
       conv_out_scaled = conv_out * 9'd314;
       pixel_out = conv_out_scaled[32:25];  		//shift by 25 bits. Dividing by 2^25
     end
     else if (background_r[5]) begin // kernel 11x11
       kernel_size = 18'd11;
       kernel_size_sq = 18'd121;
       kernel_select_offset = 18'd81;
       //conv_out_next_row_end_start = 18'd515;
       conv_out_scaled = conv_out * 9'd463;
       pixel_out = conv_out_scaled[32:25];		//shift by 25 bits. Dividing by 2^25
     end
     else if (background_r[4]) begin // kernel 15x15
       kernel_size = 18'd15;
       kernel_size_sq = 18'd225;
       kernel_select_offset = 18'd202;
       //conv_out_next_row_end_start = 18'd511;
       conv_out_scaled = conv_out * 9'd377;
       pixel_out = conv_out_scaled[34:27];		//shift by 27 bits. Dividing by 2^27
     end
     else if (background_r[3]) begin // kernel 19x19
       kernel_size = 18'd19;
       kernel_size_sq = 18'd261;
       kernel_select_offset = 18'd427;
       //conv_out_next_row_end_start = 18'd507;
       conv_out_scaled = conv_out * 9'd347;
       pixel_out = conv_out_scaled[34:27];		//shift by 27 bits. Dividing by 2^27
     end
     else if (background_r[2]) begin // kernel 25x25
       kernel_size = 18'd25;
       kernel_size_sq = 18'd625;
       kernel_select_offset = 18'd788;
       //conv_out_next_row_end_start = 18'd501;
       conv_out_scaled = conv_out * 9'd331;
       pixel_out = conv_out_scaled[35:28];		//shift by 28 bits. Dividing by 2^28
     end
     else begin
       kernel_size = 18'd9;
       kernel_size_sq = 18'd81;
       kernel_select_offset = 18'd0;
       //conv_out_next_row_end_start = 18'd517;
       conv_out_scaled = conv_out * 9'd314;
       pixel_out = conv_out_scaled[32:25];		//shift by 25 bits. Dividing by 2^25
     end

     conv_out_width = img_width - kernel_size + 18'd1;  //By default kernel size is 18'd9 and img_width = 525 then, conv_out_width = 525 - 9 + 1 = 517

     // Configs for octaves
     if (background_g[6]) begin // octave original
       img_width = img_width_orig; // 525
       img_height = img_height_orig; // 350

       conv_row_change_offset_kernel = conv_out_width;	//
       conv_row_change_offset_image = kernel_size;	//
       step_size = 18'd1;
     end
     else if (background_g[5]) begin // octave divde by 2							//To understand this, read line 249
       img_width = (img_width_orig>>1) + img_width_orig[0]; // 263 - 8						//525 >> 1 = 262 + 1 = 263		//From where is the -8 coming?
       img_height = (img_height_orig>>1) + img_height_orig[0]; // 175 - 8					//350 >> 1 = 175 + 0 = 175
       
       conv_row_change_offset_kernel = (img_width_orig - kernel_size + 1)<<1;					//What is the value of kernel_size? 
       conv_row_change_offset_image = (kernel_size<<1) + img_width_orig - (1-(img_width_orig-1)%2);		// 18'd1;  
       step_size = 18'd2;
     end
     else if (background_g[4]) begin // octave divde by 4
       img_width = (img_width_orig>>2) + img_width_orig[1]; 							//525 >> 2 = 131 + 0 = 131
       img_height = (img_height_orig>>2) + img_height_orig[1]; 							//350 >> 2 = 87 + 1  = 88 

       conv_row_change_offset_kernel = (img_width_orig - kernel_size + 1)<<2;
       conv_row_change_offset_image = (kernel_size<<2) + (img_width_orig*3)+1;
       step_size = 18'd4;
     end
     else if (background_g[3]) begin // octave divde by 8
       img_width = (img_width_orig>>3) + img_width_orig[2]; // 66-24
       img_height = (img_height_orig>>3) + img_height_orig[2]; // 44-24 

       conv_row_change_offset_kernel = (img_width_orig - kernel_size + 1)<<3;
       conv_row_change_offset_image = (kernel_size<<3) + (img_width_orig*7)-3;
       step_size = 18'd8;
     end
     else begin
       img_width = img_width_orig;
       img_height = img_height_orig;

       conv_row_change_offset_kernel = conv_out_width;
       conv_row_change_offset_image = kernel_size;
       step_size = 18'd1;
     end

     conv_out_height = img_height - kernel_size + 18'd1;
     conv_out_end = conv_out_width*conv_out_height;     

     if (VGA_BLANK_n) begin
       if (print_pixel) begin
         if (readmem)
           {VGA_R, VGA_G, VGA_B} = {readmem, readmem, readmem};   
         else
           {VGA_R, VGA_G, VGA_B} = {8'd255, 8'd0, 8'd0};
       end
       else
         {VGA_R, VGA_G, VGA_B} = {background_r, background_g, background_b};
     end
     else
       {VGA_R, VGA_G, VGA_B} = {8'd128, 8'd128, 8'd128};

   end
   assign width_gap = (18'd640-img_width_orig)>>1;
   assign height_gap = (18'd480-img_height_orig)>>1;

   //assign print_pixel   = ((hcount[10:1] < (conv_out_width)) && (vcount < conv_out_height)); 
   assign print_pixel   = ( (hcount[10:1] > width_gap+1) && (hcount[10:1] < (img_width_orig+width_gap)) && (vcount > height_gap) && (vcount < (img_height_orig+height_gap)) ); 
   
   //assign array_address = (print_pixel ? (((conv_out_width) * vcount) + hcount[10:1]+1) : 0);
   assign array_address = (print_pixel ? ( ((img_width_orig) * (vcount-height_gap)) + (hcount[10:1]-width_gap) ) : 0);
   
   //assign {VGA_R, VGA_G, VGA_B} = (VGA_BLANK_n ? (print_pixel ? ({readmem, readmem, readmem}) : ({background_r, background_g, background_b})) : {8'h0, 8'h0, 8'h0});
   
   assign address_minus_4 = address-18'd4;

   assign mux_rdaddress = out_pixels_wren ? conv_input_pixel_addr+18'd1 : array_address;

   assign readmem = ~out_pixels_wren ? img_q : 8'b0;

   assign cache = out_pixels_wren ? img_q : 8'b0;


    `include "hardcoded_kernels/kernel_all.sv"

   mem img_memory(
       .aclr(async_clr),
       .clock(clk),
       						//.rdaddress(array_address), // address to read
       //.rdaddress(conv_input_pixel_addr+1),
       //.rdaddress(array_address),

       .rdaddress(mux_rdaddress),
       .wraddress(address_minus_4),         // address to write
       //.q(readmem),
       .q(img_q),                           // result from reading is stored here
       //.q(cache),
       .wren(chipselect && write),          // write enable
       .data(writedata)                     // data to write
   );

   mem out_pixels(
       .aclr(async_clr),
       .clock(clk),
       						//.rdaddress(array_address), // address to read
       .rdaddress(address_in_cache),
       .wraddress(out_pixels_idx),         // address to write
       						//.q(readmem),
       .q(out_pixel_cache),                           // result from reading is stored here
       .wren(out_pixels_wren && conv_done),          // write enable
       .data(pixel_out)                     // data to write
       						//.data(conv_out[7:0])                     // data to write
   );

   vga_counters counters(.clk50(clk), .*);

   always_ff @(posedge clk) begin
     
     if (conv_done == 0 && out_pixels_wren == 1) begin
	     count <= count+1;
             if (idx_offset == 0) begin
                conv_input_pixel_addr <= conv_input_pixel_addr_start;
	     end 
             if (count == 1) begin 			// calculate conv_out += kernel_val * pixel
                count <= 0;              
		conv_out <= conv_out + cache * kernel[kernel_select_offset+idx_offset]; // do the multiplcation and accum the result		//Kernel offset is required to start reading the correct kernel as all kernels are stored serially in memory. idx_offset traverses the elements of the kernel
		if (idx_offset == kernel_size_sq-18'b1) begin 			// reset the idx that idxes into kernel
		   idx_offset <= 0;
		   conv_done <= 1;
		end
                else begin
		   idx_offset <= idx_offset+1;
                   if ((idx_offset%kernel_size) == (kernel_size-1)) begin
                     conv_input_pixel_addr <= conv_input_pixel_addr + conv_row_change_offset_kernel;
   	  	   end
                   else
		     conv_input_pixel_addr <= conv_input_pixel_addr + step_size;
	        end
	     end
     end
     else if (out_pixels_wren == 1) begin

      if (out_pixels_idx == (conv_out_end-18'd1)) begin
         out_pixels_wren <= 1'd0;
         out_pixels_idx <= 0;
         conv_input_pixel_addr <= 0;
         conv_input_pixel_addr_start <= 0;
         //conv_out_next_row_end <= conv_out_width;
       end
       else if ( (out_pixels_idx%conv_out_width) == (conv_out_width-18'd1) ) begin // 500, 500+501, 500+501*2
       //end else if ( out_pixels_idx == (conv_out_next_row_end-18'd1)) begin 
         out_pixels_idx <= out_pixels_idx + 18'd1;
         //conv_input_pixel_addr_start <= conv_input_pixel_addr_start + kernel_size;
  	 conv_input_pixel_addr_start <= conv_input_pixel_addr_start + conv_row_change_offset_image;
         //conv_out_next_row_end <= conv_out_next_row_end + conv_out_width;
       end
       else begin
         //if (out_pixels_idx == 18'd0)
           //conv_out_next_row_end <= conv_out_width;
         out_pixels_idx <= out_pixels_idx + 18'd1;
         conv_input_pixel_addr_start <= conv_input_pixel_addr_start + step_size;
       end

       conv_out <= 0;
       conv_done <= 0;
       count <= 0;
     end


     if (async_clr == 1'b1) begin
         async_clr <= 1'b0;
         address_in_cache <= 18'b0;
         out_pixels_wren <= 1'd0;
         out_pixels_idx <= 0;
         conv_input_pixel_addr <= 0;
         conv_input_pixel_addr_start <= 0;
         //conv_out_next_row_end <= conv_out_width;
     end
     if (address_in_cache > (conv_out_end-18'd1))
       address_in_cache <= 18'b0;
     if (reset) begin
	background_r <= 8'h0;
	background_g <= 8'h0;
	background_b <= 8'h80;
        async_clr <= 1'b1;
        address_in_cache <= 18'b0;
        out_pixels_wren <= 1'd0;
        out_pixels_idx <= 0;
        conv_input_pixel_addr <= 0;
        conv_input_pixel_addr_start <= 0;
        //conv_out_next_row_end <= conv_out_width;
     end else if (chipselect && write) begin
       case (address)
	 3'h0 : begin
		background_r <= writedata[7:0];
                if (writedata[7] && ~write_flag) begin
                  out_pixels_wren <= 1'd1;
		  out_pixels_idx <= 0;
		  conv_input_pixel_addr <= 0;
		  conv_input_pixel_addr_start <= 0;
                  address_in_cache <= 18'b0;
		  //conv_out_next_row_end <= conv_out_width;
                end
         end
	 3'h1 : background_g <= writedata[7:0];
	 3'h2 : background_b <= writedata[7:0];
	 3'h3 : async_clr <= 1'b1;
       endcase
       write_flag <= 1'b1;
     end else if (chipselect && ~write) begin
       //readdata <= pixel_out[7:0]; 
       //readdata <= out_pixels_reg[address_in_cache];
       if (out_pixels_wren)
         readdata <= 8'b1;
       else begin  
         readdata <= out_pixel_cache;
         if (~read_flag)
           address_in_cache <= address_in_cache + 18'b1;
         read_flag <= 1'b1;
       end
     end
     else if (read_flag == 1'b1)
       read_flag <= 1'b0;
     else if (write_flag == 1'b1)
       write_flag <= 1'b0;

   end

endmodule

module vga_counters(
 input logic 	     clk50, reset,
 output logic [10:0] hcount,  // hcount[10:1] is pixel column
 output logic [9:0]  vcount,  // vcount[9:0] is pixel row
 output logic 	     VGA_CLK, VGA_HS, VGA_VS, VGA_BLANK_n, VGA_SYNC_n);

/*
 * 640 X 480 VGA timing for a 50 MHz clock: one pixel every other cycle
 * 
 * HCOUNT 1599 0             1279       1599 0
 *             _______________              ________
 * ___________|    Video      |____________|  Video
 * 
 * 
 * |SYNC| BP |<-- HACTIVE -->|FP|SYNC| BP |<-- HACTIVE
 *       _______________________      _____________
 * |____|       VGA_HS          |____|
 */
   // Parameters for hcount
   parameter HACTIVE      = 11'd 1280,
             HFRONT_PORCH = 11'd 32,
             HSYNC        = 11'd 192,
             HBACK_PORCH  = 11'd 96,   
             HTOTAL       = HACTIVE + HFRONT_PORCH + HSYNC +
                            HBACK_PORCH; // 1600
   
   // Parameters for vcount
   parameter VACTIVE      = 10'd 480,
             VFRONT_PORCH = 10'd 10,
             VSYNC        = 10'd 2,
             VBACK_PORCH  = 10'd 33,
             VTOTAL       = VACTIVE + VFRONT_PORCH + VSYNC +
                            VBACK_PORCH; // 525

   logic endOfLine;
   
   always_ff @(posedge clk50 or posedge reset)
     if (reset)          hcount <= 0;
     else if (endOfLine) hcount <= 0;
     else  	         hcount <= hcount + 11'd1;

   assign endOfLine = hcount == HTOTAL - 1;
       
   logic endOfField;
   
   always_ff @(posedge clk50 or posedge reset)
     if (reset)          vcount <= 0;
     else if (endOfLine)
       if (endOfField)   vcount <= 0;
       else              vcount <= vcount + 10'd1;

   assign endOfField = vcount == VTOTAL - 1;

   // Horizontal sync: from 0x520 to 0x5DFconv_input_pixel_addr (0x57F)
   // 101 0010 0000 to 101 1101 1111
   assign VGA_HS = !( (hcount[10:8] == 3'b101) &
		      !(hcount[7:5] == 3'b111));
   assign VGA_VS = !( vcount[9:1] == (VACTIVE + VFRONT_PORCH) / 2);

   assign VGA_SYNC_n = 1'b0; // For putting sync on the green signal; unused
   
   // Horizontal active: 0 to 1279     Vertical active: 0 to 479
   // 101 0000 0000  1280	       01 1110 0000  480
   // 110 0011 1111  1599	       10 0000 1100  524
   assign VGA_BLANK_n = !( hcount[10] & (hcount[9] | hcount[8]) ) &
			!( vcount[9] | (vcount[8:5] == 4'b1111) );


   /* VGA_CLK is 25 MHz
    *             __    __    __
    * clk50    __|  |__|  |__|
    *        
    *             _____       __
    * hcount[0]__|     |_____|
    */
   assign VGA_CLK = hcount[0]; // 25 MHz clock: rising edge sensitive
   
endmodule
