library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;


--------------Matrix Vector Multiplier------------------

--Block containing output accumulators, LFSRs needed for 
--matrix generation, block RAM and ROM.  Operation of block
--is controlled using a state machine.

---------------------------------------------------------

entity matrix_vector_mult is
  port
    (
      state_out : out std_logic_vector(4 downto 0);

      --avalon bus signals
      clk        : in  std_logic;
      reset      : in  std_logic;
      read       : in  std_logic;
      write      : in  std_logic;
      chipselect : in  std_logic;
      address    : in  std_logic_vector(13 downto 0);
      writedata  : in  std_logic_vector(31 downto 0);
      readdata   : out std_logic_vector(31 downto 0)
    );

end matrix_vector_mult;

architecture rtl of matrix_vector_mult is
  -- ROM generated by wizard stores seeds for all the parallel LFSRs
  component rom_test
    port
      (
        address : in  std_logic_vector (13 downto 0);
        clock   : in  std_logic;
        q       : out std_logic_vector (31 downto 0)
        );
  end component;

  component column_module
    port
      (
        clk        : in  std_logic;
        acc_active : in  std_logic;
        reset      : in  std_logic;
        reset_acc  : in  std_logic;
        load       : in  std_logic;
        seed       : in  std_logic_vector(31 downto 0);
        i_vec      : in  std_logic_vector(15 downto 0);
        o_reg      : out std_logic_vector(31 downto 0)
        );
  end component;

  component single_port_ram
    generic
      (
        DATA_WIDTH :     natural   := 32;
        ADDR_WIDTH :     natural   := 12
        );
    port
      (
        clk        : in  std_logic;
        addr       : in  natural range 0 to 2**ADDR_WIDTH - 1;
        data       : in  std_logic_vector((DATA_WIDTH-1) downto 0);
        we         : in  std_logic := '1';
        q          : out std_logic_vector((DATA_WIDTH -1) downto 0)
        );
  end component;

  component lfsr
    port
      (
        clk, reset, load, enable : in  std_logic;
        seed                     : in  std_logic_vector(31 downto 0);
        bit_out                  : out std_logic
        );
  end component;

  constant DATA_WIDTH : natural := 32;
  constant ADDR_WIDTH : natural := 12;
  constant P_NUM      : natural := 128;

  type o_register is array(0 to P_NUM-1) of std_logic_vector(31 downto 0);
  type seed_register is array(0 to P_NUM-1) of std_logic_vector(31 downto 0);

  --signals for column_module
  signal acc_active : std_logic                     := '0';
  signal reset_lfsr : std_logic                     := '0';
  signal reset_acc  : std_logic                     := '0';
  signal load       : std_logic                     := '0';
  signal i_vec      : std_logic_vector(15 downto 0) := (others => '0');
  signal o_reg      : o_register;
  signal seed       : seed_register;

  --signals for seed ROM
  signal seed_mem : std_logic_vector(31 downto 0);
  
  --signals for single_port_ram
  signal addr : natural range 0 to (2**ADDR_WIDTH-1);
  signal data : std_logic_vector((DATA_WIDTH-1) downto 0);
  signal we   : std_logic := '1';
  signal q    : std_logic_vector((DATA_WIDTH-1) downto 0);

  --signals for writing reading from single_port_ram to controller
  signal addr_cntrl : natural range 0 to (2**ADDR_WIDTH-1);
  signal data_cntrl : std_logic_vector((DATA_WIDTH-1) downto 0);
  signal we_cntrl   : std_logic := '1';
  signal q_cntrl    : std_logic_vector((DATA_WIDTH-1) downto 0);

  --signal for memory arbitration 
  signal ram_rq : std_logic := '1';

  -- bit coming out of LFSR used to generate single column of A matrix
  signal extra_lfsr_bit_out : std_logic;

  --signals for state machine
  constant NUM_ITER : integer := 16384/P_NUM;  --number of times we have to assign new seeds to cover entire matrix
  constant NUM_ROWS : integer := 4096;  --number of rows in matrix
  constant NUM_COLS : integer := 16384;  --number of columns in matrix

  -- RAM contents:
  -- status registers:     0x000 to 0x003 (0-3 [4 words])
  -- active set:           0x004 to 0x403 (4-1027 [1028 words])
  -- input vector:         0x404 to 0xC03 (1028-3075 [2048 words = 4096 half-words])
  -- output vector:        0xC04 to 0xC83 (3076-3203 [128==P_NUM words])
  -- new active set index: 0xC84 (3204)

  -- offsets of the above memory regions
  constant aset_offset   : integer := 4;
  constant ivec_offset   : integer := 1028;
  constant ovec_offset   : integer := 3076;
  constant newind_offset : integer := 3204;

  type states is (S0, S1, S2, S3a, S3b, S4, S5, S6, S7, S8, S9a, S9b, S9c);

  signal state, next_state         : states;
  signal grp_cntr, next_grp_cntr   : integer range 0 to NUM_ROWS;  --number of groups to cover all columns 
  signal ivec_cntr, next_ivec_cntr : integer range 0 to NUM_ROWS-1;  --number of input vector
  signal seed_cntr, next_seed_cntr : integer range 0 to P_NUM-1;  --number of seeds that must be loaded
  signal ovec_cntr, next_ovec_cntr : integer range 0 to P_NUM-1;  --number of output registers
  signal seed_transfer             : std_logic := '0';

  --special control registers

  -- old values- 32 bits
  constant GO    : std_logic_vector((DATA_WIDTH-1) downto 0) := X"FFFFFFFF";
  constant DONE  : std_logic_vector((DATA_WIDTH-1) downto 0) := X"00030000";
  constant FULL  : std_logic_vector((DATA_WIDTH-1) downto 0) := X"FF00FFFF";
  constant EMPTY : std_logic_vector((DATA_WIDTH-1) downto 0) := X"00000000";
  constant MODE0 : std_logic_vector((DATA_WIDTH-1) downto 0) := X"0000FFFF";
  constant MODE1 : std_logic_vector((DATA_WIDTH-1) downto 0) := X"FFFF0000";

  type cntrl_register is array(0 to 2) of std_logic_vector((DATA_WIDTH-1) downto 0);

  signal cntrl_reg : cntrl_register;

  signal waitreq_reg1, waitrequest_int : std_logic;

begin

  ram_inst0 : single_port_ram
    generic map
    (
      DATA_WIDTH => 32,
      ADDR_WIDTH => 12
      )
    port map
    (
      clk        => clk,
      addr       => addr,
      data       => data,
      we         => we,
      q          => q
      );

  --Seed ROM holds the intitial seed values for each of the 16384 columns.
  --Values are loaded from a file.
  seed_rom_inst0 : rom_test
    port map
    (
      address => std_logic_vector(to_unsigned(addr_cntrl, address'length)),
      clock   => clk,
      q       => seed_mem
      );

  --Generate N elements for doing parallel computations
  col_gen   : for n in 0 to (P_NUM-1) generate
    col_map : column_module port map
      (
        clk        => clk,
        acc_active => acc_active,
        reset      => reset_lfsr,
        reset_acc  => reset_acc,
        load       => load,
        seed       => seed(n),
        i_vec      => i_vec,
        o_reg      => o_reg(n)
        );
  end generate;

--LFSR needed to generate column needed by processor.
extra_lfsr0 : lfsr port map
    (
      enable  => '1',
      clk     => clk,
      reset   => reset_lfsr,
      load    => load,
      seed    => seed_mem,
      bit_out => extra_lfsr_bit_out
    );

  --Helpful for debugging
  process(state)
  begin
    case state is
      when S0     =>
        state_out <= "00000";
      when S1     =>
        state_out <= "00001";
      when S2     =>
        state_out <= "00010";
      when S3a    =>
        state_out <= "00011";
      when S3b    =>
        state_out <= "00100";
      when S4     =>
        state_out <= "00101";
      when S5     =>
        state_out <= "00110";
      when S6     =>
        state_out <= "00111";
      when S7     =>
        state_out <= "01000";
      when S8     =>
        state_out <= "01001";
      when S9a     => 
        state_out <= "01010";
      when S9b     => 
        state_out <= "01011";
      when S9c     =>
        state_out <= "01100";
      when others =>
        state_out <= "11111";
    end case;
  end process;

  --Memory control processes
  --process for avalon bus
  process(clk)
  begin
    if(rising_edge(clk)) then
      if ram_rq = '1' then
        if chipselect = '1' then
          if write = '1' then
            we                                         <= '1';
            data                                       <= writedata;
            if (address = std_logic_vector(to_unsigned(0, address'length))
                or address = std_logic_vector(to_unsigned(1, address'length))
                or address = std_logic_vector(to_unsigned(2, address'length))) then
              cntrl_reg(to_integer(unsigned(address))) <= writedata;
            end if;
          elsif read = '1' then
            we                                         <= '0';
            readdata                                   <= q;
          end if;
          addr                                         <= to_integer(unsigned(address));
        end if;
      else
        if we_cntrl = '1' then
          we                                           <= '1';
          data                                         <= data_cntrl;
        else
          we                                           <= '0';
          q_cntrl                                      <= q;
        end if;
        addr                                           <= addr_cntrl;
      end if;
    end if;

  end process;

  -- state machine controller
  process(clk)
  begin

    if (rising_edge(clk)) then
      seed_cntr         <= next_seed_cntr;
      ivec_cntr         <= next_ivec_cntr;
      ovec_cntr         <= next_ovec_cntr;
      grp_cntr          <= next_grp_cntr;
      state             <= next_state;
      if seed_transfer  <= '1' then
        seed(seed_cntr) <= seed_mem;
      end if;
    end if;

  end process;

  process(reset, state, seed_cntr, ivec_cntr, ovec_cntr, grp_cntr, cntrl_reg, o_reg, q_cntrl, addr_cntrl)
  begin
    if reset = '1' then
      next_state     <= S0;
      next_seed_cntr <= 0;
      next_ivec_cntr <= 0;
      next_ovec_cntr <= 0;
      next_grp_cntr  <= 0;
      acc_active     <= '0';
      ram_rq         <= '1';
      we_cntrl       <= '0';
      load           <= '0';
      reset_lfsr     <= '1';
      reset_acc      <= '1';
      addr_cntrl     <= 0;
      seed_transfer  <= '0';
      i_vec          <= (others => '0');
      data_cntrl     <= (others => '0');
    else
	  --Default values
      next_seed_cntr <= seed_cntr;
      next_ivec_cntr <= ivec_cntr;
      next_ovec_cntr <= ovec_cntr;
      next_grp_cntr  <= grp_cntr;
      acc_active     <= '0';
      ram_rq         <= '0';
      we_cntrl       <= '0';
      load           <= '0';
      reset_lfsr     <= '0';
      reset_acc      <= '0';
      next_state     <= state;
      addr_cntrl     <= 0;
      seed_transfer  <= '0';
      i_vec          <= (others => '0');
      data_cntrl     <= (others => '0');

      case state is
        when S0  =>
          ram_rq         <= '1';
          reset_acc      <= '1';
          reset_lfsr     <= '1';
          if cntrl_reg(0) = GO then
            --add other modes here
            if cntrl_reg(1) = MODE0 then
              next_state <= S1;
            elsif cntrl_reg(1) = MODE1 then
              next_state <= S9a;
            else
              next_state <= S0;
            end if;
          end if;
        when S1  =>
          next_seed_cntr <= 0;
          next_ivec_cntr <= 0;
          next_ovec_cntr <= 0;
          next_grp_cntr  <= 0;
          ram_rq         <= '0';
          next_state     <= S2;
        when S2  =>
          seed_transfer  <= '1';
          addr_cntrl     <= grp_cntr + seed_cntr;
          next_seed_cntr <= seed_cntr + 1;
          load           <= '1';
          if seed_cntr = (P_NUM-1) then
            next_state   <= S3a;
          else
            next_state   <= S2;
          end if;
        -- don't increment ivec_cntr in this state
        when S3a =>
          acc_active     <= '1';
          we_cntrl       <= '0';
          addr_cntrl     <= ivec_offset + ivec_cntr;
          i_vec          <= q_cntrl(15 downto 0);
          next_ivec_cntr <= ivec_cntr;
          next_state     <= S3b;
        -- do increment ivec_cntr in this state
        when S3b =>
          acc_active     <= '1';
          we_cntrl       <= '0';
          addr_cntrl     <= ivec_offset + ivec_cntr;
          i_vec          <= q_cntrl(31 downto 16);
          next_ivec_cntr <= ivec_cntr + 1;
          if ivec_cntr = (NUM_ROWS/2)-1 then
            next_state   <= S4;
          else
            next_state   <= S3a;
          end if;
        when S4 =>
		  --Write the output back to Block RAM
          we_cntrl <= '1';
          addr_cntrl <= ovec_offset + ovec_cntr;
          data_cntrl <= o_reg(ovec_cntr);
          next_ovec_cntr <= ovec_cntr + 1;
          if ovec_cntr = (P_NUM-1) then
            next_state <= S5;
          else
            next_state <= S4;
          end if;
        when S5 =>
		  --Signal processor that the ouput is ready.
          we_cntrl <= '1';
          addr_cntrl <= 2;
          data_cntrl <= FULL;
          next_state <= S6;
        when S6 =>
          ram_rq <= '1';
          if cntrl_reg(2) = EMPTY then
		    --processor has successfully read back the output set.
            next_state <= S7;
          else
            next_state <= S6;
          end if;
        when S7 =>
		  --Do next set of columns.
          reset_acc <= '1';
          next_grp_cntr <= grp_cntr + P_NUM;
          next_state <= S8;
        when S8 =>
          if grp_cntr = NUM_COLS then
            --Finished
			next_state <= S0;
          else
            next_state <= S1;
          end if;
        when S9a => 
          -- address the RAM to get the index of the new element of the A matrix
          addr_cntrl <= newind_offset;
          
          next_state <= S9b;
        when S9b =>
          -- address the seed ROM with the index retrieved from RAM
          addr_cntrl <= to_integer(unsigned(data((ADDR_WIDTH-1) downto 0)));
          next_ivec_cntr <= 0;
          load <= '1';
        when S9c =>
          addr_cntrl <= ivec_offset + ivec_cntr;
          next_ivec_cntr <= ivec_cntr + 1;
          if (extra_lfsr_bit_out = '0') then
             data_cntrl <= "00000000000000000000000000000001";
          else
             data_cntrl <= "11111111111111111111111111111111";
          end if;
          if (ivec_cntr = 4095) then
            next_state <= S0;
          else
            next_state <= S9c;
          end if;
      end case;
    end if;

  end process;

end rtl;
