module gaussian_elim_compat_tb
	#(parameter 
	  EPSILON = 32'b00110011110101101011111110010101,
	  DIV_LATENCY = 11,
          SUB_LATENCY = 2,
	  MUL_LATENCY = 2,
          MAT_SHIFT = 7,
          FLOAT_SHIFT = 2,
          MAT_SIZE = 15'd128)
	(input clock,

	input logic start_in,
	input logic reset,
	input logic [7:0] n,

	output logic [14:0] address,
	output logic [31:0] mem_data,
	output logic mem_wren,
	input logic [31:0] mem_result,

	output logic done,
	output logic success,
	output logic singular_out);
typedef enum logic [5:0] {
	GS_IDLE,
	PF_INIT, PF_INIT_MEM_WAIT, PF_READ_DIAG, PF_SCAN_CHECK, PF_READ_VAL, PF_EVALUATE,
	PS_SWAP_G_A_PRE_BUF, PS_SWAP_G_A, PS_SWAP_G_B, PS_SWAP_G_WA, PS_SWAP_G_WB,
	PS_SWAP_I_A, PS_SWAP_I_B, PS_SWAP_I_WA, PS_SWAP_I_WB,
	EL_INIT_READ_PIVOT, EL_INIT_SETUP, EL_READ_AIK, EL_DIV_START, EL_DIV_WAIT,
	EL_COL_SETUP, EL_READ_COL, EL_READ_ROW, EL_MUL_START, EL_MUL_WAIT,
	EL_SUB_START, EL_SUB_WAIT, EL_SUB_WAIT_BUF, EL_WRITE_COL, EL_COL_INCREMENT, EL_ROW_INCREMENT,
	BS_READ_I, BS_SETUP, BS_READ_A, BS_READ_V, BS_MUL_START, BS_MUL_WAIT, BS_SUB_START, BS_SUB_WAIT,
	BS_NEXT_CHECK, BS_READ_DIAG, BS_DIV_START, BS_DIV_WAIT, BS_DIV_WAIT_BUF, BS_DIV_WRITE, BS_ROW_DEC,
	GS_CHECK_K, GS_DONE, GS_FAILED} state_t;
    state_t state;
//    state_t next_state;

    logic singular;
    logic start;
    logic [4:0] fp_latency_count;
    logic [(MAT_SHIFT + 2'd2) << (MAT_SHIFT + FLOAT_SHIFT) - 1'b1:0] mem;

    logic [7:0] k;
    logic [7:0] pivot;
    logic [7:0] row;
    logic [7:0] j;
    logic [31:0] max_val;
    logic [31:0] val_buf;


    logic [7:0] ei;
    logic [7:0] ej;
    logic [7:0] bi;
    logic [7:0] bj;
    logic [31:0] pivot_val;
    logic [31:0] m;
    logic [31:0] buf_col;
    logic [31:0] buf_row;
    logic [31:0] sum;
    logic [31:0] mem_I;
    logic [31:0] mem_A;
    logic [31:0] mem_v;
    logic [31:0] mem_diag;

    logic sub_en;
    logic [31:0] sub_a;
    logic [31:0] sub_b;
    logic sub_nan;

    logic sub_overflow;
    logic [31:0] sub_output;
    logic sub_underflow;

    fp_sub sub_u(
	    .areset(reset),
	    .en(sub_en),
	    .clk(clock),
	    .a(sub_a),
	    .b(sub_b),
		 .q(sub_output));

    logic areset;
    logic mul_en;
    logic [31:0] mul_a;
    logic [31:0] mul_b;
    logic [31:0] mul_output;
    logic [31:0] mul_result;

    fp_mult mul_u(
	    .clk(clock),
	    .areset(areset),
	    .en(mul_en),
	    .a(mul_a),
	    .b(mul_b),
	    .q(mul_output));

    logic div_en;
    logic [31:0] div_a;
    logic [31:0] div_b;
    logic [31:0] div_output;

    fp_div div_u(
	    .clk(clock),
		 .areset(reset),
		 .en(div_en),
		 .a(div_a),
		 .b(div_b),
		 .q(div_output));


    function [31:0] fabsf(input [31:0] a);
    begin
	    fabsf = {1'b0,a[30:0]};
    end
    endfunction

    function [0:0] fbig(input [31:0] a);
    begin
	    fbig = a[30:23] < 104;
    end
    endfunction

    function [0:0] fgtf(input [31:0] a, input [31:0] b);
    begin
	    fgtf = (a[31] == 1'b0 && b[31] == 1'b1) ||
			(a[31] == 1'b0 && b[31] == 1'b0 &&
			(a[30:23] > b[30:23] || 
			(a[30:23] == b[30:23] && a[22:0] > b[22:0]))) ||
			(a[31] == 1'b1 && b[31] == 1'b1 && 
			(a[30:23] < b[30:23] ||
			(a[30:23] == b[30:23] && a[22:0] < b[22:0])));
    end
    endfunction

    function [14:0] get_G(input [7:0] i, input [7:0] j);
    begin
	    get_G = ({7'b0,i} << (MAT_SHIFT)) + {7'b0,j};
    end
    endfunction

    function [14:0] get_I(input [7:0] i);
    begin
	    get_I = (MAT_SIZE << (MAT_SHIFT)) + {7'b0,i};
    end
    endfunction

    function [14:0] get_v(input [7:0] i);
    begin
	    get_v = ((MAT_SIZE + 15'b1) << (MAT_SHIFT)) + {7'b0,i};
    end
    endfunction

    function [0:0] mul_exception_check(input[31:0] mul_a, input [31:0] mul_b, input [31:0] mul_output);
    begin
	    if(mul_output[30:23] == 8'b1111_1111 || mul_a[31] ^ mul_b[31] != mul_output[31]) begin
		    mul_exception_check = 1'b1;
	    end
	    else begin
		    mul_exception_check = 1'b0;
	    end
    end
    endfunction
	 
	 function [0:0] sub_exception_check(input[31:0] sub_a, input [31:0] sub_b, input [31:0] sub_output);
	 begin
		 if (sub_a[30:0] == 31'b0 && sub_b[30:0] == 31'b0) begin
                    sub_exception_check = 1'b0;
	         end
	    else if(sub_output[30:23] == 8'b1111_1111 || (sub_a == sub_b && sub_output[30:0] != 31'b0) || (fgtf(sub_a,sub_b) ^ (sub_output[31] == 1'b0 && sub_output[30:0] != 31'b0))) begin
		     sub_exception_check = 1'b1;
		 end
		 else begin
			  sub_exception_check = 1'b0;
		 end
	 end
	 endfunction

	 function [0:0] div_exception_check(input[31:0] div_a, input [31:0] div_b, input [31:0] div_output);
	 begin
	    if(mul_exception_check(div_a,div_b,div_output) || div_b[30:0] == 31'b0) begin
		     div_exception_check = 1'b1;
		 end
		 else begin
		     div_exception_check = 1'b0;
		 end
	 end
	 endfunction
    always_ff @(posedge clock or posedge reset) begin
	//    start <= start_in;
	    if(reset) begin
		    done <= 1'b0;
		    success <= 1'b0;
		    singular_out <= 1'b0;
		    //start = start_in;
		    state <= GS_IDLE;
		    k <= 8'd0;
			pivot <= 8'd0;
			row <= 8'd0;
			j <= 8'd0;
			max_val <= 32'd0;
			val_buf <= 32'd0;
			ei <= 7'd0; ej <= 7'd0; bi <= 7'd0; bj <= 7'd0;
			pivot_val <= 32'b0;
			m <= 32'b0; buf_col <= 32'b0; buf_row <= 32'b0;
			sum <= 32'b0; mem_I <= 32'b0; mem_v <= 32'b0; mem_diag <= 32'b0;

			sub_en <= 1'b0; mul_en <= 1'b0; div_en <= 1'b0;
			sub_a <= 32'b0; sub_b <= 32'b0; mul_a <= 32'b0; mul_b <= 32'b0; div_a <= 32'b0; div_b <= 32'b0;

		 end
	    else begin
		    singular_out <= singular;
		    case(state)
			    GS_IDLE: begin
				    done <= 1'b0;
				    success <= 1'b0;
				    singular <= 1'b0;
				    if(start_in) begin
					    state <= PF_INIT;
				    end
			
					 
			    end
			    PF_INIT: begin
				    row <= k;
				    pivot <= k;
				    state <= PF_INIT_MEM_WAIT;

				    address <= get_G(k,k);
				    mem_wren <= 1'b0;

			    end
			    PF_INIT_MEM_WAIT: begin
				    state <= PF_READ_DIAG;
			    end
			    PF_READ_DIAG: begin
				    max_val <= fabsf(mem_result);
				    state <= PF_SCAN_CHECK;
			    end
			    PF_SCAN_CHECK: begin
				    row <= row + 8'b1;
				    if(row + 8'b1 < n) begin
					    state <= PF_READ_VAL;
					    address <= get_G(row + 8'b1,k);
					    mem_wren <= 1'b0;
				    end
				    else begin
					    j <= k;
					    if(fbig(fabsf(max_val))) begin //checks exponent, sees if its less than 2e-7 
						    singular <= 1'b1;
						    state <= GS_DONE;
					    end
					    else begin
						    state <= PS_SWAP_G_A_PRE_BUF;
						    address <= get_G(k,k); //j being updated this cycle to value of k, so must set it to be k,k
						    mem_wren <= 1'b0;

					    end
				    end
			    end
			    PF_READ_VAL: begin
				    state <= PF_EVALUATE;
			    end
			    PF_EVALUATE: begin
				    if(fgtf(fabsf(mem_result),max_val)) begin
					    max_val <= fabsf(mem_result);
					    pivot <= row;
				    end
				    state <= PF_SCAN_CHECK;
			    end
			    PS_SWAP_G_A_PRE_BUF: begin
				    state <= PS_SWAP_G_A;

				    if(pivot != k) begin
					    address <= get_G(pivot,j);
					    mem_wren <= 1'b0;
				    end
				    else begin
					    address <= get_G(k,k);
					    mem_wren <= 1'b0;
				    end
			    end
			    PS_SWAP_G_A: begin
				    if(pivot == k) begin
					    state <= EL_INIT_READ_PIVOT;
				    end
				    else begin
					    state <= PS_SWAP_G_B;
					    //address <= get_G(pivot,j);
					    //mem_wren <= 1'b0;
					    address <= get_G(pivot,j);
					    mem_wren <= 1'b1;
					    mem_data <= mem_result;
				    end
			    end
			    PS_SWAP_G_B: begin
				    state <= PS_SWAP_G_WA;
				    //address <= get_G(k,j);
				    //mem_wren <= 1'b1;
				    //mem_data <= mem_result;
				    address <= get_G(k,j);
				    mem_wren <= 1'b1;
				    mem_data <= mem_result;
			    end
			    PS_SWAP_G_WA: begin
				    //write performed
				    //mem_wren <= 1'b0;
				    state <= PS_SWAP_G_WB;
				    //address <= get_G(pivot,j);
				    //mem_wren <= 1'b1;
				    if(j + 8'b1 < n) begin
					    address <= get_G(k,j + 8'b1);
					    mem_wren <= 1'b0;

				    end
				    else begin
					    address <= get_I(k);
					    mem_wren <= 1'b0;
				    end
			    end
			    PS_SWAP_G_WB: begin
				    //write performed
				    j <= j + 8'b1;
				    if(j + 8'b1 < n) begin
					    state <= PS_SWAP_G_A;
					    //address <= get_G(k,j + 8'b1);
					    //mem_wren <= 1'b0;
					    address <= get_G(pivot,j + 8'b1);
					    mem_wren <= 1'b0;
				    end
				    else begin
					    state <= PS_SWAP_I_A;
					    //address <= get_I(k);
					    //mem_wren <= 1'b0;
					    address <= get_I(pivot);
					    mem_wren <= 1'b0;
				    end
			    end
			    PS_SWAP_I_A: begin
				    state <= PS_SWAP_I_B;
				    //address <= get_I(pivot);
				    //mem_wren <= 1'b0;
				    address <= get_I(pivot);
				    mem_wren <= 1'b1;
				    mem_data <= mem_result;
			    end
			    PS_SWAP_I_B: begin
				    mem_data <= mem_result;
				    state <= PS_SWAP_I_WA;
				    //address <= get_I(k);
				    //mem_wren <= 1'b1;
				    //mem_data <= mem_result; //temp_b gets set that cycle, must use mem_result
				    address <= get_I(k);
				    mem_wren <= 1'b1;
				    mem_data <= mem_result;
			    end
			    PS_SWAP_I_WA: begin
				    //write performed
				    //mem_wren <= 1'b0;
				    state <= PS_SWAP_I_WB;
				    //address <= get_I(pivot);
				    //mem_wren <= 1'b1;
				    address <= get_G(k,k);
				    mem_wren <= 1'b0;
			    end
			    PS_SWAP_I_WB: begin
				    //write performed
				    //mem_wren <= 1'b0;
				    state <= EL_INIT_READ_PIVOT;
				    //address <= get_G(k,k);
				    //mem_wren <= 1'b0;
			    end


			    EL_INIT_READ_PIVOT: begin
				    pivot_val <= mem_result;
				    state <= EL_INIT_SETUP;
				    if(!fbig(fabsf(mem_result))) begin
					    address <= get_G(k + 8'b1, k);
					    mem_wren <= 1'b0;
				    end
			    end
			    EL_INIT_SETUP: begin
				    if(fbig(fabsf(pivot_val))) begin
					    singular <= 1'b1;
					    state <= GS_DONE;
				    end
				    else begin
					    ei <= k + 8'b1;
					    state <= EL_READ_AIK;
					    //address <= get_G(k + 8'b1, k); //ei changing to value of k + 1 in this cycle
					    //mem_wren <= 1'b0;
				    end
			    end
			    EL_READ_AIK: begin
				    if(ei < n) begin
					    buf_col <= mem_result;
					    state <= EL_DIV_START;
				    end
				    else begin
					    state <= GS_CHECK_K;
				    end
			    end
			    EL_DIV_START: begin
				    div_a <= buf_col;
				    div_b <= pivot_val;
				    div_en <= 1'b1;
				    state <= EL_DIV_WAIT;
				    fp_latency_count <= 5'b0;
				    //FOR ALL STARTS OF FP CALCULATIONS MUST
				    //0 OUT LATENCY COUNTER
			    end
			    EL_DIV_WAIT: begin
				    //div_en <= 1'b0;
				    fp_latency_count <= fp_latency_count + 5'b1;
				    if(fp_latency_count == DIV_LATENCY) begin
					   div_en <= 1'b0;
					   if(div_exception_check(div_a,div_b,div_output)) begin
						   state <= GS_FAILED;
					   end
					   else begin
						   state <= EL_COL_SETUP;

						   if(k < n) begin
					    		address <= get_G(k,k);
					    		mem_wren <= 1'b0;
						   end
				    		   else begin
					    		address <= get_I(k);
					    		mem_wren <= 1'b0;
						   end
					   end
				    end
			    end
			    EL_COL_SETUP: begin
				    m <= div_output; 
				    ej <= k;
				    state <= EL_READ_COL;
				    /*if(k < n) begin
					    address <= get_G(k,k);
					    mem_wren <= 1'b0;
				    end
				    else begin
					    address <= get_I(k);
					    mem_wren <= 1'b0;
				    end*/
				    if(k < n) begin
					    address <= get_G(ei,k);
					    mem_wren <= 1'b0;
				    end
				    else begin
					    address <= get_I(ei);
					    mem_wren <= 1'b0;
				    end

			    end
			    EL_READ_COL: begin
				    /*if(ej < n) begin
					    buf_col <= 1;//get_G(k,ej)
				    end
				    else begin
					    buf_col <= 1;//get_I(k)
				    end*/
				    buf_col <= mem_result;
				    state <= EL_READ_ROW;
				    /*if(ej < n) begin
					    address <= get_G(ei,ej);
					    mem_wren <= 1'b0;
				    end
				    else begin
					    address <= get_I(ei);
					    mem_wren <= 1'b0;
				    end*/
			    end
			    EL_READ_ROW: begin
				    /*if(ej < n) begin
					    buf_row <= 1;//get_G(ei,ej)
				    end
				    else begin
					    buf_row <= 1;//get_I(ei)
				    end*/
				    buf_row <= mem_result;
				    state <= EL_MUL_START;
			    end
			    EL_MUL_START: begin
				    mul_a <= m;
				    mul_b <= buf_col;
				    mul_en <= 1'b1;
				    state <= EL_MUL_WAIT;
				    fp_latency_count <= 5'b0;
			    end
			    EL_MUL_WAIT: begin
				    //mul_en <= 1'b0;
				    fp_latency_count <= fp_latency_count + 5'b1;
				    if(fp_latency_count == MUL_LATENCY) begin
					    //insert basic exception check?,
					    mul_en <= 1'b0;
					    if(mul_exception_check(mul_a,mul_b,mul_output)) begin
						    state <= GS_FAILED;
					    end
					    else begin
						    state <= EL_SUB_START;
					    end
				    end
			    end
			    EL_SUB_START: begin
				    sub_a <= buf_row;
				    sub_b <= mul_output;
				    sub_en <= 1'b1;
				    state <= EL_SUB_WAIT;
				    fp_latency_count <= 5'b0;
			    end
			    EL_SUB_WAIT: begin
				    //sub_en <= 1'b0;
				    fp_latency_count <= fp_latency_count + 5'b1;
				    if(fp_latency_count == SUB_LATENCY) begin
					    sub_en <= 1'b0;
					    if(sub_exception_check(sub_a,sub_b,sub_output)) begin
						    state <= GS_FAILED;
					    end
					    else begin
						    state <= EL_SUB_WAIT_BUF;


						    if(ej < n) begin
							    address <= get_G(ei,ej);
							    mem_wren <= 1'b1;
							    mem_data <= sub_output;
						    end
						    else begin
							    address <= get_I(ei);
							    mem_wren <= 1'b1;
							    mem_data <= sub_output;
						    end
							    end
				    end
			    end
			    EL_SUB_WAIT_BUF: begin
/*				    if(ej < n) begin
					    address <= get_G(ei,ej);
					    mem_wren <= 1'b1;
					    mem_data <= sub_output;
				    end
				    else begin
					    address <= get_I(ei);
					    mem_wren <= 1'b1;
					    mem_data <= sub_output;
				    end*/
				    state <= EL_WRITE_COL;
			    end
			    EL_WRITE_COL: begin
				    /*if(ej < n) begin
					    //write(get_G(ei,ej), sub_result)
				    end
				    else begin
					    //write(get_I(ei), sub_result
				    end*/
				    //mem_wren <= 1'b0;
				    state <= EL_COL_INCREMENT;

				    if(ej + 8'b1 <= n) begin
					    if(ej + 8'b1 < n) begin
						    address <= get_G(k,ej + 8'b1);
						   // address <= get_G(ei, ej + 8'b1);
						    mem_wren <= 1'b0;
					    end
					    else begin
						    address <= get_I(k);
						    mem_wren <= 1'b0;
					    end
				    end
			    end
			    EL_COL_INCREMENT: begin
				    ej <= ej + 8'b1;
				    if(ej + 8'b1 <= n) begin
				 	    state <= EL_READ_COL;
					    if(ej + 8'b1 < n) begin
						    address <= get_G(ei,ej + 8'b1);
						    mem_wren <= 1'b0;
					    end
					    else begin
						    address <= get_I(ei);
						    mem_wren <= 1'b0;
					    end
				    end
				    else begin
					    state = EL_ROW_INCREMENT;
					    if(ei + 8'b1 < n) begin
						    address <= get_G(ei + 8'b1, k);
						    mem_wren <= 1'b0; 
					    end
				    end



				 end
			    EL_ROW_INCREMENT: begin
				    ei <= ei + 8'b1;
				    if(ei + 8'b1 < n) begin
					    state <= EL_READ_AIK;
					    //address <= get_G(ei + 8'b1, k); //ei being incremented in this cycle
					    //mem_wren <= 1'b0;
				    end
				    else begin
					    state <= GS_CHECK_K;
				    end
			    end
			    GS_CHECK_K: begin
				    k <= k + 8'b1;
				    if(k + 8'b1 < n) begin
					    state <= PF_INIT;
				    end
				    else begin
					    bi <= n - 8'b1;
					    bj <= n;
					    state <= BS_READ_I;
					    address <= get_I(n - 8'b1);
					    mem_wren <= 1'b0;
				    end

			    end
			    BS_READ_I: begin
				    //mem_I <= mem_result;
				    state <= BS_SETUP;
				    if(bj < n) begin
					    address <= get_G(bi, bj);
					    mem_wren <= 1'b0;
				    end
				    else begin
					    address <= get_G(bi, bi);
					    mem_wren <= 1'b0;
				    end
			    end
			    BS_SETUP: begin
				    sum <= mem_result;
				    bj <= bi + 8'b1;
				    if(bj < n) begin
					    state <= BS_READ_A;
					    //address <= get_G(bi + 8'b1, bj);
					    //mem_wren <= 1'b0;
					    address <= get_v(bi + 8'b1);
					    mem_wren <= 1'b0;
				    end
				    else begin
					    state <= BS_READ_DIAG;
					    //address <= get_G(bi, bi);
					    //mem_wren <= 1'b0;
				    end
			    end
			    BS_READ_A: begin
				    mem_A <= mem_result;
				    state <= BS_READ_V;
				    //address <= get_v(bj);
				    //mem_wren <= 1'b0;
			    end
			    BS_READ_V: begin
				    mem_v <= mem_result;
				    state <= BS_MUL_START;
			    end
			    BS_MUL_START: begin
				    mul_a <= mem_A;
				    mul_b <= mem_v;
				    mul_en <= 1'b1;
				    state <= BS_MUL_WAIT;
				    fp_latency_count <= 5'b0;
			    end
			    BS_MUL_WAIT: begin
				    //mul_en <= 1'b0;
				    fp_latency_count <= fp_latency_count + 5'b1;
				    if(fp_latency_count == MUL_LATENCY) begin
					    mul_en <= 1'b0;
					    if(mul_exception_check(mul_a,mul_b,mul_output)) begin
						    state <= GS_FAILED;
					    end
					    else begin
						    //sum <= mul_outpusim:/gaussian_elim_tb/test/start

						    state <= BS_SUB_START;


						    /*if(bj + 8'b1 < n) begin
							    address <= get_G(bi,bj + 8'b1);
							    mem_wren <= 1'b0;
						    end
						    else begin
							    address <= get_G(bi,bi);
							    mem_wren <= 1'b0;
						    end*/
					    end
				    end
			    end
			    BS_SUB_START: begin
				    sub_a <= sum;
				    sub_b <= mul_output;
				    sub_en <= 1'b1;
				    state <= BS_SUB_WAIT;
				    fp_latency_count <= 5'b0;
			    end
			    BS_SUB_WAIT: begin
				    fp_latency_count <= fp_latency_count + 5'b1;
				    if(fp_latency_count == SUB_LATENCY) begin
					    sub_en <= 1'b0;
					    if(sub_exception_check(sub_a,sub_b,sub_output)) begin
						    state <= GS_FAILED;
					    end
					    else begin
						    state <= BS_NEXT_CHECK;
						sum <= sub_output;
						    if(bj + 8'b1 < n) begin
							    address <= get_G(bi,bj + 8'b1);
							    mem_wren <= 1'b0;
						    end
						    else begin
							    address <= get_G(bi,bi);
							    mem_wren <= 1'b0;
						    end
					    end
				    end
			    end

			    BS_NEXT_CHECK: begin
				    bj <= bj + 8'b1;
				    if(bj + 8'b1 < n) begin
					    state <= BS_READ_A;
					    address <= get_v(bj + 8'b1);
					    mem_wren <= 1'b0;
				    end
				    else begin
					    state <= BS_READ_DIAG;
					    //address <= get_G(bi,bi);
					    //mem_wren <= 1'b0;
				    end
			    end
			    BS_READ_DIAG: begin
				    mem_diag <= mem_result;
				    state <= BS_DIV_START;
			    end
			    BS_DIV_START: begin
				    if(fbig(fabsf(mem_diag))) begin
					    state <= GS_DONE;
				    end
				    else begin
					    div_a <= sum;
					    div_b <= mem_diag;
					    div_en <= 1'b1;
					    state <= BS_DIV_WAIT;
					    fp_latency_count <= 5'b0;
				    end
			    end
			    BS_DIV_WAIT: begin
				    //div_en <= 1'b0;
				    fp_latency_count <= fp_latency_count + 5'b1;
				    if(fp_latency_count == DIV_LATENCY) begin
					    div_en <= 1'b0;
					    if(div_exception_check(div_a,div_b,div_output)) begin
						    state <= GS_FAILED;
					    end
					    else begin
						    //write(get_v(bi),div_output) performed in next state
						    state <= BS_DIV_WAIT_BUF;
						    address <= get_v(bi);
						    mem_wren <= 1'b1;
						    mem_data <= div_output;
					    end
				    end
			    end
			    BS_DIV_WAIT_BUF: begin
				    //address <= get_v(bi);
				    //mem_wren <= 1'b1;
				    //mem_data <= div_output;

				    state <= BS_DIV_WRITE;
			    end
			    BS_DIV_WRITE: begin
				    mem_wren <= 1'b0;
				    state <= BS_ROW_DEC;

				    if(bi - 8'b1 >= 0) begin
					    address <= get_I(bi - 8'b1);
					    mem_wren <= 1'b0;
				    end
			    end
			    BS_ROW_DEC: begin
				    bi <= bi - 8'b1;
				    bj <= bi;
				    if(bi - 8'b1 >= 0) begin
					    state <= BS_READ_I;
					    //address <= get_I(bi - 8'b1);
					    //mem_wren <= 1'b0;
				    end
				    else begin
					    state <= GS_DONE;
				    end
			    end
			    GS_DONE: begin
				    done <= 1;
				    success <= 1;
					// reset only when starting again
					if (start == 1'b1 || reset == 1'b1)
				    	state <= GS_IDLE;
			    end
			    GS_FAILED: begin
				    done <= 1;
				    success <= 0;
					if (start == 1'b1 || reset == 1'b1)
				    	state <= GS_IDLE;
			    end
		    endcase
	    end
    end
endmodule
