Commit 74062195 authored by Paul Mackerras's avatar Paul Mackerras

execute1: Do forwarding of the CR result to the next instruction

This adds a path to allow the CR result of one instruction to be
forwarded to the next instruction, so that sequences such as
cmp; bc can avoid having a 1-cycle bubble.

Forwarding is not available for dot-form (Rc=1) instructions,
since the CR result for them is calculated in writeback.  The
decode.output_cr field is used to identify those instructions
that compute the CR result in execute1.

For some reason, the multiply instructions incorrectly had
output_cr = 1 in the decode tables.  This fixes that.
Signed-off-by: default avatarPaul Mackerras <paulus@ozlabs.org>
parent 0f057390
......@@ -151,6 +151,7 @@ package common is
bypass_data2: std_ulogic;
bypass_data3: std_ulogic;
cr: std_ulogic_vector(31 downto 0);
bypass_cr : std_ulogic;
xerc: xer_common_t;
lr: std_ulogic;
rc: std_ulogic;
......@@ -173,7 +174,7 @@ package common is
end record;
constant Decode2ToExecute1Init : Decode2ToExecute1Type :=
(valid => '0', unit => NONE, insn_type => OP_ILLEGAL, bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0',
lr => '0', rc => '0', oe => '0', invert_a => '0',
bypass_cr => '0', lr => '0', rc => '0', oe => '0', invert_a => '0',
invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0',
is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', br_pred => '0',
byte_reverse => '0', sign_extend => '0', update => '0', nia => (others => '0'), read_data1 => (others => '0'), read_data2 => (others => '0'), read_data3 => (others => '0'), cr => (others => '0'), insn => (others => '0'), data_len => (others => '0'), others => (others => '0'));
......
......@@ -38,6 +38,7 @@ entity control is
cr_read_in : in std_ulogic;
cr_write_in : in std_ulogic;
cr_bypassable : in std_ulogic;
valid_out : out std_ulogic;
stall_out : out std_ulogic;
......@@ -45,7 +46,8 @@ entity control is
gpr_bypass_a : out std_ulogic;
gpr_bypass_b : out std_ulogic;
gpr_bypass_c : out std_ulogic
gpr_bypass_c : out std_ulogic;
cr_bypass : out std_ulogic
);
end entity control;
......@@ -161,8 +163,10 @@ begin
cr_read_in => cr_read_in,
cr_write_in => cr_write_valid,
bypassable => cr_bypassable,
stall_out => cr_stall_out
stall_out => cr_stall_out,
use_bypass => cr_bypass
);
control0: process(clk)
......
......@@ -16,15 +16,18 @@ entity cr_hazard is
cr_read_in : in std_ulogic;
cr_write_in : in std_ulogic;
bypassable : in std_ulogic;
stall_out : out std_ulogic
stall_out : out std_ulogic;
use_bypass : out std_ulogic
);
end entity cr_hazard;
architecture behaviour of cr_hazard is
type pipeline_entry_type is record
valid : std_ulogic;
valid : std_ulogic;
bypass : std_ulogic;
end record;
constant pipeline_entry_init : pipeline_entry_type := (valid => '0');
constant pipeline_entry_init : pipeline_entry_type := (valid => '0', bypass => '0');
type pipeline_t is array(0 to PIPELINE_DEPTH) of pipeline_entry_type;
constant pipeline_t_init : pipeline_t := (others => pipeline_entry_init);
......@@ -47,7 +50,20 @@ begin
if complete_in = '1' then
v(1).valid := '0';
end if;
stall_out <= cr_read_in and (v(0).valid or v(1).valid);
use_bypass <= '0';
stall_out <= '0';
if cr_read_in = '1' then
loop_0: for i in 0 to PIPELINE_DEPTH loop
if v(i).valid = '1' then
if r(i).bypass = '1' then
use_bypass <= '1';
else
stall_out <= '1';
end if;
end if;
end loop;
end if;
-- XXX assumes PIPELINE_DEPTH = 1
if busy_in = '0' then
......@@ -56,6 +72,7 @@ begin
end if;
if deferred = '0' and issuing = '1' then
v(0).valid := cr_write_in;
v(0).bypass := bypassable;
end if;
if flush_in = '1' then
v(0).valid := '0';
......
......@@ -60,7 +60,7 @@ architecture behaviour of decode1 is
41 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lhzu
32 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lwz
33 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lwzu
7 => (ALU, OP_MUL_L64, RA, CONST_SI, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- mulli
7 => (ALU, OP_MUL_L64, RA, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- mulli
24 => (ALU, OP_OR, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ori
25 => (ALU, OP_OR, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- oris
20 => (ALU, OP_RLC, RA, CONST_SH32, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- rlwimi
......@@ -262,19 +262,19 @@ architecture behaviour of decode1 is
2#0010010000# => (ALU, OP_MTCRF, NONE, NONE, RS, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mtcrf/mtocrf
2#0010110010# => (ALU, OP_MTMSRD, NONE, NONE, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- mtmsrd # ignore top bits and d
2#0111010011# => (ALU, OP_MTSPR, NONE, NONE, RS, SPR, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mtspr
2#0001001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulhd
2#0000001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- mulhdu
2#0001001011# => (ALU, OP_MUL_H32, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- mulhw
2#0000001011# => (ALU, OP_MUL_H32, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- mulhwu
2#0001001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulhd
2#0000001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- mulhdu
2#0001001011# => (ALU, OP_MUL_H32, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- mulhw
2#0000001011# => (ALU, OP_MUL_H32, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- mulhwu
-- next 4 have reserved bit set
2#1001001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulhd
2#1000001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- mulhdu
2#1001001011# => (ALU, OP_MUL_H32, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- mulhw
2#1000001011# => (ALU, OP_MUL_H32, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- mulhwu
2#0011101001# => (ALU, OP_MUL_L64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulld
2#1011101001# => (ALU, OP_MUL_L64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulldo
2#0011101011# => (ALU, OP_MUL_L64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- mullw
2#1011101011# => (ALU, OP_MUL_L64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- mullwo
2#1001001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulhd
2#1000001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- mulhdu
2#1001001011# => (ALU, OP_MUL_H32, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- mulhw
2#1000001011# => (ALU, OP_MUL_H32, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- mulhwu
2#0011101001# => (ALU, OP_MUL_L64, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulld
2#1011101001# => (ALU, OP_MUL_L64, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulldo
2#0011101011# => (ALU, OP_MUL_L64, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- mullw
2#1011101011# => (ALU, OP_MUL_L64, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- mullwo
2#0111011100# => (ALU, OP_AND, NONE, RB, RS, RA, '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- nand
2#0001101000# => (ALU, OP_ADD, RA, NONE, NONE, RT, '0', '0', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- neg
2#1001101000# => (ALU, OP_ADD, RA, NONE, NONE, RT, '0', '0', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- nego
......
......@@ -213,7 +213,10 @@ architecture behaviour of decode2 is
signal gpr_c_read : gpr_index_t;
signal gpr_c_bypass : std_ulogic;
signal cr_write_valid : std_ulogic;
signal cr_write_valid : std_ulogic;
signal cr_bypass : std_ulogic;
signal cr_bypass_avail : std_ulogic;
begin
control_0: entity work.control
generic map (
......@@ -248,7 +251,9 @@ begin
gpr_c_read_in => gpr_c_read,
cr_read_in => d_in.decode.input_cr,
cr_write_in => cr_write_valid,
cr_write_in => cr_write_valid,
cr_bypass => cr_bypass,
cr_bypassable => cr_bypass_avail,
valid_out => control_valid_out,
stall_out => stall_out,
......@@ -342,6 +347,7 @@ begin
v.e.oe := decode_oe(d_in.decode.rc, d_in.insn);
end if;
v.e.cr := c_in.read_cr_data;
v.e.bypass_cr := cr_bypass;
v.e.xerc := c_in.read_xerc_data;
v.e.invert_a := d_in.decode.invert_a;
v.e.invert_out := d_in.decode.invert_out;
......@@ -388,6 +394,10 @@ begin
gpr_c_read <= gspr_to_gpr(decoded_reg_c.reg);
cr_write_valid <= d_in.decode.output_cr or decode_rc(d_in.decode.rc, d_in.insn);
cr_bypass_avail <= '0';
if EX1_BYPASS then
cr_bypass_avail <= d_in.decode.output_cr;
end if;
v.e.valid := control_valid_out;
if d_in.decode.unit = NONE then
......
......@@ -74,6 +74,7 @@ architecture behaviour of execute1 is
signal r, rin : reg_type;
signal a_in, b_in, c_in : std_ulogic_vector(63 downto 0);
signal cr_in : std_ulogic_vector(31 downto 0);
signal valid_in : std_ulogic;
signal ctrl: ctrl_t := (irq_state => WRITE_SRR0, others => (others => '0'));
......@@ -355,6 +356,16 @@ begin
v.e.xerc := e_in.xerc;
end if;
-- CR forwarding
cr_in <= e_in.cr;
if EX1_BYPASS and e_in.bypass_cr = '1' and r.e.write_cr_enable = '1' then
for i in 0 to 7 loop
if r.e.write_cr_mask(i) = '1' then
cr_in(i * 4 + 3 downto i * 4) <= r.e.write_cr_data(i * 4 + 3 downto i * 4);
end if;
end loop;
end if;
v.lr_update := '0';
v.mul_in_progress := '0';
v.div_in_progress := '0';
......@@ -635,7 +646,7 @@ begin
v.e.write_reg := fast_spr_num(SPR_CTR);
end if;
is_branch := '1';
taken_branch := ppc_bc_taken(bo, bi, e_in.cr, a_in);
taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in);
abs_branch := insn_aa(e_in.insn);
when OP_BCREG =>
-- read_data1 is CTR
......@@ -648,7 +659,7 @@ begin
v.e.write_reg := fast_spr_num(SPR_CTR);
end if;
is_branch := '1';
taken_branch := ppc_bc_taken(bo, bi, e_in.cr, a_in);
taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in);
abs_branch := '1';
when OP_RFID =>
......@@ -675,7 +686,7 @@ begin
v.busy := '1';
when OP_ISEL =>
crbit := to_integer(unsigned(insn_bc(e_in.insn)));
if e_in.cr(31-crbit) = '1' then
if cr_in(31-crbit) = '1' then
result := a_in;
else
result := b_in;
......@@ -695,7 +706,7 @@ begin
lo := (7-i)*4;
hi := lo + 3;
if i = scrnum then
newcrf := e_in.cr(hi downto lo);
newcrf := cr_in(hi downto lo);
end if;
end loop;
for i in 0 to 7 loop
......@@ -713,14 +724,14 @@ begin
bbnum := 31 - to_integer(unsigned(bb));
-- Bits 5-8 of cr_op give the truth table of the requested
-- logical operation
cr_operands := e_in.cr(banum) & e_in.cr(bbnum);
cr_operands := cr_in(banum) & cr_in(bbnum);
crresult := cr_op(5 + to_integer(unsigned(cr_operands)));
v.e.write_cr_mask := num_to_fxm((31-btnum) / 4);
for i in 0 to 31 loop
if i = btnum then
v.e.write_cr_data(i) := crresult;
else
v.e.write_cr_data(i) := e_in.cr(i);
v.e.write_cr_data(i) := cr_in(i);
end if;
end loop;
end if;
......@@ -772,7 +783,7 @@ begin
when OP_MFCR =>
if e_in.insn(20) = '0' then
-- mfcr
result := x"00000000" & e_in.cr;
result := x"00000000" & cr_in;
else
-- mfocrf
crnum := fxm_to_num(insn_fxm(e_in.insn));
......@@ -781,7 +792,7 @@ begin
lo := (7-i)*4;
hi := lo + 3;
if crnum = i then
result(hi downto lo) := e_in.cr(hi downto lo);
result(hi downto lo) := cr_in(hi downto lo);
end if;
end loop;
end if;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment