流水线stall的设计
两种方式选一:
-
chiplab的,上一级等下一级,一级一级串起来,一但后面的比如ex2卡住了,前面流水线全部停下。
-
我自己cpu6的,if阶段有个stallF信号,后面阶段如果要stall,就去signal这个stallF,然后流水线里就产生泡泡。这样需要各级都很清楚哪里要stall,哪里要flush。
先在看来流水线控制我已经知道了3种方式。
-
chilap的,一级链一级,后面的不完成,前面的进不去。
-
我自己的,控制if,刷后面的id,ex,让流水线里产生泡泡。
-
OpenSPARC T1,stall 各级NIR,同时有很多个信号随着这条stall后重复的指令通过整个流水线,但在读写reg的时候都不给en,所以这条指令是没任何效果的。
如果要用OpenSPARC的方式,rf要支持,虽说rf都有wen,但我要用chiplab的rf,这个没有ren。
如果要刷流水线寄存器,那么所有的dff都还要有rst。
还没有看chiplab里面怎样刷流水线,比如branch以后已经进到流水线里的指令。OpenSPARC怎么做的更是还没看明白。 todo
先看下opensparc是怎么做的。
来自ifu外部的stallreq,一个是来自lsu,是某个队列快满了就告诉ifu不要再取值往后面执行了。另一个是来自ffu,浮点运算单元。
lsu/rtl/lsu_qctl2.v
// Determine whether cfq has crossed high-water mark. IFU must switchout all threads
// for every cycle that this is valid.
// Need to change wptr size once new cfq array description incorporated.
// Wrap bit may not be needed !!!
wire [5:0] dfq_vld_entries ;
assign dfq_vld_entries[5:0] = (dfq_wptr_w_wrap[5:0] - dfq_rptr_w_wrap[5:0]) ;
/*assign dfq_vld_entries[3:0] =
(dfq_rptr_w_wrap[4] ^ dfq_wptr_w_wrap[4]) ?
(dfq_rptr_w_wrap[3:0] - dfq_wptr_w_wrap[3:0]) : (dfq_wptr_w_wrap[3:0] - dfq_rptr_w_wrap[3:0]) ;*/
// High water mark conservatively put at 16-4 = 12
assign dfq_stall = (dfq_vld_entries[5:0] >= 6'd4) ;
assign lsu_ifu_stallreq =
dfq_stall | int_skid_stall | lsu_tlbop_force_swo ;
//dfq_stall | dfq_stall_d1 | dfq_stall_d2 | int_skid_stall | lsu_tlbop_force_swo ;
ffu/rtl/sparc_ffu_ctl.v
output ffu_ifu_stallreq; // stall pipe so blk st can issue
///////////////////
// bst starvation
//----------------
// when six bit counter saturates then a req to stall inst issue is made
// The request stays high until a bst gets issued
///////////////////
assign ffu_ifu_stallreq = bst_stall_req;
assign bst_stall_req_next = ((bst_stall_cnt[5:0] == 6'b111111) & bst_issue_c2 & ~can_issue_bst_c2 |
bst_stall_req & other_mem_op_e);
assign bst_stall_cnt_next[5:0] = (~bst_issue_c2)? 6'd0: bst_stall_cnt[5:0] + 6'd1;
来自ifu内部的stallreq也有。fcl_dtu_stall_bf,代码里已经没了sparc_ifu_dtu,不知道sparc_ifu_swl是不是就是原来的dtu。
这个stall是来自fcl,是ifu control logic提示其它的模块stall发生了。
ifu/rtl/sparc_ifu_swl.v
input fcl_dtu_stall_bf
// assign sched_nt = fcl_dtu_switch_s & ~fcl_dtu_stall_bf;
assign sched_nt = dtu_fcl_ntr_s & ~(fcl_dtu_stall_bf | ifq_swl_stallreq);
assign schedule = dtu_fcl_nextthr_bf & {4{sched_nt}};
// thread start and stop
// assign switch_out = fcl_dtu_switch_s | fcl_dtu_stall_bf | fcl_swl_swout_f;
assign switch_out = dtu_fcl_ntr_s | fcl_dtu_stall_bf | fcl_swl_swout_f |
ifq_swl_stallreq;
ifu/rtl/sparc_ifu_fcl.v
assign all_stallreq = ifq_fcl_stallreq | lsu_stallreq_d1 |
ffu_stallreq_d1 | itlb_starv_alert;
// leave out stall from ifq which goes directly to swl
assign fcl_dtu_stall_bf = lsu_stallreq_d1 | ffu_stallreq_d1 |
itlb_starv_alert | rst_stallreq;
这个all_stallreq应该是包含了所有让fetch stall的情况,看看它用在哪里。
//--------------------------
// Fetch Request and Stall
//--------------------------
// Determine if we need can continue fetching next cycle
// assign fetch_bf = (~all_stallreq & ~fcl_reset & ~rst_stallreq) &
// (switch_bf |
// ~(part_stall_thisthr_f | fdp_fcl_swc_s2));
// ~(stall_thisthr_f | fdp_fcl_swc_s2 | immu_fault_f));
assign fetch_bf = (~all_stallreq & ~fcl_reset & ~rst_stallreq) &
(switch_bf | // replace with ntr_s?
~(part_stall_thisthr_f
| fdp_fcl_swc_s2
)
);
//-------------------------
// Valid Instruction Pipe
//-------------------------
// F stage
assign inst_vld_bf = fetch_bf;
dff_s #(1) inst_vld_ff(.din (inst_vld_bf),
.clk (clk),
.q (inst_vld_f),
.se (se), .si(), .so());
assign stall_f = ~inst_vld_f | kill_curr_f;
assign stall_thisthr_f = stall_f | imsto_thisthr_s1 | // intrto_thisthr_d |
kill_thread_s2 | rb_stg_s | ~dtu_fcl_running_s |
iferrto_thisthr_d1;
assign part_stall_thisthr_f = stall_f |
imsto_thisthr_s1 |
~dtu_fcl_running_s |
ely_kill_thread_s2 |
rb_stg_s;
assign ely_stall_thisthr_f = stall_f | rb_stg_s;
//-------------------------
// Valid Instruction Pipe
//-------------------------
// F stage
assign inst_vld_bf = fetch_bf;
dff_s #(1) inst_vld_ff(.din (inst_vld_bf),
.clk (clk),
.q (inst_vld_f),
.se (se), .si(), .so());
assign stall_f = ~inst_vld_f | kill_curr_f;
assign stall_thisthr_f = stall_f | imsto_thisthr_s1 | // intrto_thisthr_d |
kill_thread_s2 | rb_stg_s | ~dtu_fcl_running_s |
iferrto_thisthr_d1;
assign part_stall_thisthr_f = stall_f |
imsto_thisthr_s1 |
~dtu_fcl_running_s |
ely_kill_thread_s2 |
rb_stg_s;
assign ely_stall_thisthr_f = stall_f | rb_stg_s;
// assign stall_s1_nxt = stall_thisthr_f | intr_vld_s | tmsto_thisthr_f;
assign stall_s1_nxt = stall_thisthr_f; //| intr_vld_s;
// S1 stage
dff_s #(1) stalld_ff(.din (stall_s1_nxt),
.clk (clk),
.q (stall_s1),
.se (se), .si(), .so());
assign inst_vld_s1 = ~stall_s1 & ~ic_miss_s1 & ~kill_curr_d;
assign val_thr_s1 = thr_s1 & {4{inst_vld_s1}}; // 4b
// S2 stage
assign val_thr_f = thr_f & {4{~stall_f & ~rb_stg_s & dtu_fcl_running_s}};
// Tag the S stage thr inst register as containing a valid inst or not
assign tinst_vld_nxt = (ifq_fcl_fill_thr |
(rb_w2 & ~rb_for_iferr_e) | // set
val_thr_s1 & ~val_thr_f |
// val_thr_s1 |
tinst_vld_s & ~val_thr_f) &
~(clear_s_d1 |
{4{erb_dtu_ifeterr_d1 & inst_vld_d1 &
~rb_stg_e}} & thr_e); // reset
dffr_s #(4) tinst_reg(.din (tinst_vld_nxt),
.clk (clk),
.rst (fcl_reset),
.q (tinst_vld_s),
.se (se), .si(), .so());
// Does current thread have valid inst in s2
assign inst_vld_s2 = ((thr_f_crit & tinst_vld_s) == 4'b0000) ?
{1'b0} : {1'b1};
assign inst_vld_s = ~switch_s2 & inst_vld_s1 |
switch_s2 & inst_vld_s2;
assign inst_vld_s_crit = ~switch_s2 & ~stall_s1 & ~kill_curr_d |
switch_s2 & inst_vld_s2;
assign valid_s = inst_vld_s & ~stall_f & // f and s2 have same thread
dtu_fcl_running_s &
~(ely_kill_thread_s2 | rb_stg_s);
assign running_s2 = inst_vld_s & ~stall_thisthr_f;// f and s2 have
// same thread
// D stage
dff_s #(1) rund_ff(.din (running_s2),
.clk (clk),
.q (inst_vld_d),
.se (se), .si(), .so());
dff_s #(1) eivd_ff(.din (running_s2),
.clk (clk),
.q (inst_vld_d_crit),
.se (se), .si(), .so());
assign fcl_erb_inst_issue_d = inst_vld_d & ~intr_vld_d;
assign running_d = inst_vld_d & ~kill_thread_d & ~rb_stg_d &
~intr_vld_d;
// E stage
dff_s #(1) rune_ff(.din (running_d),
.clk (clk),
.q (inst_vld_e),
.se (se), .si(), .so());
assign running_e = inst_vld_e & ~dtu_inst_anull_e &
~kill_curr_e & ~rb_stg_e &
~(thr_match_em & ifu_tlu_flush_m);
assign inst_vld_qual_e = inst_vld_e & ~rb_stg_e;
assign val_thr_e = thr_e_v2 & {4{inst_vld_qual_e}} & ~late_flush_w2 &
~(thr_w & {4{utrap_flush_w}});
// M stage
dff_s #(1) runm_ff(.din (running_e),
.clk (clk),
.q (inst_vld_m),
.se (se), .si(), .so());
assign running_m = (inst_vld_m | intr_vld_m) & ~kill_thread_m;
assign ifu_tlu_inst_vld_m = (inst_vld_m | intr_vld_m) & ~kill_curr_m;
// less critical
// assign ifu_lsu_inst_vld_m = ifu_tlu_inst_vld_m;
// W stage
dff_s #(1) runw_ff(.din (running_m),
.q (inst_vld_w),
.clk (clk), .se (se), .si(), .so());
dff_s #(1) iw_ff(.din (running_m),
.q (inst_vld_w_crit),
.clk (clk), .se (se), .si(), .so());
// synopsys translate_off
// wire sas_m,
// inst_done_w_for_sas;
// assign sas_m = inst_vld_m & ~kill_thread_m &
// ~(exu_ifu_ecc_ce_m & inst_vld_m & ~trap_m);
// dff #(1) sasw_ff(.din (sas_m),
// .clk (clk),
// .q (inst_done_w_for_sas),
// .se (se), .si(), .so());
// synopsys translate_on
// need to kill branch by E stage, so qual with rb_stg_X
assign fcl_dtu_inst_vld_e = inst_vld_e & ~rb_stg_e & ~kill_curr_e;
assign fcl_dtu_intr_vld_e = intr_vld_e & ~rb_stg_e & ~kill_curr_e;
assign fcl_dtu_inst_vld_d = inst_vld_d & ~kill_curr_d &
~rb_stg_d_crit & ~immu_miss_crit_d;
assign fcl_dtu_ely_inst_vld_d = inst_vld_d_crit;
assign ifu_tlu_inst_vld_w = inst_vld_w;
assign ifu_exu_inst_vld_w = inst_vld_w_crit;
assign ifu_spu_inst_vld_w = inst_vld_w;
assign ifu_exu_inst_vld_e = fcl_dtu_inst_vld_e;
assign flush_sonly_qual_e = dtu_fcl_flush_sonly_e & inst_vld_e &
// ~dec_fcl_kill4sta_e &
~rb_stg_e & ~dtu_inst_anull_e & ~kill_curr_e;
dff_s #(1) flshm_ff(.din (flush_sonly_qual_e),
.q (flush_sonly_m),
.clk (clk),
.se (se), .si(), .so());
dff_s #(1) imflshm_ff(.din (ifq_fcl_flush_sonly_e),
.q (ims_flush_sonly_m),
.clk (clk),
.se (se), .si(), .so());
// detect collision between two different types of retractions
assign ims_flush_coll_m = ims_flush_sonly_m & ~canthr_sm &
retract_iferr_e;
dff_s #(1) imflshw_ff(.din (ims_flush_coll_m),
.q (ims_flush_sonly_w),
.clk (clk),
.se (se), .si(), .so());
assign ims_flush_coll_w = ims_flush_sonly_w & ~canthr_sw;
assign flush_sonly_qual_m = (ims_flush_sonly_m & ~canthr_sm &
~retract_iferr_e |
flush_sonly_m & inst_vld_m & ~kill_local_m &
~kill_curr_m);
assign flush_sonly_all_m = (ims_flush_sonly_m & ~canthr_sm |
flush_sonly_m & inst_vld_m);
可以看到inst_vld_ 系列的信号控制流水线的各个阶段。
wire running_s2,
valid_s,
running_s1,
ely_running_s1,
running_d,
running_e,
running_m,
inst_vld_f,
inst_vld_s,
inst_vld_s_crit,
inst_vld_s1,
inst_vld_s2, // valid bit of S stage
// instruction. If this is 0,
// convert inst to no-op
inst_vld_d,
inst_vld_d_crit,
inst_vld_d1,
inst_vld_e,
inst_vld_qual_e,
inst_vld_m,
inst_vld_w;
以inst_vld_f为例,下面这几行直接说明问题,这几个信号都是从fcl传到fdp(ifu datapath)的。
// assign fcl_fdp_noswpc_sel_rst_l_bf = 1'b1;
assign fcl_fdp_noswpc_sel_tnpc_l_bf = ~ntpc_thisthr;
assign fcl_fdp_noswpc_sel_old_l_bf = ntpc_thisthr | inst_vld_f | arst_vld_f;
assign fcl_fdp_noswpc_sel_inc_l_bf = ntpc_thisthr | ~inst_vld_f & ~arst_vld_f;
如果inst_vld_f为1(不考虑其它几个信号),fcl_fdp_noswpc_sel_old_l_bf就是1的话,因为这个信号是_l,所以是不选择old指令。而是fcl_fdp_noswpc_sel_inc_l_bf为0,表示ip增加,采用下一条指令。如果inst_vld_f一直是0,那么ifu就会一直停在old指令上。
我想这时候应该配上刷下一阶段s指令寄存器的信号,这样ip不动,而流水线里是刷出来的起泡。
assign valid_s = inst_vld_s & ~stall_f & // f and s2 have same thread
dtu_fcl_running_s &
~(ely_kill_thread_s2 | rb_stg_s);
...
// interrupt and reset signal pipe
// VA hole trap has higher priority than interrupt
// - since the VA hole marker is lost once the intr is taken
assign intr_vld_s = force_intr_s & (valid_s & ~pc_oor_s |
async_intr_vld_s);
...
// thread NIR input muxes (2:1 no need to protect)
assign fcl_fdp_thr_s1_l = ~thr_s1 | {4{stall_s1}};
然后在sparc_ifu_fdp里:
ifu/rtl/sparc_ifu_fdp.v
//----------------------------------------------------------------------
// Next Instruction Datapath
//----------------------------------------------------------------------
// Thread next instruction muxes
// dp_mux2es #(33) t0nir_mux(.dout (t0nir_in),
// .in0 (icd_fdp_topdata_s1[32:0]),
// .in1 (t0nir),
// .sel (fcl_fdp_thr_s1_l[0])); // 0=new
// dp_mux2es #(33) t1nir_mux(.dout (t1nir_in),
// .in0 (icd_fdp_topdata_s1[32:0]),
// .in1 (t1nir),
// .sel (fcl_fdp_thr_s1_l[1]));
// dp_mux2es #(33) t2nir_mux(.dout (t2nir_in),
// .in0 (icd_fdp_topdata_s1[32:0]),
// .in1 (t2nir),
// .sel (fcl_fdp_thr_s1_l[2]));
// dp_mux2es #(33) t3nir_mux(.dout (t3nir_in),
// .in0 (icd_fdp_topdata_s1[32:0]),
// .in1 (t3nir),
// .sel (fcl_fdp_thr_s1_l[3]));
// Thread Next Instruction Register
wire clk_nir0;
`ifdef FPGA_SYN_CLK_EN
`else
bw_u1_ckenbuf_6x ckennir0(.rclk (rclk),
.clk (clk_nir0),
.en_l (fcl_fdp_thr_s1_l[0]),
.tm_l (~se));
`endif
`ifdef FPGA_SYN_CLK_DFF
dffe_s #(33) t0nir_reg(.din (icd_fdp_topdata_s1[32:0]),
.q (t0nir),
.en (~(fcl_fdp_thr_s1_l[0])), .clk(rclk), .se(se), .si(), .so());
`else
dff_s #(33) t0nir_reg(.din (icd_fdp_topdata_s1[32:0]),
.q (t0nir),
.clk (clk_nir0), .se(se), .si(), .so());
`endif
也就是说,如果stall_s1是1,fcl_fdp_thr_s1_l为1,不enable t0nir_reg,则t0nir不从cache里取最新的数据,而是不变,表示s1阶段的NIR全部stall住。
那这条指令不会在NIR里不变,不会被当作和上一条重复的指令再被执行一次吗?
突然想起来之前看OpenSPARC T1的一个事,感觉应该是inst_vld这个信号会被后面级的流水线寄存器一直传递下去,一直跟到最后写寄存器或者cache,让最终的修改操作无效。
比如下面这段这些ifu_exu_ren3_s什么的,如果没有inst_vld_f,连读en都没有。
//--------------------------------------------------------------
// Reg file control
//--------------------------------------------------------------
// Some decode is done here since these signals are in the crit path
// Regfile enables are only power saving features. So they don't
// have to be exact, as long as they are on, a super set of when
// they need to be on.
// Enable rs3 if store or atomic or mov
assign ifu_exu_ren3_s = inst_vld_f & fdp_fcl_op_s[1] & fdp_fcl_op3_s[2] &
(fdp_fcl_op_s[0] | fdp_fcl_op3_s[5]);
// enable rs2 if i=0 and !branch or CAS
// cas not fully decoded; i=inst[13];
assign ifu_exu_ren2_s = inst_vld_f & fdp_fcl_op_s[1] &
(~fdp_fcl_ibit_s |
fdp_fcl_op_s[0] & fdp_fcl_op3_s[5]);
// rs1 is read if this is not (a branch on cc or no-op/sethi)
assign ifu_exu_ren1_s = inst_vld_f & (fdp_fcl_op_s[1] | // not br/call
fdp_fcl_op3_s[4] & fdp_fcl_op3_s[3]); // BPR
这几个信号是ifu给exu的,最终在exu里用来读rf。
bw_r_irf irf(
.so (short_scan0_1),
.si (short_si0),
.reset_l (arst_l),
.rst_tri_en (mem_write_disable),
.rml_irf_old_e_cwp_e (rml_irf_old_e_cwp_e[1:0]),
.rml_irf_new_e_cwp_e (rml_irf_new_e_cwp_e[1:0]),
/*AUTOINST*/
// Outputs
.irf_byp_rs1_data_d_l (irf_byp_rs1_data_d_l[71:0]),
.irf_byp_rs2_data_d_l (irf_byp_rs2_data_d_l[71:0]),
.irf_byp_rs3_data_d_l (irf_byp_rs3_data_d_l[71:0]),
.irf_byp_rs3h_data_d_l (irf_byp_rs3h_data_d_l[31:0]),
// Inputs
.rclk (rclk),
.se (se),
.sehold (sehold),
.ifu_exu_tid_s2 (ifu_exu_tid_s2[1:0]),
.ifu_exu_rs1_s (ifu_exu_rs1_s[4:0]),
.ifu_exu_rs2_s (ifu_exu_rs2_s[4:0]),
.ifu_exu_rs3_s (ifu_exu_rs3_s[4:0]),
.ifu_exu_ren1_s (ifu_exu_ren1_s),
.ifu_exu_ren2_s (ifu_exu_ren2_s),
.ifu_exu_ren3_s (ifu_exu_ren3_s),
.ecl_irf_wen_w (ecl_irf_wen_w),
.ecl_irf_wen_w2 (ecl_irf_wen_w2),
可以看出,如果inst_vld_f不是1的话,这条指令传下去连寄存器都不能正确的读出来。
虽然不是流水线里的泡泡,但这条指令应该处处都不起作用。
既然寄存器都读不出,那对cache也不会产生影响,至少不可控。
inst_vld_bf没有产生什么从fcl向外的信号,说明对于cache取值来说,无论怎样都会把下顺序下一条指令取来。
只不过在下一个cycle的时候inst_vld_f决定是用新的还是old指令不变。
chiplab里的allow_in
整个流水线控制是串起来的,一级控制一级,后面级控制前面级,后面stall赌住了,前面也就堵住了。
前面有点乱,因为有3个port,有valid和accept,我把de1和de2合并成de了,不过意思差不多。
后面都是allow_in。
o_valid <– de_accept <– is_allow_in <– ex1_allow_in <– ex2_allow_in <– wb_allow_in
比如ex1_allow_in因为执行mul指令stall住了,那前面的流水线就自动stall了,不用每级分别通知。
module cpu7_ifu_fdp(
// group o
input wire [2 :0] o_allow ,
);
endmodule
o_allow需要de_accept。
module lsoc1000_stage_de(
// pipe in
output de_allow_in,
output [ 2:0] de_accept,
input is_allow_in,
);
//assign de_allow_in = (is_allow_in || bru_cancel || exception || eret) && (wait_barrier == 2'd0);
assign de_allow_in = is_allow_in || bru_cancel || exception || eret;
//assign de_accept = {port2_valid,!crash,de_allow_in};
assign de_accept = {0, 0, de_allow_in};
endmodule
module lsoc1000_stage_issue(
// pipe in
output is_allow_in,
// pipe out
input ex1_allow_in,
input ex2_allow_in,
);
assign allow_in = (port0_dispatch || !is_port0_valid) && (port1_dispatch || !is_port1_valid) && ex1_allow_in;
assign is_allow_in = allow_in && !type_crash;
endmodule
这里面ex2_allow_in输入了,但是没用到。
module ex1_stage(
// pipe in
output ex1_allow_in,
// pipe out
input ex2_allow_in,
);
assign ex1_allow_in = ex2_allow_in && change && tlb_allow_in;
always @(posedge clk) begin
if (rst) begin
ex2_port0_rf_target <= 5'd0;
ex2_port0_rf_wen <= 1'd0;
ex2_port0_ll <= 1'd0;
ex2_port0_sc <= 1'd0;
ex2_port1_rf_target <= 5'd0;
ex2_port1_rf_wen <= 1'd0;
ex2_port1_ll <= 1'd0;
ex2_port1_sc <= 1'd0;
end
else if(ex1_allow_in) begin
ex2_port0_src <= ex1_port0_src;
ex2_port0_inst <= ex1_port0_inst;
ex2_port0_pc <= ex1_port0_pc;
ex2_port0_rf_target <= ex1_port0_valid ? ex1_port0_rf_target : 5'd0;
ex2_port0_rf_wen <= ex1_port0_rf_wen;
ex2_port0_ll <= ex1_port0_ll;
ex2_port0_sc <= ex1_port0_sc;
ex2_port1_src <= ex1_port1_src;
ex2_port1_inst <= ex1_port1_inst;
ex2_port1_pc <= ex1_port1_pc;
ex2_port1_rf_target <= ex1_port1_valid ? ex1_port1_rf_target : 5'd0;
ex2_port1_rf_wen <= ex1_port1_rf_wen;
ex2_port1_ll <= ex1_port1_ll;
ex2_port1_sc <= ex1_port1_sc;
end
end
endmodule
如果ex1_allow_in,说明这时候ex2_allow_in也是可以了,ex1里的数据更新。如果没有ex1_allow_in,那寄存器如ex2_port0_src就保持原数据不变。
module ex2_stage(
// pipe in
output ex2_allow_in,
// pipe out
input wb_allow_in,
);
//////basic
assign allow_in_temp = wb_allow_in && change;
assign ex2_allow_in = allow_in_temp; ////TODO: MAY WAIT FOR DEBUG
always @(posedge clk) begin
if (rst) begin
wb_port0_rf_target <= 5'd0;
wb_port0_rf_wen <= 1'd0;
wb_port1_rf_target <= 5'd0;
wb_port1_rf_wen <= 1'd0;
wb_bru_link <= 1'b0;
wb_bru_brop <= 1'b0;
wb_bru_jrop <= 1'b0;
wb_bru_jrra <= 1'b0;
wb_bru_valid <= 1'b0;
end
else if(ex2_allow_in) begin //TODO!!!!!!!!!
wb_port0_src <= ex2_port0_src ;
wb_port0_inst <= ex2_port0_inst ;
wb_port0_pc <= ex2_port0_pc ;
wb_port0_rf_target <= ex2_port0_rf_target;
wb_port0_rf_wen <= ex2_port0_rf_wen ;
wb_port1_src <= ex2_port1_src ;
wb_port1_inst <= ex2_port1_inst ;
wb_port1_pc <= ex2_port1_pc ;
wb_port1_rf_target <= ex2_port1_rf_target;
wb_port1_rf_wen <= ex2_port1_rf_wen ;
wb_bru_link <= ex2_bru_link ;
wb_bru_br_taken <= bru_taken_ex2 ;
wb_bru_brop <= ex2_bru_brop && branch_valid;
wb_bru_jrop <= ex2_bru_jrop && branch_valid;
wb_bru_jrra <= ex2_bru_jrra && branch_valid;
wb_bru_port <= ex2_bru_port ;
wb_bru_valid <= ex2_bru_valid ;
wb_bru_hint <= ex2_bru_hint ;
wb_bru_pc <= ex2_bru_pc ;
wb_bru_link_pc <= bru_link_pc_ex2 ;
end
else begin
wb_bru_link <= 1'd0;
wb_bru_brop <= 1'd0;
wb_bru_jrop <= 1'd0;
wb_bru_jrra <= 1'd0;
end
end
endmodule
ex2_allow_in如果没有好,说明ex2_stage还不能给wb发新的数据,但还是有几个信号要置0,例如wb_bru_link wb_bru_brop,不能保持不变,这跟具体实现有关。
module wb_stage(
// pipe in
output wb_allow_in,
};
//basic
assign wb_allow_in = 1'b1;
endmodule
wb阶段寄存器回写,不存在stall的时候,所以一直是1。
假如在ex1_stage stall了,is_stage也同样不会更新传给ex1的数据,但这些寄存器都没有刷新流水线寄存器的地方。
也就是说is_stage里给出的output ex1_port0_xxx,会一直保持不变,这样是不是相当于一直在像ex1_stage发出同一条指令呢?
比如一句加法指令:add x5, x5, 0x10,会不会在stall的过程中x5一直在累加。
我之前用的是flush流水线寄存器的方法,把存在管道里的指令清理成流水钱泡泡。泡泡是内部NOP,传下去对后面的模块也没有影响。
chiplab里是怎么做的?
chiplab里每一级的valid信号,比如is_port0_valid, ex1_port0_valid, ex2_port0_valid。都是上一级根据后面的allow_in再给出的,相当于握手。
assign ex2_alu0_valid = ex2_port0_valid && (ex2_port0_src == `EX_ALU0);
//////basic
assign allow_in_temp = wb_allow_in && change;
assign ex2_allow_in = allow_in_temp; ////TODO: MAY WAIT FOR DEBUG
//result
wire port0_change;
wire port0_alu_change = (ex2_port0_src == `EX_ALU0 ) && ex2_alu0_valid;
wire port0_lsu_change = (ex2_port0_src == `EX_LSU ) && lsu_res_valid;
wire port0_bru_change = (ex2_port0_src == `EX_BRU ) && ex2_bru_valid;
wire port0_none_change = (ex2_port0_src == `EX_NONE0 ) && ex2_none0_valid;
wire port0_div_change = (ex2_port0_src == `EX_DIV ) && ex2_div_valid;// && (div_complete || div_completed);
wire port0_mul_change = (ex2_port0_src == `EX_MUL ) && ex2_mul_valid && ex2_mul_ready;
assign port0_change = port0_alu_change || port0_lsu_change || port0_bru_change || port0_none_change || port0_div_change || port0_mul_change || !ex2_port0_valid;
assign port1_change = port1_alu_change || port1_lsu_change || port1_bru_change || port1_none_change || port1_div_change || port1_mul_change || !ex2_port1_valid;
assign change = port0_change && port1_change;
如果没有valid,就不会设置change。而change是表明这条指令的结果是有意义,也就是需要写会内存和寄存器的。
所以再lsu里,change就是input,而最终控制output的lsu_res_valid。表明lsu要写内存的内容是否有效。
lsu_s2 lsu_s2(
.clk (clk ),
.resetn (resetn ),
.valid (lsu_valid ),
.lsu_op (ex2_lsu_op ),
.lsu_recv (ex2_lsu_recv ),
.lsu_shift (ex2_lsu_shift ),
.data_recv (data_recv ),
.data_scsucceed (data_scsucceed ),
.data_rdata (data_rdata ),
.data_exception (data_exception && !data_cancel_ex2),
.data_excode (data_excode ),
.data_badvaddr (data_badvaddr ),
.data_data_ok (data_data_ok ),
.read_result (ex2_lsu_res ),
.lsu_res_valid (lsu_res_valid ),
.exception (exception ),
.change (change ),
.badvaddr (badvaddr_ex2 ),
.badvaddr_valid (badvaddr_ex2_valid )
);
// signal process
always @(posedge clk) begin
if (rst || change) res_valid <= 1'd0;
else if(data_data_ok || prefetch || data_exception) res_valid <= 1'd1;
end
assign lsu_res_valid = data_data_ok || data_exception || res_valid || prefetch || !valid || (lsu_op == `LSOC1K_LSU_IDLE);
这其实和opensparc的做法一样。
看来流水线寄存器可以分为:
-
信号控制指令结果
-
流水线寄存器刷NOP
chiplab里这种控制流水线的方法叫“前递设计”。
搞错了,这是数据前递,是数据bypass,比如alu计算的offset结果给前面decode模块处理的branch指令。
不过这也提醒了我,这一串的allow_in也时逻辑组合,要传到最前面去也很慢。
CPU设计实战 p127 4.6 流水线数据的前递设计
缺点是前递的过程中会有从后面模块一直传递给ifu取值的一个非常长的逻辑组合路径。
书里的例子给出:
执行级流水线换存的触发器Q端 –> ALU –> 前递路径 –> 译码级四选一 –> 转移指令跳转方向的判定逻辑 –> GenNextPC的四选一 –> 取指请求的虚实转换 –> 指令RAM的地址输入端口
“完全舍弃前递,主频最高,流水线效率低;最大限度地进行前递,流水线效率最高,但主频低。”
我还搞懂流水线效率低是低在什么地方。