LSU读到数据,回写regfile时wen怎么办?
lsu需要回写regfile,把数据传回ecl(以后改byp),然后给到ecl_irf_rd_data_w,但这时候有个问题,wen已经不在了。
wen一开始在ecl里维护,从ifu_exu_rf_wen_d一直到最后的wen_w,给出ecl_irf_wen_w。
277 wire [4:0] wen_d;
278 wire [4:0] wen_e;
279 wire [4:0] wen_m;
280 wire [4:0] wen_w;
281
282 assign wen_d = ifu_exu_rf_wen_d & ifu_exu_valid_d;
283
284 dff_s #(1) wen_e_reg (
285 .din (wen_d),
286 .clk (clk),
287 .q (wen_e),
288 .se(), .si(), .so());
289
290 dff_s #(1) wen_m_reg (
291 .din (wen_e),
292 .clk (clk),
293 .q (wen_m),
294 .se(), .si(), .so());
295
296 dff_s #(1) wen_w_reg (
297 .din (wen_m),
298 .clk (clk),
299 .q (wen_w),
300 .se(), .si(), .so());
301
302 assign ecl_irf_wen_w = wen_w;
但对于lsu来说,特别是load,中间在_m要等好几个cycle,而上面的流水线寄存器根本不等着,所以在ld写寄存器的时候ecl_irf_wen_w已经不是1了。
看了下OpenSPARC,在exu/rtl/sparc_exu_ecl_wb.v里,这个模块主要是维护alu的wen状态。
/*
// Module Name: sparc_exu_ecl_wb
// Description: Implements the writeback logic for the exu.
// This includes the control signals for the w1 and w2 input
// muxes as well as keeping track of the wen signal for ALU ops.
*/
看来和自己想的一样,LSU自己带着自己的wen,到_w的时候送到ecl里去mux选择。
下面代码里的rd_mux就选择来自不同地方的rd,看来就是各自模块维护自己的rd和wen。
///////////////////
// W1 port control
///////////////////
// sehold will turn off in pipe writes and put the hold functionality through
// the non inst_vld part
// Mux between load and ALU for rd, thr, and wen
assign ecl_byp_sel_load_m = ~(wb_m | wrsr_m | ecl_byp_sel_ecc_m) & ld_g2;
assign ecl_byp_sel_pipe_m = (wb_m | wrsr_m) & ~ecl_byp_sel_ecc_m;
assign ecl_byp_sel_restore_m = ~(wb_m | wrsr_m | ld_g2 | ecl_byp_sel_ecc_m);
assign wen_no_inst_vld_m = (sehold)? ecl_irf_wen_w:
((dfill_vld_g2 & ecl_byp_sel_load_m) |
(ecl_byp_sel_restore_m & restore_wen));
dff_s dff_lsu_wen_m2w(.din(wen_no_inst_vld_m), .clk(clk), .q(wen_no_inst_vld_w), .se(se), .si(),
.so());
// ecc_wen must be kept separate because it needs to check inst_vld but not flush
assign inst_vld_noflush_wen_m = ecl_byp_sel_ecc_m & ~sehold;
dff_s ecc_wen_m2w(.din(inst_vld_noflush_wen_m), .clk(clk), .q(inst_vld_noflush_wen_w), .se(se), .si(), .so());
assign ecl_irf_tid_m[1:0] = ((ecl_byp_sel_load_m)? dfill_tid_g2[1:0]:
(ecl_byp_sel_restore_m)? restore_tid[1:0]:
tid_m[1:0]);
mux4ds #(5) rd_mux(.dout(ecl_irf_rd_m[4:0]),
.in0(rd_m[4:0]),
.in1(dfill_rd_g2[4:0]),
.in2(eccctl_wb_rd_m[4:0]),
.in3(restore_rd[4:0]),
.sel0(ecl_byp_sel_pipe_m),
.sel1(ecl_byp_sel_load_m),
.sel2(ecl_byp_sel_ecc_m),
.sel3(ecl_byp_sel_restore_m));
assign wen_w_inst_vld = valid_w | inst_vld_noflush_wen_w;
assign ecl_irf_wen_w = ifu_exu_inst_vld_w & wen_w_inst_vld | wen_no_inst_vld_w;
// bypass valid logic and flops
dff_s dff_wb_d2e(.din(ifu_exu_wen_d), .clk(clk), .q(wb_e), .se(se),
.si(), .so());
dff_s dff_wb_e2m(.din(valid_e), .clk(clk), .q(wb_m), .se(se),
.si(), .so());
dffr_s dff_wb_m2w(.din(valid_m), .clk(clk), .q(wb_w), .se(se),
.si(), .so(), .rst(reset));
assign valid_e = wb_e & ~ifu_exu_kill_e & ~restore_e & ~wrsr_e;// restore doesn't finish on time
assign bypass_m = wb_m;// bypass doesn't need to check for traps or sehold
assign valid_m = bypass_m & ~rml_ecl_kill_m & ~sehold;// sehold turns off writes from this path
assign valid_w = (wb_w & ~early_flush_w & ~ifu_tlu_flush_w);// check inst_vld later
// don't check flush for bypass
assign bypass_w = wb_w | inst_vld_noflush_wen_w | wen_no_inst_vld_w;
// speculative wen for ecc injection
assign wb_eccctl_spec_wen_next = valid_m | dfill_vld_g2 | restore_request | divcntl_wb_req_g;
从最开始的ifu_exu_wen_d,到最后的assign ecl_irf_wen_w = ifu_exu_inst_vld_w & wen_w_inst_vld | wen_no_inst_vld_w;
不过这个好像只是ALU回写的信号,LOAD的回写在_g?
exu/rtl/sparc_exu_ecl_wb.v
////////////////////////////////////////////////////////////////
// G selection logic (picks between LOAD and MUL/DIV outputs)
////////////////////////////////////////////////////////////////
// select signals: priority LOAD, RESTORE, MUL, DIV
assign ecl_byp_sel_load_g = (ld_g2 & (wb_m | wrsr_m | ecl_byp_sel_ecc_m));
assign ecl_byp_sel_restore_g = restore_request & ((wb_m | wrsr_m | ecl_byp_sel_ecc_m) ^ ld_g2);
assign ecl_byp_sel_muldiv_g = ~(ecl_byp_sel_load_g | ecl_byp_sel_restore_g);
assign ecl_sel_mul_g = ~ecl_div_sel_div & ecl_byp_sel_muldiv_g;
assign ecl_sel_div_g = ecl_div_sel_div & ecl_byp_sel_muldiv_g;
assign wb_divcntl_ack_g = ecl_byp_sel_muldiv_g;
assign muldiv_tid[1:0] = (ecl_div_sel_div)? mdqctl_wb_divthr_g[1:0]: mdqctl_wb_multhr_g[1:0];
assign muldiv_done_g[3] = ((wb_divcntl_ack_g & divcntl_wb_req_g) &
muldiv_tid[1] & muldiv_tid[0]);
assign muldiv_done_g[2] = ((wb_divcntl_ack_g & divcntl_wb_req_g) &
muldiv_tid[1] & ~muldiv_tid[0]);
assign muldiv_done_g[1] = ((wb_divcntl_ack_g & divcntl_wb_req_g) &
~muldiv_tid[1] & muldiv_tid[0]);
assign muldiv_done_g[0] = ((wb_divcntl_ack_g & divcntl_wb_req_g) &
~muldiv_tid[1] & ~muldiv_tid[0]);
assign ecl_irf_wen_g = (sehold)? ecl_irf_wen_w2:
(ecl_byp_sel_load_g & dfill_vld_g2 |
(ecl_byp_sel_restore_g & restore_wen) |
(ecl_byp_sel_muldiv_g & divcntl_wb_req_g));
dff_s wen_w2_dff(.din(ecl_irf_wen_g), .clk(clk), .q(ecl_irf_wen_w2),
.se(se), .si(), .so());
mux4ds #(5) rd_g_mux(.dout(ecl_irf_rd_g[4:0]), .in0(dfill_rd_g2[4:0]),
.in1(mdqctl_wb_divrd_g[4:0]),
.in2(mdqctl_wb_mulrd_g[4:0]),
.in3(restore_rd[4:0]),
.sel0(ecl_byp_sel_load_g),
.sel1(ecl_sel_div_g),
.sel2(ecl_sel_mul_g),
.sel3(ecl_byp_sel_restore_g));
mux4ds #(2) thr_g_mux(.dout(ecl_irf_tid_g[1:0]), .in0(dfill_tid_g2[1:0]),
.in1(mdqctl_wb_divthr_g[1:0]),
.in2(mdqctl_wb_multhr_g[1:0]),
.in3(restore_tid[1:0]),
.sel0(ecl_byp_sel_load_g),
.sel1(ecl_sel_div_g),
.sel2(ecl_sel_mul_g),
.sel3(ecl_byp_sel_restore_g));
mux2ds setcc_g_mux(.dout(setcc_g),
.in0(mdqctl_wb_mulsetcc_g),
.in1(mdqctl_wb_divsetcc_g),
.sel0(~ecl_div_sel_div),
.sel1(ecl_div_sel_div));
dff_s #(2) dff_thr_g2w2(.din(ecl_irf_tid_g[1:0]), .clk(clk), .q(tid_w2[1:0]), .se(se),
.si(), .so());
dff_s #(5) dff_rd_g2w2(.din(ecl_irf_rd_g[4:0]), .clk(clk), .q(rd_w2[4:0]), .se(se),
.si(), .so());
// needs wen to setcc
assign wb_ccr_setcc_g = wb_divcntl_ack_g & divcntl_wb_req_g & setcc_g;
exu/rtl/sparc_exu_ecl_wb.v
dff_s wen_w2_dff(.din(ecl_irf_wen_g), .clk(clk), .q(ecl_irf_wen_w2),
.se(se), .si(), .so());
mux4ds #(5) rd_g_mux(.dout(ecl_irf_rd_g[4:0]), .in0(dfill_rd_g2[4:0]),
.in1(mdqctl_wb_divrd_g[4:0]),
.in2(mdqctl_wb_mulrd_g[4:0]),
.in3(restore_rd[4:0]),
.sel0(ecl_byp_sel_load_g),
.sel1(ecl_sel_div_g),
.sel2(ecl_sel_mul_g),
.sel3(ecl_byp_sel_restore_g));
最后到irf里和alu是用不同的write port。
而regfile里这样。
.ecl_irf_wen_w (ecl_irf_wen_w),
.ecl_irf_wen_w2 (ecl_irf_wen_w2),
.ecl_irf_rd_m (ecl_irf_rd_m[4:0]),
.ecl_irf_rd_g (ecl_irf_rd_g[4:0]),
.byp_irf_rd_data_w (byp_irf_rd_data_w[71:0]),
.byp_irf_rd_data_w2 (byp_irf_rd_data_w2[71:0]),
../srams/rtl/bw_r_irf.v
/*
// Module Name: bw_r_irf
// Description: Register file with 3 read ports and 2 write ports. Has
// 32 registers per thread with 4 threads. Reading and writing
// the same register concurrently produces x.
*/
module bw_r_irf (/*AUTOARG*/
// Outputs
so, irf_byp_rs1_data_d_l, irf_byp_rs2_data_d_l,
irf_byp_rs3_data_d_l, irf_byp_rs3h_data_d_l,
// Inputs
rclk, reset_l, si, se, sehold, rst_tri_en, ifu_exu_tid_s2,
ifu_exu_rs1_s, ifu_exu_rs2_s, ifu_exu_rs3_s, ifu_exu_ren1_s,
ifu_exu_ren2_s, ifu_exu_ren3_s, ecl_irf_wen_w, ecl_irf_wen_w2,
ecl_irf_rd_m, ecl_irf_rd_g, byp_irf_rd_data_w, byp_irf_rd_data_w2,
ecl_irf_tid_m, ecl_irf_tid_g, rml_irf_old_lo_cwp_e,
rml_irf_new_lo_cwp_e, rml_irf_old_e_cwp_e, rml_irf_new_e_cwp_e,
rml_irf_swap_even_e, rml_irf_swap_odd_e, rml_irf_swap_local_e,
rml_irf_kill_restore_w, rml_irf_cwpswap_tid_e, rml_irf_old_agp,
rml_irf_new_agp, rml_irf_swap_global, rml_irf_global_tid
) ;
input rclk;
input reset_l;
input si;
input se;
input sehold;
input rst_tri_en;
input [1:0] ifu_exu_tid_s2; // s stage thread
input [4:0] ifu_exu_rs1_s; // source addresses
input [4:0] ifu_exu_rs2_s;
input [4:0] ifu_exu_rs3_s;
input ifu_exu_ren1_s; // read enables for all 3 ports
input ifu_exu_ren2_s;
input ifu_exu_ren3_s;
input ecl_irf_wen_w; // write enables for both write ports
input ecl_irf_wen_w2;
input [4:0] ecl_irf_rd_m; // w destination
input [4:0] ecl_irf_rd_g; // w2 destination
input [71:0] byp_irf_rd_data_w;// write data from w1
input [71:0] byp_irf_rd_data_w2; // write data from w2
input [1:0] ecl_irf_tid_m; // w stage thread
input [1:0] ecl_irf_tid_g; // w2 thread
没搞懂rd为什么分别在_m _g的时候给,而不是在w时和wen一起给出。
现在看来w1 w2是指两个write port,不是指两个cycle。
_g是不是_m的另一个称呼呢?
OpenSPARC ALU和LOAD指令分别用不同的write port,因为load返回的cycle不确定,那时候alu也许正有指令要wb。
我这里load的时候就先把流水线停了,等完成了再说。
SPARC internal里确实说w2是个stage啊。
• Load-store unit (LSU) — Processes memory referencing operation codes (opcodes) such as various types of loads, various types of stores, CAS, SWAP, LDSTUB, FLUSH, PREFETCH, and MEMBAR instructions. The LSU interfaces with all the SPARC core functional units and acts as the gateway between the SPARC core units and the CPU-cache crossbar (CCX). Through the CCX, data transfer paths can be established with the memory subsystem and the I/O subsystem (the data transfers are done with packets). The LSU includes memory and write-back stages and a data cache complex. The LSU pipeline has four stages:
• E stage – Cache and TLB setup • M stage – Cache/Tag TLB read • W stage – Store buffer look-up, trap detection, and execution of data bypass • W2 stage – Generates PCX requests and write-backs to the cache
OpenSPARC T2
These pipelines:
• Eight-stage integer pipeline
• Extended pipeline for long latency operations
• Twelve-stage floating-point and graphics pipeline
C.2.3.1 Changes From OpenSPARC T1
• OpenSPARC T2 supports store pipelining. OpenSPARC T1 requires any store to receive an ACK before the next store could issue to the PCX.
• The store buffer supports eight threads.
• The load miss queue supports eight threads.
• The data Translation Lookaside Buffer (DTLB) is 128 entries.
• Only load operations access the D-cache from the pipeline.This reduces conflicts with the CPQ.
• Partial Store instructions from the VIS 2.0 ISA are supported.
• Pipeline is E/M/B/W vs. OpenSPARC T1’s E/M/W/W2. Pipeline timings are different from those of OpenSPARC T2.
• OpenSPARC T2 has additional RAS features and enhanced error detection and protection.
从ecl里的寄存器能看出来g是m的下一级流水线。
dff_s #(5) dff_ld_rd_m2g(.din(lsu_exu_rd_m[4:0]), .clk(clk), .q(ld_rd_g[4:0]), .se(se), .si(), .so());
也就是说g也可以叫做w?