u@home:~$

branch logic

exu/rtl/sparc_exu.v


   sparc_exu_ecl ecl(
                     .so                (short_so0),
                     .si                (short_scan0_1),
                     .rst_tri_en        (mux_drive_disable),
                     .byp_ecl_wrccr_data_w(byp_irf_rd_data_w[7:0]),
                     .alu_ecl_adder_out_31_e(exu_ifu_brpc_e[31]),
                     .byp_ecl_rd_data_3lsb_m(exu_tlu_wsr_data_m[2:0]),
                     .alu_ecl_adder_out_7_0_e(exu_ifu_brpc_e[7:0]),

...





   sparc_exu_alu alu(
                     .byp_alu_rs3_data_e(exu_lsu_rs3_data_e[63:0]),
                     .so                (scan0_2),
                     .si                (scan0_1),
                     .ifu_lsu_casa_e (ecl_alu_casa_e),
                     /*AUTOINST*/
                     // Outputs
                     .alu_byp_rd_data_e (alu_byp_rd_data_e[63:0]),
                     .exu_ifu_brpc_e    (exu_ifu_brpc_e[47:0]),
                     .exu_lsu_ldst_va_e (exu_lsu_ldst_va_e[47:0]),
                     .exu_lsu_early_va_e(exu_lsu_early_va_e[10:3]),
                     .exu_mmu_early_va_e(exu_mmu_early_va_e[7:0]),
                     .alu_ecl_add_n64_e (alu_ecl_add_n64_e),
                     .alu_ecl_add_n32_e (alu_ecl_add_n32_e),
                     .alu_ecl_log_n64_e (alu_ecl_log_n64_e),
                     .alu_ecl_log_n32_e (alu_ecl_log_n32_e),
...


exu_ifu_brpc_e是exu给ifu的branch地址,可以看到这部分是在_e得出的。

从文档里也看到branch address的计算也是复用了ALU,应该就是ALU里的exu_ifu_brpc_e。

而在sparc_ifu.v里却有这个

   // Branch Logic
   sparc_ifu_dcl  dcl(
                      .so               (scan0_3),
                      .si               (scan0_2),
                                  .dtu_dcl_opf2_d       (dtu_inst_d[7]),
                      .fdp_dcl_op_s     (fdp_dtu_inst_s[31:30]),
                      .fdp_dcl_op3_s    (fdp_dtu_inst_s[24:19]),

在ifu里,branch就应该是在decode阶段。

在OpenSPARC T1 Microarchitecture Specification里也有这一句。

2.3.14 Instruction Decode

    The branch condition is also evaluated in the D-stage, and the decision for annulling
    a delay slot is made in this stage as well.

sparc_ifu_dcl.v的注释里也说了这一点,这到要好好看看opensparc里的branch是怎样实现的。

ifu/rtl/sparc_ifu_dcl.v

////////////////////////////////////////////////////////////////////////
/*
//  Module Name: sparc_ifu_dcl
//  Description:        
//   The decode control logic block does branch condition evaluation,
//   delay slot management, and appropriate condition code
//   selection.  It also executes the tcc instruction and kills the E
//   stage instruction if a move did not succeed.  The DCL block is
//   also responsible for generating the correct select signals to
//   choose the branch offset and immediate operand.
//
*/
////////////////////////////////////////////////////////////////////////

还在Microarchitecture Specification里看到这一段。

2.3.10 Thread Selection Policy

    A thread could become unavailable due to one of these reasons:
 
    1. The thread is executing one of the long latency instructions, such as load, branch,
       multiplication, division, and so on.

branch也是long latency instructions?

load是,store看来不是。


sparc_ifu_dcl里面是cc指令和FP cc。

比如ifu_ffu_mvcnd_m,这个名字已经很说的很清楚了。

ifu/rtl/sparc_ifu_dcl.v


   // branch condition to FPU
   dff_s #(1) fpcond_ff(.din  (cond_brtaken_e),
                                  .q    (ifu_ffu_mvcnd_m),
                                  .clk  (clk),
                                  .se   (se), .si(), .so());

下面还有句,回头要仔细看看怎样kill write back and bypass。

   // if mov didn't succeed kill write back and bypass
   // need to check thread as well
//   assign ifu_exu_kill_e = dtu_inst_anull_e | 
//                         ~fcl_dtu_inst_vld_e;  // don't need this anymore
   assign ifu_exu_kill_e = dtu_inst_anull_e;


看来branch指令是应该放在_e。

ifu/rtl/sparc_ifu_fcl.v


   assign load_tpc[3:0] = {4{trappc_vld_w2}} & trap_thr |
                           rb_w2 |
                           {4{rb_stg_w | ims_flush_coll_w}} & thr_w |
//                     {4{dec_fcl_kill4sta_e}} & thr_e |
                           {4{flush_sonly_qual_m}} & thr_m;

   assign load_bpc[3:0] = {4{brtaken_buf_e}} & thr_e;
   assign load_pcp4[3:0] = {4{~part_stall_thisthr_f &
                              ~iferrto_thisthr_d1 |
                              arst_vld_f |
                              async_intr_vld_s}}     & thr_f;

   always @ (/*AUTOSENSE*/load_bpc or load_pcp4 or load_tpc)
     begin
//            if (fcl_reset)
//              begin // RESET PC is loaded to T0
//                 fcl_fdp_tpcbf_sel_old_bf_l = 4'b0001;
//                 fcl_fdp_tpcbf_sel_pcp4_bf_l = 4'b1110;
//                 fcl_fdp_tpcbf_sel_trap_bf_l = 4'b1111;
//                 fcl_fdp_tpcbf_sel_brpc_bf_l = 4'b1111;
//              end // if (reset)
//            else 
//              begin
              fcl_fdp_tpcbf_sel_old_bf_l = (load_bpc | load_tpc | load_pcp4);
              fcl_fdp_tpcbf_sel_brpc_bf_l = ~load_bpc | load_tpc | load_pcp4;
              fcl_fdp_tpcbf_sel_pcp4_bf_l = ~load_pcp4 | load_tpc;
              fcl_fdp_tpcbf_sel_trap_bf_l = ~load_tpc;
     end // always @ (...

看到用always的时候不多啊。


SPARC有两类branch指令。

The SPARC Architecture Manual V9

screenshot0 screenshot1 screenshot2

因为SPARC有状态寄存器,flags。

loongarch没有,所以可以参考Branch on Integer Register with Prediction (BPr)。

这个是寄存器里的简单比较,和0比。

在ALU里,

module sparc_exu_alu
(
 /*AUTOARG*/
   // Outputs
   so, alu_byp_rd_data_e, exu_ifu_brpc_e, exu_lsu_ldst_va_e, 
   exu_lsu_early_va_e, exu_mmu_early_va_e, alu_ecl_add_n64_e,
   alu_ecl_add_n32_e, alu_ecl_log_n64_e, alu_ecl_log_n32_e, 
   alu_ecl_zhigh_e, alu_ecl_zlow_e, exu_ifu_regz_e, exu_ifu_regn_e,
   alu_ecl_adderin2_63_e, alu_ecl_adderin2_31_e, 
   alu_ecl_adder_out_63_e, alu_ecl_cout32_e, alu_ecl_cout64_e_l,
   alu_ecl_mem_addr_invalid_e_l,
   // Inputs 
   rclk, se, si, byp_alu_rs1_data_e, byp_alu_rs2_data_e_l, 
   byp_alu_rs3_data_e, byp_alu_rcc_data_e, ecl_alu_cin_e, ecl_alu_rd_e, // uty: test 
   ifu_exu_invert_d, ecl_alu_log_sel_and_e, ecl_alu_log_sel_or_e,
   ecl_alu_log_sel_xor_e, ecl_alu_log_sel_move_e, 
   ecl_alu_out_sel_sum_e_l, ecl_alu_out_sel_rs3_e_l, 
   ecl_alu_out_sel_shift_e_l, ecl_alu_out_sel_logic_e_l, 
   shft_alu_shift_out_e, ecl_alu_sethi_inst_e, ifu_lsu_casa_e
   );

...

   output    exu_ifu_regz_e;              // rs1_data == 0 
   output    exu_ifu_regn_e;


...


   // Zero comparison for exu_ifu_regz_e
   sparc_exu_aluzcmp64 regzcmp(.in(byp_alu_rcc_data_e[63:0]), .zero64(exu_ifu_regz_e));
   assign     exu_ifu_regn_e = byp_alu_rcc_data_e[63];

exu_ifu_regz_e和exu_ifu_regn_e能判断处rs1_data与0的关系。

ifu/rtl/sparc_ifu_fcl.v

//------------------------------   
// Branch Control
//------------------------------
   // final portion of branch evaluation
   wire brtaken_e_l;
   bw_u1_buf_20x UZsize_bcbf(.z(fcl_dcl_regz_e),
                             .a(exu_ifu_regz_e));

   bw_u1_muxi21_6x UZsize_bcmux(.z(brtaken_e_l),
                                .d0(dcl_fcl_bcregz0_e),
                                .d1(dcl_fcl_bcregz1_e),
                                .s(exu_ifu_regz_e));

   bw_u1_inv_15x UZsize_bcinv(.z(brtaken_e), 
                              .a(brtaken_e_l));

   // Branch is taken in the E stage to thr_e.  Below we check to see
   // if this is the same as the next thread we will switch to

   // isolate non critical section
   bw_u1_buf_5x UZsize_btbuf(.z (brtaken_unq_e),
                             .a (brtaken_e));
   assign brtaken_buf_e = brtaken_unq_e & inst_vld_qual_e & ~kill_curr_e;

//   assign thr_match_ne_norst = thr_match_ne & ~rst_sw_bf;
//   assign brto_nxtthr_bf  = thr_match_ne & brtaken_e;
   bw_u1_nand2_4x UZsize_btkn_ntl(.a (brtaken_e),
                                  .b (thr_match_ne),
                                  .z (brto_nxtthr_bf_l));

//   bw_u1_inv_8x UZsize_btkn_bf(.a (brto_nxtthr_bf_l),
//                               .z (brto_nxtthr_bf));

   dff_s #(1) br_ff(.din (brtaken_buf_e),
                              .q   (brtaken_m),
                              .clk (clk),
                              .se  (se), .si(), .so());



...


   assign load_bpc[3:0] = {4{brtaken_buf_e}} & thr_e;
   assign load_pcp4[3:0] = {4{~part_stall_thisthr_f &
                              ~iferrto_thisthr_d1 |
                              arst_vld_f |
                              async_intr_vld_s}}     & thr_f;

   always @ (/*AUTOSENSE*/load_bpc or load_pcp4 or load_tpc)
     begin
//            if (fcl_reset)
//              begin // RESET PC is loaded to T0
//                 fcl_fdp_tpcbf_sel_old_bf_l = 4'b0001;
//                 fcl_fdp_tpcbf_sel_pcp4_bf_l = 4'b1110;
//                 fcl_fdp_tpcbf_sel_trap_bf_l = 4'b1111;
//                 fcl_fdp_tpcbf_sel_brpc_bf_l = 4'b1111;
//              end // if (reset)
//            else 
//              begin
              fcl_fdp_tpcbf_sel_old_bf_l = (load_bpc | load_tpc | load_pcp4);
              fcl_fdp_tpcbf_sel_brpc_bf_l = ~load_bpc | load_tpc | load_pcp4;
              fcl_fdp_tpcbf_sel_pcp4_bf_l = ~load_pcp4 | load_tpc;
              fcl_fdp_tpcbf_sel_trap_bf_l = ~load_tpc;
     end // always @ (...


可以看到exu_ifu_regz_e直接影响到load_bpc,然后影响到fcl_fdp_tpcbf_sel_brpc_bf_l。

但为什么load_bpc是控制4个线程的,还没搞懂。

奇怪的是exu_ifu_regn_e并没有和exu_ifu_regz_e一起进sparc_ifu_fcl。

而是进exu_ifu_regn_e和fcl_dcl_regz_e一起进sparc_ifu_dcl。

原来fcl里的dcl_fcl_bcregz0_e dcl_fcl_bcregz1_e都是来自dcl。

ifu/rtl/sparc_ifu_dcl.v


   //////// Chandra ////////

   wire   temp0, temp1, cond_brtaken_e_l;

   // limit loading on this signal
//   wire   regz_buf_e;
//   bw_u1_buf_5x UZfix_regz_bf(.a (exu_ifu_regz_e),
//                              .z (regz_buf_e));

   assign temp0 = (r_eval0 | ccbr_taken_e);
   assign temp1 = (r_eval1 | ccbr_taken_e);

   bw_u1_muxi21_6x UZsize_cbtmux(.z(cond_brtaken_e_l),
                                  .d0(temp0),
                                  .d1(temp1),
                                  .s(fcl_dcl_regz_e));

   bw_u1_inv_20x UZsize_cbtinv(.z(cond_brtaken_e),
                                .a(cond_brtaken_e_l));

   ////////////////////////

   assign dcl_fcl_bcregz0_e = (temp0 & dbr_inst_e | ibr_inst_e |
                               call_inst_e) & ~dtu_inst_anull_e;
   assign dcl_fcl_bcregz1_e = (temp1 & dbr_inst_e | ibr_inst_e |
                               call_inst_e) & ~dtu_inst_anull_e;

这几个信号在fcl dcl里来回传的有点乱。

最终的branch判断就在这句,

   bw_u1_muxi21_6x UZsize_bcmux(.z(brtaken_e_l),
                                .d0(dcl_fcl_bcregz0_e),
                                .d1(dcl_fcl_bcregz1_e),
                                .s(exu_ifu_regz_e));
   //--------------
   // For BRZ
   // -------------
   // Calculate Cond Assuming Z=1 And Z=0.  Then Mux
//   assign r_eval1 = ((exu_ifu_regn_e | ~br_cond_e[1] | ~br_cond_e[0]) ^
//                                br_cond_e[2]) & ~cc_mvbr_e;
   assign r_eval1 = exu_ifu_regn_e ? (~br_cond_e[2] & ~cc_mvbr_e) :
                                       (((br_cond_e[1] & br_cond_e[0]) ^
                                         ~br_cond_e[2]) & ~cc_mvbr_e);

//   assign r_eval0 = ((exu_ifu_regn_e & br_cond_e[1]) ^
//                      br_cond_e[2]) & ~cc_mvbr_e;
   assign r_eval0 = exu_ifu_regn_e ? ((br_cond_e[1] ^ br_cond_e[2]) &
                                       ~cc_mvbr_e) :
                                       (br_cond_e[2] & ~cc_mvbr_e);


chiplab里的branch,首先是没有复用ALU来计算跳转地址。整个模块参数跟ALU也很像。

为了简单, 我也先不复用ALU了,等到后面搞了bypass再说。

branch模块分别在ex1_stage ex2_stage里调用了一次,还没搞清楚为什么。

ex1_stage.v


branch branch(
    .branch_valid (branch_valid     ),
    .branch_a     (bru_a            ),
    .branch_b     (bru_b            ),
    .branch_op    (ex1_bru_op       ),
    .branch_pc    (ex1_bru_pc       ),
    .branch_inst  (ex1_bru_inst     ),
    .branch_taken (ex1_bru_br_taken ),
    .branch_target(ex1_bru_br_target),
    .branch_offset(ex1_bru_offset   ),
    .cancel_allow (cancel_allow     ),
    // pc interface
    .bru_cancel   (bru_cancel       ),
    .bru_target   (bru_target       ),
    .bru_valid    (bru_valid        ),
    .bru_taken    (bru_taken        ),
    .bru_link_pc  (bru_link_pc      ),
    .bru_pc       (bru_pc           )
);

ex2_stage.v


branch bru_s2(
    .branch_valid (branch_valid     ),
    .branch_a     (bru_a            ),
    .branch_b     (bru_b            ),
    .branch_op    (ex2_bru_op       ),
    .branch_pc    (ex2_bru_pc       ),
    .branch_inst  (ex2_bru_inst     ),
    .branch_taken (ex2_bru_br_taken ),
    .branch_target(ex2_bru_br_target),
    .branch_offset(ex2_bru_offset   ),
    .cancel_allow (cancel_allow     ),
    // pc interface
    .bru_cancel   (bru_cancel_ex2   ),
    .bru_target   (bru_target_ex2   ),
    .bru_valid    (bru_valid_ex2    ),
    .bru_taken    (bru_taken_ex2    ),
    .bru_link_pc  (bru_link_pc_ex2  ),
    .bru_pc       (bru_pc_ex2       )
);

lsoc1000_mainpipe.v


wire [`GRLEN-1:0]         bru_target_input  = ex1_bru_delay ? bru_target_ex2  : bru_target ;
wire [`GRLEN-1:0]         bru_pc_input      = ex1_bru_delay ? bru_pc_ex2      : bru_pc     ;
wire [`LSOC1K_PRU_HINT:0] bru_hint_input    = ex1_bru_delay ? bru_hint_ex2    : bru_hint   ;
wire                      bru_sign_input    = ex1_bru_delay ? bru_sign_ex2    : bru_sign   ;
wire                      bru_taken_input   = ex1_bru_delay ? bru_taken_ex2   : bru_taken  ;
wire                      bru_brop_input    = ex1_bru_delay ? bru_brop_ex2    : bru_brop   ;
wire                      bru_jrop_input    = ex1_bru_delay ? bru_jrop_ex2    : bru_jrop   ;
wire                      bru_jrra_input    = ex1_bru_delay ? bru_jrra_ex2    : bru_jrra   ;
wire                      bru_link_input    = ex1_bru_delay ? bru_link_ex2    : bru_link   ;
wire [`GRLEN-1:0]         bru_link_pc_input = ex1_bru_delay ? bru_link_pc_ex2 : bru_link_pc;
wire                      bru_cancel_input  = bru_cancel_ex2 || bru_cancel;

bru_target_input就是送进front里的br_target。

ex1_bru_delay是bru_delay从issue stage传下去的。

lsoc1000_stage_issue.v


i    if(allow_in) begin
        ex1_branch_valid  <= ((port0_sr_ur == `EX_BRU && is_port0_valid) || (port1_sr_ur == `EX_BRU && is_port1_valid) || is_port2_valid) && !type_crash && !bru_cancel;
        ex1_bru_delay     <= bru_delay;
        ex1_bru_link      <= is_port2_valid ? 1'b0          : port0_bru_dispatch ? (is_port0_link && is_port0_valid) : (is_port1_link && is_port1_valid);
        ex1_bru_jrra      <= is_port2_valid ? is_port2_jrra : port0_bru_dispatch ? is_port0_jrra                     : is_port1_jrra;
        ex1_bru_brop      <= is_port2_valid ? is_port2_brop : port0_bru_dispatch ? is_port0_brop                     : is_port1_brop;
        ex1_bru_jrop      <= is_port2_valid ? is_port2_jrop : port0_bru_dispatch ? is_port0_jrop                     : is_port1_jrop;
    end

而bru_delay又是从 ex1_stage生成的。

ex1_stage.v


//branch
reg first_trial;

wire port0_branch_valid  = ex1_port0_src == `EX_BRU && ex1_port0_valid;
wire port1_branch_valid  = ex1_port1_src == `EX_BRU && ex1_port1_valid;
wire branch_valid        = port0_branch_valid || port1_branch_valid || ex1_bru_port[0];
wire port0_cancel_allow  = (ex1_port0_src == `EX_BRU && ex1_port0_valid) && !ex1_port0_type && !ex1_bru_delay;
wire port1_cancel_allow  = (ex1_port1_src == `EX_BRU && ex1_port1_valid) && !ex1_port1_type && !ex1_bru_delay;
wire port2_cancel_allow  = ex1_bru_port[0]                               && !ex1_port2_type && !ex1_bru_delay;
wire cancel_allow        = (port0_cancel_allow || port1_cancel_allow || port2_cancel_allow) && first_trial;
assign bru_port          = port1_branch_valid;
wire [31:0] ex1_bru_inst = port0_branch_valid ? ex1_port0_inst  : ex1_port1_inst;
wire [`GRLEN-1:0] bru_a  = ex1_bru_a_lsu_fw   ? ex1_lsu_fw_data : ex1_bru_a     ;
wire [`GRLEN-1:0] bru_b  = ex1_bru_b_lsu_fw   ? ex1_lsu_fw_data : ex1_bru_b     ;
assign bru_delay         = branch_valid && !(port0_cancel_allow || port1_cancel_allow || port2_cancel_allow);

现在还没搞清楚branch这么复杂是为了处理什么样的情况。

发现从下面代码可以看出来在chiplab里,port2是专门用来运行branch指令的。

always @(posedge clk) begin
    if (rst) begin
        ex1_bru_br_taken  <= 1'b0;
        ex1_bru_link      <= 1'b0;
        ex1_bru_jrra      <= 1'b0;
        ex1_bru_brop      <= 1'b0;
        ex1_bru_jrop      <= 1'b0;
    end
    else if(/*(port0_bru_dispatch || port1_bru_dispatch) &&*/ is_allow_in) begin
        ex1_bru_op        <= is_port2_valid ? is_port2_op[`LSOC1K_BRU_CODE] : port0_bru_dispatch ? is_port0_op[`LSOC1K_BRU_CODE] : is_port1_op[`LSOC1K_BRU_CODE];
        ex1_bru_a         <= is_port2_valid ? rdata2_0_input               : port0_bru_dispatch ? rdata0_0_input                    : rdata1_0_input;
        ex1_bru_b         <= is_port2_valid ? rdata2_1_input               : port0_bru_dispatch ? rdata0_1_input                    : rdata1_1_input;
        ex1_bru_br_taken  <= is_port2_valid ? is_port2_br_taken            : port0_bru_dispatch ? is_port0_br_taken                 : is_port1_br_taken;
        ex1_bru_br_target <= is_port2_valid ? {is_port2_br_target,2'b0}    : port0_bru_dispatch ? {is_port0_br_target,2'b0}         : {is_port1_br_target,2'b0};
        ex1_bru_hint      <= is_port2_valid ? is_port2_hint                : port0_bru_dispatch ? is_port0_hint                     : is_port1_hint;
        ex1_bru_offset    <= is_port2_valid ? port2_offset                 : port0_bru_dispatch ? port0_offset                      : port1_offset;
        ex1_bru_pc        <= is_port2_valid ? is_port2_pc                  : port0_bru_dispatch ? is_port0_pc                       : is_port1_pc;
        ex1_bru_a_lsu_fw  <= is_port2_valid && rdata2_0_lsu_fw || port0_bru_dispatch && rdata0_0_lsu_fw || port1_bru_dispatch && rdata1_0_lsu_fw;
        ex1_bru_b_lsu_fw  <= is_port2_valid && rdata2_1_lsu_fw || port0_bru_dispatch && rdata0_1_lsu_fw || port1_bru_dispatch && rdata1_1_lsu_fw;
        ex1_bru_port[1]   <= is_port2_id || port1_bru_dispatch;
        ex1_bru_port[2]   <= is_port2_app && is_port2_valid;
    end


chiplab里的branch也包括JIRL。

在cpu7里,让bru工作在_e,也就是说,在_e完成比较,计算出跳转地址后,直接在_e里传给ifu,改变pc_bf。

并且同时发出kill_d。

也要发出stallF,exu_ifu_stall_req。

其实exu_ifu_stall_req和kill_d作用的方法都一样,就是把当前流水线valid & (~exu_ifu_stall_req), valid & (~kill_d)

这样,传到_d _e的valid都是0, 这条指令就没意义了。


loongarch指令里,表示偏移的部分,有叫做si ui的,也有叫offs的。

offs是bru指令里的offset,si ui是其它指令里用的。

这两种偏移的编码是不同的,并且有交集,比如有si16又有offs16。

不过loongarch32里没有用到si16的,只在loonarch64里有,比如:addu16i.d rd, rj, si16

addu16i里的si16的位置,其实和jirl bne的offs16是一样的。而offs还有offs21 offs26,si最大是si20。

感觉编码可以统一起来。

但在代码里,这两部分是分开编码的。

 11    //immediate operater prepare                                                                                                            
 12    wire [ 4:0] port0_i5  = `GET_I5(ifu_exu_inst_d);                                                                                         
 13    wire [ 5:0] port0_i6  = `GET_I6(ifu_exu_inst_d);                                                                                                    
 14    wire [11:0] port0_i12 = `GET_I12(ifu_exu_inst_d);                                                                                          
 15    wire [13:0] port0_i14 = `GET_I14(ifu_exu_inst_d);                                                                                           
 16    wire [15:0] port0_i16 = `GET_I16(ifu_exu_inst_d);                                                                                          
 17    wire [19:0] port0_i20 = `GET_I20(ifu_exu_inst_d);                                                                                          
 18                                                                                                                                                  
 19    wire [30:0] port0_i5_u  = {27'b0,port0_i5};                                                                                                  
 20    wire [31:0] port0_i6_u  = {26'b0,port0_i6};                                                                                                  
 21    wire [31:0] port0_i12_u = {20'b0,port0_i12};                                                                                                    
 22    wire [31:0] port0_i12_s = ,port0_i12};                                                                                   
 23    wire [31:0] port0_i14_s = ,port0_i14};                                                                                  
 24    wire [31:0] port0_i16_s = ,port0_i16};                                                                                  
 25    wire [31:0] port0_i20_s = ,port0_i20};                                                                                       
 26                                                                                                                                                 
 27    wire [31:0] port0_i5_i = ifu_exu_op_d[`LSOC1K_DOUBLE_WORD] ? port0_i6_u : port0_i5_u;                                                      
 28    wire [31:0] port0_i12_i = ifu_exu_op_d[`LSOC1K_UNSIGN] ? port0_i12_u : port0_i12_s;                                                                                                                         

lsoc1000_stage_issue.v


////BRU operater push
wire branch_dispatch   = port0_bru_dispatch || port1_bru_dispatch;

wire [15:0] port0_offset16 = `GET_OFFSET16(is_port0_inst);
wire [15:0] port1_offset16 = `GET_OFFSET16(is_port1_inst);
wire [15:0] port2_offset16 = `GET_OFFSET16(is_port2_inst);
wire [20:0] port0_offset21 = `GET_OFFSET21(is_port0_inst);
wire [20:0] port1_offset21 = `GET_OFFSET21(is_port1_inst);
wire [20:0] port2_offset21 = `GET_OFFSET21(is_port2_inst);
wire [25:0] port0_offset26 = `GET_OFFSET26(is_port0_inst);
wire [25:0] port1_offset26 = `GET_OFFSET26(is_port1_inst);
wire [25:0] port2_offset26 = `GET_OFFSET26(is_port2_inst);

`ifdef LA64
wire [63:0] port0_offset = is_port0_op[`LSOC1K_RD_READ    ] ? ,port0_offset16,2'b0} :
                           is_port0_op[`LSOC1K_HIGH_TARGET] ? ,port0_offset26,2'b0} :
                                                              ,port0_offset21,2'b0} ;
wire [63:0] port1_offset = is_port1_op[`LSOC1K_RD_READ    ] ? ,port1_offset16,2'b0} :
                           is_port1_op[`LSOC1K_HIGH_TARGET] ? ,port1_offset26,2'b0} :
                                                              ,port1_offset21,2'b0} ;
wire [63:0] port2_offset = is_port2_op[`LSOC1K_RD_READ    ] ? ,port2_offset16,2'b0} :
                           is_port2_op[`LSOC1K_HIGH_TARGET] ? ,port2_offset26,2'b0} :
                                                              ,port2_offset21,2'b0} ;
`elsif LA32
wire [31:0] port0_offset = is_port0_op[`LSOC1K_RD_READ    ] ? ,port0_offset16,2'b0} :
                           is_port0_op[`LSOC1K_HIGH_TARGET] ? ,port0_offset26,2'b0} :
                                                              ,port0_offset21,2'b0} ;
wire [31:0] port1_offset = is_port1_op[`LSOC1K_RD_READ    ] ? ,port1_offset16,2'b0} :
                           is_port1_op[`LSOC1K_HIGH_TARGET] ? ,port1_offset26,2'b0} :
                                                              ,port1_offset21,2'b0} ;
wire [31:0] port2_offset = is_port2_op[`LSOC1K_RD_READ    ] ? ,port2_offset16,2'b0} :
                           is_port2_op[`LSOC1K_HIGH_TARGET] ? ,port2_offset26,2'b0} :
                                                              ,port2_offset21,2'b0} ;
`endif

我也先分开解码,以后看能不能再改进吧。