流水线和data byp
现在的问题是cache miss的时候,每2个cycle取指一次。
以前cpu6都是一个cycle一条指令,所以在做data forwarding的时候没想那么多。
因为读寄存器是在_d,所以_e要把alu的数据前递给_d。而_m时lsu要把load的数据给_d。
而如果时2个cycle fetch一个指令,拿alu来说,在_e时,下一条指令还没有到_d,也就是还没有读寄存器,所以没法前递。
需要到_m时,下一条指令才到_d。
_d _e(alu) _m(lsu)
^_______| |
^_________________|
比如下面这个例子,在2 cycle fetch一条指令的时候,func_uty1_ld。
obj/main.elf: file format elf32-loongarch
obj/main.elf
Disassembly of section .text:
1c000000 <_start>:
kernel_entry():
1c000000: 14380006 lu12i.w $r6,114688(0x1c000)
1c000004: 028030c6 addi.w $r6,$r6,12(0xc)
1c000008: 288000c5 ld.w $r5,$r6,0
Disassembly of section .data:
1c00000c <var1>:
var1():
1c00000c: 0000005a 0x0000005a
当第一条指令到_e,alu的result已经是1c000000了,而_d阶段pc_d显示当前ip还是1c000000。
又看了看书,是我搞错了。
在_m _w的时候把数据送向_e。_d读完寄存器应该时间挺紧张的。
_e _m _w
^_______| |
^_________________|
这样的话就算chiplab的cache fill cache line需要2个cycle(外面也是SRAM),也不用担心了,_w –> _e就行了。
如果delay时间更长的话,data forwarding也不用管了。
看了下OpenSPARC,这里的data bypass是从regfile里读出来就直接在_d的时候就做mux,得到的结果再传给_e的流水线寄存器。
// rs1_data muxes: RF and E are critical paths
mux4ds #(64) mux_rs1_data_1(.dout(rs1_data_btwn_mux[63:0]),
.in0(rd_data_m[63:0]),
.in1(byp_irf_rd_data_w[63:0]),
.in2(rs1_data_w2[63:0]),
.in3(, ifu_exu_pc_d[47:0]}),
.sel0(ecl_byp_rs1_mux1_sel_m),
.sel1(ecl_byp_rs1_mux1_sel_w),
.sel2(ecl_byp_rs1_mux1_sel_w2),
.sel3(ecl_byp_rs1_mux1_sel_other));
mux4ds #(64) mux_rs1_data_2(.dout(byp_alu_rs1_data_d[63:0]),
.in0(rs1_data_btwn_mux[63:0]),
.in1(irf_byp_rs1_data_d[63:0]),
.in2(alu_byp_rd_data_e[63:0]),
.in3(lsu_exu_dfill_data_g[63:0]),
.sel0(ecl_byp_rs1_mux2_sel_usemux1),
.sel1(ecl_byp_rs1_mux2_sel_rf),
.sel2(ecl_byp_rs1_mux2_sel_e),
.sel3(ecl_byp_rs1_mux2_sel_ld));
2022-12-20 add:
今天在做csr寄存器bypass的时候又在想这个问题,为什么bypass _w不把结果传递给_d,毕竟_d的时候已经从寄存器读出结果了。
OpenSPARC的byp就是做在_d。
我想来想去也觉得不应该是_d _e位置随便选吧。
应该是读regfile在哪,就应该在什么地方开始做byp。
那为什么现在做在_e,只要把_m _w的数据前递就行了呢?
特意做了个func_uty13_testbyp1cycle来测试下。
1c000000 <_start>:
kernel_entry():
1c000000: 02800403 addi.w $r3,$r0,1(0x1)
1c000004: 02800004 addi.w $r4,$r0,0
1c000008 <again>:
again():
1c000008: 02800484 addi.w $r4,$r4,1(0x1)
1c00000c: 02802805 addi.w $r5,$r0,10(0xa)
1c000010: 02800000 addi.w $r0,$r0,0
1c000014: 02800000 addi.w $r0,$r0,0
1c000018: 028140a5 addi.w $r5,$r5,80(0x50)
1c00001c: 02800000 addi.w $r0,$r0,0
1c000020: 02800000 addi.w $r0,$r0,0
1c000024: 5fffe464 bne $r3,$r4,-28(0x3ffe4) # 1c000008 <again>
中间r5的两次addi.w,中间隔着2个nop,结果也是正确的。
现在找到的原因是cpu7里用的是chiplab的regfile。
还没有搞清楚这个regfile生成出来的是latch还是dflop。
78 // process write
79 always @(posedge clk) begin
80 // if(rst) begin
81 // regs[31] <= 32'd0;
82
83 // end
84 // else begin
85 case({wen1_input,wen2_input})
86 2'b11:begin
87 regs[waddr1] <= wdata1;
88 regs[waddr2] <= wdata2;
89 end
90 2'b10:regs[waddr1] <= wdata1;
91 2'b01:regs[waddr2] <= wdata2;
92 default: ;
93 endcase
94 // end
95 end
不过先看下上面func_uty13_testbyp1cycle测试的例子。
可以看到rd_data_w的数据0xa,已经写进去了,并且irf_ecl_rs1_data_d,也就是nop nop后面的addi指令也已经从寄存器里读出来了。
本来以为rd_data_w的数据要在下一个rising edge写进寄存器,那时候_d才能读出来,现在看来不是。
不过这个是always @(posedge clk) 感觉就是和dff是又一样的啊。
原来是里面做了read port forwarding。
32 // read after write (RAW)
33 wire r1_1_w1_raw = wen1 && (raddr0_0 == waddr1);
34 wire r1_2_w1_raw = wen1 && (raddr0_1 == waddr1);
35 wire r1_1_w2_raw = wen2 && (raddr0_0 == waddr2);
36 wire r1_2_w2_raw = wen2 && (raddr0_1 == waddr2);
37
38 wire r2_1_w1_raw = wen1 && (raddr1_0 == waddr1);
39 wire r2_2_w1_raw = wen1 && (raddr1_1 == waddr1);
40 wire r2_1_w2_raw = wen2 && (raddr1_0 == waddr2);
41 wire r2_2_w2_raw = wen2 && (raddr1_1 == waddr2);
42
43 wire r3_1_w1_raw = wen1 && (raddr2_0 == waddr1);
44 wire r3_2_w1_raw = wen1 && (raddr2_1 == waddr1);
45 wire r3_1_w2_raw = wen2 && (raddr2_0 == waddr2);
46 wire r3_2_w2_raw = wen2 && (raddr2_1 == waddr2);
47
48 wire r1_1_raw = r1_1_w1_raw || r1_1_w2_raw; // read port need forwarding
49 wire r1_2_raw = r1_2_w1_raw || r1_2_w2_raw;
50 wire r2_1_raw = r2_1_w1_raw || r2_1_w2_raw;
51 wire r2_2_raw = r2_2_w1_raw || r2_2_w2_raw;
52 wire r3_1_raw = r3_1_w1_raw || r3_1_w2_raw;
53 wire r3_2_raw = r3_2_w1_raw || r3_2_w2_raw;
54
55 wire [`GRLEN-1:0] r1_1_raw_data = r1_1_w2_raw ? wdata2 : wdata1; // forwarding data
56 wire [`GRLEN-1:0] r1_2_raw_data = r1_2_w2_raw ? wdata2 : wdata1;
57 wire [`GRLEN-1:0] r2_1_raw_data = r2_1_w2_raw ? wdata2 : wdata1;
58 wire [`GRLEN-1:0] r2_2_raw_data = r2_2_w2_raw ? wdata2 : wdata1;
59 wire [`GRLEN-1:0] r3_1_raw_data = r3_1_w2_raw ? wdata2 : wdata1;
60 wire [`GRLEN-1:0] r3_2_raw_data = r3_2_w2_raw ? wdata2 : wdata1;
原因在这,也就是说,这个图上的_w时刻,数据还没有写进reg。
果然是这样,r5的值在下一个cycle才写进去。
那这么说,现在实现的cpu7_csr里,读csr寄存器在_d,写csr寄存器在_m,byp要把_e _m的都forwarding到_d。
以后做byp,要regfile读在那个阶段,数据就要前递到哪个阶段。当然还要看refile的实现。
现在又有一个想法,让lsu里计算地址也复用ALU,有个好处,就是data bypass只需要处理给alu的参数了。
否则ld指令需要的参数也要考虑data bypass。
在ecl里alu和lsu的操作数都需要传过去。
144 wire alu_b_imm = (ifu_exu_op_d[`LSOC1K_I5] || ifu_exu_op_d[`LSOC1K_I12] || ifu_exu_op_d[`LSOC1K_I16] || ifu_exu_op_d[`LSOC1K_I20]) & alu_dispatch_d;
157 wire [`GRLEN-1:0] alu_b_d = alu_b_imm? ifu_exu_imm_shifted_d : irf_ecl_rs2_data_d;
252 assign user_other = alu_b_imm_e | double_read_e;
253
254 cpu7_exu_eclbyplog byplog_rs2(
255 .rs_e (rs2_e[4:0] ),
256 .rd_m (rd_m[4:0] ),
257 .rd_w (rd_w[4:0] ),
258 .wen_m (wen_m ),
259 .wen_w (wen_w ),
260 .use_other (alu_b_imm_e ),
261
262 .rs_mux_sel_rf (ecl_byp_rs2_mux_sel_rf ),
263 .rs_mux_sel_m (ecl_byp_rs2_mux_sel_m ),
264 .rs_mux_sel_w (ecl_byp_rs2_mux_sel_w )
265 );
359 assign double_read_d = ifu_exu_op_d[`LSOC1K_DOUBLE_READ] & lsu_dispatch_d;
373 assign lsu_offset_e = double_read_e ? byp_rs2_data_e : ifu_exu_imm_shifted_e;
ALU指令的时候alu_b_imm决定用rs2寄存器还是用imm。
而LSU指令时double_read_e作用差不多,也是选择imm和rs2,不过意思反的。
最主要是alu_b_imm和double_read_e在指令里也不是同一个bit,没办法一个bit决定ALU和LSU的参数。
这俩都不是指令里的bit,而是解码的时候给出的。
比如DOUBLE_READ, alu_b_imm也是一样。
decoder.v
assign res[`LSOC1K_DOUBLE_READ ] = double_read;
wire double_read = op_preldx || op_ldx_b || op_ldx_h || op_ldx_w || op_ldx_d || op_ldx_bu || op_ldx_hu || op_ldx_wu || op_ldgt_b || op_ldgt_h || op_ldgt_w || op_ldgt_d || op_ldle_b || op_ldle_h || op_ldle_w || op_ldle_d ||
op_amswap_w || op_amswap_d || op_amadd_w || op_amadd_d || op_amand_w || op_amand_d || op_amor_w ||
op_amor_d || op_amxor_w || op_amxor_d || op_ammax_w || op_ammax_d || op_ammin_w || op_ammin_d || op_ammax_wu || op_ammax_du || op_ammin_wu || op_ammin_du ||
op_amswap_db_w || op_amswap_db_d || op_amadd_db_w || op_amadd_db_d || op_amand_db_w || op_amand_db_d || op_amor_db_w || op_amor_db_d || op_amxor_db_w ||
op_amxor_db_d || op_ammax_db_w || op_ammax_db_d || op_ammin_db_w || op_ammin_db_d || op_ammax_db_wu || op_ammax_db_du || op_ammin_db_wu || op_ammin_db_du;
所以需要”& alu_dispatch_d” “& lsu_dispatch”来选择是进alu的指令还是进lsu的指令。
搞了个func_uty7_beq_testbyp1cycle_obj,测试下cache一个cycle一条指令的时候byp是否正常。
obj/main.elf: file format elf32-loongarch
obj/main.elf
Disassembly of section .text:
1c000000 <_start>:
kernel_entry():
1c000000: 02800803 addi.w $r3,$r0,2(0x2)
1c000004: 02800004 addi.w $r4,$r0,0
1c000008 <again>:
again():
1c000008: 02800484 addi.w $r4,$r4,1(0x1)
1c00000c: 02802805 addi.w $r5,$r0,10(0xa)
1c000010: 028040a5 addi.w $r5,$r5,16(0x10)
1c000014: 028040a5 addi.w $r5,$r5,16(0x10)
1c000018: 0280a406 addi.w $r6,$r0,41(0x29)
1c00001c: 028004c6 addi.w $r6,$r6,1(0x1)
1c000020: 580008c5 beq $r6,$r5,8(0x8) # 1c000028 <skip>
1c000024: 028200a5 addi.w $r5,$r5,128(0x80)
1c000028 <skip>:
skip():
1c000028: 028040a5 addi.w $r5,$r5,16(0x10)
1c00002c: 028040a5 addi.w $r5,$r5,16(0x10)
1c000030: 028040a5 addi.w $r5,$r5,16(0x10)
1c000034: 5fffd464 bne $r3,$r4,-44(0x3ffd4) # 1c000008 <again>
1c000010 1c000014这两条就应该是_m的rd bypass到_e。
之前乱改cache和tlb,搞成1cycle一条指令,但有问题,比如下一个cache line的时候 0x20以后会出问题。
就不git上传了,把patch文件贴这算了。
diff --git a/IP/myCPU/cpu7.v b/IP/myCPU/cpu7.v
index 8a14d5c..b7265d6 100644
--- a/IP/myCPU/cpu7.v
+++ b/IP/myCPU/cpu7.v
@@ -120,7 +120,10 @@ module cpu7(
wire data_req_empty ;
wire data_scsucceed ;
-
+ // uty: test
+ wire [`PABITS-1:0] itlb_paddr_dumb;
+
+ assign itlb_paddr = inst_tlb_vaddr[`PABITS-1:0];
cpu7_ifu ifu(
@@ -351,7 +354,7 @@ module cpu7(
.i_cache_rcv (itlb_cache_recv ),
.i_finish (itlb_finish ),
.i_hit (itlb_hit ),
- .i_paddr (itlb_paddr ),
+ .i_paddr (itlb_paddr_dumb ),
.i_uncached (itlb_uncache ),
.i_exccode (itlb_exccode ),
diff --git a/IP/myCPU/icache.v b/IP/myCPU/icache.v
index b1946a1..5f5942e 100644
--- a/IP/myCPU/icache.v
+++ b/IP/myCPU/icache.v
@@ -206,7 +206,8 @@ wire rfil_new_req;
reg rfil_send_cpu;
reg [`I_TAG_LEN-1 :0] rfil_ptag ;
reg [`I_INDEX_LEN-1 :0] rfil_index;
-reg [`I_OFFSET_LEN-1:0] rfil_offset;
+reg [`I_OFFSET_LEN-1:0] rfil_offset; // uty: test
+wire [`I_OFFSET_LEN-1:0] rfil_offset_uty;
reg rfil_uncache;
reg [31 :0] rfil_data [`LINE_INST_NUM-1:0];
@@ -219,6 +220,7 @@ reg [`SCWAY_LEN-1 :0] rfil_rscway;
// ------------------- Store Req & Addr ------------------
reg inst_cancel_reg; // TODO: We can cencel request at lkup_state
+// uty: test
reg [`I_INDEX_LEN-1 :0] index ;
reg [`I_OFFSET_LEN-1:0] offset;
@@ -227,11 +229,19 @@ always @(posedge clk) begin
index <= {`I_INDEX_LEN {1'b0}}; // TODO: remove rst ?
offset <= {`I_OFFSET_LEN{1'b0}};
end
- else if(inst_addr_ok) begin
+ //else if(inst_addr_ok) begin // uty: test
+ else begin
index <= inst_addr[`I_INDEX_BITS];
offset <= inst_addr[`I_OFFSET_BITS];
end
end
+//wire [`I_INDEX_LEN-1 :0] index ;
+//wire [`I_OFFSET_LEN-1:0] offset;
+//
+//assign index = inst_addr[`I_INDEX_BITS];
+//assign offset = inst_addr[`I_OFFSET_BITS];
+
+
always @(posedge clk) begin
if(rst)
@@ -541,15 +551,16 @@ end
assign data_wdata_o = {rfil_data[ 7], rfil_data[ 6], rfil_data[ 5], rfil_data[ 4],
rfil_data[ 3], rfil_data[ 2], rfil_data[ 1], rfil_data[ 0]};
+ // uty: test
assign miss_ret_data = {`INST_OUT_LEN{ rfil_uncache}} & {32'b0 , 32'b0 , 32'b0 , rfil_data[ 0]} |
- {`INST_OUT_LEN{rfil_offset == 3'b000 & !rfil_uncache}} & {rfil_data[ 3], rfil_data[ 2], rfil_data[ 1], rfil_data[ 0]} |
- {`INST_OUT_LEN{rfil_offset == 3'b001 & !rfil_uncache}} & {rfil_data[ 4], rfil_data[ 3], rfil_data[ 2], rfil_data[ 1]} |
- {`INST_OUT_LEN{rfil_offset == 3'b010 & !rfil_uncache}} & {rfil_data[ 5], rfil_data[ 4], rfil_data[ 3], rfil_data[ 2]} |
- {`INST_OUT_LEN{rfil_offset == 3'b011 & !rfil_uncache}} & {rfil_data[ 6], rfil_data[ 5], rfil_data[ 4], rfil_data[ 3]} |
- {`INST_OUT_LEN{rfil_offset == 3'b100 & !rfil_uncache}} & {rfil_data[ 7], rfil_data[ 6], rfil_data[ 5], rfil_data[ 4]} |
- {`INST_OUT_LEN{rfil_offset == 3'b101 & !rfil_uncache}} & {32'b0 , rfil_data[ 7], rfil_data[ 6], rfil_data[ 5]} |
- {`INST_OUT_LEN{rfil_offset == 3'b110 & !rfil_uncache}} & {32'b0 , 32'b0 , rfil_data[ 7], rfil_data[ 6]} |
- {`INST_OUT_LEN{rfil_offset == 3'b111 & !rfil_uncache}} & {32'b0 , 32'b0 , 32'b0 , rfil_data[ 7]} ;
+ {`INST_OUT_LEN{offset == 3'b000 & !rfil_uncache}} & {rfil_data[ 3], rfil_data[ 2], rfil_data[ 1], rfil_data[ 0]} |
+ {`INST_OUT_LEN{offset == 3'b001 & !rfil_uncache}} & {rfil_data[ 4], rfil_data[ 3], rfil_data[ 2], rfil_data[ 1]} |
+ {`INST_OUT_LEN{offset == 3'b010 & !rfil_uncache}} & {rfil_data[ 5], rfil_data[ 4], rfil_data[ 3], rfil_data[ 2]} |
+ {`INST_OUT_LEN{offset == 3'b011 & !rfil_uncache}} & {rfil_data[ 6], rfil_data[ 5], rfil_data[ 4], rfil_data[ 3]} |
+ {`INST_OUT_LEN{offset == 3'b100 & !rfil_uncache}} & {rfil_data[ 7], rfil_data[ 6], rfil_data[ 5], rfil_data[ 4]} |
+ {`INST_OUT_LEN{offset == 3'b101 & !rfil_uncache}} & {32'b0 , rfil_data[ 7], rfil_data[ 6], rfil_data[ 5]} |
+ {`INST_OUT_LEN{offset == 3'b110 & !rfil_uncache}} & {32'b0 , 32'b0 , rfil_data[ 7], rfil_data[ 6]} |
+ {`INST_OUT_LEN{offset == 3'b111 & !rfil_uncache}} & {32'b0 , 32'b0 , 32'b0 , rfil_data[ 7]} ;
`endif
assign data_addr_o = (rfil_refill )? rfil_index :
@@ -653,10 +664,17 @@ wire [1:0] miss_count;
assign hit_count = offset [2] == 1'b1 ? ~offset [1:0] : 2'b11;
assign miss_count = rfil_offset[2] == 1'b1 ? ~rfil_offset[1:0] : 2'b11;
- assign miss_data_ok[0] = rfil_send_cpu & rfil_offset == 3'b000 & rfil_record[1];
- assign miss_data_ok[1] = rfil_send_cpu & (rfil_offset == 3'b001 || rfil_offset == 3'b010) & rfil_record[2];
- assign miss_data_ok[2] = rfil_send_cpu & (rfil_offset[2] == 1'b1 || rfil_offset == 3'b011) & rfil_record[3];
- assign miss_data_ok[3] = rfil_send_cpu & (rfil_offset[2] == 1'b1 || rfil_offset == 3'b011) & rfil_record[3];
+ // uty: test
+// assign miss_data_ok[0] = rfil_send_cpu & rfil_offset == 3'b000 & rfil_record[1];
+// assign miss_data_ok[1] = rfil_send_cpu & (rfil_offset == 3'b001 || rfil_offset == 3'b010) & rfil_record[2];
+// assign miss_data_ok[2] = rfil_send_cpu & (rfil_offset[2] == 1'b1 || rfil_offset == 3'b011) & rfil_record[3];
+// assign miss_data_ok[3] = rfil_send_cpu & (rfil_offset[2] == 1'b1 || rfil_offset == 3'b011) & rfil_record[3];
+
+ assign miss_data_ok[0] = (rfil_offset_uty == 3'b000 || rfil_offset_uty == 3'b001) & rfil_record[0];
+ assign miss_data_ok[1] = (rfil_offset_uty == 3'b010 || rfil_offset_uty == 3'b011) & rfil_record[1];
+ assign miss_data_ok[2] = (rfil_offset_uty == 3'b100 || rfil_offset_uty == 3'b101) & rfil_record[2];
+ assign miss_data_ok[3] = (rfil_offset_uty == 3'b110 || rfil_offset_uty == 3'b111) & rfil_record[3];
+
`endif
assign inst_count = (uncache_data_ok)? 2'b00 : ({2{|hit_data_ok}} & hit_count | {2{|miss_data_ok}} & miss_count);
@@ -682,7 +700,9 @@ assign rfil_idle = rfil_state == 4'b0000;
assign rfil_addr_wait = rfil_state == 4'b0001;
assign rfil_data_wait = rfil_state == 4'b0010;
+// uty: test
assign rfil_hit = rfil_valid && tlb_ptag == rfil_ptag && index == rfil_index && tlb_finish && state_lkup;
+//assign rfil_hit = rfil_valid && tlb_ptag == rfil_ptag && index == rfil_index && tlb_finish;
assign rfil_alloc = rfil_idle && state_lkup && cache_miss && !rfil_hit || rfil_refill && state_blck;
@@ -724,10 +744,12 @@ always @(posedge clk) begin
if(rfil_new_req) begin
rfil_ptag <= (state_blck)? blck_ptag : tlb_ptag;
rfil_index <= index;
- rfil_offset <= offset;
+ rfil_offset <= offset;
end
end
+assign rfil_offset_uty = inst_addr[`I_OFFSET_BITS]; // uty: tet
+
// cache or uncache
always @(posedge clk) begin
if(uncache_data_ok) // TODO: remove?
@@ -768,7 +790,9 @@ always @(posedge clk) begin
end
always @(posedge clk) begin
+ // uty: test
if((!rfil_uncache && rfil_record[`RFIL_RECORD_LEN-1] && rfil_refill || rfil_uncache && rfil_record[0]) || rfil_alloc)
+ //if((!rfil_uncache && rfil_record[`RFIL_RECORD_LEN-1] || rfil_uncache && rfil_record[0]) || rfil_alloc)
rfil_record <= {`RFIL_RECORD_LEN{1'b0}};
else if(rfil_data_wait && ret_valid) begin
rfil_record[3] <= rfil_record[2]? 1'b1 : 1'b0;
@@ -873,4 +897,4 @@ assign wr_awscway = 4'b0;
assign wr_pgcl = ex_pgcl;
assign wr_fmt = `WR_FMT_ALLLINE;
// ------------------------- END --------------------------
-endmodule
\ No newline at end of file
+endmodule
diff --git a/IP/myCPU/mycpu_top.v b/IP/myCPU/mycpu_top.v
index 7e39d58..ea96a54 100644
--- a/IP/myCPU/mycpu_top.v
+++ b/IP/myCPU/mycpu_top.v
@@ -626,10 +626,15 @@ module mycpu_top (
.cache_op_badvaddr(icache_op_badvaddr ),
.tlb_ptag (itlb_paddr[`I_TAG_BITS]),
- .tlb_finish (itlb_finish ),
- .tlb_hit (itlb_hit ),
+ //.tlb_finish (itlb_finish ), // uty: test
+ //.tlb_finish (cpu_inst_tlb_req ),
+ .tlb_finish (1'b1 ),
+ //.tlb_hit (itlb_hit ), // uty: test
+ //.tlb_hit (cpu_inst_tlb_req ),
+ .tlb_hit (1'b1 ),
.tlb_cache_recv (itlb_cache_recv ),
- .tlb_uncache (itlb_uncache ),
+ //.tlb_uncache (itlb_uncache ), // uty: test
+ .tlb_uncache (1'b0 ), // uty: test
.tlb_exccode (itlb_exccode ),
////cpu_control
diff --git a/IP/myCPU/tlb.v b/IP/myCPU/tlb.v
index a2e447a..0f6128c 100644
--- a/IP/myCPU/tlb.v
+++ b/IP/myCPU/tlb.v
@@ -33,6 +33,7 @@ module tlb
input i_s_cache_rcv ,
output reg i_s_finish_his,
output reg [PABITS-1:0] i_s_paddr_his ,
+// output [PABITS-1:0] i_s_paddr_uty ,
output i_s_hit ,
output i_s_uncached ,
output [ 5:0] i_s_exccode ,
@@ -184,6 +185,10 @@ assign i_dir_map_win_hit = i_dir_map_win0_hit | i_dir_map_win1_hit;
assign i_dir_map_win0_hit = csr_CRMD_PG & i_s_vaddr[`LSOC1K_DMW_VSEG] == csr_dir_map_win0[`LSOC1K_DMW_VSEG] & csr_dir_map_win0[{3'b0,csr_CRMD_PLV}];
assign i_dir_map_win1_hit = csr_CRMD_PG & i_s_vaddr[`LSOC1K_DMW_VSEG] == csr_dir_map_win1[`LSOC1K_DMW_VSEG] & csr_dir_map_win1[{3'b0,csr_CRMD_PLV}];
+
+// uty: test
+//assign i_s_paddr_uty = i_s_vaddr[PABITS-1:0];
+
always @(posedge clk) begin
if(i_s_req && i_unmapped_search) begin
i_s_hit_his <= 1'b1;
diff --git a/sims/verilator/run_func/config-software.mak b/sims/verilator/run_func/config-software.mak
index 8a8d0ed..4ca42e4 100644
--- a/sims/verilator/run_func/config-software.mak
+++ b/sims/verilator/run_func/config-software.mak
@@ -1,4 +1,4 @@
-RUN_SOFTWARE=func/func_uty5_jirl
+RUN_SOFTWARE=func/func_uty1_ld
TRACE_COMP=n
SIMU_TRACE=y
RUN_FUNC=y
diff --git a/sims/verilator/run_func/configure.sh b/sims/verilator/run_func/configure.sh
index 55f36dc..30083d2 100755
--- a/sims/verilator/run_func/configure.sh
+++ b/sims/verilator/run_func/configure.sh
@@ -282,6 +282,11 @@ do
mkdir -p ./obj/func
mkdir -p ./log/func
;;
+ func/test_cache_loop)
+ RUN_FUNC=y
+ mkdir -p ./obj/func
+ mkdir -p ./log/func
+ ;;
my_program)
RUN_FUNC=n
RUN_C=y