CPU7
The goal is to learn from chiplab project, including how to use verilator to establish a developing and testing framework, how the chiplab cpu’s modules are orgnized, and how to use AXI and ABP bus protocol to connect with memory and peripherial devices.
Step 1:
Move and rename the chiplab project to other directory and compile it with minimal components.
Define CPU7_HOME variable in ~/.bashrc
export CPU7_HOME=~/prjs/cpu7
sims/verilator/run_func/Makefile
include ../../../chip/config-generator.mak
config-generator.mak -> ../../../chip/config-generator.mak
toolchains directory is necessary for running any tests. loongarch32-unknown-elf-objdump is needed when converting binary into ram file.
./script.sh: line 21: ../../../toolchains/loongarch32_gnu/install/bin/loongarch32-unknown-elf-objdump: No such file or directory
verilator 做测试框架有点不方便,测试代码是用loongarch gcc编译后转成内存文件然后模拟的。
这种方式测已经写好的cpu还挺好的,得到的trace结果和qemu的trace结果做比较,看看有什么不同。
但我想从头写指令一条一条的写测试用例就不是很方便。时候我不想带个qemu去生成golden trace,维护整个这套流程牵扯的代码太多了。
想为每条指令写测试例子,可以自己再手写个正确的log,或者就用qemu去生成,然后跟模拟生成的log做对比。这样不需要改testbench。
我还是想按之前的方式,用verilog的testbench写测试用例。用modelsim模拟,用quartus ii综合生成bitstream。
还有一个原因,就是模拟的这套cpu接口是简化了的。
chiplab有一套测试用的框架,包括CpuAxi, CpuRam,用c++代码模拟内存。
可以从verilator编译时给出的cpu接口看到,
module simu_top
#(
`ifdef AXI128
parameter DATA_WIDTH = 128,
`elsif AXI64
parameter DATA_WIDTH = 64,
`else
parameter DATA_WIDTH = 32,
`endif
`ifdef ADDR64
parameter BUS_WIDTH = 64,
parameter CPU_WIDTH = 64
`else
parameter BUS_WIDTH = 32,
parameter CPU_WIDTH = 32
`endif
)(
input aclk,
input aresetn,
//input [ 7 :0] intrpt,
input enable_delay,
input [ 22 :0] random_seed,
// ram
output ram_ren ,
output [BUS_WIDTH-1 :0] ram_raddr,
input [DATA_WIDTH-1 :0] ram_rdata,
output [DATA_WIDTH/8-1 :0] ram_wen ,
output [BUS_WIDTH-1 :0] ram_waddr,
output [DATA_WIDTH-1 :0] ram_wdata
// debug
,
output [CPU_WIDTH-1 :0] debug0_wb_pc ,
output debug0_wb_rf_wen ,
output [ 4 :0] debug0_wb_rf_wnum ,
output [CPU_WIDTH-1 :0] debug0_wb_rf_wdata
`ifdef CPU_2CMT
,
output [CPU_WIDTH-1 :0] debug1_wb_pc ,
output debug1_wb_rf_wen ,
output [ 4 :0] debug1_wb_rf_wnum ,
output [CPU_WIDTH-1 :0] debug1_wb_rf_wdata
`endif
`ifdef RAND_TEST
,
output [`RAND_TEST_BUS_WD-1:0] rand_test_bus
`endif
,
inout uart_rx,
inout uart_tx,
output uart_enab,
output uart_rw,
output [3 :0] uart_addr,
output [7 :0] uart_datai,
output [15:0] led,
output [1 :0] led_rg0,
output [1 :0] led_rg1,
output reg [7 :0] num_csn,
output reg [6 :0] num_a_g,
input [7 :0] switch,
output [3 :0] btn_key_col,
input [3 :0] btn_key_row,
input [1 :0] btn_step
);
soc_top #(
.BUS_WIDTH(BUS_WIDTH),
.DATA_WIDTH(DATA_WIDTH),
.CPU_WIDTH(CPU_WIDTH)
)
soc(
.aclk (aclk ),
.aresetn (aresetn ),
//.intrpt (intrpt ),
.enable_delay(enable_delay),
.random_seed (random_seed ),
// ram
.sram_ren (ram_ren ),
.sram_raddr(ram_raddr),
.sram_rdata(ram_rdata),
.sram_wen (ram_wen ),
.sram_waddr(ram_waddr),
.sram_wdata(ram_wdata)
,
.debug0_wb_pc (debug0_wb_pc ),// O, 64
.debug0_wb_rf_wen (debug0_wb_rf_wen ),// O, 4
.debug0_wb_rf_wnum (debug0_wb_rf_wnum ),// O, 5
.debug0_wb_rf_wdata(debug0_wb_rf_wdata) // O, 64
`ifdef CPU_2CMT
,
.debug1_wb_pc (debug1_wb_pc ),// O, 64
.debug1_wb_rf_wen (debug1_wb_rf_wen ),// O, 4
.debug1_wb_rf_wnum (debug1_wb_rf_wnum ),// O, 5
.debug1_wb_rf_wdata(debug1_wb_rf_wdata) // O, 64
`endif
,
.UART_RX (uart_rx ),
.UART_TX (uart_tx ),
//use for simulation
.uart0_enab (uart_enab ),
.uart0_rw (uart_rw ),
.uart0_addr (uart_addr ),
.uart0_datai (uart_datai ),
// For confreg
.led (led ),
.led_rg0 (led_rg0 ),
.led_rg1 (led_rg1 ),
.num_csn (num_csn ),
.num_a_g (num_a_g ),
.switch (switch ),
.btn_key_col (btn_key_col ),
.btn_key_row (btn_key_row ),
.btn_step (btn_step )
);
`ifdef RAND_TEST
wire cmtbus_valid0;
wire [3:0] cmtbus_cmtnum0;
wire [3:0] commit_num;
wire cmt_last_split;
assign cmtbus_valid0 = `CMTBUS_VALID0;
assign cmtbus_cmtnum0 = `CMTBUS_CMTNUM0;
`ifdef CPU_2CMT
wire cmtbus_valid1;
wire [3:0] cmtbus_cmtnum1;
assign cmtbus_valid1 = `CMTBUS_VALID1;
assign cmtbus_cmtnum1 = `CMTBUS_CMTNUM1;
assign commit_num = cmtbus_cmtnum0 + cmtbus_cmtnum1;
assign cmt_last_split = cmtbus_valid1 ? (cmtbus_cmtnum1 == 0) :
cmtbus_valid0 ? (cmtbus_cmtnum0 == 0) :
1'b0;
`else
assign commit_num = cmtbus_cmtnum0;
assign cmt_last_split = cmtbus_valid0 ? (cmtbus_cmtnum0 == 0) :
1'b0;
`endif
assign rand_test_bus ={
{28'b0,commit_num},
{31'b0,cmt_last_split},
{`CR_BADVADDR},
{`EXBUS_EPC},
{26'b0,`EXBUS_EXCODE},
{31'b0,`EXBUS_ERET},
{31'b0,`EXBUS_EX},
`GR_RTL[31],`GR_RTL[30],`GR_RTL[29],`GR_RTL[28],
`GR_RTL[27],`GR_RTL[26],`GR_RTL[25],`GR_RTL[24],
`GR_RTL[23],`GR_RTL[22],`GR_RTL[21],`GR_RTL[20],
`GR_RTL[19],`GR_RTL[18],`GR_RTL[17],`GR_RTL[16],
`GR_RTL[15],`GR_RTL[14],`GR_RTL[13],`GR_RTL[12],
`GR_RTL[11],`GR_RTL[10],`GR_RTL[ 9],`GR_RTL[ 8],
`GR_RTL[ 7],`GR_RTL[ 6],`GR_RTL[ 5],`GR_RTL[ 4],
`GR_RTL[ 3],`GR_RTL[ 2],`GR_RTL[ 1],`GR_RTL[ 0]
};
`endif
endmodule
simu_top是verilator编译时指定的模块入口。
SIMU_TOP_NAME=simu_top
...
verilator:${VERILATOR_SRC}
@echo "============================================================================================================="
@echo "============================================================================================================="
@echo "COMPILING verilog..."
@echo "============================================================================================================="
@echo "============================================================================================================="
verilator ${VERILATOR_INCLUDE} ${WAVEOPTION} --savable --threads ${THREAD} -O3 -Wno-fatal -DSIMULATION=1 -Wall --trace -cc ${VFLAGS} ${SIMU_TOP_NAME}.v ${VERILATOR_SRC} 2>&1 | tee log/compile.log
make -C ${OBJ_DIR} -f "V${SIMU_TOP_NAME}.mk"
内存接口是和ALTEA的1-port block ram接口差不多。
// ram
output ram_ren ,
output [BUS_WIDTH-1 :0] ram_raddr,
input [DATA_WIDTH-1 :0] ram_rdata,
output [DATA_WIDTH/8-1 :0] ram_wen ,
output [BUS_WIDTH-1 :0] ram_waddr,
output [DATA_WIDTH-1 :0] ram_wdata
verilator testbench里的c++部分通过读内存文件满足这个接口。
里面的soc_top是一样的接口。
soc_top是在chip/soc_demo/sim/soc_top.v
里定义的。
module soc_top#(
parameter BUS_WIDTH = 32,
parameter DATA_WIDTH = 64,
parameter CPU_WIDTH = 32
)
(
input wire aresetn ,
input wire aclk ,
input wire enable_delay ,
input wire [22:0] random_seed ,
output wire [CPU_WIDTH-1:0] debug0_wb_pc ,
output wire [CPU_WIDTH-1:0] debug0_wb_rf_wdata,
output wire debug0_wb_rf_wen ,
output wire [4 :0] debug0_wb_rf_wnum ,
`ifdef CPU_2CMT
output wire [CPU_WIDTH-1:0] debug1_wb_pc ,
output wire [CPU_WIDTH-1:0] debug1_wb_rf_wdata,
output wire debug1_wb_rf_wen ,
output wire [4 :0] debug1_wb_rf_wnum ,
`endif
//------gpio----------------
output [15:0] led,
output [1 :0] led_rg0,
output [1 :0] led_rg1,
output [7 :0] num_csn,
output [6 :0] num_a_g,
input [7 :0] switch,
output [3 :0] btn_key_col,
input [3 :0] btn_key_row,
input [1 :0] btn_step,
//ram
output wire [BUS_WIDTH -1:0] sram_raddr ,
input wire [DATA_WIDTH-1:0] sram_rdata ,
output wire sram_ren ,
output wire [BUS_WIDTH -1:0] sram_waddr ,
output wire [DATA_WIDTH-1:0] sram_wdata ,
output wire [DATA_WIDTH/8-1:0] sram_wen ,
//use for simulation
output uart0_enab,
output uart0_rw,
output [3 :0] uart0_addr,
output [7 :0] uart0_datai,
//------uart-------
inout UART_RX,
inout UART_TX
/*
//------DDR3 interface------
inout [15:0] ddr3_dq,
output [12:0] ddr3_addr,
output [2 :0] ddr3_ba,
output ddr3_ras_n,
output ddr3_cas_n,
output ddr3_we_n,
output ddr3_odt,
output ddr3_reset_n,
output ddr3_cke,
output [1:0] ddr3_dm,
inout [1:0] ddr3_dqs_p,
inout [1:0] ddr3_dqs_n,
output ddr3_ck_p,
output ddr3_ck_n,
//------mac controller-------
//TX
input mtxclk_0,
output mtxen_0,
output [3:0] mtxd_0,
output mtxerr_0,
//RX
input mrxclk_0,
input mrxdv_0,
input [3:0] mrxd_0,
input mrxerr_0,
input mcoll_0,
input mcrs_0,
// MIIM
output mdc_0,
inout mdio_0,
output phy_rstn,
//------EJTAG-------
input EJTAG_TRST,
input EJTAG_TCK,
input EJTAG_TDI,
input EJTAG_TMS,
output EJTAG_TDO,
//------nand-------
output NAND_CLE ,
output NAND_ALE ,
input NAND_RDY ,
inout [7:0] NAND_DATA,
output NAND_RD ,
output NAND_CE , //low active
output NAND_WR ,
//------spi flash-------
output SPI_CLK,
output SPI_CS,
inout SPI_MISO,
inout SPI_MOSI
*/
);
可以看到soc的引脚应该是ddr3,uart,EJTAG,nand,spi flash这些。mac是网卡吗?
ram在soc的外面是为了模拟测试,模拟的是内存,而不是cache用的sram。
soc_top里面包含core_top,理解为一个核心,核心里面包括cache和processor core。
core_top cpu
(
.intrpt (interrupt ),// I, 8
.aclk (aclk ),// I, 1
.aresetn (aresetn ),// I, 1
.arid (cpu_arid ),// O, 4
.araddr (cpu_araddr ),// O, 64
.arlen (cpu_arlen ),// O, 4
.arsize (cpu_arsize ),// O, 3
.arburst (cpu_arburst ),// O, 2
.arlock (cpu_arlock ),// O, 2
.arcache (cpu_arcache ),// O, 4
.arprot (cpu_arprot ),// O, 3
.arvalid (cpu_arvalid ),// O, 1
.arready (cpu_arready ),// I, 1
.awid (cpu_awid ),// O, 4
.awaddr (cpu_awaddr ),// O, 64
.awlen (cpu_awlen ),// O, 4
.awsize (cpu_awsize ),// O, 3
.awburst (cpu_awburst ),// O, 2
.awlock (cpu_awlock ),// O, 2
.awcache (cpu_awcache ),// O, 4
.awprot (cpu_awprot ),// O, 3
.awvalid (cpu_awvalid ),// O, 1
.awready (cpu_awready ),// I, 1
.rid (cpu_rid ),// I, 4
.rdata (cpu_rdata ),// I, 128
.rresp (cpu_rresp ),// I, 2
.rlast (cpu_rlast ),// I, 1
.rvalid (cpu_rvalid ),// I, 1
.rready (cpu_rready ),// O, 1
.wid (cpu_wid ),// O, 4
.wdata (cpu_wdata ),// O, 128
.wstrb (cpu_wstrb ),// O, 16
.wlast (cpu_wlast ),// O, 1
.wvalid (cpu_wvalid ),// O, 1
.wready (cpu_wready ),// I, 1
.bid (cpu_bid ),// I, 4
.bresp (cpu_bresp ),// I, 2
.bvalid (cpu_bvalid ),// I, 1
.bready (cpu_bready ) // O, 1
,
.debug0_wb_pc (debug0_wb_pc ),// O, 64
.debug0_wb_rf_wen (debug0_wb_rf_wen ),// O, 1
.debug0_wb_rf_wnum (debug0_wb_rf_wnum ),// O, 5
.debug0_wb_rf_wdata(debug0_wb_rf_wdata) // O, 64
`ifdef CPU_2CMT
,
.debug1_wb_pc (debug1_wb_pc ),// O, 64
.debug1_wb_rf_wen (debug1_wb_rf_wen ),// O, 1
.debug1_wb_rf_wnum (debug1_wb_rf_wnum ),// O, 5
.debug1_wb_rf_wdata(debug1_wb_rf_wdata) // O, 64
`endif
);
+------------+
| | +--------+ +------------------+
| core_top |__master____slave__| random |__master_____slave__| |
| axi_master | | delay | | axi_slave_mux | (master)
| | | | | |
+------------+ +--------+ +------------------+
(master) (slave) | | | | |
| | | | |
s0| |s1 |s2 |s3|s4
| | | | |________________________
________________| | | |___________ |
| | | | |
+------------+ +------+ +-------+ +------------+ +--------+
|soc_axi_sram| | | |axi2apb| |soc_axi_sram| | |
| _bridge | | spi | |_misc | |_bridge | | mac |
+------------+ | | +-------+ +------------+ | |
| +------+ | | +--------+
+------------+ +------+ +------------+
| | | | | conf |
| sram | | uart | | sram |
| | | | +------------+
+------------+ +------+
core_top对外的信号主要是一个axi master。
通过一个axi_slave_mux连接几个slave,这里是为了简单用了sram作为内存。
应该是对应一个ddr controller。
如果不考虑测试,不连这个random delay,core_top应该就直接连mux了。
+----------------------------------------------------------------------------------------+
| soc_top |
| | +------------+
| +-------------------------------------------+ +------+ +------------+ | | |
| | core_top | | | |soc_axi_sram|---------| sram |
| | | | a | | _bridge | | | |
| | +--------------------+ +--------+ | | x | +------------+ | +------------+
| | | mycpu_top | | | | | i | |
| | | | | icache | | | _ | |
| | | | | | | | s | +------------+ | +------------+
| | | | +--------+ | | l | |soc_axi_sram|---------| conf |
| | | | | | a | | _bridge | | | sram |
| | | | +--------+ | | v | +------------+ | +------------+
| | | | | | | | e | |
| | | | | dcache | | | _ | |
| | +--------------------+ | | | | m | |
| | +--------+ | | u | +-------+ | +------+
| | | | x | |axi2apb|--------------| |
| +-------------------------------------------+ | | |_misc | | | uart |
| | | +-------+ | | |
| +-----------------------------------------------+ | | +------+
| | | |
| | | |
| +------------------------------------------------------+ |
| | | | |
+-----------------|-----------------|-----------------|----------------------------------+
| | |
| | |
+----------------+ +------+ +--------+
| | | | | |
| ddr controller | | spi | | mac |
| | | | | |
+----------------+ +------+ +--------+
没搞明白,从注释掉的代码上看,为什么spi,mac是直接连在axi_slave_mux,而uart和sram都需要再实现一个bridge。
ddr controller应该怎样连呢,需要看一个实现了ddr与core用axi连接的例子才行。