基于AXI4总线卷积FPGA加速IP核的尝试
来源:互联网 发布:java毫秒数转时间 在线 编辑:程序博客网 时间:2024/05/21 10:29
本文先总结不同AXI IP核
的实现的方法,性能的对比,性能差异的分析,可能改进的方面。使用的硬件平台是Zedboard
。
不同的AXI
总线卷积加速模块的概况
这次实现并逐渐优化了三个版本的卷积加速模块,先简要描述各个版本的主要内容。
版本一
版本一主要是用来测试AXI
总线IP
核的实现可能。
- 该模块拥有19个32位寄存器
- 其中前9个寄存器用来保存需要计算的值
- 后面9个寄存器用来保存卷积核
- 在读取第19个寄存器的地址的时候计算9个寄存器的卷积和(该计算可以在一个时钟周期内完成)
- 9个寄存器单独赋值,程序中分别向对应地址写入内容,通过总线进行传输。
- 故乐观的来算,需要10个总线周期可以获取一个输出
可以从驱动的书写简单理解一下:
void Conv_HW(int filter[3][3], int arr[100][100], int filterW, int filterH, int arrW, int arrH) { int i, j; for (i = 2; i < filterH + arrH - 3; i++) { for (j = 2; j < filterW + arrW - 3; j++) { Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR, arr[i][j]); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+4, arr[i][j - 1]); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+8, arr[i][j - 2]); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+12, arr[i - 1][j]); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+16, arr[i - 1][j - 1]); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+20, arr[i - 1][j - 2]); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+24, arr[i - 2][j]); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+28, arr[i - 2][j - 1]); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+32, arr[i - 2][j - 2]); res[i][j] = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 72); } if (i % 15 == 0) printf("="); }}
版本一性能
- 版本一性能最惨,由于没有时间戳,目测软件计算速度远远快于FPGA核心运算速度。
- 版本一的改进速度就是引入滑动窗口,能够最大程度减少总线周期。
版本二
版本二引入滑动窗口,和初期设计的概念相同。
该模块拥有19个32位寄存器
其中前9个寄存器用来保存需要计算的值
后面9个寄存器用来保存卷积核
在读取第19个寄存器的地址的时候计算9个寄存器的卷积和(该计算可以在一个时钟周期内完成)
三个寄存器滑动赋值,该计算窗口在计算矩阵上滑动 除了冷启动多余两个周期用来预载寄存器,后面的每一个计算只需要四个总线周期
可以通过写的驱动简单理解一下:
void Conv_HW(int filter[3][3], int arr[100][100], int arrW, int arrH) {int i, j;i = 2; j = 2;for (i = 2; i < arrH; i++) { //pre load Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j - 1]); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j - 1]); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j - 1]); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j]); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j]); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j]); for (j = 2; j < arrW; j++) { Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j + 1]); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j + 1]); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j + 1]); res[i][j] = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 72); }}}
版本二性能
测试样本 500*500
的32bit
单位的矩阵 计算200
次。
软件消耗33.78
秒,卷积IP
核心40.25
秒
这样的结果还是非常不乐观,分析可能有两种限制了IP核
的速度。
- 两个寄存器的乘法LUT太大,无法硬件优化
- 总线周期太慢太慢
版本三对于这两种可能进行探索。
版本二的FPGA部分核心代码
// Implement memory mapped register select and write logic generation // The write data is accepted and written to memory mapped registers when // axi_awready, S_AXI_WVALID, axi_wready and S_AXI_WVALID are asserted. Write strobes are used to // select byte enables of slave registers while writing. // These registers are cleared when reset (active low) is applied. // Slave register write enable is asserted when valid address and data are available // and the slave is ready to accept the write address and write data. assign slv_reg_wren = axi_wready && S_AXI_WVALID && axi_awready && S_AXI_AWVALID; always @( posedge S_AXI_ACLK ) begin if ( S_AXI_ARESETN == 1'b0 ) begin slv_reg0 <= 0; slv_reg1 <= 0; slv_reg2 <= 0; slv_reg3 <= 0; slv_reg4 <= 0; slv_reg5 <= 0; slv_reg6 <= 0; slv_reg7 <= 0; slv_reg8 <= 0; slv_reg9 <= 0; slv_reg10 <= 0; slv_reg11 <= 0; slv_reg12 <= 0; slv_reg13 <= 0; slv_reg14 <= 0; slv_reg15 <= 0; slv_reg16 <= 0; slv_reg17 <= 0;// slv_reg18 <= 0; end else begin if (slv_reg_wren) begin case ( axi_awaddr[ADDR_LSB+OPT_MEM_ADDR_BITS:ADDR_LSB] ) 5'h00: for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 ) if ( S_AXI_WSTRB[byte_index] == 1 ) begin // Respective byte enables are asserted as per write strobes // Slave register 0 slv_reg0[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8]; end 5'h01: for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 ) if ( S_AXI_WSTRB[byte_index] == 1 ) begin // Respective byte enables are asserted as per write strobes // Slave register 1 slv_reg1[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8]; end 5'h02: begin slv_reg0 <= slv_reg1; slv_reg1 <= slv_reg2; for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 ) if ( S_AXI_WSTRB[byte_index] == 1 ) begin // Respective byte enables are asserted as per write strobes // Slave register 2 slv_reg2[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8]; end end 5'h03: for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 ) if ( S_AXI_WSTRB[byte_index] == 1 ) begin // Respective byte enables are asserted as per write strobes // Slave register 3 slv_reg3[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8]; end 5'h04: for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 ) if ( S_AXI_WSTRB[byte_index] == 1 ) begin // Respective byte enables are asserted as per write strobes // Slave register 4 slv_reg4[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8]; end 5'h05: begin slv_reg3 <= slv_reg4; slv_reg4 <= slv_reg5; for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 ) if ( S_AXI_WSTRB[byte_index] == 1 ) begin // Respective byte enables are asserted as per write strobes // Slave register 5 slv_reg5[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8]; end end 5'h06: for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 ) if ( S_AXI_WSTRB[byte_index] == 1 ) begin // Respective byte enables are asserted as per write strobes // Slave register 6 slv_reg6[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8]; end 5'h07: for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 ) if ( S_AXI_WSTRB[byte_index] == 1 ) begin // Respective byte enables are asserted as per write strobes // Slave register 7 slv_reg7[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8]; end 5'h08: begin slv_reg6 <= slv_reg7; slv_reg7 <= slv_reg8; for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 ) if ( S_AXI_WSTRB[byte_index] == 1 ) begin // Respective byte enables are asserted as per write strobes // Slave register 8 slv_reg8[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8]; end end 5'h09: for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 ) if ( S_AXI_WSTRB[byte_index] == 1 ) begin // Respective byte enables are asserted as per write strobes // Slave register 9 slv_reg9[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8]; end 5'h0A: for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 ) if ( S_AXI_WSTRB[byte_index] == 1 ) begin // Respective byte enables are asserted as per write strobes // Slave register 10 slv_reg10[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8]; end 5'h0B: for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 ) if ( S_AXI_WSTRB[byte_index] == 1 ) begin // Respective byte enables are asserted as per write strobes // Slave register 11 slv_reg11[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8]; end 5'h0C: for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 ) if ( S_AXI_WSTRB[byte_index] == 1 ) begin // Respective byte enables are asserted as per write strobes // Slave register 12 slv_reg12[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8]; end 5'h0D: for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 ) if ( S_AXI_WSTRB[byte_index] == 1 ) begin // Respective byte enables are asserted as per write strobes // Slave register 13 slv_reg13[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8]; end 5'h0E: for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 ) if ( S_AXI_WSTRB[byte_index] == 1 ) begin // Respective byte enables are asserted as per write strobes // Slave register 14 slv_reg14[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8]; end 5'h0F: for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 ) if ( S_AXI_WSTRB[byte_index] == 1 ) begin // Respective byte enables are asserted as per write strobes // Slave register 15 slv_reg15[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8]; end 5'h10: for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 ) if ( S_AXI_WSTRB[byte_index] == 1 ) begin // Respective byte enables are asserted as per write strobes // Slave register 16 slv_reg16[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8]; end 5'h11: for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 ) if ( S_AXI_WSTRB[byte_index] == 1 ) begin // Respective byte enables are asserted as per write strobes // Slave register 17 slv_reg17[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8]; end // 5'h12:// for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )// if ( S_AXI_WSTRB[byte_index] == 1 ) begin// // Respective byte enables are asserted as per write strobes // // Slave register 18// slv_reg18[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];// end default : begin slv_reg0 <= slv_reg0; slv_reg1 <= slv_reg1; slv_reg2 <= slv_reg2; slv_reg3 <= slv_reg3; slv_reg4 <= slv_reg4; slv_reg5 <= slv_reg5; slv_reg6 <= slv_reg6; slv_reg7 <= slv_reg7; slv_reg8 <= slv_reg8; slv_reg9 <= slv_reg9; slv_reg10 <= slv_reg10; slv_reg11 <= slv_reg11; slv_reg12 <= slv_reg12; slv_reg13 <= slv_reg13; slv_reg14 <= slv_reg14; slv_reg15 <= slv_reg15; slv_reg16 <= slv_reg16; slv_reg17 <= slv_reg17; end endcase end end end // Implement memory mapped register select and read logic generation // Slave register read enable is asserted when valid address is available // and the slave is ready to accept the read address. assign slv_reg_rden = axi_arready & S_AXI_ARVALID & ~axi_rvalid; always @(*) begin // Address decoding for reading registers case ( axi_araddr[ADDR_LSB+OPT_MEM_ADDR_BITS:ADDR_LSB] ) 5'h00 : reg_data_out <= slv_reg0; 5'h01 : reg_data_out <= slv_reg1; 5'h02 : reg_data_out <= slv_reg2; 5'h03 : reg_data_out <= slv_reg3; 5'h04 : reg_data_out <= slv_reg4; 5'h05 : reg_data_out <= slv_reg5; 5'h06 : reg_data_out <= slv_reg6; 5'h07 : reg_data_out <= slv_reg7; 5'h08 : reg_data_out <= slv_reg8; 5'h09 : reg_data_out <= slv_reg9; 5'h0A : reg_data_out <= slv_reg10; 5'h0B : reg_data_out <= slv_reg11; 5'h0C : reg_data_out <= slv_reg12; 5'h0D : reg_data_out <= slv_reg13; 5'h0E : reg_data_out <= slv_reg14; 5'h0F : reg_data_out <= slv_reg15; 5'h10 : reg_data_out <= slv_reg16; 5'h11 : reg_data_out <= slv_reg17; 5'h12 : reg_data_out <= slv_reg0 * slv_reg9 + slv_reg1 * slv_reg10 + slv_reg2 * slv_reg11 + slv_reg3 * slv_reg12 + slv_reg4 * slv_reg13 + slv_reg5 * slv_reg14 + slv_reg6 * slv_reg15 + slv_reg7 * slv_reg16 + slv_reg8 * slv_reg17; default : reg_data_out <= 0; endcase end
版本三
先尝试生成更小的LUT
- 该模块拥有19个32位寄存器
- 其中前9个寄存器用来保存需要计算的值
- 卷积核固定在
Verilog
中,用来生成更小的LUT
- 一个计算只需要四个总线周期
性能测试
仍然软件消耗33
秒,卷积IP
核心40
秒
基本否决是LUT
问题。
下面测试AXI总线问题:
假设所有数据均来自于FPGA
,无需从总线写入:
void Conv_HW(int filter[3][3], int arr[100][100], int arrW, int arrH) { int i, j; i = 2; j = 2; for (i = 2; i < arrH; i++) { for (j = 2; j < arrW; j++) { res[i][j] = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 72); } }}
只需要9.47
秒即可完成计算,并传回CPU !!!
总结
至此,基本上可以否决利用AXI
传数据的可能,所有需要利用AXI
总线传输数据的模块均会被总线周期所连累,在优化了传输后,仍然无法解决该问题。确实需要一个更快的方式来传输数据。
在Altera
的NIOS2
中,直接利用IO
口传输数据,无需总线周期,再因为NIOS II
内核没有流水线优化,所以硬件确实比较快。
附1:AXI4 总线的 FPGA 接口部分
先看总线接口:
// Users to add ports here // User ports ends // Do not modify the ports beyond this line // Global Clock Signal // 全局时钟 input wire S_AXI_ACLK, // Global Reset Signal. This Signal is Active LOW // 全局复位信号 input wire S_AXI_ARESETN, // Write address (issued by master, acceped by Slave) // 写地址 input wire [C_S_AXI_ADDR_WIDTH-1 : 0] S_AXI_AWADDR, // 写地址的保护模式 包括privilege和security level // Write channel Protection type. This signal indicates the // privilege and security level of the transaction, and whether // the transaction is a data access or an instruction access. input wire [2 : 0] S_AXI_AWPROT, // 写地址有效信号。为高指示地址有效。 // Write address valid. This signal indicates that the master signaling // valid write address and control information. input wire S_AXI_AWVALID, // 写地址准备信号。为高表示从设备空闲,准备接收地址;为低表示从设备忙。 // ********** 注意 这里是地址 下面是数据 ******** // Write address ready. This signal indicates that the slave is ready // to accept an address and associated control signals. output wire S_AXI_AWREADY, // 写数据,32位到1024位宽 // 从主设备来的数据 从设备接收 // Write data (issued by master, acceped by Slave) input wire [C_S_AXI_DATA_WIDTH-1 : 0] S_AXI_WDATA, // 写字节选通,用于表示更新存储器的字节通道,对于数据总线的每8位数据有一位写选通信号。 // Write strobes. This signal indicates which byte lanes hold // valid data. There is one write strobe bit for each eight // bits of the write data bus. input wire [(C_S_AXI_DATA_WIDTH/8)-1 : 0] S_AXI_WSTRB, // 写有效。为高指示数据有效。 // Write valid. This signal indicates that valid write // data and strobes are available. input wire S_AXI_WVALID, // 写准备。为高表示从设备空闲,准备接收数据;为低表示从设备忙。 // Write ready. This signal indicates that the slave // can accept the write data. output wire S_AXI_WREADY, // 写响应。该信号表示写状态,可允许相应的表示为OKAY\EXOKAY\SLVERR\DECERR。 // Write response. This signal indicates the status // of the write transaction. output wire [1 : 0] S_AXI_BRESP, // 写响应有效。为高指示响应数据有效 // Write response valid. This signal indicates that the channel // is signaling a valid write response. output wire S_AXI_BVALID, // 写响应准备。为高表示主设备空闲,准备接收写响应;为低表示主设备忙。 // Response ready. This signal indicates that the master // can accept a write response. input wire S_AXI_BREADY, // // 读地址。读地址给出突发数据传输的第一个传输地址。 // Read address (issued by master, acceped by Slave) input wire [C_S_AXI_ADDR_WIDTH-1 : 0] S_AXI_ARADDR, // 保护类型,建议值为000。 // Protection type. This signal indicates the privilege // and security level of the transaction, and whether the // transaction is a data access or an instruction access. input wire [2 : 0] S_AXI_ARPROT, // // Read address valid. This signal indicates that the channel // is signaling valid read address and control information. input wire S_AXI_ARVALID, // 读地址准备信号。为高表示从设备空闲,准备接收地址;为低表示从设备忙。 // Read address ready. This signal indicates that the slave is // ready to accept an address and associated control signals. output wire S_AXI_ARREADY, // Read data (issued by slave) output wire [C_S_AXI_DATA_WIDTH-1 : 0] S_AXI_RDATA, // Read response. This signal indicates the status of the // read transfer. output wire [1 : 0] S_AXI_RRESP, // Read valid. This signal indicates that the channel is // signaling the required read data. output wire S_AXI_RVALID, // Read ready. This signal indicates that the master can // accept the read data and response information. input wire S_AXI_RREADY ); // AXI4LITE signals reg [C_S_AXI_ADDR_WIDTH-1 : 0] axi_awaddr; reg axi_awready; reg axi_wready; reg [1 : 0] axi_bresp; reg axi_bvalid; reg [C_S_AXI_ADDR_WIDTH-1 : 0] axi_araddr; reg axi_arready; reg [C_S_AXI_DATA_WIDTH-1 : 0] axi_rdata; reg [1 : 0] axi_rresp; reg axi_rvalid;
其中最为重要的读取总线信号寻址的部分:
assign slv_reg_wren = axi_wready && S_AXI_WVALID && axi_awready && S_AXI_AWVALID; always @( posedge S_AXI_ACLK ) begin if ( S_AXI_ARESETN == 1'b0 ) begin slv_reg0 <= 0; slv_reg1 <= 0; slv_reg2 <= 0; slv_reg3 <= 0; slv_reg4 <= 0; slv_reg5 <= 0; slv_reg6 <= 0; slv_reg7 <= 0; slv_reg8 <= 0; slv_reg9 <= 0; end else begin if (slv_reg_wren) begin // 进行寻址 // 地址寻址 是这么玩的 // 当寄存器是32位的 最后就是 2位 4个Byte ADDR_LSB = 2 // 当寄存器是64位的 最后就是 3位 8个Byte ADDR_LSB = 3 // OPT_MEM_ADDR_BITS 用来寻址寄存器 这里选了十个寄存器 所以这里就是4位 case ( axi_awaddr[ADDR_LSB+OPT_MEM_ADDR_BITS:ADDR_LSB] ) 4'h0: for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 ) // 只有在对应的Bit位置为1的时候才能开始读取 if ( S_AXI_WSTRB[byte_index] == 1 ) begin // Respective byte enables are asserted as per write strobes // Slave register 0 slv_reg0[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8]; end 4'h1: for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 ) if ( S_AXI_WSTRB[byte_index] == 1 ) begin // Respective byte enables are asserted as per write strobes // Slave register 1 slv_reg1[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8]; end 4'h2: for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 ) if ( S_AXI_WSTRB[byte_index] == 1 ) begin // Respective byte enables are asserted as per write strobes // Slave register 2 slv_reg2[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8]; end 4'h3: for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 ) if ( S_AXI_WSTRB[byte_index] == 1 ) begin // Respective byte enables are asserted as per write strobes // Slave register 3 slv_reg3[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8]; end 4'h4: for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 ) if ( S_AXI_WSTRB[byte_index] == 1 ) begin // Respective byte enables are asserted as per write strobes // Slave register 4 slv_reg4[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8]; end 4'h5: for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 ) if ( S_AXI_WSTRB[byte_index] == 1 ) begin // Respective byte enables are asserted as per write strobes // Slave register 5 slv_reg5[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8]; end 4'h6: for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 ) if ( S_AXI_WSTRB[byte_index] == 1 ) begin // Respective byte enables are asserted as per write strobes // Slave register 6 slv_reg6[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8]; end 4'h7: for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 ) if ( S_AXI_WSTRB[byte_index] == 1 ) begin // Respective byte enables are asserted as per write strobes // Slave register 7 slv_reg7[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8]; end 4'h8: for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 ) if ( S_AXI_WSTRB[byte_index] == 1 ) begin // Respective byte enables are asserted as per write strobes // Slave register 8 slv_reg8[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8]; end 4'h9: for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 ) if ( S_AXI_WSTRB[byte_index] == 1 ) begin // Respective byte enables are asserted as per write strobes // Slave register 9 slv_reg9[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8]; end default : begin slv_reg0 <= slv_reg0; slv_reg1 <= slv_reg1; slv_reg2 <= slv_reg2; slv_reg3 <= slv_reg3; slv_reg4 <= slv_reg4; slv_reg5 <= slv_reg5; slv_reg6 <= slv_reg6; slv_reg7 <= slv_reg7; slv_reg8 <= slv_reg8; slv_reg9 <= slv_reg9; end endcase end end end
附2:AXI4的测试模块与仿真测试
`timescale 1ns/1nsmodule conv_axi_test(); parameter integer C_S00_AXI_DATA_WIDTH = 32; parameter integer C_S00_AXI_ADDR_WIDTH = 6; reg s00_axi_aclk; // 全局复位信号 reg s00_axi_aresetn; reg [C_S00_AXI_ADDR_WIDTH-1 : 0] s00_axi_awaddr; wire [2 : 0] s00_axi_awprot; reg s00_axi_awvalid; wire s00_axi_awready; reg [C_S00_AXI_DATA_WIDTH-1 : 0] s00_axi_wdata; reg [(C_S00_AXI_DATA_WIDTH/8)-1 : 0] s00_axi_wstrb; reg s00_axi_wvalid; wire s00_axi_wready; wire [1 : 0] s00_axi_bresp; wire s00_axi_bvalid; wire s00_axi_bready; reg [C_S00_AXI_ADDR_WIDTH-1 : 0] s00_axi_araddr; wire [2 : 0] s00_axi_arprot; reg s00_axi_arvalid; wire s00_axi_arready; wire [C_S00_AXI_DATA_WIDTH-1 : 0] s00_axi_rdata; wire [1 : 0] s00_axi_rresp; wire s00_axi_rvalid; wire s00_axi_rready; conv_v1_0_S00_AXI # ( .C_S_AXI_DATA_WIDTH(C_S00_AXI_DATA_WIDTH), .C_S_AXI_ADDR_WIDTH(C_S00_AXI_ADDR_WIDTH) ) conv_v1_0_S00_AXI_inst ( .S_AXI_ACLK(s00_axi_aclk), .S_AXI_ARESETN(s00_axi_aresetn), .S_AXI_AWADDR(s00_axi_awaddr), .S_AXI_AWPROT(s00_axi_awprot), .S_AXI_AWVALID(s00_axi_awvalid), .S_AXI_AWREADY(s00_axi_awready), .S_AXI_WDATA(s00_axi_wdata), .S_AXI_WSTRB(s00_axi_wstrb), .S_AXI_WVALID(s00_axi_wvalid), .S_AXI_WREADY(s00_axi_wready), .S_AXI_BRESP(s00_axi_bresp), .S_AXI_BVALID(s00_axi_bvalid), .S_AXI_BREADY(s00_axi_bready), .S_AXI_ARADDR(s00_axi_araddr), .S_AXI_ARPROT(s00_axi_arprot), .S_AXI_ARVALID(s00_axi_arvalid), .S_AXI_ARREADY(s00_axi_arready), .S_AXI_RDATA(s00_axi_rdata), .S_AXI_RRESP(s00_axi_rresp), .S_AXI_RVALID(s00_axi_rvalid), .S_AXI_RREADY(s00_axi_rready) );initialbegin:d integer i; s00_axi_aclk = 1; for(i = 0; i< 1000;i++) begin #1 s00_axi_aclk = ~ s00_axi_aclk; end $finish();endinitialbegin s00_axi_aresetn = 0; s00_axi_arvalid = 0;#4 s00_axi_aresetn = 1; s00_axi_awvalid = 1; s00_axi_wvalid = 1; s00_axi_awaddr = 0; s00_axi_wstrb = 4'b1111; s00_axi_wdata = 3;#4 s00_axi_awaddr = 6'b000100; s00_axi_wdata = 21;#4 s00_axi_awaddr = 6'b001000; s00_axi_wdata = 19;#4 s00_axi_awaddr = 6'b001100; s00_axi_wdata = 22;#4 s00_axi_awaddr = 6'b010000; s00_axi_wdata = 20;#4 s00_axi_awaddr = 6'b010100; s00_axi_wdata = 13;#4 s00_axi_awaddr = 6'b011000; s00_axi_wdata = 16;#4 s00_axi_awaddr = 6'b011100; s00_axi_wdata = 14;#4 s00_axi_awaddr = 6'b100000; s00_axi_wdata = 7;#4 s00_axi_arvalid = 1; s00_axi_araddr = 6'b100100;endinitialbegin $dumpfile("test.vcd"); $dumpvars();endendmodule
利用iverilog
进行仿真GTKwave
显示测试波形如下
新建IP核
如下:
工程顶层图如下:
附3:软件驱动
#include <stdio.h>#include "platform.h"#include "xbasic_types.h"#include "xparameters.h"#include "xil_io.h"#define test_speedint res[1000][1000];void delay() { int i, j, k; for (i = 0; i < 1000; i++) { for (j = 0; j < 1000; j++) { for (k = 0; k < 100; k++) ; } }}void show_reg() { int i; u32 result; printf("\n============SHOW REG ================\n"); for (i = 0; i < 9; i++) { result = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 4 * i); printf("Reg %3d : %u\n", i, result); }}void load_kernel(int filter[3][3]) { UINTPTR kernel_addr = (UINTPTR) XPAR_CONV_0_S00_AXI_BASEADDR + 36; Xil_Out32(kernel_addr, filter[0][0]); kernel_addr = kernel_addr + 0x4; Xil_Out32(kernel_addr, filter[0][1]); kernel_addr = kernel_addr + 0x4; Xil_Out32(kernel_addr, filter[0][2]); kernel_addr = kernel_addr + 0x4; Xil_Out32(kernel_addr, filter[1][0]); kernel_addr = kernel_addr + 0x4; Xil_Out32(kernel_addr, filter[1][1]); kernel_addr = kernel_addr + 0x4; Xil_Out32(kernel_addr, filter[1][2]); kernel_addr = kernel_addr + 0x4; Xil_Out32(kernel_addr, filter[2][0]); kernel_addr = kernel_addr + 0x4; Xil_Out32(kernel_addr, filter[2][1]); kernel_addr = kernel_addr + 0x4; Xil_Out32(kernel_addr, filter[2][2]);}void test_set() { Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, 3); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, 22); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, 16); printf("1\n"); show_reg(); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, 21); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, 20); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, 14); printf("2\n"); show_reg(); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, 19); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, 13); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, 7); printf("3\n"); show_reg();}void Conv_SW(int filter[3][3], int arr[100][100], int arrW, int arrH) { int i, j; i = 2; j = 2; for (i = 2; i < arrH; i++) { for (j = 2; j < arrW;j++){ res[i][j] = 0; res[i][j] += filter[0][0] * arr[i - 1][j - 1]; res[i][j] += filter[0][1] * arr[i - 1][j]; res[i][j] += filter[0][2] * arr[i - 1][j + 1]; res[i][j] += filter[1][0] * arr[i][j - 1]; res[i][j] += filter[1][1] * arr[i][j]; res[i][j] += filter[1][2] * arr[i][j + 1]; res[i][j] += filter[2][0] * arr[i + 1][j - 1]; res[i][j] += filter[2][1] * arr[i + 1][j]; res[i][j] += filter[2][2] * arr[i + 1][j + 1]; } }}void Conv_HW(int filter[3][3], int arr[100][100], int arrW, int arrH) { int i, j; i = 2; j = 2; for (i = 2; i < arrH; i++) { //pre load Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j - 1]); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j - 1]); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j - 1]); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j]); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j]); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j]); for (j = 2; j < arrW; j++) { Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j + 1]); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j + 1]); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j + 1]); res[i][j] = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 72); } }}int main() { printf("HELLO WORLD"); u32 result; int filterW = 3; int filterH = 3; int arrW = 5; int arrH = 5; int resW = filterW + arrW - 1; int resH = filterH + arrH - 1; int i, j; int pFilter[3][3]; int arr[100][100]; UINTPTR cur_addr = (UINTPTR) XPAR_CONV_0_S00_AXI_BASEADDR; pFilter[0][0] = 1; pFilter[0][1] = 3; pFilter[0][2] = 1; pFilter[1][0] = 0; pFilter[1][1] = 5; pFilter[1][2] = 0; pFilter[2][0] = 2; pFilter[2][1] = 1; pFilter[2][2] = 2; init_platform(); for (i = 0; i < 9; i++) { Xil_Out32(cur_addr, 0); cur_addr = cur_addr + 4; } load_kernel(pFilter); printf("Kernel Loaded\n");#ifdef test_single test_set(); result = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 72); printf("Test Set Result %u", result); show_reg();#endif#ifdef test_func srand(10); arrW = 20; arrH = 20; resH = filterH + arrH - 1; resW = filterW + arrW - 1; for (i = 0; i < arrH; i++) { for (j = 0; j < arrW; j++) { arr[i][j] = rand() % 20; } } printf("*********************************************** \n"); printf("Filter: \n"); for (i = filterH - 1; i >= 0; i--) { for (j = filterW - 1; j >= 0; j--) { printf("%d ", pFilter[i][j]); } printf("\n"); } printf("*********************************************** \n"); printf("Matrix: \n"); for (i = 0; i < arrH; i++) { for (j = 0; j < arrW; j++) { printf("%4d ", arr[i][j]); } printf("\n"); } printf("*********************************************** \n"); printf("Software Start!\n"); Conv_SW(pFilter, arr, arrW, arrH); printf("\nSoftware end!\n"); printf("*********************************************** \n"); printf("Result1: \n"); for (i = 0; i < resH; i++) { for (j = 0; j < resW; j++) { printf("%5d ", res[i][j]); } printf("\n"); } for (i = 0; i < resH; i++) { for (j = 0; j < resW; j++) { res[i][j] = 0; } } printf("*********************************************** \n"); printf("HardWare Start!\n"); Conv_HW(pFilter, arr, arrW, arrH); printf("\nHardWare end!"); printf("Result2: \n"); for (i = 0; i < resH; i++) { for (j = 0; j < resW; j++) { printf("%5d ", res[i][j]); } printf("\n"); } printf("*********************************************** \n");#endif#ifdef test_speed arrW = 500; arrH = 500; resH = filterH + arrH - 1; resW = filterW + arrW - 1; printf("Software Start!\n"); for(i = 0; i< 200;i++) { Conv_SW(pFilter, arr, arrW, arrH); } printf("\nSoftware end!\n"); printf("HardWare Start!\n"); for(i = 0; i< 200;i++) { Conv_HW(pFilter, arr, arrW, arrH); } printf("\nHardWare end!"); cleanup_platform();#endif return 0;}
- 基于AXI4总线卷积FPGA加速IP核的尝试
- Zynq 的AXI4 总线应用
- AXI4、AXI4-Lite、AXI-Stream总线协议的简单认识
- 基于IP核的fpga调试经验
- 【FPGA黑金开发板】NIOSII那些事儿--基于AVALON总线的IP定制(十七)
- AXI4总线协议
- FPGA的IP核
- 优化基于FPGA的深度卷积神经网络的加速器设计
- 优化基于FPGA的深度卷积神经网络的加速器设计
- 优化基于FPGA的深度卷积神经网络的加速器设计
- 优化基于FPGA的深度卷积神经网络的加速器设计
- 优化基于FPGA的深度卷积神经网络的加速器设计
- 优化基于FPGA的深度卷积神经网络的加速器设计
- 优化基于FPGA的深度卷积神经网络的加速器设计
- 基于FSL总线的ip核封装(DES)
- 基于FPGA的GoogLeNet加速器-卷积层/汇聚层
- 基于XILINX FPGA的卷积神经网络(一)
- 基于XILINX FPGA的卷积神经网络(二)
- HDU_2838_Cow Sorting_树状数组
- Java实现Spark词配对Wordcount计数
- 视频直播质量的评测和实现分享---很好视频评测知识
- 【luogu1134】阶乘问题(数论)
- 索引优化MAX()
- 基于AXI4总线卷积FPGA加速IP核的尝试
- 条款 13
- 多维数组取值
- WSN 之定时器1
- 一款基于Vue2.0高仿微信App的单页应用
- Q&A——资源管理(五)
- 递归
- RelativeLayout一些属性
- 分析比较多表查询中的IN与JOIN