基于AXI4总线卷积FPGA加速IP核的尝试

来源:互联网 发布:java毫秒数转时间 在线 编辑:程序博客网 时间:2024/05/21 10:29

本文先总结不同AXI IP核的实现的方法,性能的对比,性能差异的分析,可能改进的方面。使用的硬件平台是Zedboard

不同的AXI总线卷积加速模块的概况

这次实现并逐渐优化了三个版本的卷积加速模块,先简要描述各个版本的主要内容。

版本一

版本一主要是用来测试AXI总线IP核的实现可能。

  • 该模块拥有19个32位寄存器
  • 其中前9个寄存器用来保存需要计算的值
  • 后面9个寄存器用来保存卷积核
  • 在读取第19个寄存器的地址的时候计算9个寄存器的卷积和(该计算可以在一个时钟周期内完成)
  • 9个寄存器单独赋值,程序中分别向对应地址写入内容,通过总线进行传输。
  • 故乐观的来算,需要10个总线周期可以获取一个输出

可以从驱动的书写简单理解一下:

void Conv_HW(int filter[3][3], int arr[100][100],        int filterW, int filterH, int arrW, int arrH) {    int i, j;    for (i = 2; i < filterH + arrH - 3; i++) {        for (j = 2; j < filterW + arrW - 3; j++) {            Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR, arr[i][j]);            Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+4, arr[i][j - 1]);            Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+8, arr[i][j - 2]);            Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+12, arr[i - 1][j]);            Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+16, arr[i - 1][j - 1]);            Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+20, arr[i - 1][j - 2]);            Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+24, arr[i - 2][j]);            Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+28, arr[i - 2][j - 1]);            Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+32, arr[i - 2][j - 2]);            res[i][j] = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 72);        }        if (i % 15 == 0)            printf("=");    }}

版本一性能

  • 版本一性能最惨,由于没有时间戳,目测软件计算速度远远快于FPGA核心运算速度。
  • 版本一的改进速度就是引入滑动窗口,能够最大程度减少总线周期。

版本二

版本二引入滑动窗口,和初期设计的概念相同。

  • 该模块拥有19个32位寄存器

  • 其中前9个寄存器用来保存需要计算的值

  • 后面9个寄存器用来保存卷积核

  • 在读取第19个寄存器的地址的时候计算9个寄存器的卷积和(该计算可以在一个时钟周期内完成)

  • 三个寄存器滑动赋值,该计算窗口在计算矩阵上滑动 除了冷启动多余两个周期用来预载寄存器,后面的每一个计算只需要四个总线周期

    可以通过写的驱动简单理解一下:

    void Conv_HW(int filter[3][3], int arr[100][100], int arrW, int arrH) {int i, j;i = 2; j = 2;for (i = 2; i < arrH; i++) {    //pre load    Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j - 1]);    Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j - 1]);    Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j - 1]);    Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j]);    Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j]);    Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j]);    for (j = 2; j <  arrW; j++) {        Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j + 1]);        Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j + 1]);        Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j + 1]);        res[i][j] = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 72);    }}}

版本二性能

测试样本 500*50032bit单位的矩阵 计算200次。

软件消耗33.78秒,卷积IP核心40.25

这样的结果还是非常不乐观,分析可能有两种限制了IP核的速度。

  • 两个寄存器的乘法LUT太大,无法硬件优化
  • 总线周期太慢太慢

版本三对于这两种可能进行探索。

版本二的FPGA部分核心代码

    // Implement memory mapped register select and write logic generation    // The write data is accepted and written to memory mapped registers when    // axi_awready, S_AXI_WVALID, axi_wready and S_AXI_WVALID are asserted. Write strobes are used to    // select byte enables of slave registers while writing.    // These registers are cleared when reset (active low) is applied.    // Slave register write enable is asserted when valid address and data are available    // and the slave is ready to accept the write address and write data.    assign slv_reg_wren = axi_wready && S_AXI_WVALID && axi_awready && S_AXI_AWVALID;    always @( posedge S_AXI_ACLK )    begin      if ( S_AXI_ARESETN == 1'b0 )        begin          slv_reg0 <= 0;          slv_reg1 <= 0;          slv_reg2 <= 0;          slv_reg3 <= 0;          slv_reg4 <= 0;          slv_reg5 <= 0;          slv_reg6 <= 0;          slv_reg7 <= 0;          slv_reg8 <= 0;          slv_reg9 <= 0;          slv_reg10 <= 0;          slv_reg11 <= 0;          slv_reg12 <= 0;          slv_reg13 <= 0;          slv_reg14 <= 0;          slv_reg15 <= 0;          slv_reg16 <= 0;          slv_reg17 <= 0;//        slv_reg18 <= 0;        end       else begin        if (slv_reg_wren)          begin            case ( axi_awaddr[ADDR_LSB+OPT_MEM_ADDR_BITS:ADDR_LSB] )              5'h00:                for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )                  if ( S_AXI_WSTRB[byte_index] == 1 ) begin                    // Respective byte enables are asserted as per write strobes                     // Slave register 0                    slv_reg0[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];                  end                5'h01:                for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )                  if ( S_AXI_WSTRB[byte_index] == 1 ) begin                    // Respective byte enables are asserted as per write strobes                     // Slave register 1                    slv_reg1[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];                  end                5'h02:              begin                slv_reg0 <= slv_reg1;                slv_reg1 <= slv_reg2;                for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )                  if ( S_AXI_WSTRB[byte_index] == 1 ) begin                    // Respective byte enables are asserted as per write strobes                     // Slave register 2                    slv_reg2[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];                   end                end              5'h03:                for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )                  if ( S_AXI_WSTRB[byte_index] == 1 ) begin                    // Respective byte enables are asserted as per write strobes                     // Slave register 3                    slv_reg3[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];                  end                5'h04:                for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )                  if ( S_AXI_WSTRB[byte_index] == 1 ) begin                    // Respective byte enables are asserted as per write strobes                     // Slave register 4                    slv_reg4[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];                  end                5'h05:              begin                slv_reg3 <= slv_reg4;                slv_reg4 <= slv_reg5;                for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )                  if ( S_AXI_WSTRB[byte_index] == 1 ) begin                    // Respective byte enables are asserted as per write strobes                     // Slave register 5                    slv_reg5[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];                  end                end              5'h06:                for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )                  if ( S_AXI_WSTRB[byte_index] == 1 ) begin                    // Respective byte enables are asserted as per write strobes                     // Slave register 6                    slv_reg6[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];                  end                5'h07:                for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )                  if ( S_AXI_WSTRB[byte_index] == 1 ) begin                    // Respective byte enables are asserted as per write strobes                     // Slave register 7                    slv_reg7[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];                  end                5'h08:              begin                slv_reg6 <= slv_reg7;                slv_reg7 <= slv_reg8;                for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )                  if ( S_AXI_WSTRB[byte_index] == 1 ) begin                    // Respective byte enables are asserted as per write strobes                     // Slave register 8                    slv_reg8[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];                  end                end              5'h09:                for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )                  if ( S_AXI_WSTRB[byte_index] == 1 ) begin                    // Respective byte enables are asserted as per write strobes                     // Slave register 9                    slv_reg9[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];                  end                5'h0A:                for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )                  if ( S_AXI_WSTRB[byte_index] == 1 ) begin                    // Respective byte enables are asserted as per write strobes                     // Slave register 10                    slv_reg10[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];                  end                5'h0B:                for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )                  if ( S_AXI_WSTRB[byte_index] == 1 ) begin                    // Respective byte enables are asserted as per write strobes                     // Slave register 11                    slv_reg11[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];                  end                5'h0C:                for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )                  if ( S_AXI_WSTRB[byte_index] == 1 ) begin                    // Respective byte enables are asserted as per write strobes                     // Slave register 12                    slv_reg12[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];                  end                5'h0D:                for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )                  if ( S_AXI_WSTRB[byte_index] == 1 ) begin                    // Respective byte enables are asserted as per write strobes                     // Slave register 13                    slv_reg13[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];                  end                5'h0E:                for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )                  if ( S_AXI_WSTRB[byte_index] == 1 ) begin                    // Respective byte enables are asserted as per write strobes                     // Slave register 14                    slv_reg14[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];                  end                5'h0F:                for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )                  if ( S_AXI_WSTRB[byte_index] == 1 ) begin                    // Respective byte enables are asserted as per write strobes                     // Slave register 15                    slv_reg15[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];                  end                5'h10:                for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )                  if ( S_AXI_WSTRB[byte_index] == 1 ) begin                    // Respective byte enables are asserted as per write strobes                     // Slave register 16                    slv_reg16[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];                  end                5'h11:                for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )                  if ( S_AXI_WSTRB[byte_index] == 1 ) begin                    // Respective byte enables are asserted as per write strobes                     // Slave register 17                    slv_reg17[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];                  end  //            5'h12://              for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )//                if ( S_AXI_WSTRB[byte_index] == 1 ) begin//                  // Respective byte enables are asserted as per write strobes //                  // Slave register 18//                  slv_reg18[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];//                end                default : begin                          slv_reg0 <= slv_reg0;                          slv_reg1 <= slv_reg1;                          slv_reg2 <= slv_reg2;                          slv_reg3 <= slv_reg3;                          slv_reg4 <= slv_reg4;                          slv_reg5 <= slv_reg5;                          slv_reg6 <= slv_reg6;                          slv_reg7 <= slv_reg7;                          slv_reg8 <= slv_reg8;                          slv_reg9 <= slv_reg9;                          slv_reg10 <= slv_reg10;                          slv_reg11 <= slv_reg11;                          slv_reg12 <= slv_reg12;                          slv_reg13 <= slv_reg13;                          slv_reg14 <= slv_reg14;                          slv_reg15 <= slv_reg15;                          slv_reg16 <= slv_reg16;                          slv_reg17 <= slv_reg17;                        end            endcase          end      end    end        // Implement memory mapped register select and read logic generation    // Slave register read enable is asserted when valid address is available    // and the slave is ready to accept the read address.    assign slv_reg_rden = axi_arready & S_AXI_ARVALID & ~axi_rvalid;    always @(*)    begin          // Address decoding for reading registers          case ( axi_araddr[ADDR_LSB+OPT_MEM_ADDR_BITS:ADDR_LSB] )            5'h00   : reg_data_out <= slv_reg0;            5'h01   : reg_data_out <= slv_reg1;            5'h02   : reg_data_out <= slv_reg2;            5'h03   : reg_data_out <= slv_reg3;            5'h04   : reg_data_out <= slv_reg4;            5'h05   : reg_data_out <= slv_reg5;            5'h06   : reg_data_out <= slv_reg6;            5'h07   : reg_data_out <= slv_reg7;            5'h08   : reg_data_out <= slv_reg8;            5'h09   : reg_data_out <= slv_reg9;            5'h0A   : reg_data_out <= slv_reg10;            5'h0B   : reg_data_out <= slv_reg11;            5'h0C   : reg_data_out <= slv_reg12;            5'h0D   : reg_data_out <= slv_reg13;            5'h0E   : reg_data_out <= slv_reg14;            5'h0F   : reg_data_out <= slv_reg15;            5'h10   : reg_data_out <= slv_reg16;            5'h11   : reg_data_out <= slv_reg17;            5'h12   : reg_data_out <= slv_reg0 * slv_reg9 +                                      slv_reg1 * slv_reg10 +                                       slv_reg2 * slv_reg11 +                                       slv_reg3 * slv_reg12 +                                      slv_reg4 * slv_reg13 +                                      slv_reg5 * slv_reg14 +                                      slv_reg6 * slv_reg15 +                                      slv_reg7 * slv_reg16 +                                      slv_reg8 * slv_reg17;            default : reg_data_out <= 0;          endcase    end

版本三

先尝试生成更小的LUT

  • 该模块拥有19个32位寄存器
  • 其中前9个寄存器用来保存需要计算的值
  • 卷积核固定在Verilog中,用来生成更小的LUT
  • 一个计算只需要四个总线周期

性能测试

仍然软件消耗33秒,卷积IP核心40

基本否决是LUT问题。

下面测试AXI总线问题:

假设所有数据均来自于FPGA,无需从总线写入:

void Conv_HW(int filter[3][3], int arr[100][100], int arrW, int arrH) {    int i, j;    i = 2; j = 2;    for (i = 2; i < arrH; i++) {        for (j = 2; j <  arrW; j++) {            res[i][j] = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 72);        }    }}

只需要9.47秒即可完成计算,并传回CPU !!!

总结

至此,基本上可以否决利用AXI传数据的可能,所有需要利用AXI总线传输数据的模块均会被总线周期所连累,在优化了传输后,仍然无法解决该问题。确实需要一个更快的方式来传输数据。

AlteraNIOS2中,直接利用IO口传输数据,无需总线周期,再因为NIOS II内核没有流水线优化,所以硬件确实比较快。

附1:AXI4 总线的 FPGA 接口部分

先看总线接口:

        // Users to add ports here        // User ports ends        // Do not modify the ports beyond this line        // Global Clock Signal        // 全局时钟        input wire  S_AXI_ACLK,        // Global Reset Signal. This Signal is Active LOW        // 全局复位信号        input wire  S_AXI_ARESETN,        // Write address (issued by master, acceped by Slave)        // 写地址         input wire [C_S_AXI_ADDR_WIDTH-1 : 0] S_AXI_AWADDR,        // 写地址的保护模式 包括privilege和security level        // Write channel Protection type. This signal indicates the    // privilege and security level of the transaction, and whether    // the transaction is a data access or an instruction access.        input wire [2 : 0] S_AXI_AWPROT,        // 写地址有效信号。为高指示地址有效。        // Write address valid. This signal indicates that the master signaling    // valid write address and control information.        input wire  S_AXI_AWVALID,        // 写地址准备信号。为高表示从设备空闲,准备接收地址;为低表示从设备忙。        // ********** 注意 这里是地址 下面是数据 ********        // Write address ready. This signal indicates that the slave is ready    // to accept an address and associated control signals.        output wire  S_AXI_AWREADY,        // 写数据,32位到1024位宽        // 从主设备来的数据 从设备接收        // Write data (issued by master, acceped by Slave)         input wire [C_S_AXI_DATA_WIDTH-1 : 0] S_AXI_WDATA,        // 写字节选通,用于表示更新存储器的字节通道,对于数据总线的每8位数据有一位写选通信号。        // Write strobes. This signal indicates which byte lanes hold    // valid data. There is one write strobe bit for each eight    // bits of the write data bus.            input wire [(C_S_AXI_DATA_WIDTH/8)-1 : 0] S_AXI_WSTRB,        // 写有效。为高指示数据有效。        // Write valid. This signal indicates that valid write    // data and strobes are available.        input wire  S_AXI_WVALID,        // 写准备。为高表示从设备空闲,准备接收数据;为低表示从设备忙。        // Write ready. This signal indicates that the slave    // can accept the write data.        output wire  S_AXI_WREADY,        // 写响应。该信号表示写状态,可允许相应的表示为OKAY\EXOKAY\SLVERR\DECERR。        // Write response. This signal indicates the status    // of the write transaction.        output wire [1 : 0] S_AXI_BRESP,        // 写响应有效。为高指示响应数据有效        // Write response valid. This signal indicates that the channel    // is signaling a valid write response.        output wire  S_AXI_BVALID,        // 写响应准备。为高表示主设备空闲,准备接收写响应;为低表示主设备忙。        // Response ready. This signal indicates that the master    // can accept a write response.        input wire  S_AXI_BREADY,        //         // 读地址。读地址给出突发数据传输的第一个传输地址。        // Read address (issued by master, acceped by Slave)        input wire [C_S_AXI_ADDR_WIDTH-1 : 0] S_AXI_ARADDR,        // 保护类型,建议值为000。        // Protection type. This signal indicates the privilege    // and security level of the transaction, and whether the    // transaction is a data access or an instruction access.        input wire [2 : 0] S_AXI_ARPROT,        //         // Read address valid. This signal indicates that the channel    // is signaling valid read address and control information.        input wire  S_AXI_ARVALID,        // 读地址准备信号。为高表示从设备空闲,准备接收地址;为低表示从设备忙。        // Read address ready. This signal indicates that the slave is    // ready to accept an address and associated control signals.        output wire  S_AXI_ARREADY,        // Read data (issued by slave)        output wire [C_S_AXI_DATA_WIDTH-1 : 0] S_AXI_RDATA,        // Read response. This signal indicates the status of the            // read transfer.        output wire [1 : 0] S_AXI_RRESP,        // Read valid. This signal indicates that the channel is            // signaling the required read data.        output wire  S_AXI_RVALID,        // Read ready. This signal indicates that the master can            // accept the read data and response information.        input wire  S_AXI_RREADY    );    // AXI4LITE signals    reg [C_S_AXI_ADDR_WIDTH-1 : 0]  axi_awaddr;    reg     axi_awready;    reg     axi_wready;    reg [1 : 0]     axi_bresp;    reg     axi_bvalid;    reg [C_S_AXI_ADDR_WIDTH-1 : 0]  axi_araddr;    reg     axi_arready;    reg [C_S_AXI_DATA_WIDTH-1 : 0]  axi_rdata;    reg [1 : 0]     axi_rresp;    reg     axi_rvalid;

其中最为重要的读取总线信号寻址的部分:

assign slv_reg_wren = axi_wready && S_AXI_WVALID && axi_awready && S_AXI_AWVALID;    always @( posedge S_AXI_ACLK )    begin      if ( S_AXI_ARESETN == 1'b0 )        begin          slv_reg0 <= 0;          slv_reg1 <= 0;          slv_reg2 <= 0;          slv_reg3 <= 0;          slv_reg4 <= 0;          slv_reg5 <= 0;          slv_reg6 <= 0;          slv_reg7 <= 0;          slv_reg8 <= 0;          slv_reg9 <= 0;        end       else begin        if (slv_reg_wren)          begin                  // 进行寻址                    // 地址寻址 是这么玩的                    // 当寄存器是32位的 最后就是 2位 4个Byte ADDR_LSB = 2                    // 当寄存器是64位的 最后就是 3位 8个Byte ADDR_LSB = 3                    // OPT_MEM_ADDR_BITS 用来寻址寄存器 这里选了十个寄存器 所以这里就是4位            case ( axi_awaddr[ADDR_LSB+OPT_MEM_ADDR_BITS:ADDR_LSB] )              4'h0:                for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )                              // 只有在对应的Bit位置为1的时候才能开始读取                  if ( S_AXI_WSTRB[byte_index] == 1 ) begin                    // Respective byte enables are asserted as per write strobes                     // Slave register 0                    slv_reg0[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];                  end                4'h1:                for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )                  if ( S_AXI_WSTRB[byte_index] == 1 ) begin                    // Respective byte enables are asserted as per write strobes                     // Slave register 1                    slv_reg1[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];                  end                4'h2:                for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )                  if ( S_AXI_WSTRB[byte_index] == 1 ) begin                    // Respective byte enables are asserted as per write strobes                     // Slave register 2                    slv_reg2[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];                  end                4'h3:                for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )                  if ( S_AXI_WSTRB[byte_index] == 1 ) begin                    // Respective byte enables are asserted as per write strobes                     // Slave register 3                    slv_reg3[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];                  end                4'h4:                for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )                  if ( S_AXI_WSTRB[byte_index] == 1 ) begin                    // Respective byte enables are asserted as per write strobes                     // Slave register 4                    slv_reg4[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];                  end                4'h5:                for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )                  if ( S_AXI_WSTRB[byte_index] == 1 ) begin                    // Respective byte enables are asserted as per write strobes                     // Slave register 5                    slv_reg5[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];                  end                4'h6:                for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )                  if ( S_AXI_WSTRB[byte_index] == 1 ) begin                    // Respective byte enables are asserted as per write strobes                     // Slave register 6                    slv_reg6[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];                  end                4'h7:                for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )                  if ( S_AXI_WSTRB[byte_index] == 1 ) begin                    // Respective byte enables are asserted as per write strobes                     // Slave register 7                    slv_reg7[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];                  end                4'h8:                for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )                  if ( S_AXI_WSTRB[byte_index] == 1 ) begin                    // Respective byte enables are asserted as per write strobes                     // Slave register 8                    slv_reg8[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];                  end                4'h9:                for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )                  if ( S_AXI_WSTRB[byte_index] == 1 ) begin                    // Respective byte enables are asserted as per write strobes                     // Slave register 9                    slv_reg9[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];                  end                default : begin                          slv_reg0 <= slv_reg0;                          slv_reg1 <= slv_reg1;                          slv_reg2 <= slv_reg2;                          slv_reg3 <= slv_reg3;                          slv_reg4 <= slv_reg4;                          slv_reg5 <= slv_reg5;                          slv_reg6 <= slv_reg6;                          slv_reg7 <= slv_reg7;                          slv_reg8 <= slv_reg8;                          slv_reg9 <= slv_reg9;                        end            endcase          end      end    end  

附2:AXI4的测试模块与仿真测试

`timescale 1ns/1nsmodule conv_axi_test();    parameter integer C_S00_AXI_DATA_WIDTH  = 32;    parameter integer C_S00_AXI_ADDR_WIDTH  = 6;         reg  s00_axi_aclk;        // 全局复位信号         reg  s00_axi_aresetn;         reg [C_S00_AXI_ADDR_WIDTH-1 : 0] s00_axi_awaddr;         wire [2 : 0] s00_axi_awprot;         reg  s00_axi_awvalid;         wire  s00_axi_awready;         reg [C_S00_AXI_DATA_WIDTH-1 : 0] s00_axi_wdata;         reg [(C_S00_AXI_DATA_WIDTH/8)-1 : 0] s00_axi_wstrb;         reg  s00_axi_wvalid;         wire  s00_axi_wready;         wire [1 : 0] s00_axi_bresp;         wire  s00_axi_bvalid;         wire  s00_axi_bready;         reg [C_S00_AXI_ADDR_WIDTH-1 : 0] s00_axi_araddr;         wire [2 : 0] s00_axi_arprot;         reg  s00_axi_arvalid;         wire  s00_axi_arready;         wire [C_S00_AXI_DATA_WIDTH-1 : 0] s00_axi_rdata;         wire [1 : 0] s00_axi_rresp;         wire  s00_axi_rvalid;         wire  s00_axi_rready;    conv_v1_0_S00_AXI # (         .C_S_AXI_DATA_WIDTH(C_S00_AXI_DATA_WIDTH),        .C_S_AXI_ADDR_WIDTH(C_S00_AXI_ADDR_WIDTH)    ) conv_v1_0_S00_AXI_inst (        .S_AXI_ACLK(s00_axi_aclk),        .S_AXI_ARESETN(s00_axi_aresetn),        .S_AXI_AWADDR(s00_axi_awaddr),        .S_AXI_AWPROT(s00_axi_awprot),        .S_AXI_AWVALID(s00_axi_awvalid),        .S_AXI_AWREADY(s00_axi_awready),        .S_AXI_WDATA(s00_axi_wdata),        .S_AXI_WSTRB(s00_axi_wstrb),        .S_AXI_WVALID(s00_axi_wvalid),        .S_AXI_WREADY(s00_axi_wready),        .S_AXI_BRESP(s00_axi_bresp),        .S_AXI_BVALID(s00_axi_bvalid),        .S_AXI_BREADY(s00_axi_bready),        .S_AXI_ARADDR(s00_axi_araddr),        .S_AXI_ARPROT(s00_axi_arprot),        .S_AXI_ARVALID(s00_axi_arvalid),        .S_AXI_ARREADY(s00_axi_arready),        .S_AXI_RDATA(s00_axi_rdata),        .S_AXI_RRESP(s00_axi_rresp),        .S_AXI_RVALID(s00_axi_rvalid),        .S_AXI_RREADY(s00_axi_rready)    );initialbegin:d    integer i;    s00_axi_aclk = 1;    for(i = 0; i< 1000;i++)    begin        #1 s00_axi_aclk = ~ s00_axi_aclk;    end    $finish();endinitialbegin    s00_axi_aresetn = 0;    s00_axi_arvalid = 0;#4  s00_axi_aresetn = 1;    s00_axi_awvalid = 1;    s00_axi_wvalid = 1;    s00_axi_awaddr = 0;     s00_axi_wstrb = 4'b1111;    s00_axi_wdata = 3;#4  s00_axi_awaddr = 6'b000100;    s00_axi_wdata = 21;#4  s00_axi_awaddr = 6'b001000;    s00_axi_wdata = 19;#4  s00_axi_awaddr = 6'b001100;    s00_axi_wdata = 22;#4  s00_axi_awaddr = 6'b010000;    s00_axi_wdata = 20;#4  s00_axi_awaddr = 6'b010100;    s00_axi_wdata = 13;#4  s00_axi_awaddr = 6'b011000;    s00_axi_wdata = 16;#4  s00_axi_awaddr = 6'b011100;    s00_axi_wdata = 14;#4  s00_axi_awaddr = 6'b100000;    s00_axi_wdata = 7;#4    s00_axi_arvalid = 1;    s00_axi_araddr = 6'b100100;endinitialbegin    $dumpfile("test.vcd");    $dumpvars();endendmodule

利用iverilog进行仿真GTKwave显示测试波形如下

新建IP核如下:

工程顶层图如下:

附3:软件驱动

#include <stdio.h>#include "platform.h"#include "xbasic_types.h"#include "xparameters.h"#include "xil_io.h"#define test_speedint res[1000][1000];void delay() {    int i, j, k;    for (i = 0; i < 1000; i++) {        for (j = 0; j < 1000; j++) {            for (k = 0; k < 100; k++)                ;        }    }}void show_reg() {    int i;    u32 result;    printf("\n============SHOW REG ================\n");    for (i = 0; i < 9; i++) {        result = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 4 * i);        printf("Reg %3d : %u\n", i, result);    }}void load_kernel(int filter[3][3]) {    UINTPTR kernel_addr = (UINTPTR) XPAR_CONV_0_S00_AXI_BASEADDR + 36;    Xil_Out32(kernel_addr, filter[0][0]);    kernel_addr = kernel_addr + 0x4;    Xil_Out32(kernel_addr, filter[0][1]);    kernel_addr = kernel_addr + 0x4;    Xil_Out32(kernel_addr, filter[0][2]);    kernel_addr = kernel_addr + 0x4;    Xil_Out32(kernel_addr, filter[1][0]);    kernel_addr = kernel_addr + 0x4;    Xil_Out32(kernel_addr, filter[1][1]);    kernel_addr = kernel_addr + 0x4;    Xil_Out32(kernel_addr, filter[1][2]);    kernel_addr = kernel_addr + 0x4;    Xil_Out32(kernel_addr, filter[2][0]);    kernel_addr = kernel_addr + 0x4;    Xil_Out32(kernel_addr, filter[2][1]);    kernel_addr = kernel_addr + 0x4;    Xil_Out32(kernel_addr, filter[2][2]);}void test_set() {    Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, 3);    Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, 22);    Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, 16);    printf("1\n");    show_reg();    Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, 21);    Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, 20);    Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, 14);    printf("2\n");    show_reg();    Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, 19);    Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, 13);    Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, 7);    printf("3\n");    show_reg();}void Conv_SW(int filter[3][3], int arr[100][100], int arrW, int arrH) {    int i, j;    i = 2; j = 2;    for (i = 2; i < arrH; i++) {        for (j = 2; j < arrW;j++){            res[i][j] = 0;            res[i][j] += filter[0][0] * arr[i - 1][j - 1];            res[i][j] += filter[0][1] * arr[i - 1][j];            res[i][j] += filter[0][2] * arr[i - 1][j + 1];            res[i][j] += filter[1][0] * arr[i][j - 1];            res[i][j] += filter[1][1] * arr[i][j];            res[i][j] += filter[1][2] * arr[i][j + 1];            res[i][j] += filter[2][0] * arr[i + 1][j - 1];            res[i][j] += filter[2][1] * arr[i + 1][j];            res[i][j] += filter[2][2] * arr[i + 1][j + 1];        }    }}void Conv_HW(int filter[3][3], int arr[100][100], int arrW, int arrH) {    int i, j;    i = 2; j = 2;    for (i = 2; i < arrH; i++) {        //pre load        Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j - 1]);        Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j - 1]);        Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j - 1]);        Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j]);        Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j]);        Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j]);        for (j = 2; j <  arrW; j++) {            Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j + 1]);            Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j + 1]);            Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j + 1]);            res[i][j] = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 72);        }    }}int main() {    printf("HELLO WORLD");    u32 result;    int filterW = 3;    int filterH = 3;    int arrW = 5;    int arrH = 5;    int resW = filterW + arrW - 1;    int resH = filterH + arrH - 1;    int i, j;    int pFilter[3][3];    int arr[100][100];    UINTPTR cur_addr = (UINTPTR) XPAR_CONV_0_S00_AXI_BASEADDR;    pFilter[0][0] = 1;    pFilter[0][1] = 3;    pFilter[0][2] = 1;    pFilter[1][0] = 0;    pFilter[1][1] = 5;    pFilter[1][2] = 0;    pFilter[2][0] = 2;    pFilter[2][1] = 1;    pFilter[2][2] = 2;    init_platform();    for (i = 0; i < 9; i++) {        Xil_Out32(cur_addr, 0);        cur_addr = cur_addr + 4;    }    load_kernel(pFilter);    printf("Kernel Loaded\n");#ifdef test_single    test_set();    result = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 72);    printf("Test Set Result %u", result);    show_reg();#endif#ifdef test_func    srand(10);    arrW = 20;    arrH = 20;    resH = filterH + arrH - 1;    resW = filterW + arrW - 1;    for (i = 0; i < arrH; i++) {        for (j = 0; j < arrW; j++) {            arr[i][j] = rand() % 20;        }    }    printf("*********************************************** \n");    printf("Filter: \n");    for (i = filterH - 1; i >= 0; i--) {        for (j = filterW - 1; j >= 0; j--) {            printf("%d ", pFilter[i][j]);        }        printf("\n");    }    printf("*********************************************** \n");    printf("Matrix: \n");    for (i = 0; i < arrH; i++) {        for (j = 0; j < arrW; j++) {            printf("%4d ", arr[i][j]);        }        printf("\n");    }    printf("*********************************************** \n");    printf("Software Start!\n");    Conv_SW(pFilter, arr, arrW, arrH);    printf("\nSoftware end!\n");    printf("*********************************************** \n");    printf("Result1: \n");    for (i = 0; i < resH; i++) {        for (j = 0; j < resW; j++) {            printf("%5d ", res[i][j]);        }        printf("\n");    }    for (i = 0; i < resH; i++) {        for (j = 0; j < resW; j++) {            res[i][j] = 0;        }    }    printf("*********************************************** \n");    printf("HardWare Start!\n");    Conv_HW(pFilter, arr, arrW, arrH);    printf("\nHardWare end!");    printf("Result2: \n");    for (i = 0; i < resH; i++) {        for (j = 0; j < resW; j++) {            printf("%5d ", res[i][j]);        }        printf("\n");    }    printf("*********************************************** \n");#endif#ifdef test_speed    arrW = 500;    arrH = 500;    resH = filterH + arrH - 1;    resW = filterW + arrW - 1;    printf("Software Start!\n");    for(i = 0; i< 200;i++) {        Conv_SW(pFilter, arr, arrW, arrH);    }    printf("\nSoftware end!\n");    printf("HardWare Start!\n");    for(i = 0; i< 200;i++) {        Conv_HW(pFilter, arr, arrW, arrH);    }    printf("\nHardWare end!");    cleanup_platform();#endif    return 0;}