dpdk l2fwd 应用流程分析

来源:互联网 发布:企业seo外包 编辑:程序博客网 时间:2024/06/14 00:40
intMAIN(int argc, char **argv){    struct lcore_queue_conf *qconf;    struct rte_eth_dev_info dev_info;    int ret;    uint8_t nb_ports;    uint8_t nb_ports_available;    uint8_t portid, last_port;    unsigned lcore_id, rx_lcore_id;    unsigned nb_ports_in_mask = 0;    /* init EAL */    ret = rte_eal_init(argc, argv);    if (ret < 0)        rte_exit(EXIT_FAILURE, "Invalid EAL arguments\n");    argc -= ret;    argv += ret;    /* parse application arguments (after the EAL ones) */    ret = l2fwd_parse_args(argc, argv);    if (ret < 0)        rte_exit(EXIT_FAILURE, "Invalid L2FWD arguments\n");    /* create the mbuf pool */    l2fwd_pktmbuf_pool =        rte_mempool_create("mbuf_pool", NB_MBUF,                   MBUF_SIZE, 32,                   sizeof(struct rte_pktmbuf_pool_private),                   rte_pktmbuf_pool_init, NULL,                   rte_pktmbuf_init, NULL,                   rte_socket_id(), 0);    if (l2fwd_pktmbuf_pool == NULL)        rte_exit(EXIT_FAILURE, "Cannot init mbuf pool\n");    /* init driver(s) */    if (rte_pmd_init_all() < 0)        rte_exit(EXIT_FAILURE, "Cannot init pmd\n");    if (rte_eal_pci_probe() < 0)        rte_exit(EXIT_FAILURE, "Cannot probe PCI\n");    nb_ports = rte_eth_dev_count();    if (nb_ports == 0)        rte_exit(EXIT_FAILURE, "No Ethernet ports - bye\n");    if (nb_ports > RTE_MAX_ETHPORTS)        nb_ports = RTE_MAX_ETHPORTS;    /* reset l2fwd_dst_ports */    for (portid = 0; portid < RTE_MAX_ETHPORTS; portid++)        l2fwd_dst_ports[portid] = 0;    last_port = 0;    /* port0发给port1, port1发给port0. 两个端口为一对,互相发包 */    /*     * Each logical core is assigned a dedicated TX queue on each port.     */    for (portid = 0; portid < nb_ports; portid++) {        /* skip ports that are not enabled */        if ((l2fwd_enabled_port_mask & (1 << portid)) == 0)            continue;        if (nb_ports_in_mask % 2) {            l2fwd_dst_ports[portid] = last_port;            l2fwd_dst_ports[last_port] = portid;        }        else            last_port = portid;        nb_ports_in_mask++;        rte_eth_dev_info_get(portid, &dev_info);    }    if (nb_ports_in_mask % 2) {        printf("Notice: odd number of ports in portmask.\n");        l2fwd_dst_ports[last_port] = last_port;    }    rx_lcore_id = 0;    qconf = NULL;    /* 每个core负责收l2fwd_rx_queue_per_lcore个端口, 每个端口(其实应该是QUEUE,因为这里一个port只有一个QUEUE)只能由一个lcore进行收包 */    /* Initialize the port/queue configuration of each logical core */    for (portid = 0; portid < nb_ports; portid++) {        /* skip ports that are not enabled */        if ((l2fwd_enabled_port_mask & (1 << portid)) == 0)            continue;        /* get the lcore_id for this port */        while (rte_lcore_is_enabled(rx_lcore_id) == 0 ||               lcore_queue_conf[rx_lcore_id].n_rx_port ==               l2fwd_rx_queue_per_lcore) {            rx_lcore_id++;            if (rx_lcore_id >= RTE_MAX_LCORE)                rte_exit(EXIT_FAILURE, "Not enough cores\n");        }        if (qconf != &lcore_queue_conf[rx_lcore_id])            /* Assigned a new logical core in the loop above. */            qconf = &lcore_queue_conf[rx_lcore_id];        qconf->rx_port_list[qconf->n_rx_port] = portid;        qconf->n_rx_port++;        printf("Lcore %u: RX port %u\n", rx_lcore_id, (unsigned) portid);    }    nb_ports_available = nb_ports;    /* 每个port收发包队列的初始化 */    /* Initialise each port */    for (portid = 0; portid < nb_ports; portid++) {        /* skip ports that are not enabled */        if ((l2fwd_enabled_port_mask & (1 << portid)) == 0) {            printf("Skipping disabled port %u\n", (unsigned) portid);            nb_ports_available--;            continue;        }        /* init port */        printf("Initializing port %u... ", (unsigned) portid);        fflush(stdout);        ret = rte_eth_dev_configure(portid, 1, 1, &port_conf);        if (ret < 0)            rte_exit(EXIT_FAILURE, "Cannot configure device: err=%d, port=%u\n",                  ret, (unsigned) portid);        rte_eth_macaddr_get(portid,&l2fwd_ports_eth_addr[portid]);        /* init one RX queue */        fflush(stdout);        ret = rte_eth_rx_queue_setup(portid, 0, nb_rxd,                         rte_eth_dev_socket_id(portid), &rx_conf,                         l2fwd_pktmbuf_pool);        if (ret < 0)            rte_exit(EXIT_FAILURE, "rte_eth_rx_queue_setup:err=%d, port=%u\n",                  ret, (unsigned) portid);        /* init one TX queue on each port */        fflush(stdout);        ret = rte_eth_tx_queue_setup(portid, 0, nb_txd,                rte_eth_dev_socket_id(portid), &tx_conf);        if (ret < 0)            rte_exit(EXIT_FAILURE, "rte_eth_tx_queue_setup:err=%d, port=%u\n",                ret, (unsigned) portid);        /* Start device */        ret = rte_eth_dev_start(portid);        if (ret < 0)            rte_exit(EXIT_FAILURE, "rte_eth_dev_start:err=%d, port=%u\n",                  ret, (unsigned) portid);        printf("done: \n");        rte_eth_promiscuous_enable(portid);        printf("Port %u, MAC address: %02X:%02X:%02X:%02X:%02X:%02X\n\n",                (unsigned) portid,                l2fwd_ports_eth_addr[portid].addr_bytes[0],                l2fwd_ports_eth_addr[portid].addr_bytes[1],                l2fwd_ports_eth_addr[portid].addr_bytes[2],                l2fwd_ports_eth_addr[portid].addr_bytes[3],                l2fwd_ports_eth_addr[portid].addr_bytes[4],                l2fwd_ports_eth_addr[portid].addr_bytes[5]);        /* initialize port stats */        memset(&port_statistics, 0, sizeof(port_statistics));    }    if (!nb_ports_available) {        rte_exit(EXIT_FAILURE,            "All available ports are disabled. Please set portmask.\n");    }    check_all_ports_link_status(nb_ports, l2fwd_enabled_port_mask);    /* 启动l2fwd线程 */    /* launch per-lcore init on every lcore */    rte_eal_mp_remote_launch(l2fwd_launch_one_lcore, NULL, CALL_MASTER);    RTE_LCORE_FOREACH_SLAVE(lcore_id) {        if (rte_eal_wait_lcore(lcore_id) < 0)            return -1;    }    return 0;}


以下详细分析端口初始化过程; 对于每个port, 首先调用rte_eth_dev_configure配置端口的收发包队列个数,并初始化收发包队列控制块;

intrte_eth_dev_configure(uint8_t port_id, uint16_t nb_rx_q, uint16_t nb_tx_q,              const struct rte_eth_conf *dev_conf){    struct rte_eth_dev *dev;    struct rte_eth_dev_info dev_info;    int diag;    /* 只能由primary进程初始化 */    /* This function is only safe when called from the primary process     * in a multi-process setup*/    PROC_PRIMARY_OR_ERR_RET(-E_RTE_SECONDARY);    if (port_id >= nb_ports || port_id >= RTE_MAX_ETHPORTS) {        PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);        return (-EINVAL);    }    dev = &rte_eth_devices[port_id];    /* 在PMD驱动初始化过程中,E1000的ops注册为eth_em_ops */    FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -ENOTSUP);    FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_configure, -ENOTSUP);    /* rte_eth_dev_start会把该标记为置为1 */    if (dev->data->dev_started) {        PMD_DEBUG_TRACE(            "port %d must be stopped to allow configuration\n", port_id);        return (-EBUSY);    }    /* eth_em_infos_get会返回tx,rx队列数; 本例子max_rx_queues = 1 max_tx_queues = 1 */    /*     * Check that the numbers of RX and TX queues are not greater     * than the maximum number of RX and TX queues supported by the     * configured device.     */    (*dev->dev_ops->dev_infos_get)(dev, &dev_info);    if (nb_rx_q > dev_info.max_rx_queues) {        PMD_DEBUG_TRACE("ethdev port_id=%d nb_rx_queues=%d > %d\n",                port_id, nb_rx_q, dev_info.max_rx_queues);        return (-EINVAL);    }    if (nb_rx_q == 0) {        PMD_DEBUG_TRACE("ethdev port_id=%d nb_rx_q == 0\n", port_id);        return (-EINVAL);    }    if (nb_tx_q > dev_info.max_tx_queues) {        PMD_DEBUG_TRACE("ethdev port_id=%d nb_tx_queues=%d > %d\n",                port_id, nb_tx_q, dev_info.max_tx_queues);        return (-EINVAL);    }    if (nb_tx_q == 0) {        PMD_DEBUG_TRACE("ethdev port_id=%d nb_tx_q == 0\n", port_id);        return (-EINVAL);    }    /* dev_conf里面是tx,rx模式的配置 */    /* Copy the dev_conf parameter into the dev structure */    memcpy(&dev->data->dev_conf, dev_conf, sizeof(dev->data->dev_conf));    /* 是否收大报文 一般不需要 */    /*     * If jumbo frames are enabled, check that the maximum RX packet     * length is supported by the configured device.     */    if (dev_conf->rxmode.jumbo_frame == 1) {        if (dev_conf->rxmode.max_rx_pkt_len >            dev_info.max_rx_pktlen) {            PMD_DEBUG_TRACE("ethdev port_id=%d max_rx_pkt_len %u"                " > max valid value %u\n",                port_id,                (unsigned)dev_conf->rxmode.max_rx_pkt_len,                (unsigned)dev_info.max_rx_pktlen);            return (-EINVAL);        }        else if (dev_conf->rxmode.max_rx_pkt_len < ETHER_MIN_LEN) {            PMD_DEBUG_TRACE("ethdev port_id=%d max_rx_pkt_len %u"                " < min valid value %u\n",                port_id,                (unsigned)dev_conf->rxmode.max_rx_pkt_len,                (unsigned)ETHER_MIN_LEN);            return (-EINVAL);        }    } else        /* Use default value */        dev->data->dev_conf.rxmode.max_rx_pkt_len = ETHER_MAX_LEN;    /* 多队列的检查, 其中各种模式DCB/RSS表示什么意思? */    /* multipe queue mode checking */    diag = rte_eth_dev_check_mq_mode(port_id, nb_rx_q, nb_tx_q, dev_conf);    if (diag != 0) {        PMD_DEBUG_TRACE("port%d rte_eth_dev_check_mq_mode = %d\n",                port_id, diag);        return diag;    }    /*     * Setup new number of RX/TX queues and reconfigure device.     */    /* RX队列控制块内存分配 */    diag = rte_eth_dev_rx_queue_config(dev, nb_rx_q);    if (diag != 0) {        PMD_DEBUG_TRACE("port%d rte_eth_dev_rx_queue_config = %d\n",                port_id, diag);        return diag;    }    /* TX队列控制块内存分配 */    diag = rte_eth_dev_tx_queue_config(dev, nb_tx_q);    if (diag != 0) {        PMD_DEBUG_TRACE("port%d rte_eth_dev_tx_queue_config = %d\n",                port_id, diag);        rte_eth_dev_rx_queue_config(dev, 0);        return diag;    }    /* eth_em_configure, 标记intr->flags |= E1000_FLAG_NEED_LINK_UPDATE; */    diag = (*dev->dev_ops->dev_configure)(dev);    if (diag != 0) {        PMD_DEBUG_TRACE("port%d dev_configure = %d\n",                port_id, diag);        rte_eth_dev_rx_queue_config(dev, 0);        rte_eth_dev_tx_queue_config(dev, 0);        return diag;    }    return 0;}


RX queue setup


intrte_eth_rx_queue_setup(uint8_t port_id, uint16_t rx_queue_id,               uint16_t nb_rx_desc, unsigned int socket_id,               const struct rte_eth_rxconf *rx_conf,               struct rte_mempool *mp){    struct rte_eth_dev *dev;    struct rte_pktmbuf_pool_private *mbp_priv;    struct rte_eth_dev_info dev_info;    /* This function is only safe when called from the primary process     * in a multi-process setup*/    PROC_PRIMARY_OR_ERR_RET(-E_RTE_SECONDARY);    if (port_id >= nb_ports) {        PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);        return (-EINVAL);    }    dev = &rte_eth_devices[port_id];    if (rx_queue_id >= dev->data->nb_rx_queues) {        PMD_DEBUG_TRACE("Invalid RX queue_id=%d\n", rx_queue_id);        return (-EINVAL);    }    if (dev->data->dev_started) {        PMD_DEBUG_TRACE(            "port %d must be stopped to allow configuration\n", port_id);        return -EBUSY;    }    FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -ENOTSUP);    FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_setup, -ENOTSUP);    /*     * Check the size of the mbuf data buffer.     * This value must be provided in the private data of the memory pool.     * First check that the memory pool has a valid private data.     */    (*dev->dev_ops->dev_infos_get)(dev, &dev_info);    if (mp->private_data_size < sizeof(struct rte_pktmbuf_pool_private)) {        PMD_DEBUG_TRACE("%s private_data_size %d < %d\n",                mp->name, (int) mp->private_data_size,                (int) sizeof(struct rte_pktmbuf_pool_private));        return (-ENOSPC);    }    /* mbuf data部分大小(2048) > 256 */    mbp_priv = rte_mempool_get_priv(mp);    if ((uint32_t) (mbp_priv->mbuf_data_room_size - RTE_PKTMBUF_HEADROOM) <        dev_info.min_rx_bufsize) {        PMD_DEBUG_TRACE("%s mbuf_data_room_size %d < %d "                "(RTE_PKTMBUF_HEADROOM=%d + min_rx_bufsize(dev)"                "=%d)\n",                mp->name,                (int)mbp_priv->mbuf_data_room_size,                (int)(RTE_PKTMBUF_HEADROOM +                      dev_info.min_rx_bufsize),                (int)RTE_PKTMBUF_HEADROOM,                (int)dev_info.min_rx_bufsize);        return (-EINVAL);    }    /* eth_em_rx_queue_setup, 初始化收包描述符 */    return (*dev->dev_ops->rx_queue_setup)(dev, rx_queue_id, nb_rx_desc,                           socket_id, rx_conf, mp);}



intrte_eth_rx_queue_setup(uint8_t port_id, uint16_t rx_queue_id,               uint16_t nb_rx_desc, unsigned int socket_id,               const struct rte_eth_rxconf *rx_conf,               struct rte_mempool *mp){    struct rte_eth_dev *dev;    struct rte_pktmbuf_pool_private *mbp_priv;    struct rte_eth_dev_info dev_info;    /* This function is only safe when called from the primary process     * in a multi-process setup*/    PROC_PRIMARY_OR_ERR_RET(-E_RTE_SECONDARY);    if (port_id >= nb_ports) {        PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);        return (-EINVAL);    }    dev = &rte_eth_devices[port_id];    if (rx_queue_id >= dev->data->nb_rx_queues) {        PMD_DEBUG_TRACE("Invalid RX queue_id=%d\n", rx_queue_id);        return (-EINVAL);    }    if (dev->data->dev_started) {        PMD_DEBUG_TRACE(            "port %d must be stopped to allow configuration\n", port_id);        return -EBUSY;    }    FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_infos_get, -ENOTSUP);    FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_setup, -ENOTSUP);    /*     * Check the size of the mbuf data buffer.     * This value must be provided in the private data of the memory pool.     * First check that the memory pool has a valid private data.     */    (*dev->dev_ops->dev_infos_get)(dev, &dev_info);    if (mp->private_data_size < sizeof(struct rte_pktmbuf_pool_private)) {        PMD_DEBUG_TRACE("%s private_data_size %d < %d\n",                mp->name, (int) mp->private_data_size,                (int) sizeof(struct rte_pktmbuf_pool_private));        return (-ENOSPC);    }    /* mbuf data部分大小(2048) > 256 */    mbp_priv = rte_mempool_get_priv(mp);    if ((uint32_t) (mbp_priv->mbuf_data_room_size - RTE_PKTMBUF_HEADROOM) <        dev_info.min_rx_bufsize) {        PMD_DEBUG_TRACE("%s mbuf_data_room_size %d < %d "                "(RTE_PKTMBUF_HEADROOM=%d + min_rx_bufsize(dev)"                "=%d)\n",                mp->name,                (int)mbp_priv->mbuf_data_room_size,                (int)(RTE_PKTMBUF_HEADROOM +                      dev_info.min_rx_bufsize),                (int)RTE_PKTMBUF_HEADROOM,                (int)dev_info.min_rx_bufsize);        return (-EINVAL);    }    /* eth_em_rx_queue_setup, 初始化收包描述符 */    return (*dev->dev_ops->rx_queue_setup)(dev, rx_queue_id, nb_rx_desc,                           socket_id, rx_conf, mp);}


TX queue setup


intrte_eth_tx_queue_setup(uint8_t port_id, uint16_t tx_queue_id,               uint16_t nb_tx_desc, unsigned int socket_id,               const struct rte_eth_txconf *tx_conf){    struct rte_eth_dev *dev;    /* This function is only safe when called from the primary process     * in a multi-process setup*/    PROC_PRIMARY_OR_ERR_RET(-E_RTE_SECONDARY);    if (port_id >= RTE_MAX_ETHPORTS || port_id >= nb_ports) {        PMD_DEBUG_TRACE("Invalid port_id=%d\n", port_id);        return (-EINVAL);    }    dev = &rte_eth_devices[port_id];    if (tx_queue_id >= dev->data->nb_tx_queues) {        PMD_DEBUG_TRACE("Invalid TX queue_id=%d\n", tx_queue_id);        return (-EINVAL);    }    /* 必须在设备启动前做初始化操作 */    if (dev->data->dev_started) {        PMD_DEBUG_TRACE(            "port %d must be stopped to allow configuration\n", port_id);        return -EBUSY;    }    /* 调用PMD驱动的tx_queue_setup */    FUNC_PTR_OR_ERR_RET(*dev->dev_ops->tx_queue_setup, -ENOTSUP);    return (*dev->dev_ops->tx_queue_setup)(dev, tx_queue_id, nb_tx_desc,                           socket_id, tx_conf);}


inteth_em_tx_queue_setup(struct rte_eth_dev *dev,             uint16_t queue_idx,             uint16_t nb_desc,             unsigned int socket_id,             const struct rte_eth_txconf *tx_conf){    const struct rte_memzone *tz;    struct em_tx_queue *txq;    struct e1000_hw     *hw;    uint32_t tsize;    uint16_t tx_rs_thresh, tx_free_thresh;    hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);    /* tx descriptor必须是cache line对齐的 */    /*     * Validate number of transmit descriptors.     * It must not exceed hardware maximum, and must be multiple     * of EM_ALIGN.     */    if (((nb_desc * sizeof(*txq->tx_ring)) % EM_ALIGN) != 0 ||            (nb_desc > EM_MAX_RING_DESC) ||            (nb_desc < EM_MIN_RING_DESC)) {        return -(EINVAL);    }    /* threshold 配置 */    tx_free_thresh = tx_conf->tx_free_thresh;    if (tx_free_thresh == 0)        tx_free_thresh = (uint16_t)RTE_MIN(nb_desc / 4,                    DEFAULT_TX_FREE_THRESH);    tx_rs_thresh = tx_conf->tx_rs_thresh;    if (tx_rs_thresh == 0)        tx_rs_thresh = (uint16_t)RTE_MIN(tx_free_thresh,                    DEFAULT_TX_RS_THRESH);    if (tx_free_thresh >= (nb_desc - 3)) {        RTE_LOG(ERR, PMD, "tx_free_thresh must be less than the "            "number of TX descriptors minus 3. (tx_free_thresh=%u "            "port=%d queue=%d)\n", (unsigned int)tx_free_thresh,                (int)dev->data->port_id, (int)queue_idx);        return -(EINVAL);    }    if (tx_rs_thresh > tx_free_thresh) {        RTE_LOG(ERR, PMD, "tx_rs_thresh must be less than or equal to "            "tx_free_thresh. (tx_free_thresh=%u tx_rs_thresh=%u "            "port=%d queue=%d)\n", (unsigned int)tx_free_thresh,            (unsigned int)tx_rs_thresh, (int)dev->data->port_id,                            (int)queue_idx);        return -(EINVAL);    }    /*     * If rs_bit_thresh is greater than 1, then TX WTHRESH should be     * set to 0. If WTHRESH is greater than zero, the RS bit is ignored     * by the NIC and all descriptors are written back after the NIC     * accumulates WTHRESH descriptors.     */    if (tx_conf->tx_thresh.wthresh != 0 && tx_rs_thresh != 1) {        RTE_LOG(ERR, PMD, "TX WTHRESH must be set to 0 if "            "tx_rs_thresh is greater than 1. (tx_rs_thresh=%u "            "port=%d queue=%d)\n", (unsigned int)tx_rs_thresh,                (int)dev->data->port_id, (int)queue_idx);        return -(EINVAL);    }    /* txq不为空,释放原先的队列中的mbuf和txq */    /* Free memory prior to re-allocation if needed... */    if (dev->data->tx_queues[queue_idx] != NULL) {        em_tx_queue_release(dev->data->tx_queues[queue_idx]);        dev->data->tx_queues[queue_idx] = NULL;    }    /* 分配名为rte_em_pmd_tx_ring_p_q的memzone, 用于存放EM_MAX_RING_DESC个tx descriptor */    /*     * Allocate TX ring hardware descriptors. A memzone large enough to     * handle the maximum ring size is allocated in order to allow for     * resizing in later calls to the queue setup function.     */    tsize = sizeof (txq->tx_ring[0]) * EM_MAX_RING_DESC;    if ((tz = ring_dma_zone_reserve(dev, "tx_ring", queue_idx, tsize,            socket_id)) == NULL)        return (-ENOMEM);    /* txq内存分配 */    /* Allocate the tx queue data structure. */    if ((txq = rte_zmalloc("ethdev TX queue", sizeof(*txq),            CACHE_LINE_SIZE)) == NULL)        return (-ENOMEM);    /* txq sw_ring内存分配 */    /* Allocate software ring */    if ((txq->sw_ring = rte_zmalloc("txq->sw_ring",            sizeof(txq->sw_ring[0]) * nb_desc,            CACHE_LINE_SIZE)) == NULL) {        em_tx_queue_release(txq);        return (-ENOMEM);    }    txq->nb_tx_desc = nb_desc;    txq->tx_free_thresh = tx_free_thresh;    txq->tx_rs_thresh = tx_rs_thresh;    txq->pthresh = tx_conf->tx_thresh.pthresh;    txq->hthresh = tx_conf->tx_thresh.hthresh;    txq->wthresh = tx_conf->tx_thresh.wthresh;    txq->queue_id = queue_idx;    txq->port_id = dev->data->port_id;    txq->tdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_TDT(queue_idx));    /* tx_ring的物理地址 */#ifndef RTE_LIBRTE_XEN_DOM0    txq->tx_ring_phys_addr = (uint64_t) tz->phys_addr;#else       txq->tx_ring_phys_addr = rte_mem_phy2mch(tz->memseg_id, tz->phys_addr);#endif    /* tx_ring的虚拟地址 */    txq->tx_ring = (struct e1000_data_desc *) tz->addr;    PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64"\n",        txq->sw_ring, txq->tx_ring, txq->tx_ring_phys_addr);    /* 环状队列初始化,每个entry的next指向下一个,最后一个指向第一个 */    em_reset_tx_queue(txq);    dev->data->tx_queues[queue_idx] = txq;    return (0);}


端口初始化的最后一步是使能端口收发包功能,其中主要是通知E1000驱动tx ring和rx ring的地址, 细节就不再跟进

voideth_em_tx_init(struct rte_eth_dev *dev){    struct e1000_hw     *hw;    struct em_tx_queue *txq;    uint32_t tctl;    uint32_t txdctl;    uint16_t i;    hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);    /* 把每一个queue的tx ring的物理地址通告给E1000驱动 */    /* Setup the Base and Length of the Tx Descriptor Rings. */    for (i = 0; i < dev->data->nb_tx_queues; i++) {        uint64_t bus_addr;        txq = dev->data->tx_queues[i];        bus_addr = txq->tx_ring_phys_addr;        E1000_WRITE_REG(hw, E1000_TDLEN(i),                txq->nb_tx_desc *                sizeof(*txq->tx_ring));        E1000_WRITE_REG(hw, E1000_TDBAH(i),                (uint32_t)(bus_addr >> 32));        E1000_WRITE_REG(hw, E1000_TDBAL(i), (uint32_t)bus_addr);        /* Setup the HW Tx Head and Tail descriptor pointers. */        E1000_WRITE_REG(hw, E1000_TDT(i), 0);        E1000_WRITE_REG(hw, E1000_TDH(i), 0);        /* Setup Transmit threshold registers. */        txdctl = E1000_READ_REG(hw, E1000_TXDCTL(i));        /*         * bit 22 is reserved, on some models should always be 0,         * on others  - always 1.         */        txdctl &= E1000_TXDCTL_COUNT_DESC;        txdctl |= txq->pthresh & 0x3F;        txdctl |= (txq->hthresh & 0x3F) << 8;        txdctl |= (txq->wthresh & 0x3F) << 16;        txdctl |= E1000_TXDCTL_GRAN;        E1000_WRITE_REG(hw, E1000_TXDCTL(i), txdctl);    }    /* Program the Transmit Control Register. */    tctl = E1000_READ_REG(hw, E1000_TCTL);    tctl &= ~E1000_TCTL_CT;    tctl |= (E1000_TCTL_PSP | E1000_TCTL_RTLC | E1000_TCTL_EN |         (E1000_COLLISION_THRESHOLD << E1000_CT_SHIFT));    /* This write will effectively turn on the transmit unit. */    E1000_WRITE_REG(hw, E1000_TCTL, tctl);}


inteth_em_rx_init(struct rte_eth_dev *dev){    struct e1000_hw *hw;    struct em_rx_queue *rxq;    uint32_t rctl;    uint32_t rfctl;    uint32_t rxcsum;    uint32_t rctl_bsize;    uint16_t i;    int ret;    hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);    /*     * Make sure receives are disabled while setting     * up the descriptor ring.     */    rctl = E1000_READ_REG(hw, E1000_RCTL);    E1000_WRITE_REG(hw, E1000_RCTL, rctl & ~E1000_RCTL_EN);    rfctl = E1000_READ_REG(hw, E1000_RFCTL);    /* Disable extended descriptor type. */    rfctl &= ~E1000_RFCTL_EXTEN;    /* Disable accelerated acknowledge */    if (hw->mac.type == e1000_82574)        rfctl |= E1000_RFCTL_ACK_DIS;    E1000_WRITE_REG(hw, E1000_RFCTL, rfctl);    /*     * XXX TEMPORARY WORKAROUND: on some systems with 82573     * long latencies are observed, like Lenovo X60. This     * change eliminates the problem, but since having positive     * values in RDTR is a known source of problems on other     * platforms another solution is being sought.     */    if (hw->mac.type == e1000_82573)        E1000_WRITE_REG(hw, E1000_RDTR, 0x20);    dev->rx_pkt_burst = (eth_rx_burst_t)eth_em_recv_pkts;    /* 计算pkt buf的大小 */    /* Determine RX bufsize. */    rctl_bsize = EM_MAX_BUF_SIZE;    for (i = 0; i < dev->data->nb_rx_queues; i++) {        struct rte_pktmbuf_pool_private *mbp_priv;        uint32_t buf_size;        rxq = dev->data->rx_queues[i];        mbp_priv = rte_mempool_get_priv(rxq->mb_pool);        buf_size = mbp_priv->mbuf_data_room_size - RTE_PKTMBUF_HEADROOM;        rctl_bsize = RTE_MIN(rctl_bsize, buf_size);    }    rctl |= em_rctl_bsize(hw->mac.type, &rctl_bsize);    /* Configure and enable each RX queue. */    for (i = 0; i < dev->data->nb_rx_queues; i++) {        uint64_t bus_addr;        uint32_t rxdctl;        rxq = dev->data->rx_queues[i];        /* 从mbuf pool中分配mbuf, 填写到rxq->sw_ring,记录每个pkt buf的物理地址到rxq->rx_ring */        /* Allocate buffers for descriptor rings and setup queue */        ret = em_alloc_rx_queue_mbufs(rxq);        if (ret)            return ret;        /* 把rx ring的物理地址通告给E1000驱动 */        /*         * Reset crc_len in case it was changed after queue setup by a         *  call to configure         */        rxq->crc_len =            (uint8_t)(dev->data->dev_conf.rxmode.hw_strip_crc ?: ETHER_CRC_LEN);        bus_addr = rxq->rx_ring_phys_addr;        E1000_WRITE_REG(hw, E1000_RDLEN(i),                rxq->nb_rx_desc *                sizeof(*rxq->rx_ring));        E1000_WRITE_REG(hw, E1000_RDBAH(i),                (uint32_t)(bus_addr >> 32));        E1000_WRITE_REG(hw, E1000_RDBAL(i), (uint32_t)bus_addr);        E1000_WRITE_REG(hw, E1000_RDH(i), 0);        E1000_WRITE_REG(hw, E1000_RDT(i), rxq->nb_rx_desc - 1);        rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(0));        rxdctl &= 0xFE000000;        rxdctl |= rxq->pthresh & 0x3F;        rxdctl |= (rxq->hthresh & 0x3F) << 8;        rxdctl |= (rxq->wthresh & 0x3F) << 16;        rxdctl |= E1000_RXDCTL_GRAN;        E1000_WRITE_REG(hw, E1000_RXDCTL(i), rxdctl);        /* 收大报文用的收包函数 */        /*         * Due to EM devices not having any sort of hardware         * limit for packet length, jumbo frame of any size         * can be accepted, thus we have to enable scattered         * rx if jumbo frames are enabled (or if buffer size         * is too small to accomodate non-jumbo packets)         * to avoid splitting packets that don't fit into         * one buffer.         */        if (dev->data->dev_conf.rxmode.jumbo_frame ||                rctl_bsize < ETHER_MAX_LEN) {            dev->rx_pkt_burst =                (eth_rx_burst_t)eth_em_recv_scattered_pkts;            dev->data->scattered_rx = 1;        }    }    /* 以下省略 */    ...    return 0;}


到此端口初始化完成,比启动,回到main函数中, 在每个lcore上启动循环收包函数

/* launch per-lcore init on every lcore */rte_eal_mp_remote_launch(l2fwd_launch_one_lcore, NULL, CALL_MASTER);

lcore的主线程处理如下


/* main processing loop */static voidl2fwd_main_loop(void){    struct rte_mbuf *pkts_burst[MAX_PKT_BURST];    struct rte_mbuf *m;    unsigned lcore_id;    uint64_t prev_tsc, diff_tsc, cur_tsc, timer_tsc;    unsigned i, j, portid, nb_rx;    struct lcore_queue_conf *qconf;    const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;    prev_tsc = 0;    timer_tsc = 0;    lcore_id = rte_lcore_id();    qconf = &lcore_queue_conf[lcore_id];    if (qconf->n_rx_port == 0) {        RTE_LOG(INFO, L2FWD, "lcore %u has nothing to do\n", lcore_id);        return;    }    RTE_LOG(INFO, L2FWD, "entering main loop on lcore %u\n", lcore_id);    /* 当前lcore需要处理哪些port(queue) */    for (i = 0; i < qconf->n_rx_port; i++) {        portid = qconf->rx_port_list[i];        RTE_LOG(INFO, L2FWD, " -- lcoreid=%u portid=%u\n", lcore_id,            portid);    }    while (1) {        cur_tsc = rte_rdtsc();        /*         * TX burst queue drain         */        diff_tsc = cur_tsc - prev_tsc;        /* 隔一段时间才把所有要发送的报文发送出去并打印统计信息 */        if (unlikely(diff_tsc > drain_tsc)) {            for (portid = 0; portid < RTE_MAX_ETHPORTS; portid++) {                /* 当前port没有需要发送的报文 */                if (qconf->tx_mbufs[portid].len == 0)                    continue;                /* 调用device的发包函数并统计发送的报文个数 */                l2fwd_send_burst(&lcore_queue_conf[lcore_id],                         qconf->tx_mbufs[portid].len,                         (uint8_t) portid);                /* 到此应该当前端口需要发送的报文全部发送,因此len置为0 */                qconf->tx_mbufs[portid].len = 0;            }            /* if timer is enabled */            if (timer_period > 0) {                /* advance the timer */                timer_tsc += diff_tsc;                /* if timer has reached its timeout */                if (unlikely(timer_tsc >= (uint64_t) timer_period)) {                    /* do this only on master core */                    if (lcore_id == rte_get_master_lcore()) {                        print_stats();                        /* reset the timer */                        timer_tsc = 0;                    }                }            }            prev_tsc = cur_tsc;        }        /* 当前lcore需要处理的queue */        /*         * Read packet from RX queues         */        for (i = 0; i < qconf->n_rx_port; i++) {            portid = qconf->rx_port_list[i];            /* 当前port只有queue0 */            nb_rx = rte_eth_rx_burst((uint8_t) portid, 0,                         pkts_burst, MAX_PKT_BURST);            /* 更新收包统计 */            port_statistics[portid].rx += nb_rx;            /* 把所有收上来的报文修改目的MAC后加入到发包队列 */            for (j = 0; j < nb_rx; j++) {                m = pkts_burst[j];                /* PKT DATA部分载入cache,这个好像收包部分已经prefetch过了 */                rte_prefetch0(rte_pktmbuf_mtod(m, void *));                /* forword */                l2fwd_simple_forward(m, portid);            }        }    }}



首先看报文是如何收上来的, 调用device的rx_pkt_burst


static inline uint16_trte_eth_rx_burst(uint8_t port_id, uint16_t queue_id,         struct rte_mbuf **rx_pkts, uint16_t nb_pkts){    struct rte_eth_dev *dev;    dev = &rte_eth_devices[port_id];    return (*dev->rx_pkt_burst)(dev->data->rx_queues[queue_id], rx_pkts, nb_pkts);}


PMD的收包函数如下:

uint16_teth_em_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,        uint16_t nb_pkts){    /* volatile防止编译器优化,每次使用必须重新从memory中取而不是用寄存器的值 */    volatile struct e1000_rx_desc *rx_ring;    volatile struct e1000_rx_desc *rxdp;    struct em_rx_queue *rxq;    struct em_rx_entry *sw_ring;    struct em_rx_entry *rxe;    struct rte_mbuf *rxm;    struct rte_mbuf *nmb;    struct e1000_rx_desc rxd;    uint64_t dma_addr;    uint16_t pkt_len;    uint16_t rx_id;    uint16_t nb_rx;    uint16_t nb_hold;    uint8_t status;    rxq = rx_queue;    nb_rx = 0;    nb_hold = 0;    rx_id = rxq->rx_tail;       /* 当前收包位置 */    rx_ring = rxq->rx_ring;     /* rx descriptor */    sw_ring = rxq->sw_ring;     /* mbuf */    /* 一次性收32个报文 */    while (nb_rx < nb_pkts) {        /*         * The order of operations here is important as the DD status         * bit must not be read after any other descriptor fields.         * rx_ring and rxdp are pointing to volatile data so the order         * of accesses cannot be reordered by the compiler. If they were         * not volatile, they could be reordered which could lead to         * using invalid descriptor fields when read from rxd.         */                /* 当前报文的descriptor */        rxdp = &rx_ring[rx_id];        /* 结束标记,必须首先读取 */        status = rxdp->status;        if (! (status & E1000_RXD_STAT_DD))            break;        /* 复制一份 */        rxd = *rxdp;        /*         * End of packet.         *         * If the E1000_RXD_STAT_EOP flag is not set, the RX packet is         * likely to be invalid and to be dropped by the various         * validation checks performed by the network stack.         *         * Allocate a new mbuf to replenish the RX ring descriptor.         * If the allocation fails:         *    - arrange for that RX descriptor to be the first one         *      being parsed the next time the receive function is         *      invoked [on the same queue].         *         *    - Stop parsing the RX ring and return immediately.         *         * This policy do not drop the packet received in the RX         * descriptor for which the allocation of a new mbuf failed.         * Thus, it allows that packet to be later retrieved if         * mbuf have been freed in the mean time.         * As a side effect, holding RX descriptors instead of         * systematically giving them back to the NIC may lead to         * RX ring exhaustion situations.         * However, the NIC can gracefully prevent such situations         * to happen by sending specific "back-pressure" flow control         * frames to its peer(s).         */        PMD_RX_LOG(DEBUG, "\nport_id=%u queue_id=%u rx_id=%u "            "status=0x%x pkt_len=%u\n",            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,            (unsigned) rx_id, (unsigned) status,            (unsigned) rte_le_to_cpu_16(rxd.length));        /* 分配新的mbuf给驱动 */        nmb = rte_rxmbuf_alloc(rxq->mb_pool);        if (nmb == NULL) {            PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u "                "queue_id=%u\n",                (unsigned) rxq->port_id,                (unsigned) rxq->queue_id);            rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;            break;        }        /* 表示当前descriptor被上层软件占用 */        nb_hold++;        /* 当前收到的mbuf */        rxe = &sw_ring[rx_id];        /* 收包位置,如果超过环状数组则回滚 */        rx_id++;        if (rx_id == rxq->nb_rx_desc)            rx_id = 0;        /* mbuf载入cache下次循环使用 */        /* Prefetch next mbuf while processing current one. */        rte_em_prefetch(sw_ring[rx_id].mbuf);        /* 取下一个descriptor,以及mbuf指针下次循环使用 */        /* 一个cache line是4个descriptor大小(64字节) */        /*         * When next RX descriptor is on a cache-line boundary,         * prefetch the next 4 RX descriptors and the next 8 pointers         * to mbufs.         */        if ((rx_id & 0x3) == 0) {            rte_em_prefetch(&rx_ring[rx_id]);            rte_em_prefetch(&sw_ring[rx_id]);        }        /* Rearm RXD: attach new mbuf and reset status to zero. */        /* 替换sw_ring entry的mbuf指针 */        rxm = rxe->mbuf;        rxe->mbuf = nmb;        dma_addr =            rte_cpu_to_le_64(RTE_MBUF_DATA_DMA_ADDR_DEFAULT(nmb));        rxdp->buffer_addr = dma_addr;        /* 重置当前descriptor的status */        rxdp->status = 0;        /*         * Initialize the returned mbuf.         * 1) setup generic mbuf fields:         *    - number of segments,         *    - next segment,         *    - packet length,         *    - RX port identifier.         * 2) integrate hardware offload data, if any:         *    - RSS flag & hash,         *    - IP checksum flag,         *    - VLAN TCI, if any,         *    - error flags.         */        pkt_len = (uint16_t) (rte_le_to_cpu_16(rxd.length) -                rxq->crc_len);        rxm->pkt.data = (char*) rxm->buf_addr + RTE_PKTMBUF_HEADROOM;        rte_packet_prefetch(rxm->pkt.data);        rxm->pkt.nb_segs = 1;        rxm->pkt.next = NULL;        rxm->pkt.pkt_len = pkt_len;        rxm->pkt.data_len = pkt_len;        rxm->pkt.in_port = rxq->port_id;        rxm->ol_flags = rx_desc_status_to_pkt_flags(status);        rxm->ol_flags = (uint16_t)(rxm->ol_flags |                rx_desc_error_to_pkt_flags(rxd.errors));        /* Only valid if PKT_RX_VLAN_PKT set in pkt_flags */        rxm->pkt.vlan_macip.f.vlan_tci = rte_le_to_cpu_16(rxd.special);        /* 把收到的mbuf返回给用户 */        /*         * Store the mbuf address into the next entry of the array         * of returned packets.         */        rx_pkts[nb_rx++] = rxm;    }    /* 收包位置更新 */    rxq->rx_tail = rx_id;    /* 更新被上层软件使用的descriptor个数 */    /*     * If the number of free RX descriptors is greater than the RX free     * threshold of the queue, advance the Receive Descriptor Tail (RDT)     * register.     * Update the RDT with the value of the last processed RX descriptor     * minus 1, to guarantee that the RDT register is never equal to the     * RDH register, which creates a "full" ring situtation from the     * hardware point of view...     */    nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);    if (nb_hold > rxq->rx_free_thresh) {        PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_tail=%u "            "nb_hold=%u nb_rx=%u\n",            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,            (unsigned) rx_id, (unsigned) nb_hold,            (unsigned) nb_rx);        rx_id = (uint16_t) ((rx_id == 0) ?            (rxq->nb_rx_desc - 1) : (rx_id - 1));        E1000_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);        nb_hold = 0;    }    rxq->nb_rx_hold = nb_hold;    return (nb_rx);}


发包函数

static inline uint16_trte_eth_tx_burst(uint8_t port_id, uint16_t queue_id,         struct rte_mbuf **tx_pkts, uint16_t nb_pkts){    struct rte_eth_dev *dev;    dev = &rte_eth_devices[port_id];    return (*dev->tx_pkt_burst)(dev->data->tx_queues[queue_id], tx_pkts, nb_pkts);}


调用的PMD的发包函数


uint16_teth_em_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,        uint16_t nb_pkts){    struct em_tx_queue *txq;    struct em_tx_entry *sw_ring;    struct em_tx_entry *txe, *txn;    volatile struct e1000_data_desc *txr;    volatile struct e1000_data_desc *txd;    struct rte_mbuf     *tx_pkt;    struct rte_mbuf     *m_seg;    uint64_t buf_dma_addr;    uint32_t popts_spec;    uint32_t cmd_type_len;    uint16_t slen;    uint16_t ol_flags;    uint16_t tx_id;    uint16_t tx_last;    uint16_t nb_tx;    uint16_t nb_used;    uint16_t tx_ol_req;    uint32_t ctx;    uint32_t new_ctx;    union rte_vlan_macip hdrlen;    txq = tx_queue;    sw_ring = txq->sw_ring;    txr     = txq->tx_ring;    /* 发包位置 */    tx_id   = txq->tx_tail;    /* 先把旧的已发送的mbuf回收,然后把新的要发送的mbuf写入 */    txe = &sw_ring[tx_id];    /* 可用tx descriptor太少的话做cleanup */    /* Determine if the descriptor ring needs to be cleaned. */    if ((txq->nb_tx_desc - txq->nb_tx_free) > txq->tx_free_thresh) {        em_xmit_cleanup(txq);    }    /* nb_pkts为一共要发送的报文个数(32) */    /* TX loop */    for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {        new_ctx = 0;        /* 要发送的mbuf指针 */        tx_pkt = *tx_pkts++;        /* 载入L1,L2 cache,用于释放mbuf */        RTE_MBUF_PREFETCH_TO_FREE(txe->mbuf);        /*         * Determine how many (if any) context descriptors         * are needed for offload functionality.         */        ol_flags = tx_pkt->ol_flags;        /* If hardware offload required */        tx_ol_req = (uint16_t)(ol_flags & (PKT_TX_IP_CKSUM |                            PKT_TX_L4_MASK));        if (tx_ol_req) {            hdrlen = tx_pkt->pkt.vlan_macip;            /* 检查是否需要新的context descriptor */            /* If new context to be built or reuse the exist ctx. */            ctx = what_ctx_update(txq, tx_ol_req, hdrlen);            /* Only allocate context descriptor if required*/            new_ctx = (ctx == EM_CTX_NUM);        }        /* 需要的descriptor个数为报文的segment数+是否需要context descriptor */        /*         * Keep track of how many descriptors are used this loop         * This will always be the number of segments + the number of         * Context descriptors required to transmit the packet         */        nb_used = (uint16_t)(tx_pkt->pkt.nb_segs + new_ctx);        /* 结束位置, 从tx_id处用起,因此-1 */        /*          * The number of descriptors that must be allocated for a         * packet is the number of segments of that packet, plus 1         * Context Descriptor for the hardware offload, if any.         * Determine the last TX descriptor to allocate in the TX ring         * for the packet, starting from the current position (tx_id)         * in the ring.         */        tx_last = (uint16_t) (tx_id + nb_used - 1);        /* 回滚 */        /* Circular ring */        if (tx_last >= txq->nb_tx_desc)            tx_last = (uint16_t) (tx_last - txq->nb_tx_desc);        PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u pktlen=%u"            " tx_first=%u tx_last=%u\n",            (unsigned) txq->port_id,            (unsigned) txq->queue_id,            (unsigned) tx_pkt->pkt.pkt_len,            (unsigned) tx_id,            (unsigned) tx_last);        /*         * Make sure there are enough TX descriptors available to         * transmit the entire packet.         * nb_used better be less than or equal to txq->tx_rs_thresh         */        while (unlikely (nb_used > txq->nb_tx_free)) {            PMD_TX_FREE_LOG(DEBUG,                    "Not enough free TX descriptors "                    "nb_used=%4u nb_free=%4u "                    "(port=%d queue=%d)",                    nb_used, txq->nb_tx_free,                    txq->port_id, txq->queue_id);            if (em_xmit_cleanup(txq) != 0) {                /* Could not clean any descriptors */                if (nb_tx == 0)                    return (0);                goto end_of_tx;            }        }        /*         * By now there are enough free TX descriptors to transmit         * the packet.         */        /*         * Set common flags of all TX Data Descriptors.         *         * The following bits must be set in all Data Descriptors:         *    - E1000_TXD_DTYP_DATA         *    - E1000_TXD_DTYP_DEXT         *         * The following bits must be set in the first Data Descriptor         * and are ignored in the other ones:         *    - E1000_TXD_POPTS_IXSM         *    - E1000_TXD_POPTS_TXSM         *         * The following bits must be set in the last Data Descriptor         * and are ignored in the other ones:         *    - E1000_TXD_CMD_VLE         *    - E1000_TXD_CMD_IFCS         *         * The following bits must only be set in the last Data         * Descriptor:         *   - E1000_TXD_CMD_EOP         *         * The following bits can be set in any Data Descriptor, but         * are only set in the last Data Descriptor:         *   - E1000_TXD_CMD_RS         */        cmd_type_len = E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D |            E1000_TXD_CMD_IFCS;        popts_spec = 0;        /* Set VLAN Tag offload fields. */        if (ol_flags & PKT_TX_VLAN_PKT) {            cmd_type_len |= E1000_TXD_CMD_VLE;            popts_spec = tx_pkt->pkt.vlan_macip.f.vlan_tci <<                E1000_TXD_VLAN_SHIFT;        }        if (tx_ol_req) {            /*             * Setup the TX Context Descriptor if required             */            if (new_ctx) {                volatile struct e1000_context_desc *ctx_txd;                /* 如果需要context descriptor, tx_id处存放ctx的tx descriptor */                ctx_txd = (volatile struct e1000_context_desc *)                    &txr[tx_id];                /* 下一个tx descriptor */                txn = &sw_ring[txe->next_id];                RTE_MBUF_PREFETCH_TO_FREE(txn->mbuf);                if (txe->mbuf != NULL) {                    rte_pktmbuf_free_seg(txe->mbuf);                    txe->mbuf = NULL;                }                /* 设置ctx值到txq */                em_set_xmit_ctx(txq, ctx_txd, tx_ol_req,                    hdrlen);                txe->last_id = tx_last;                /* tx_id,txe 都分别指向下一个 */                tx_id = txe->next_id;                txe = txn;            }            /*             * Setup the TX Data Descriptor,             * This path will go through             * whatever new/reuse the context descriptor             */            popts_spec |= tx_desc_cksum_flags_to_upper(ol_flags);        }        m_seg = tx_pkt;        do {            txd = &txr[tx_id];            txn = &sw_ring[txe->next_id];            /* 已发送的mbuf,回收,实际的pkt addr已经写入tx descriptor了,mbuf已经没用了 */            if (txe->mbuf != NULL)                rte_pktmbuf_free_seg(txe->mbuf);            /* 当前mbuf加入txe */            txe->mbuf = m_seg;            /*             * Set up Transmit Data Descriptor.             */            slen = m_seg->pkt.data_len;            buf_dma_addr = RTE_MBUF_DATA_DMA_ADDR(m_seg);            txd->buffer_addr = rte_cpu_to_le_64(buf_dma_addr);            txd->lower.data = rte_cpu_to_le_32(cmd_type_len | slen);            txd->upper.data = rte_cpu_to_le_32(popts_spec);            txe->last_id = tx_last;            /* tx_id更新 */            tx_id = txe->next_id;            txe = txn;            m_seg = m_seg->pkt.next;        } while (m_seg != NULL);        /* 驱动相关的flag,vlan ip checksum之类,略过 */        /*         * The last packet data descriptor needs End Of Packet (EOP)         */        cmd_type_len |= E1000_TXD_CMD_EOP;        txq->nb_tx_used = (uint16_t)(txq->nb_tx_used + nb_used);        txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_used);        /* Set RS bit only on threshold packets' last descriptor */        if (txq->nb_tx_used >= txq->tx_rs_thresh) {            PMD_TX_FREE_LOG(DEBUG,                    "Setting RS bit on TXD id="                    "%4u (port=%d queue=%d)",                    tx_last, txq->port_id, txq->queue_id);            cmd_type_len |= E1000_TXD_CMD_RS;            /* Update txq RS bit counters */            txq->nb_tx_used = 0;        }        txd->lower.data |= rte_cpu_to_le_32(cmd_type_len);    }end_of_tx:    rte_wmb();    /* 通知驱动有报文发送 */    /*     * Set the Transmit Descriptor Tail (TDT)     */    PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u tx_tail=%u nb_tx=%u",        (unsigned) txq->port_id, (unsigned) txq->queue_id,        (unsigned) tx_id, (unsigned) nb_tx);    E1000_PCI_REG_WRITE(txq->tdt_reg_addr, tx_id);    /* 更新tx队列位置 */    txq->tx_tail = tx_id;    return (nb_tx);}


1 0