网络数据包发送之调度层处理

来源：互联网发布：js隐藏div的几种方式编辑：程序博客网时间：2024/05/22 10:27

一、网络数据包在流量调度层的路径分析

离开网络层后，内核调用dev_queue_xmit函数进入流量调度层处理，那么所有的分析都依据该函数为依据。

1、首先调用netdev_pick_tx函数选择输出传输队列。如果存在有效的传输队列，则将该数据包插入队列中或者直接传递给dev_hard_start_xmit函数，并且调用__qdisc_run函数选择队列上的数据包发送出去，当配额用完以后当前调度结束。如果队列上的数据包还没有传输完成，则把剩下的数据包插入到CPU的softnet_data上，并调用NET_TX_SOFTIRQ等待下一次软中断被调度。

/** *__dev_queue_xmit - transmit a buffer *@skb: buffer to transmit *@accel_priv: private data used for L2 forwarding offload * *Queue a buffer for transmission to a network device. The caller must *have set the device and priority and built the buffer before calling *this function. The function can be called from an interrupt. * *A negative errno code is returned on a failure. A success does not *guarantee the frame will be transmitted as it may be dropped due *to congestion or traffic shaping. * * ----------------------------------------------------------------------------------- *      I notice this method can also return errors from the queue disciplines, *      including NET_XMIT_DROP, which is a positive value.  So, errors can also *      be positive. * *      Regardless of the return value, the skb is consumed, so it is currently *      difficult to retry a send to this method.  (You can bump the ref count *      before sending to hold a reference for retry if you are careful.) * *      When calling this method, interrupts MUST be enabled.  This is because *      the BH enable code must have IRQs enabled so that it will not deadlock. *          --BLG */static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv){struct net_device *dev = skb->dev;struct netdev_queue *txq;struct Qdisc *q;int rc = -ENOMEM;skb_reset_mac_header(skb);/* Disable soft irqs for various locks below. Also * stops preemption for RCU. */rcu_read_lock_bh();skb_update_prio(skb);/* 选择该设备上发送的传输队列 */

txq = netdev_pick_tx(dev, skb, accel_priv);q = rcu_dereference_bh(txq->qdisc);#ifdef CONFIG_NET_CLS_ACTskb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);#endiftrace_net_dev_queue(skb);/* 如果Qdisc提供了入队列方法，则需要通过调度才能发送数据包 */

if (q->enqueue) {rc = __dev_xmit_skb(skb, q, dev, txq);goto out;}/* The device has no queue. Common case for software devices:   loopback, all the sorts of tunnels...   Really, it is unlikely that netif_tx_lock protection is necessary   here.  (f.e. loopback and IP tunnels are clean ignoring statistics   counters.)   However, it is possible, that they rely on protection   made by us here.   Check this and shot the lock. It is not prone from deadlocks.   Either shot noqueue qdisc, it is even simpler 8) */if (dev->flags & IFF_UP) {int cpu = smp_processor_id(); /* ok because BHs are off */if (txq->xmit_lock_owner != cpu) {if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)goto recursion_alert;HARD_TX_LOCK(dev, txq, cpu);if (!netif_xmit_stopped(txq)) {__this_cpu_inc(xmit_recursion);rc = dev_hard_start_xmit(skb, dev, txq);__this_cpu_dec(xmit_recursion);if (dev_xmit_complete(rc)) {HARD_TX_UNLOCK(dev, txq);goto out;}}HARD_TX_UNLOCK(dev, txq);net_crit_ratelimited("Virtual device %s asks to queue packet!\n",     dev->name);} else {/* Recursion is detected! It is possible, * unfortunately */recursion_alert:net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",     dev->name);}}rc = -ENETDOWN;rcu_read_unlock_bh();kfree_skb(skb);return rc;out:rcu_read_unlock_bh();return rc;}

2、当网络发送软中断被激活时，net_tx_action函数被调用。该函数依次调度之前未完成等待处理的Qdisc队列链表，每个Qdisc根据分配的配额发送数据包给驱动层。

static void net_tx_action(struct softirq_action *h){struct softnet_data *sd = &__get_cpu_var(softnet_data);/* 对于已经发送出去的数据包，需要回收释放skb相关内存空间 */

if (sd->completion_queue) {struct sk_buff *clist;local_irq_disable();clist = sd->completion_queue;sd->completion_queue = NULL;local_irq_enable();while (clist) {struct sk_buff *skb = clist;clist = clist->next;WARN_ON(atomic_read(&skb->users));if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))trace_consume_skb(skb);elsetrace_kfree_skb(skb, net_tx_action);__kfree_skb(skb);}}/* 如果有待发送的数据包，则在此处理 */

if (sd->output_queue) {struct Qdisc *head;local_irq_disable();head = sd->output_queue;sd->output_queue = NULL;sd->output_queue_tailp = &sd->output_queue;local_irq_enable();while (head) {struct Qdisc *q = head;spinlock_t *root_lock;head = head->next_sched;root_lock = qdisc_lock(q);if (spin_trylock(root_lock)) {smp_mb__before_clear_bit();clear_bit(__QDISC_STATE_SCHED,  &q->state);/* 根据分配的配额调度发送该Qdisc管理的数据包 */

qdisc_run(q);spin_unlock(root_lock);} else {if (!test_bit(__QDISC_STATE_DEACTIVATED,      &q->state)) {__netif_reschedule(q);} else {smp_mb__before_clear_bit();clear_bit(__QDISC_STATE_SCHED,  &q->state);}}}}}

二、流量调度算法和网络设备、传输接收队列的关系

简单来说流量调度算法与网络接口设备、设备上的传输/接收队列绑在一起。

三、应用程序API接口

调度层通过netlink接口为应用程序提供控制管理接口，从以下注册代码可知：

rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);

四、流量调度算法

1、注册/反注册

相关函数如下，调度算法采用模块方式加载到内核中，由于比较简单，不再赘述：

int register_qdisc(struct Qdisc_ops *qops){struct Qdisc_ops *q, **qp;int rc = -EEXIST;write_lock(&qdisc_mod_lock);for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)if (!strcmp(qops->id, q->id))goto out;if (qops->enqueue == NULL)qops->enqueue = noop_qdisc_ops.enqueue;if (qops->peek == NULL) {if (qops->dequeue == NULL)qops->peek = noop_qdisc_ops.peek;elsegoto out_einval;}if (qops->dequeue == NULL)qops->dequeue = noop_qdisc_ops.dequeue;if (qops->cl_ops) {const struct Qdisc_class_ops *cops = qops->cl_ops;if (!(cops->get && cops->put && cops->walk && cops->leaf))goto out_einval;if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))goto out_einval;}qops->next = NULL;*qp = qops;rc = 0;out:write_unlock(&qdisc_mod_lock);return rc;out_einval:rc = -EINVAL;goto out;}EXPORT_SYMBOL(register_qdisc);int unregister_qdisc(struct Qdisc_ops *qops){struct Qdisc_ops *q, **qp;int err = -ENOENT;write_lock(&qdisc_mod_lock);for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)if (q == qops)break;if (q) {*qp = q->next;q->next = NULL;err = 0;}write_unlock(&qdisc_mod_lock);return err;}EXPORT_SYMBOL(unregister_qdisc);

2、流量调度算法简析

内核提供了非常多的流量调度算法，分别针对各种情景进行了优化，比如先进先出算法、黑洞算法、分类调度算法、随机早期检测算法、优先级调度算法、随机公平调度算法等，相关内容在单独章节介绍。

0 0