深入理解dpdk rte_ring无锁队列

来源：互联网发布：matlab gui 编程实例编辑：程序博客网时间：2024/06/07 06:37

一、rte_ring简介

rte_ring的实质是FIFO的无锁环形队列，无锁队列的出队入队操作是rte_ring实现的关键。常用于多线程/多进程之间的通信。

ring的特点：

无锁出入队（除了cas(compare and swap)操作）
多消费/生产者同时出入队

使用方法：

1.创建一个ring对象。

接口：structrte_ring * rte_ring_create(constchar *name, unsigned count, int socket_id, unsigned flags)其中：name：ring的namecount：ring队列的长度必须是2的幂次方socket_id：ring位于的socketflags：指定创建的ring的属性：单/多生产者、单/多消费者两者之间的组合；0表示使用默认属性（多生产者、多消费者），不同的属性出入队的操作会有所不同例如：struct rte_ring *r = rte_ring_create(“MY_RING”, 1024,rte_socket_id(), 0);

2.出入队
有不同的出入队方式（单、bulk、burst）都在rte_ring.h中。
例如：rte_ring_enqueue和rte_ring_dequeue

这种数据结构与链表队列相比：

优点如下：

更快：比较void *大小的数据，只需要执行单次Compare-And-Swap指令，而不需要执行2次Compare-And-Swap指令
比完全无锁队列简单
适用于批量入队/出队操作。因为指针存储在表中，多个对象出队并不会像链表队列那样产生大量的缓存未命中，此外，多个对象批量出队不会比单个对象出队开销大
CAS(Compare and Swap)是个原子操作

缺点如下：

大小固定
许多环在内存方面的成本比链表列表的成本更高。空环至少包含N个指针。

二、rte_ring实现多进程间通信

rte_ring需要与rte_mempool配合使用，通过rte_mempool来共享内存。

首先primary进程创建ring和mempool，secondary进程在primary进程启动后，通过rte_ring_lookup和rte_mempool_lookup来获取ring和mempool的地址：

primary:

struct rte_ring *ring = rte_ring_create("message_ring",        ring_size, rte_socket_id(), flags);struct rte_mempool *message_pool = rte_mempool_create(        "message_pool", pool_size,        string_size, pool_cache, 0,        NULL, NULL, NULL, NULL,        rte_socket_id(), flags);

secondary:

struct rte_ring *ring = rte_ring_lookup("message_ring");struct rte_mempool *message_pool = rte_mempool_lookup(        "message_pool");

使用时，rte_mempool_get从mempool中获取一个对象，然后使用rte_ring_enqueue入队列，另一个进程通过rte_ring_dequeue来出队列，使用完成后需要rte_mempool_put将对象放回mempool：

sender:

void *msg = NULL;if (rte_mempool_get(message_pool, &msg) < 0)    pannic();snprintf((char *)msg, string_size, "%s", "helloworld");if (rte_ring_enqueue(ring, msg) < 0) {    rte_mempool_put(message_pool, msg);}

receiver:

while (!quit){    void *msg;    if (rte_ring_dequeue(recv_ring, &msg) < 0){        usleep(5);        continue;    }    printf("Received: '%s'\n", (char *)msg);    rte_mempool_put(message_pool, msg);}

三、rte_ring结构体分析

无锁环形队列的结构体如下：

struct rte_ring {      /*       * Note: this field kept the RTE_MEMZONE_NAMESIZE size due to ABI       * compatibility requirements, it could be changed to RTE_RING_NAMESIZE       * next time the ABI changes       */      TAILQ_ENTRY(rte_ring) next;     /**< Next in list. */    char name[RTE_MEMZONE_NAMESIZE];    /**< Name of the ring. */      int flags;                       /**< Flags supplied at creation. */      const struct rte_memzone *memzone;              /**< Memzone, if any, containing the rte_ring */      /** Ring producer status. */      struct prod {          uint32_t watermark;      /**< Maximum items before EDQUOT. */          uint32_t sp_enqueue;     /**< True, if single producer. */          uint32_t size;           /**< Size of ring. */          uint32_t mask;           /**< Mask (size-1) of ring. */          // 生产者头尾指针，生产完成后都指向队尾        volatile uint32_t head;  /**< Producer head. 预生产到地方*/          volatile uint32_t tail;  /**< Producer tail. 实际生产了的数量*/      } prod __rte_cache_aligned;      /** Ring consumer status. */      struct cons {          uint32_t sc_dequeue;     /**< True, if single consumer. */          uint32_t size;           /**< Size of the ring. */          uint32_t mask;           /**< Mask (size-1) of ring. */         // 消费者头尾指针，生产完成后都指向队头        volatile uint32_t head;  /**< Consumer head. cgm预出队的地方*/          volatile uint32_t tail;  /**< Consumer tail. 实际出队的地方*/  #ifdef RTE_RING_SPLIT_PROD_CONS      } cons __rte_cache_aligned;  #else      } cons;  #endif  #ifdef RTE_LIBRTE_RING_DEBUG      struct rte_ring_debug_stats stats[RTE_MAX_LCORE];  #endif      // 队列中保存的所有对象    void *ring[] __rte_cache_aligned;   /**< Memory space of ring starts here.                                           * not volatile so need to be careful                                           * about compiler re-ordering */  };

这里写图片描述

dpdk在rte_ring_list链表中创建一个rte_tailq_entry节点，在memzone中根据队列的大小count申请一块内存(rte_ring的大小加上count*sizeof(void *))。紧邻着rte_ring结构的void *数组用于放置入队的对象（单纯的赋值指针值）。rte_ring结构中有生产者结构prod、消费者结构cons，初始化参数之后，把rte_tailq_entry的data节点指向rte_ring结构地址。

可以注意到cons.head、cons.tail、prod.head、prod.tail的类型都是uint32_t。除此之外，队列的大小count被限制为2的幂次方。这两个条件放到一起构成了一个很巧妙的情景。因为队列的大小一般不会有2的32次方那么大，所以，把队列取为32位的一个窗口，当窗口的大小是2的幂次方，则32位包含整数个窗口。这样，用来存放ring对象的void *指针数组空间就可只申请一个窗口大小即可。根据二进制的回环性，可以直接用(uint32_t)( prod_tail - cons_tail)计算队列中有多少生产的产品（即使溢出了也不会出错，如（uint32_t）5-65535 = 6）。

这里写图片描述

四、实现多生产/消费者同时生产/消费（同时出入队）

这里写图片描述

移动prod.head表示生产者预定的生产数量
当该生产者生产结束，且在此之前的生产也都结束后，移动prod.tail表示实际生产的位置
同样，移动cons.head表示消费者预定的消费数量
当该消费者消费结束，且在此之前的消费也都结束后，移动cons.tail表示实际消费的位置

1、多生产者入队流程：

/** * @internal Enqueue several objects on the ring (multi-producers safe).  *  * This function uses a "compare and set" instruction to move the  * producer index atomically.  *  * @param r  *   A pointer to the ring structure.  * @param obj_table  *   A pointer to a table of void * pointers (objects).  * @param n  *   The number of objects to add in the ring from the obj_table.  * @param behavior  *   RTE_RING_QUEUE_FIXED:    Enqueue a fixed number of items from a ring  *   RTE_RING_QUEUE_VARIABLE: Enqueue as many items a possible from ring  * @return  *   Depend on the behavior value  *   if behavior = RTE_RING_QUEUE_FIXED  *   - 0: Success; objects enqueue.  *   - -EDQUOT: Quota exceeded. The objects have been enqueued, but the  *     high water mark is exceeded.  *   - -ENOBUFS: Not enough room in the ring to enqueue, no object is enqueued.  *   if behavior = RTE_RING_QUEUE_VARIABLE  *   - n: Actual number of objects enqueued.  */  static inline int __attribute__((always_inline))  __rte_ring_mp_do_enqueue(struct rte_ring *r, void * const *obj_table,               unsigned n, enum rte_ring_queue_behavior behavior)  {      uint32_t prod_head, prod_next;      uint32_t cons_tail, free_entries;      const unsigned max = n;      int success;      unsigned i, rep = 0;      uint32_t mask = r->prod.mask;      int ret;      /* Avoid the unnecessary cmpset operation below, which is also      * potentially harmful when n equals 0. */      if (n == 0)          return 0;      /* move prod.head atomically */      do {          /* Reset n to the initial burst count */          n = max;          /* 1. 抢占移动prod.head */        prod_head = r->prod.head;          cons_tail = r->cons.tail;          /* The subtraction is done between two unsigned 32bits value          * (the result is always modulo 32 bits even if we have          * prod_head > cons_tail). So 'free_entries' is always between 0          * and size(ring)-1. */        /* 2.检查free空间是否足够 */        free_entries = (mask + cons_tail - prod_head);          /* check that we have enough room in ring */          if (unlikely(n > free_entries)) {              if (behavior == RTE_RING_QUEUE_FIXED) {                  __RING_STAT_ADD(r, enq_fail, n);                  return -ENOBUFS;              }              else {                  /* No free entry available */                  if (unlikely(free_entries == 0)) {                      __RING_STAT_ADD(r, enq_fail, n);                      return 0;                  }                  n = free_entries;              }          }          /* 3.利用cas操作，移动r->prod.head，预约生产*/        prod_next = prod_head + n;          success = rte_atomic32_cmpset(&r->prod.head, prod_head,                            prod_next);      } while (unlikely(success == 0));      /* write entries in ring */      ENQUEUE_PTRS();      rte_smp_wmb();      /* if we exceed the watermark */      /*4.检查是否到了阈值，并添加到统计中*/     if (unlikely(((mask + 1) - free_entries + n) > r->prod.watermark)) {          ret = (behavior == RTE_RING_QUEUE_FIXED) ? -EDQUOT :                  (int)(n | RTE_RING_QUOT_EXCEED);          __RING_STAT_ADD(r, enq_quota, n);      }      else {          ret = (behavior == RTE_RING_QUEUE_FIXED) ? 0 : n;          __RING_STAT_ADD(r, enq_success, n);      }      /*      * If there are other enqueues in progress that preceded us,      * we need to wait for them to complete      */      /*5.等待之前的入队操作完成，移动实际位置*/    while (unlikely(r->prod.tail != prod_head)) {          rte_pause();          /* Set RTE_RING_PAUSE_REP_COUNT to avoid spin too long waiting          * for other thread finish. It gives pre-empted thread a chance          * to proceed and finish with ring dequeue operation. */          if (RTE_RING_PAUSE_REP_COUNT &&              ++rep == RTE_RING_PAUSE_REP_COUNT) {              rep = 0;              sched_yield();          }      }      r->prod.tail = prod_next;      return ret;  }

下面介绍当两个生产者同时添加对象到ring时发生了什么。

1）在初始状态， prod_head 和 prod_tail指向相同的位置：

这里写图片描述

在两个生产者core中（这个core可以理解成同时运行的线程或进程），各自的局部变量都保存ring->prod_head 和 ring->cons_tail。各自的局部变量prod_next索引指向ring->prod_head的下一个元素，如果是批量入队，指向下几个元素。假如ring里没有足够的空间（检查cons_tail获知），入队函数将返回error：

        prod_head = r->prod.head;          cons_tail = r->cons.tail;          ...        free_entries = (mask + cons_tail - prod_head);          ...        prod_next = prod_head + n;

2）第二步是修改ring结构体里的ring->prod_head 索引，将它指向上面提到的局部变量prod_next指向的位置：

这里写图片描述
这个操作是通过使用 Compare And Swap (CAS)执行完成的，rte_atomic32_cmpset()所做的就是CAS(compare and set)操作，是无锁队列实现的关键。Compare And Swap (CAS)包含以下原子操作：

如果ring->prod_head索引和局部变量prod_head索引不相等，CAS操作失败，代码将从新从第一步开始执行。
若相等，将ring->prod_head索引指向局部变量prod_next的位置，CAS操作成功，继续下一步处理。

在上图中，生产者core1执行成功后，生产者core2重新运行后成功。

```do {     ...     prod_head = r->prod.head;       cons_tail = r->cons.tail;     ...     success = rte_atomic32_cmpset(&r->prod.head, prod_head, prod_next);      ...     } while (unlikely(success == 0));

3）生产者core2中CAS指令重试成功

生产者core1更新对象obj4到ring中，生产者core2更新对象obj5到ring中（CAS指令重试后执行成功的）。

这里写图片描述

   /* write entries in ring */      ENQUEUE_PTRS();      rte_smp_wmb();

4）现在每个生产者core都想更新 ring->prod_tail索引。生产者core代码中，只有ring->prod_tail等于自己局部变量prod_head才能被更新，显然从上图中可知，只有生产者core1才能满足，生产者core1完成了入队操作。

这里写图片描述

5) 一旦生产者core1更新了ring->prod_tail后，生产者core2也可以更新ring->prod_tail了。生产者core2也完成了入队操作

这里写图片描述

(4)(5)两步对应代码：

    /*      * If there are other enqueues in progress that preceded us,      * we need to wait for them to complete      */     while (unlikely(r->prod.tail != prod_head)) {          rte_pause();          /* Set RTE_RING_PAUSE_REP_COUNT to avoid spin too long waiting           * for other thread finish. It gives pre-empted thread a chance           * to proceed and finish with ring dequeue operation. */          if (RTE_RING_PAUSE_REP_COUNT &&              ++rep == RTE_RING_PAUSE_REP_COUNT) {              rep = 0;              sched_yield();          }      }      r->prod.tail = prod_next;

2. 多消费者出队流程：

static inline int __attribute__((always_inline))  __rte_ring_mc_do_dequeue(struct rte_ring *r, void **obj_table,           unsigned n, enum rte_ring_queue_behavior behavior)  {      uint32_t cons_head, prod_tail;      uint32_t cons_next, entries;      const unsigned max = n;      int success;      unsigned i, rep = 0;      uint32_t mask = r->prod.mask;      /* Avoid the unnecessary cmpset operation below, which is also       * potentially harmful when n equals 0. */      if (n == 0)          return 0;      /* move cons.head atomically       cgm      1.检查可消费空间是否足够      2.cms消费预约*/      do {          /* Restore n as it may change every loop */          n = max;          cons_head = r->cons.head;          prod_tail = r->prod.tail;          /* The subtraction is done between two unsigned 32bits value           * (the result is always modulo 32 bits even if we have           * cons_head > prod_tail). So 'entries' is always between 0           * and size(ring)-1. */          entries = (prod_tail - cons_head);          /* Set the actual entries for dequeue */          if (n > entries) {              if (behavior == RTE_RING_QUEUE_FIXED) {                  __RING_STAT_ADD(r, deq_fail, n);                  return -ENOENT;              }              else {                  if (unlikely(entries == 0)){                      __RING_STAT_ADD(r, deq_fail, n);                      return 0;                  }                  n = entries;              }          }          cons_next = cons_head + n;          success = rte_atomic32_cmpset(&r->cons.head, cons_head,                            cons_next);      } while (unlikely(success == 0));      /* copy in table */      DEQUEUE_PTRS();      rte_smp_rmb();      /*       * If there are other dequeues in progress that preceded us,       * we need to wait for them to complete       cgm 等待之前的出队操作完成       */      while (unlikely(r->cons.tail != cons_head)) {          rte_pause();          /* Set RTE_RING_PAUSE_REP_COUNT to avoid spin too long waiting           * for other thread finish. It gives pre-empted thread a chance           * to proceed and finish with ring dequeue operation. */          if (RTE_RING_PAUSE_REP_COUNT &&              ++rep == RTE_RING_PAUSE_REP_COUNT) {              rep = 0;              sched_yield();          }      }      __RING_STAT_ADD(r, deq_success, n);      r->cons.tail = cons_next;      return behavior == RTE_RING_QUEUE_FIXED ? 0 : n;  }

同生产者一个道理，代码中加了点注释，就不详细解释了。

阅读全文

0 0