【OVS2.5.0源码分析】upcall处理线程分析（1）

来源：互联网发布：网络基本知识子网编辑：程序博客网时间：2024/06/05 09:32

upcall线程处理由datapath通过netlink机制上送的报文，其入口函数为udpif_upcall_handler，这一篇我们先分析该线程是如何收取upcall报文的。

1、udpif_upcall_handler函数

/* The upcall handler thread tries to read a batch of UPCALL_MAX_BATCH * upcalls from dpif, processes the batch and installs corresponding flows * in dpif. */static void *udpif_upcall_handler(void *arg){    struct handler *handler = arg;    struct udpif *udpif = handler->udpif;    while (!latch_is_set(&handler->udpif->exit_latch)) {        if (recv_upcalls(handler)) {            poll_immediate_wake();     //不阻塞，说明还有upcall需要处理        } else {            dpif_recv_wait(udpif->dpif, handler->handler_id);     //阻塞在netlink接收上            latch_wait(&udpif->exit_latch);         }        poll_block();//poll阻塞    }    return NULL;}

2、recv_upcalls函数

static size_trecv_upcalls(struct handler *handler){    struct udpif *udpif = handler->udpif;    uint64_t recv_stubs[UPCALL_MAX_BATCH][512 / 8];    struct ofpbuf recv_bufs[UPCALL_MAX_BATCH];    struct dpif_upcall dupcalls[UPCALL_MAX_BATCH];    struct upcall upcalls[UPCALL_MAX_BATCH];    struct flow flows[UPCALL_MAX_BATCH];    size_t n_upcalls, i;    n_upcalls = 0;    while (n_upcalls < UPCALL_MAX_BATCH) {        struct ofpbuf *recv_buf = &recv_bufs[n_upcalls];        struct dpif_upcall *dupcall = &dupcalls[n_upcalls];        struct upcall *upcall = &upcalls[n_upcalls];        struct flow *flow = &flows[n_upcalls];        unsigned int mru;        int error;        ofpbuf_use_stub(recv_buf, recv_stubs[n_upcalls],                        sizeof recv_stubs[n_upcalls]);        if (dpif_recv(udpif->dpif, handler->handler_id, dupcall, recv_buf)) {    //接收upcall报文            ofpbuf_uninit(recv_buf);            break;        }        if (odp_flow_key_to_flow(dupcall->key, dupcall->key_len, flow)            == ODP_FIT_ERROR) {            goto free_dupcall;        }        if (dupcall->mru) {            mru = nl_attr_get_u16(dupcall->mru);        } else {            mru = 0;        }        error = upcall_receive(upcall, udpif->backer, &dupcall->packet,                               dupcall->type, dupcall->userdata, flow, mru,                               &dupcall->ufid, PMD_ID_NULL);        if (error) {            if (error == ENODEV) {                /* Received packet on datapath port for which we couldn't                 * associate an ofproto.  This can happen if a port is removed                 * while traffic is being received.  Print a rate-limited                 * message in case it happens frequently. */                dpif_flow_put(udpif->dpif, DPIF_FP_CREATE, dupcall->key,                              dupcall->key_len, NULL, 0, NULL, 0,                              &dupcall->ufid, PMD_ID_NULL, NULL);                VLOG_INFO_RL(&rl, "received packet on unassociated datapath "                             "port %"PRIu32, flow->in_port.odp_port);            }            goto free_dupcall;        }        upcall->key = dupcall->key;        upcall->key_len = dupcall->key_len;        upcall->ufid = &dupcall->ufid;        upcall->out_tun_key = dupcall->out_tun_key;        upcall->actions = dupcall->actions;        if (vsp_adjust_flow(upcall->ofproto, flow, &dupcall->packet)) {            upcall->vsp_adjusted = true;        }        pkt_metadata_from_flow(&dupcall->packet.md, flow);        flow_extract(&dupcall->packet, flow);        error = process_upcall(udpif, upcall,                               &upcall->odp_actions, &upcall->wc);        if (error) {            goto cleanup;        }        n_upcalls++;        continue;cleanup:        upcall_uninit(upcall);free_dupcall:        dp_packet_uninit(&dupcall->packet);        ofpbuf_uninit(recv_buf);    }    if (n_upcalls) {        handle_upcalls(handler->udpif, upcalls, n_upcalls);        for (i = 0; i < n_upcalls; i++) {            dp_packet_uninit(&dupcalls[i].packet);            ofpbuf_uninit(&recv_bufs[i]);            upcall_uninit(&upcalls[i]);        }    }    return n_upcalls;}

3、dpif_recv函数

/* Polls for an upcall from 'dpif' for an upcall handler.  Since there * there can be multiple poll loops, 'handler_id' is needed as index to * identify the corresponding poll loop.  If successful, stores the upcall * into '*upcall', using 'buf' for storage.  Should only be called if * 'recv_set' has been used to enable receiving packets from 'dpif'. * * 'upcall->key' and 'upcall->userdata' point into data in the caller-provided * 'buf', so their memory cannot be freed separately from 'buf'. * * The caller owns the data of 'upcall->packet' and may modify it.  If * packet's headroom is exhausted as it is manipulated, 'upcall->packet' * will be reallocated.  This requires the data of 'upcall->packet' to be * released with ofpbuf_uninit() before 'upcall' is destroyed.  However, * when an error is returned, the 'upcall->packet' may be uninitialized * and should not be released. * * Returns 0 if successful, otherwise a positive errno value.  Returns EAGAIN * if no upcall is immediately available. */intdpif_recv(struct dpif *dpif, uint32_t handler_id, struct dpif_upcall *upcall,          struct ofpbuf *buf){    int error = EAGAIN;    if (dpif->dpif_class->recv) {        error = dpif->dpif_class->recv(dpif, handler_id, upcall, buf);  //实际调用dpif_netlink_recv函数        if (!error) {            dpif_print_packet(dpif, upcall);        } else if (error != EAGAIN) {            log_operation(dpif, "recv", error);        }    }    return error;}

4、dpif_netlink_recv函数

static intdpif_netlink_recv(struct dpif *dpif_, uint32_t handler_id,                  struct dpif_upcall *upcall, struct ofpbuf *buf){    struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);    int error;    fat_rwlock_rdlock(&dpif->upcall_lock);#ifdef _WIN32    error = dpif_netlink_recv_windows(dpif, handler_id, upcall, buf);#else    error = dpif_netlink_recv__(dpif, handler_id, upcall, buf);   //linux#endif    fat_rwlock_unlock(&dpif->upcall_lock);    return error;}

5、dpif_netlink_recv__函数

static intdpif_netlink_recv__(struct dpif_netlink *dpif, uint32_t handler_id,                    struct dpif_upcall *upcall, struct ofpbuf *buf)    OVS_REQ_RDLOCK(dpif->upcall_lock){    struct dpif_handler *handler;    int read_tries = 0;    if (!dpif->handlers || handler_id >= dpif->n_handlers) {        return EAGAIN;    }    handler = &dpif->handlers[handler_id];    if (handler->event_offset >= handler->n_events) {           int retval;        handler->event_offset = handler->n_events = 0;        do {            retval = epoll_wait(handler->epoll_fd, handler->epoll_events,      //阻塞，直到有upcall                                dpif->uc_array_size, 0);        } while (retval < 0 && errno == EINTR);        if (retval < 0) {            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);            VLOG_WARN_RL(&rl, "epoll_wait failed (%s)", ovs_strerror(errno));        } else if (retval > 0) {            handler->n_events = retval;     //表示channels的个数        }    }    while (handler->event_offset < handler->n_events) {            int idx = handler->epoll_events[handler->event_offset].data.u32;        //这个是谁定义的？        struct dpif_channel *ch = &dpif->handlers[handler_id].channels[idx];    //处理多个channel，每个channel有一个sock        handler->event_offset++;        for (;;) {            int dp_ifindex;            int error;            if (++read_tries > 50) {                return EAGAIN;            }            error = nl_sock_recv(ch->sock, buf, false);         //收取upcall            if (error == ENOBUFS) {                /* ENOBUFS typically means that we've received so many                 * packets that the buffer overflowed.  Try again                 * immediately because there's almost certainly a packet                 * waiting for us. */                report_loss(dpif, ch, idx, handler_id);                continue;            }            ch->last_poll = time_msec();            if (error) {                if (error == EAGAIN) {                    break;                }                return error;            }            error = parse_odp_packet(dpif, buf, upcall, &dp_ifindex);            if (!error && dp_ifindex == dpif->dp_ifindex) {                return 0;            } else if (error) {                return error;            }        }    }    return EAGAIN;}

6、nl_sock_recv函数

/* Tries to receive a Netlink message from the kernel on 'sock' into 'buf'.  If * 'wait' is true, waits for a message to be ready.  Otherwise, fails with * EAGAIN if the 'sock' receive buffer is empty. * * The caller must have initialized 'buf' with an allocation of at least * NLMSG_HDRLEN bytes.  For best performance, the caller should allocate enough * space for a "typical" message. * * On success, returns 0 and replaces 'buf''s previous content by the received * message.  This function expands 'buf''s allocated memory, as necessary, to * hold the actual size of the received message. * * On failure, returns a positive errno value and clears 'buf' to zero length. * 'buf' retains its previous memory allocation. * * Regardless of success or failure, this function resets 'buf''s headroom to * 0. */intnl_sock_recv(struct nl_sock *sock, struct ofpbuf *buf, bool wait){    return nl_sock_recv__(sock, buf, wait);}

7、nl_sock_recv__函数

static intnl_sock_recv__(struct nl_sock *sock, struct ofpbuf *buf, bool wait){    /* We can't accurately predict the size of the data to be received.  The     * caller is supposed to have allocated enough space in 'buf' to handle the     * "typical" case.  To handle exceptions, we make available enough space in     * 'tail' to allow Netlink messages to be up to 64 kB long (a reasonable     * figure since that's the maximum length of a Netlink attribute). */    struct nlmsghdr *nlmsghdr;    uint8_t tail[65536];    struct iovec iov[2];    struct msghdr msg;    ssize_t retval;    int error;    ovs_assert(buf->allocated >= sizeof *nlmsghdr);    ofpbuf_clear(buf);    iov[0].iov_base = buf->base;    iov[0].iov_len = buf->allocated;    iov[1].iov_base = tail;    iov[1].iov_len = sizeof tail;    memset(&msg, 0, sizeof msg);    msg.msg_iov = iov;    msg.msg_iovlen = 2;    /* Receive a Netlink message from the kernel.     *     * This works around a kernel bug in which the kernel returns an error code     * as if it were the number of bytes read.  It doesn't actually modify     * anything in the receive buffer in that case, so we can initialize the     * Netlink header with an impossible message length and then, upon success,     * check whether it changed. */    nlmsghdr = buf->base;    do {        nlmsghdr->nlmsg_len = UINT32_MAX;#ifdef _WIN32        DWORD bytes;        if (!DeviceIoControl(sock->handle, sock->read_ioctl,                             NULL, 0, tail, sizeof tail, &bytes, NULL)) {            VLOG_DBG_RL(&rl, "fatal driver failure in transact: %s",                ovs_lasterror_to_string());            retval = -1;            /* XXX: Map to a more appropriate error. */            errno = EINVAL;        } else {            retval = bytes;            if (retval == 0) {                retval = -1;                errno = EAGAIN;            } else {                if (retval >= buf->allocated) {                    ofpbuf_reinit(buf, retval);                    nlmsghdr = buf->base;                    nlmsghdr->nlmsg_len = UINT32_MAX;                }                memcpy(buf->data, tail, retval);                buf->size = retval;            }        }#else        retval = recvmsg(sock->fd, &msg, wait ? 0 : MSG_DONTWAIT);    //调用系统接口，接收netlink消息#endif        error = (retval < 0 ? errno                 : retval == 0 ? ECONNRESET /* not possible? */                 : nlmsghdr->nlmsg_len != UINT32_MAX ? 0                 : retval);    } while (error == EINTR);    if (error) {        if (error == ENOBUFS) {            /* Socket receive buffer overflow dropped one or more messages that             * the kernel tried to send to us. */            COVERAGE_INC(netlink_overflow);        }        return error;    }    if (msg.msg_flags & MSG_TRUNC) {        VLOG_ERR_RL(&rl, "truncated message (longer than %"PRIuSIZE" bytes)",                    sizeof tail);        return E2BIG;    }    if (retval < sizeof *nlmsghdr        || nlmsghdr->nlmsg_len < sizeof *nlmsghdr        || nlmsghdr->nlmsg_len > retval) {        VLOG_ERR_RL(&rl, "received invalid nlmsg (%"PRIuSIZE" bytes < %"PRIuSIZE")",                    retval, sizeof *nlmsghdr);        return EPROTO;    }#ifndef _WIN32    buf->size = MIN(retval, buf->allocated);    if (retval > buf->allocated) {        COVERAGE_INC(netlink_recv_jumbo);        ofpbuf_put(buf, tail, retval - buf->allocated);    }#endif    log_nlmsg(__func__, 0, buf->data, buf->size, sock->protocol);    COVERAGE_INC(netlink_received);    return 0;}

用户态程序通过recvmsg函数获取netlink消息，而内核提供了genl框架可以更加简便地处理netlink消息。其中channel的sock是如何建立，留待下一篇进行分析。

0 0