ovs的upcall及ofproto-dpif处理细节

来源:互联网 发布:spss 23 for mac 破解 编辑:程序博客网 时间:2024/06/18 17:58

无论是内核态datapath还是基于dpdk的用户态datapath,当flow table查不到之后都会进入upcall的处理(我喜欢管这条路径叫做慢速路径,那么datapath里就是快速路径啦~~)

upcall的处理函数udpif_upcall_handler会在udpif_start_threads里面初始化,同时创建的还有udpif_revalidator的线程

/* Starts the handler and revalidator threads, must be enclosed in * ovsrcu quiescent state. */static voidudpif_start_threads(struct udpif *udpif, size_t n_handlers,                    size_t n_revalidators){    if (udpif && n_handlers && n_revalidators) {        size_t i;        bool enable_ufid;        udpif->n_handlers = n_handlers;        udpif->n_revalidators = n_revalidators;        udpif->handlers = xzalloc(udpif->n_handlers * sizeof *udpif->handlers);        for (i = 0; i < udpif->n_handlers; i++) {            struct handler *handler = &udpif->handlers[i];            handler->udpif = udpif;            handler->handler_id = i;            handler->thread = ovs_thread_create(                "handler", udpif_upcall_handler, handler);        }        enable_ufid = ofproto_dpif_get_enable_ufid(udpif->backer);        atomic_init(&udpif->enable_ufid, enable_ufid);        dpif_enable_upcall(udpif->dpif);        ovs_barrier_init(&udpif->reval_barrier, udpif->n_revalidators);        ovs_barrier_init(&udpif->pause_barrier, udpif->n_revalidators + 1);        udpif->reval_exit = false;        udpif->pause = false;        udpif->revalidators = xzalloc(udpif->n_revalidators                                      * sizeof *udpif->revalidators);        for (i = 0; i < udpif->n_revalidators; i++) {            struct revalidator *revalidator = &udpif->revalidators[i];            revalidator->udpif = udpif;            revalidator->thread = ovs_thread_create(                "revalidator", udpif_revalidator, revalidator);        }    }}

udpif_upcall_handler通过fd poll的方式等待触发,如果有upcall上送,则进入recv_upcalls的处理函数中
先看下几个相关的数据结构,struct udpif是和ofproto-dpif处理upcall相关的函数,分为upcall处理和flow回收两部分

/* An upcall handler for ofproto_dpif. * * udpif keeps records of two kind of logically separate units: * * upcall handling * --------------- * *    - An array of 'struct handler's for upcall handling and flow *      installation. * * flow revalidation * ----------------- * *    - Revalidation threads which read the datapath flow table and maintains *      them. */struct udpif {    struct ovs_list list_node;         /* In all_udpifs list. */    struct dpif *dpif;                 /* Datapath handle. */    struct dpif_backer *backer;        /* Opaque dpif_backer pointer. */    struct handler *handlers;          /* Upcall handlers. */    size_t n_handlers;    struct revalidator *revalidators;  /* Flow revalidators. */    size_t n_revalidators;    struct latch exit_latch;           /* Tells child threads to exit. */    /* There are 'N_UMAPS' maps containing 'struct udpif_key' elements.     *     * During the flow dump phase, revalidators insert into these with a random     * distribution. During the garbage collection phase, each revalidator     * takes care of garbage collecting a slice of these maps. */    struct umap *ukeys;};

struct umap是cuckoo hash实现的大规模hash表,用于通过udpif_keys查找datapath flow,struct udpif创建时一共会实现N_UMAPS个这样的哈希表

struct dp_packet是实际报文的封装,如果是在dpdk的dp下,会在mbuf后面的线性内存存放这些元数据

/* Buffer for holding packet data.  A dp_packet is automatically reallocated * as necessary if it grows too large for the available memory. */struct dp_packet {#ifdef DPDK_NETDEV    struct rte_mbuf mbuf;       /* DPDK mbuf */#else    void *base_;                /* First byte of allocated space. */    uint16_t allocated_;        /* Number of bytes allocated. */    uint16_t data_ofs;          /* First byte actually in use. */    uint32_t size_;             /* Number of bytes in use. */    uint32_t rss_hash;          /* Packet hash. */    bool rss_hash_valid;        /* Is the 'rss_hash' valid? */#endif    enum dp_packet_source source;  /* Source of memory allocated as 'base'. */    uint8_t l2_pad_size;           /* Detected l2 padding size.                                    * Padding is non-pullable. */    uint16_t l2_5_ofs;             /* MPLS label stack offset, or UINT16_MAX */    uint16_t l3_ofs;               /* Network-level header offset,                                    * or UINT16_MAX. */    uint16_t l4_ofs;               /* Transport-level header offset,                                      or UINT16_MAX. */    uint32_t cutlen;               /* length in bytes to cut from the end. */    union {        struct pkt_metadata md;        uint64_t data[DP_PACKET_CONTEXT_SIZE / 8];    };};/* Datapath packet metadata */struct pkt_metadata {    uint32_t recirc_id;         /* Recirculation id carried with the                                   recirculating packets. 0 for packets                                   received from the wire. */    uint32_t dp_hash;           /* hash value computed by the recirculation                                   action. */    uint32_t skb_priority;      /* Packet priority for QoS. */    uint32_t pkt_mark;          /* Packet mark. */    uint16_t ct_state;          /* Connection state. */    uint16_t ct_zone;           /* Connection zone. */    uint32_t ct_mark;           /* Connection mark. */    ovs_u128 ct_label;          /* Connection label. */    union flow_in_port in_port; /* Input port. */    struct flow_tnl tunnel;     /* Encapsulating tunnel parameters. Note that                                 * if 'ip_dst' == 0, the rest of the fields may                                 * be uninitialized. */};/* Tunnel information used in flow key and metadata. */struct flow_tnl {    ovs_be32 ip_dst;    struct in6_addr ipv6_dst;    ovs_be32 ip_src;    struct in6_addr ipv6_src;    ovs_be64 tun_id;    uint16_t flags;    uint8_t ip_tos;    uint8_t ip_ttl;    ovs_be16 tp_src;    ovs_be16 tp_dst;    ovs_be16 gbp_id;    uint8_t  gbp_flags;    uint8_t  pad1[5];        /* Pad to 64 bits. */    struct tun_metadata metadata;};

struct dpif_upcall代表了一个报文的upcall,除了报文内容还有upcall带上来的netlink属性数据

/* A packet passed up from the datapath to userspace. * * The 'packet', 'key' and 'userdata' may point into data in a buffer * provided by the caller, so the buffer should be released only after the * upcall processing has been finished. * * While being processed, the 'packet' may be reallocated, so the packet must * be separately released with ofpbuf_uninit(). */struct dpif_upcall {    /* All types. */    enum dpif_upcall_type type;    struct dp_packet packet;       /* Packet data. */    struct nlattr *key;         /* Flow key. */    size_t key_len;             /* Length of 'key' in bytes. */    ovs_u128 ufid;              /* Unique flow identifier for 'key'. */    struct nlattr *mru;         /* Maximum receive unit. */    struct nlattr *cutlen;      /* Number of bytes shrink from the end. */    /* DPIF_UC_ACTION only. */    struct nlattr *userdata;    /* Argument to OVS_ACTION_ATTR_USERSPACE. */    struct nlattr *out_tun_key;    /* Output tunnel key. */    struct nlattr *actions;    /* Argument to OVS_ACTION_ATTR_USERSPACE. */};

recv_upcalls会一次处理UPCALL_MAX_BATCH个请求,我们以单个请求的处理为例子,首先调用的是dpif_recv,实际调用了dpif_class->recv注册的函数。接收的数据会放到struct dpif_upcall和struct ofpbuf里面

/* Polls for an upcall from 'dpif' for an upcall handler.  Since there * there can be multiple poll loops, 'handler_id' is needed as index to * identify the corresponding poll loop.  If successful, stores the upcall * into '*upcall', using 'buf' for storage.  Should only be called if * 'recv_set' has been used to enable receiving packets from 'dpif'. * * 'upcall->key' and 'upcall->userdata' point into data in the caller-provided * 'buf', so their memory cannot be freed separately from 'buf'. * * The caller owns the data of 'upcall->packet' and may modify it.  If * packet's headroom is exhausted as it is manipulated, 'upcall->packet' * will be reallocated.  This requires the data of 'upcall->packet' to be * released with ofpbuf_uninit() before 'upcall' is destroyed.  However, * when an error is returned, the 'upcall->packet' may be uninitialized * and should not be released. * * Returns 0 if successful, otherwise a positive errno value.  Returns EAGAIN * if no upcall is immediately available. */intdpif_recv(struct dpif *dpif, uint32_t handler_id, struct dpif_upcall *upcall,          struct ofpbuf *buf){    int error = EAGAIN;    if (dpif->dpif_class->recv) {        error = dpif->dpif_class->recv(dpif, handler_id, upcall, buf);        if (!error) {            dpif_print_packet(dpif, upcall);        } else if (error != EAGAIN) {            log_operation(dpif, "recv", error);        }    }    return error;}

第二步是调用upcall_receive,该函数用于构造一个struct upcall结构体

static intupcall_receive(struct upcall *upcall, const struct dpif_backer *backer,               const struct dp_packet *packet, enum dpif_upcall_type type,               const struct nlattr *userdata, const struct flow *flow,               const unsigned int mru,               const ovs_u128 *ufid, const unsigned pmd_id){    int error;    error = xlate_lookup(backer, flow, &upcall->ofproto, &upcall->ipfix,                         &upcall->sflow, NULL, &upcall->in_port);    if (error) {        return error;    }    upcall->recirc = NULL;    upcall->have_recirc_ref = false;    upcall->flow = flow;    upcall->packet = packet;    upcall->ufid = ufid;    upcall->pmd_id = pmd_id;    upcall->type = type;    upcall->userdata = userdata;    ofpbuf_use_stub(&upcall->odp_actions, upcall->odp_actions_stub,                    sizeof upcall->odp_actions_stub);    ofpbuf_init(&upcall->put_actions, 0);    upcall->xout_initialized = false;    upcall->ukey_persists = false;    upcall->ukey = NULL;    upcall->key = NULL;    upcall->key_len = 0;    upcall->mru = mru;    upcall->out_tun_key = NULL;    upcall->actions = NULL;    return 0;}/* Given a datapath and flow metadata ('backer', and 'flow' respectively), * optionally populates 'ofproto' with the ofproto_dpif, 'ofp_in_port' with the * openflow in_port, and 'ipfix', 'sflow', and 'netflow' with the appropriate * handles for those protocols if they're enabled.  Caller may use the returned * pointers until quiescing, for longer term use additional references must * be taken. * * Returns 0 if successful, ENODEV if the parsed flow has no associated ofproto. */intxlate_lookup(const struct dpif_backer *backer, const struct flow *flow,             struct ofproto_dpif **ofprotop, struct dpif_ipfix **ipfix,             struct dpif_sflow **sflow, struct netflow **netflow,             ofp_port_t *ofp_in_port)

最后调用process_upcall来处理这个struct upcall,根据upcall类型不同处理方式也不同,我们这里只看MISS_UPCALL的处理,会调用到upcall_xlate
upcall_xlate首先初始化xlate_in, struct xlate_in结构体如下

struct xlate_in {    struct ofproto_dpif *ofproto;    /* Flow to which the OpenFlow actions apply.  xlate_actions() will modify     * this flow when actions change header fields. */    struct flow flow;    /* The packet corresponding to 'flow', or a null pointer if we are     * revalidating without a packet to refer to. */    const struct dp_packet *packet;    /* Should OFPP_NORMAL update the MAC learning table?  Should "learn"     * actions update the flow table?     *     * We want to update these tables if we are actually processing a packet,     * or if we are accounting for packets that the datapath has processed, but     * not if we are just revalidating. */    bool may_learn;    /* The rule initiating translation or NULL. If both 'rule' and 'ofpacts'     * are NULL, xlate_actions() will do the initial rule lookup itself. */    struct rule_dpif *rule;    /* The actions to translate.  If 'rule' is not NULL, these may be NULL. */    const struct ofpact *ofpacts;    size_t ofpacts_len;    /* Union of the set of TCP flags seen so far in this flow.  (Used only by     * NXAST_FIN_TIMEOUT.  Set to zero to avoid updating updating rules'     * timeouts.) */    uint16_t tcp_flags;    /* If nonnull, flow translation calls this function just before executing a     * resubmit or OFPP_TABLE action.  In addition, disables logging of traces     * when the recursion depth is exceeded.     *     * 'rule' is the rule being submitted into.  It will be null if the     * resubmit or OFPP_TABLE action didn't find a matching rule.     *     * 'indentation' is the resubmit recursion depth at time of invocation,     * suitable for indenting the output.     *     * This is normally null so the client has to set it manually after     * calling xlate_in_init(). */    void (*resubmit_hook)(struct xlate_in *, struct rule_dpif *rule,                          int indentation);    /* If nonnull, flow translation calls this function to report some     * significant decision, e.g. to explain why OFPP_NORMAL translation     * dropped a packet.  'indentation' is the resubmit recursion depth at time     * of invocation, suitable for indenting the output. */    void (*report_hook)(struct xlate_in *, int indentation,                        const char *format, va_list args);    /* If nonnull, flow translation credits the specified statistics to each     * rule reached through a resubmit or OFPP_TABLE action.     *     * This is normally null so the client has to set it manually after     * calling xlate_in_init(). */    const struct dpif_flow_stats *resubmit_stats;    /* Counters carried over from a pre-existing translation of a related flow.     * This can occur due to, e.g., the translation of an ARP packet that was     * generated as the result of outputting to a tunnel port.  In that case,     * the original flow going to the tunnel is the related flow.  Since the     * two flows are different, they should not use the same xlate_ctx     * structure.  However, we still need limit the maximum recursion across     * the entire translation.     *     * These fields are normally set to zero, so the client has to set them     * manually after calling xlate_in_init().  In that case, they should be     * copied from the same-named fields in the related flow's xlate_ctx.     *     * These fields are really implementation details; the client doesn't care     * about what they mean.  See the corresponding fields in xlate_ctx for     * real documentation. */    int indentation;    int depth;    int resubmits;    /* If nonnull, flow translation populates this cache with references to all     * modules that are affected by translation. This 'xlate_cache' may be     * passed to xlate_push_stats() to perform the same function as     * xlate_actions() without the full cost of translation.     *     * This is normally null so the client has to set it manually after     * calling xlate_in_init(). */    struct xlate_cache *xcache;    /* If nonnull, flow translation puts the resulting datapath actions in this     * buffer.  If null, flow translation will not produce datapath actions. */    struct ofpbuf *odp_actions;    /* If nonnull, flow translation populates this with wildcards relevant in     * translation.  Any fields that were used to calculate the action are set,     * to allow caching and kernel wildcarding to work.  For example, if the     * flow lookup involved performing the "normal" action on IPv4 and ARP     * packets, 'wc' would have the 'in_port' (always set), 'dl_type' (flow     * match), 'vlan_tci' (normal action), and 'dl_dst' (normal action) fields     * set. */    struct flow_wildcards *wc;    /* The frozen state to be resumed, as returned by xlate_lookup(). */    const struct frozen_state *frozen_state;};

之后调用xlate_actions,生成datapath需要的struct xlate_out,xlate_actions函数比较复杂,其中最重要的调用是通过rule_dpif_lookup_from_table查找到匹配的流表规则,进而生成actions
rule_dpif_lookup_from_table又会通过流表的级联一个个顺序查找,每单个流表都会调用rule_dpif_lookup_in_table

/* Look up 'flow' in 'ofproto''s classifier version 'version', starting from * table '*table_id'.  Returns the rule that was found, which may be one of the * special rules according to packet miss hadling.  If 'may_packet_in' is * false, returning of the miss_rule (which issues packet ins for the * controller) is avoided.  Updates 'wc', if nonnull, to reflect the fields * that were used during the lookup. * * If 'honor_table_miss' is true, the first lookup occurs in '*table_id', but * if none is found then the table miss configuration for that table is * honored, which can result in additional lookups in other OpenFlow tables. * In this case the function updates '*table_id' to reflect the final OpenFlow * table that was searched. * * If 'honor_table_miss' is false, then only one table lookup occurs, in * '*table_id'. * * The rule is returned in '*rule', which is valid at least until the next * RCU quiescent period.  If the '*rule' needs to stay around longer, the * caller must take a reference. * * 'in_port' allows the lookup to take place as if the in port had the value * 'in_port'.  This is needed for resubmit action support. * * 'flow' is non-const to allow for temporary modifications during the lookup. * Any changes are restored before returning. */struct rule_dpif *rule_dpif_lookup_from_table(struct ofproto_dpif *ofproto,                            ovs_version_t version, struct flow *flow,                            struct flow_wildcards *wc,                            const struct dpif_flow_stats *stats,                            uint8_t *table_id, ofp_port_t in_port,                            bool may_packet_in, bool honor_table_miss){    ovs_be16 old_tp_src = flow->tp_src, old_tp_dst = flow->tp_dst;    ofp_port_t old_in_port = flow->in_port.ofp_port;    enum ofputil_table_miss miss_config;    struct rule_dpif *rule;    uint8_t next_id;    /* We always unwildcard nw_frag (for IP), so they     * need not be unwildcarded here. */    if (flow->nw_frag & FLOW_NW_FRAG_ANY        && ofproto->up.frag_handling != OFPUTIL_FRAG_NX_MATCH) {        if (ofproto->up.frag_handling == OFPUTIL_FRAG_NORMAL) {            /* We must pretend that transport ports are unavailable. */            flow->tp_src = htons(0);            flow->tp_dst = htons(0);        } else {            /* Must be OFPUTIL_FRAG_DROP (we don't have OFPUTIL_FRAG_REASM).             * Use the drop_frags_rule (which cannot disappear). */            rule = ofproto->drop_frags_rule;            if (stats) {                struct oftable *tbl = &ofproto->up.tables[*table_id];                unsigned long orig;                atomic_add_relaxed(&tbl->n_matched, stats->n_packets, &orig);            }            return rule;        }    }    /* Look up a flow with 'in_port' as the input port.  Then restore the     * original input port (otherwise OFPP_NORMAL and OFPP_IN_PORT will     * have surprising behavior). */    flow->in_port.ofp_port = in_port;    /* Our current implementation depends on n_tables == N_TABLES, and     * TBL_INTERNAL being the last table. */    BUILD_ASSERT_DECL(N_TABLES == TBL_INTERNAL + 1);    miss_config = OFPUTIL_TABLE_MISS_CONTINUE;    for (next_id = *table_id;         next_id < ofproto->up.n_tables;         next_id++, next_id += (next_id == TBL_INTERNAL))    {        *table_id = next_id;        rule = rule_dpif_lookup_in_table(ofproto, version, next_id, flow, wc);        if (stats) {            struct oftable *tbl = &ofproto->up.tables[next_id];            unsigned long orig;            atomic_add_relaxed(rule ? &tbl->n_matched : &tbl->n_missed,                               stats->n_packets, &orig);        }        if (rule) {            goto out;   /* Match. */        }        if (honor_table_miss) {            miss_config = ofproto_table_get_miss_config(&ofproto->up,                                                        *table_id);            if (miss_config == OFPUTIL_TABLE_MISS_CONTINUE) {                continue;            }        }        break;    }    /* Miss. */    rule = ofproto->no_packet_in_rule;    if (may_packet_in) {        if (miss_config == OFPUTIL_TABLE_MISS_CONTINUE            || miss_config == OFPUTIL_TABLE_MISS_CONTROLLER) {            struct ofport_dpif *port;            port = ofp_port_to_ofport(ofproto, old_in_port);            if (!port) {                VLOG_WARN_RL(&rl, "packet-in on unknown OpenFlow port %"PRIu16,                             old_in_port);            } else if (!(port->up.pp.config & OFPUTIL_PC_NO_PACKET_IN)) {                rule = ofproto->miss_rule;            }        } else if (miss_config == OFPUTIL_TABLE_MISS_DEFAULT &&                   connmgr_wants_packet_in_on_miss(ofproto->up.connmgr)) {            rule = ofproto->miss_rule;        }    }out:    /* Restore port numbers, as they may have been modified above. */    flow->tp_src = old_tp_src;    flow->tp_dst = old_tp_dst;    /* Restore the old in port. */    flow->in_port.ofp_port = old_in_port;    return rule;}

而对于rule_dpif_lookup_in_table而言,实际调用了classifier_lookup来在流表中查找rule,struct classifier的细节后面再分析

/* Finds and returns the highest-priority rule in 'cls' that matches 'flow' and * that is visible in 'version'.  Returns a null pointer if no rules in 'cls' * match 'flow'.  If multiple rules of equal priority match 'flow', returns one * arbitrarily. * * If a rule is found and 'wc' is non-null, bitwise-OR's 'wc' with the * set of bits that were significant in the lookup.  At some point * earlier, 'wc' should have been initialized (e.g., by * flow_wildcards_init_catchall()). * * 'flow' is non-const to allow for temporary modifications during the lookup. * Any changes are restored before returning. */const struct cls_rule *classifier_lookup(const struct classifier *cls, ovs_version_t version,                  struct flow *flow, struct flow_wildcards *wc){    return classifier_lookup__(cls, version, flow, wc, true);}

xlate_actions最终调用do_xlate_actions针对每种ACTION_ATTR对flow执行不同操作。

好了,下面我们回到recv_upcalls了,最后会调用handle_upcalls,用于向datapath下发flow,handle_upcalls最终调用的是dpif_operate来下发flow,后者调用的是dpif_class->operate,该接口针对不同的dpif实现,可以是dpif_netdev_operate或者dpif_netlink_operate

0 1