OVS vswitchd 模块分析(1)

来源:互联网 发布:福建顶点软件 知乎 编辑:程序博客网 时间:2024/05/22 01:37

1.开启RPC服务。
在当前进程中启动一个worker process作为子进程,当前只支持一个,所以要确保只被调用一次并且要在客户程序主循环中呼叫 worker_run() 和worker_wait() 。worker_start 定义在lib/worker.c,工作流程:通过socketpair函数(仅适用于Unix域套接字)创建一个流管道work_fds[0],work_fds[1],然后通过fcntl设置为非阻塞状态;接下来 fork_and_clean_up在fork子进程中做一些工作。daemonize_post_detach()如果有守护进程的相关配置,那么这个方法就会响应这些,即如果有 --detach ,--no-chdir的话那么detach=true,chdir_=false (关于这些定义在daemon.c中),然后关闭标准文件描述符。接下来就是子进程作为RPC server和主进程沟通(child:work_fd[1]<->parent:work_fds[0]) 。

void  worker_start(void){
    int work_fds[2];

    assert(client_sock < 0);

    xsocketpair(AF_UNIX, SOCK_STREAM, 0, work_fds);   //--->socketpair()
    xset_nonblocking(work_fds[0]);  // fcntl() 
    xset_nonblocking(work_fds[1]);

    if (!fork_and_clean_up()) {
        /* In child (worker) process. */
        daemonize_post_detach();
        close(work_fds[0]);
        worker_main(work_fds[1]);
        NOT_REACHED();
    }

    /* In parent (main) process. */
    close(work_fds[1]);
    client_sock = work_fds[0];
    rxbuf_init(&client_rx);
}

fork_and_clean_up定义在lib/daemon.c中,调用fork函数,并且在子进程中:启动内部计时器,确保即使没有调用time_refresh()时间也会前进,fork生成的子进程不会继承父进程的内部计时器,所以要确保在fork()之后呼叫这个函数;lockfile_postfork()确保现在锁定的lockfile解开(就是关闭相应的文件描述符,而后从hmap locktable中移除),这在fork之后调用很有意义,因为被fork创建的子进程不在持有父进程的锁
/* Post-fork, but before returning, this function calls a few other functions  that are generally useful if the child isn't planning to exec a new process. */
pid_t fork_and_clean_up(void){
    pid = fork();
    if (pid > 0) {
        /* Running in parent process. */
        fatal_signal_fork();   //??
    } else if (!pid) {
        time_postfork();
        lockfile_postfork();

    } else {
        VLOG_FATAL("fork failed (%s)", strerror(errno));
    }
    return pid;
}

在worker.c中定义的RPC 请求/回复header 和 接口实际上就是RPC server和client的通信协议(在这里header和payload都符合openflow的那一套)。在worker_main中,初始化一个rxbuf,接着rxbuf_run()就会将接收信息,构造rx;然后就调用 request callback func(具体实现没看到?)。

static void  worker_main(int fd){
    struct rxbuf rx;
    server_sock = fd;
    subprogram_name = "worker";
    proctitle_set("worker process for pid %lu", (unsigned long int) getppid());
    VLOG_INFO("worker process started");

    rxbuf_init(&rx);
    for (;;) {
        int error;

        error = rxbuf_run(&rx, server_sock, sizeof(struct worker_request));
        if (!error) {
            request = *(struct worker_request *) rx.header.data;

            expect_reply = request.reply_cb != NULL;
            request.request_cb(&rx.payload, rx.fds, rx.n_fds);
            assert(!expect_reply);

            rxbuf_clear(&rx);
        } else if (error == EOF && !rx.header.size) {
            /* Main process closed the IPC socket.  Exit cleanly. */
            break;
        } else if (error != EAGAIN) {
            VLOG_FATAL("RPC receive failed (%s)", strerror(error));
        }

        poll_fd_wait(server_sock, POLLIN);
        poll_block();
    }

    VLOG_INFO("worker process exiting");
    exit(0);
}

/* Receive buffer for a RPC request or reply. */
struct rxbuf {
    /* Header. */
    struct ofpbuf header;       /* Header data. */
    int fds[SOUTIL_MAX_FDS];    /* File descriptors. */
    size_t n_fds;

    /* Payload. */
    struct ofpbuf payload;      /* Payload data. */
};

/* Buffer for holding arbitrary data.  An ofpbuf is automatically reallocated
* as necessary if it grows too large for the available memory. */
struct ofpbuf {
    void *base;                 /* First byte of allocated space. */
    size_t allocated;           /* Number of bytes allocated. */
    enum ofpbuf_source source;  /* Source of memory allocated as 'base'. */

    void *data;                 /* First byte actually in use. */
    size_t size;                /* Number of bytes in use. */

    void *l2;                   /* Link-level header. */
    void *l3;                   /* Network-level header. */
    void *l4;                   /* Transport-level header. */
    void *l7;                   /* Application data. */

    struct list list_node;      /* Private list element for use by owner. */
    void *private_p;            /* Private pointer for use by owner. */
};

如果rx->header.size=0说明rpc缓存中没有数据,就呼叫recv_data_and_fds从Unix 域套接字sock(流管道)中接收数据和文件描述符;如果rx->header.size < header_len说明rxbuf header 此时不空不足容纳一个rpc request header,说明之前的某个时机已经接收了一部分,所以接着接收请求头;在构造完成一个request header之后就要接收实际的数据payload。

static int rxbuf_run(struct rxbuf *rx, int sock, size_t header_len){
    for (;;) {
        if (!rx->header.size) {
            int retval;

            ofpbuf_clear(&rx->header);
            ofpbuf_prealloc_tailroom(&rx->header, header_len);

            retval = recv_data_and_fds(sock, rx->header.data, header_len, rx->fds, &rx->n_fds);
            if (retval <= 0) {
                return retval ? -retval : EOF;
            }
            rx->header.size += retval;
        } else if (rx->header.size < header_len) {
            size_t bytes_read;
            int error;

            error = read_fully(sock, ofpbuf_tail(&rx->header), header_len - rx->header.size, &bytes_read);
            rx->header.size += bytes_read;
            if (error) {
                return error;
            }
        } else {
            size_t payload_len = *(size_t *) rx->header.data;

            if (rx->payload.size < payload_len) {
                size_t left = payload_len - rx->payload.size;
                size_t bytes_read;
                int error;

                ofpbuf_prealloc_tailroom(&rx->payload, left);
                error = read_fully(sock, ofpbuf_tail(&rx->payload), left, &bytes_read);
                rx->payload.size += bytes_read;
                if (error) {
                    return error;
                }
            } else {
                return 0;
            }
        }
    }
    return EAGAIN;
}


2. 创建unix socket control server 监听的sun_path 是path,可能的形式是:NULL,会默认使用<rundir>/<program>.<pid>.ctl;none,成功返回,但没有创建域套接字;没有以'/' 的name,会默认放在  <rundir>下;绝对路径名,比如说 /usr/local/var/run/openvswitch/db.sock。一个程序在守护进程配置后(*after* daemonization)应该呼叫unixctl_server_create,使得socket 包含的是守护进程的pid 而不是已经退出程序的pid。ovs-appctl --target=<program>命令是告诉appctl去运行哪个daemon,接收命令返回响应,默认情况下,每个daemon都会监听一个unix domain socket /usr/local/var/run/program.pid.ctl ,比如ovs-vswitchd.5408.ctl  

int unixctl_server_create(const char *path, struct unixctl_server **serverp){
    struct unixctl_server *server;
    struct pstream *listener;
    char *punix_path;
    int error;

    *serverp = NULL;
    if (path && !strcmp(path, "none")) {
        return 0;
    }

    if (path) {
        char *abs_path = abs_file_name(ovs_rundir(), path);
        punix_path = xasprintf("punix:%s", abs_path);
        free(abs_path);
    } else {
        punix_path = xasprintf("punix:%s/%s.%ld.ctl", ovs_rundir(), program_name, (long int) getpid());
    }

    error = pstream_open(punix_path, &listener, 0);
    if (error) {
        ovs_error(error, "could not initialize control socket %s", punix_path);
        goto exit;
    }

    unixctl_command_register("help", "", 0, 0, unixctl_help, NULL);
    unixctl_command_register("version", "", 0, 0, unixctl_version, NULL);

    server = xmalloc(sizeof *server);
    server->listener = listener;
    list_init(&server->conns);
    *serverp = server;

exit:
    free(punix_path);
    return error;
}

pstream_open(定义在lib/stream.c中)开始监听 remote stream connections,参数name形如TYPE:ARGS,TYPE是passive stream class(如,punix, pssl, ptcp),ARGS是对应流类型特定的参数,如果成功就将连接更新到 * pstreamp中。
int pstream_open(const char *name, struct pstream **pstreamp, uint8_t dscp){
    const struct pstream_class *class;
    struct pstream *pstream;
    char *suffix_copy;
     COVERAGE_INC(pstream_open);  //利用 coverage instrumentation 加 1
   
 // 在 pstream_classes中查找是否有对应的类型 type name -> pstream_class
    error = pstream_lookup_class(name, &class); 
    if (!class) {
        goto error;
    }

    //取出 参数中的ARGS,然后调用相应的listen函数:
    suffix_copy = xstrdup(strchr(name, ':') + 1);
    error = class->listen(name, suffix_copy, &pstream, dscp);
    free(suffix_copy);
    if (error) {
        goto error;
    }

    *pstreamp = pstream;
    return 0;

error:
    *pstreamp = NULL;
    return error;
}

static const struct pstream_class *pstream_classes[] = {
    &ptcp_pstream_class,
    &punix_pstream_class,
#ifdef HAVE_OPENSSL
    &pssl_pstream_class,
#endif
};


比如看 punix_pstream_class (定义在 stream-unix.c中)的open函数其实就是创建unix域套接字并设为阻塞态,然后监听来自客户端的连接,呼叫new_fd_pstream 处理新的连接。
const struct pstream_class punix_pstream_class = {
    "punix",
    false,
    punix_open,
    NULL,    NULL,    NULL,    NULL,
};

static int punix_open(const char *name OVS_UNUSED, char *suffix, struct pstream **pstreamp, uint8_t dscp OVS_UNUSED)
{
   
    fd = make_unix_socket(SOCK_STREAM, true, suffix, NULL);
   
    if (listen(fd, 10) < 0) {
        error = errno;
        VLOG_ERR("%s: listen: %s", name, strerror(error));
        close(fd);
        return error;
    }
    return new_fd_pstream(name, fd, punix_accept, NULL, xstrdup(suffix), pstreamp);
}

new_fd_pstream创建一个新的名为name的pstream在fd上接收新的socket连接,存储在*pstreamp 中;当一个连接被accepte之后,就会呼叫 accept_cb (里面的参数是accept()返回的新的fd和客户端的地址信息);成功之后会初识化一个 *streamp 来和客户端沟通(就像平时我们fork子进程来处理)。

int new_fd_pstream(const char *name, int fd,
               int (*accept_cb)(int fd, const struct sockaddr *sa,
                                size_t sa_len, struct stream **streamp),
               int (*set_dscp_cb)(int fd, uint8_t dscp),
               char *unlink_path, struct pstream **pstreamp)
{
    struct fd_pstream *ps = xmalloc(sizeof *ps);
    pstream_init(&ps->pstream, &fd_pstream_class, name);
    ps->fd = fd;
    ps->accept_cb = accept_cb;
    ps->set_dscp_cb = set_dscp_cb;
    ps->unlink_path = unlink_path;
    *pstreamp = &ps->pstream;
    return 0;
}

static int punix_accept(int fd, const struct sockaddr *sa, size_t sa_len, struct stream **streamp){
    const struct sockaddr_un *sun = (const struct sockaddr_un *) sa;
    int name_len = get_unix_name_len(sa_len);
    char name[128];

    if (name_len > 0) {
        snprintf(name, sizeof name, "unix:%.*s", name_len, sun->sun_path);
    } else {
        strcpy(name, "unix");
    }
    return new_fd_stream(name, fd, 0, streamp);
}

new_fd_stream (lib/stream-fd.c) 创建形如 uinx:....的stream结构体,pstream_class 里面是fd, name , listen, accept 之类的,而stream_class 里面的域有 name, open , connect , close recv , send 等。 构造完成的 *streamp 通过其recv 和 send 在 new fd上收发数据。

int new_fd_stream(const char *name, int fd, int connect_status, struct stream **streamp){
    struct stream_fd *s;  //stream_fd  =stream + fd;

    s = xmalloc(sizeof *s);
    stream_init(&s->stream, &stream_fd_class, connect_status, name);
    s->fd = fd;
    *streamp = &s->stream;
    return 0;
}

/* Active file descriptor stream. */
struct stream_fd
{
    struct stream stream;
    int fd;
};

/* Active stream connection. This structure should be treated as opaque by implementation. */
struct stream {
    const struct stream_class *class;    // lib/stream-provider.h
    int state;
    int error;
    ovs_be32 remote_ip;
    ovs_be16 remote_port;
    ovs_be32 local_ip;
    ovs_be16 local_port;
    char *name;
};

比如 unix_stream_class  的定义为,unix_open相当于客户端,构造域套接字,去连接服务器端即可。不论是server还是client最终都会对每个session构造 stream class ,利用 stream_fd_class (lib/stream-fd.c)提供的方法来通信。
const struct stream_class unix_stream_class = {
    "unix",                     /* name */
    false,                      /* needs_probes */
    unix_open,                  /* open */
    NULL,                       /* close */
    NULL,                       /* connect */
    NULL,                       /* recv */
    NULL,                       /* send */
    NULL,                       /* run */
    NULL,                       /* run_wait */
    NULL,                       /* wait */
};

static const struct stream_class stream_fd_class = {
    "fd",                       /* name */
    false,                      /* needs_probes */
    NULL,                       /* open */
    fd_close,                   /* close */
    fd_connect,                 /* connect */
    fd_recv,                    /* recv */
    fd_send,                    /* send */
    NULL,                       /* run */
    NULL,                       /* run_wait */
    fd_wait,                    /* wait */
};

比如 fd_send 就是BSD socket 发送数据。
static ssize_t fd_send(struct stream *stream, const void *buffer, size_t n){
    struct stream_fd *s = stream_fd_cast(stream);
    ssize_t retval;

    if (STRESS(stream_flaky_send)) {
        return -EIO;
    }

    retval = write(s->fd, buffer, n);
    return (retval > 0 ? retval   : retval == 0 ? -EAGAIN  : -errno);
}

0 0
原创粉丝点击