python byte code 的生成以投放到虚拟机的过程

来源：互联网发布：mac上怎么编辑文档编辑：程序博客网时间：2024/05/03 03:18

当我们在python shell 下面输入1+1, shell 会返回一个2给我们，那具体过程是什么样的呢？

如果打开源代码，其思路是非常清晰的：

// parsemp_parse_tree_t parse_tree = mp_parse(lex, input_kind);        // compile         mp_obj_t module_fun = mp_compile(&parse_tree, source_name, emit_opt, is_repl);        // execute it        mp_call_function_0(module_fun);

第一步生成parse tree, 地二步根据生成的parse tree, 产生module_fun, 最后执行之.

parse tree

的长相是这样的：

typedef struct _mp_parse_t {    mp_parse_node_t root;    struct _mp_parse_chunk_t *chunk;} mp_parse_tree_t;

之前有问章说到了parse tree 的作用，它将产生式用树的形式表示出来。

module_fun 就比较扯了，就是一个uint64.

一路按F11, 我们来到了mp_compile

mp_obj_t mp_compile(mp_parse_tree_t *parse_tree, qstr source_file, uint emit_opt, bool is_repl) {    mp_raw_code_t *rc = mp_compile_to_raw_code(parse_tree, source_file, emit_opt, is_repl);    // return function that executes the outer module    return mp_make_function_from_raw_code(rc, MP_OBJ_NULL, MP_OBJ_NULL);}

可以看到，compile 主要的功能是生成raw code , 然后根据生成的raw code 去make 成相应的函数。

接下来，我们看看raw code 到底是什么，是传说中的bytecode 吗？

typedef struct _mp_raw_code_t {    mp_raw_code_kind_t kind : 3;    mp_uint_t scope_flags : 7;    mp_uint_t n_pos_args : 11;    union {        struct {            const byte *bytecode;            const mp_uint_t *const_table;            #if MICROPY_PERSISTENT_CODE_SAVE            mp_uint_t bc_len;            uint16_t n_obj;            uint16_t n_raw_code;            #endif        } u_byte;        struct {            void *fun_data;            const mp_uint_t *const_table;            mp_uint_t type_sig; // for viper, compressed as 2-bit types; ret is MSB, then arg0, arg1, etc        } u_native;    } data;} mp_raw_code_t;

其中, bytecode 作为它的核心成员（变量）。

继续按F11, 我们看parse_tree, 是如何一步步生成raw code 的。

代码比较长，我们分段阅读：

    // put compiler state on the stack, it's relatively small    compiler_t comp_state = {0};    compiler_t *comp = &comp_state;    comp->source_file = source_file;    comp->is_repl = is_repl;    // create the module scope    scope_t *module_scope = scope_new_and_link(comp, SCOPE_MODULE, parse_tree->root, emit_opt);    // create standard emitter; it's used at least for MP_PASS_SCOPE    emit_t *emit_bc = emit_bc_new();

第一步：创建module_scope, scope 类似于其他语言的block, 它可以是一个set, 一个dictionary , 一个map ，一个lambda 表达式诸如此类。

// scope is a "block" in Python parlancetypedef enum { SCOPE_MODULE, SCOPE_FUNCTION, SCOPE_LAMBDA, SCOPE_LIST_COMP, SCOPE_DICT_COMP, SCOPE_SET_COMP, SCOPE_GEN_EXPR, SCOPE_CLASS } scope_kind_t;typedef struct _scope_t {    scope_kind_t kind;    struct _scope_t *parent;    struct _scope_t *next;    mp_parse_node_t pn;    qstr source_file;    qstr simple_name;    mp_raw_code_t *raw_code;    uint8_t scope_flags;  // see runtime0.h    uint8_t emit_options; // see compile.h    uint16_t num_pos_args;    uint16_t num_kwonly_args;    uint16_t num_def_pos_args;    uint16_t num_locals;    uint16_t stack_size;     // maximum size of the locals stack    uint16_t exc_stack_size; // maximum size of the exception stack    uint16_t id_info_alloc;    uint16_t id_info_len;    id_info_t *id_info;} scope_t;

大家有没有注意到，里面有个成员： mp_raw_code_t *raw_code.

第二步，创建标准的emitter, 翻译成中文就是发射器，是不是说，最终生成的byte code 由它“发射” 出去呢？

struct _emit_t {    // Accessed as mp_obj_t, so must be aligned as such, and we rely on the    // memory allocator returning a suitably aligned pointer.    // Should work for cases when mp_obj_t is 64-bit on a 32-bit machine.    byte dummy_data[DUMMY_DATA_SIZE];    pass_kind_t pass : 8;    mp_uint_t last_emit_was_return_value : 8;    int stack_size;    scope_t *scope;    mp_uint_t last_source_line_offset;    mp_uint_t last_source_line;    mp_uint_t max_num_labels;    mp_uint_t *label_offsets;    size_t code_info_offset;    size_t code_info_size;    size_t bytecode_offset;    size_t bytecode_size;    byte *code_base; // stores both byte code and code info    #if MICROPY_PERSISTENT_CODE    uint16_t ct_cur_obj;    uint16_t ct_num_obj;    uint16_t ct_cur_raw_code;    #endif    mp_uint_t *const_table;};

emit_t 有个成员叫 byte *code_base , 存放byte code ,以及code info。

目前看起来，这几个结构体的关系是compiler_t 包含emit, emit 里面有byte code .

接下来，我们就要进入第一遍编译了：

    // compile pass 1    comp->emit = emit_bc;    #if MICROPY_EMIT_NATIVE    comp->emit_method_table = &emit_bc_method_table;    #endif    uint max_num_labels = 0;    for (scope_t *s = comp->scope_head; s != NULL && comp->compile_error == MP_OBJ_NULL; s = s->next) {        if (false) {#if MICROPY_EMIT_INLINE_THUMB        } else if (s->emit_options == MP_EMIT_OPT_ASM_THUMB) {            compile_scope_inline_asm(comp, s, MP_PASS_SCOPE);#endif        } else {            compile_scope(comp, s, MP_PASS_SCOPE);        }        // update maximim number of labels needed        if (comp->next_label > max_num_labels) {            max_num_labels = comp->next_label;        }    }    // compute some things related to scope and identifiers    for (scope_t *s = comp->scope_head; s != NULL && comp->compile_error == MP_OBJ_NULL; s = s->next) {        scope_compute_things(s);    }    // set max number of labels now that it's calculated    emit_bc_set_max_num_labels(emit_bc, max_num_labels);

在第一遍编译的时候, compile_scope 充当了挑大梁的人。我们进去看看，它到底做了什么。

mp_obj_t mp_call_function_0(mp_obj_t fun) {    return mp_call_function_n_kw(fun, 0, 0, NULL);}

不管传进来的参数有多少个，都来到了这个函数：

// args contains, eg: arg0  arg1  key0  value0  key1  value1mp_obj_t mp_call_function_n_kw(mp_obj_t fun_in, mp_uint_t n_args, mp_uint_t n_kw, const mp_obj_t *args) {    // TODO improve this: fun object can specify its type and we parse here the arguments,    // passing to the function arrays of fixed and keyword arguments    DEBUG_OP_printf("calling function %p(n_args=" UINT_FMT ", n_kw=" UINT_FMT ", args=%p)\n", fun_in, n_args, n_kw, args);    // get the type    mp_obj_type_t *type = mp_obj_get_type(fun_in);    // do the call    if (type->call != NULL) {        return type->call(fun_in, n_args, n_kw, args);    }    if (MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE) {        nlr_raise(mp_obj_new_exception_msg(&mp_type_TypeError,            "object not callable"));    } else {        nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_TypeError,            "'%s' object is not callable", mp_obj_get_type_str(fun_in)));    }}

如果不考虑下面的异常处理，整个函数也就两句话，get the type, do the call.

在 objint.c 的 mp_obj_int_formatted 里面，我们第一次看到了计算结果。

char *mp_obj_int_formatted(char **buf, mp_uint_t *buf_size, mp_uint_t *fmt_size, mp_const_obj_t self_in,                           int base, const char *prefix, char base_char, char comma) {    fmt_int_t num;    if (MP_OBJ_IS_SMALL_INT(self_in)) {        // A small int; get the integer value to format.        num = mp_obj_get_int(self_in);

执行的第一个bytecode 是MP_BC_LOAD_NAME

光有MP_BC_LOAD_NAME肯定是不行的，因为我们还要做计算，即“+”， micropython 里面，有一个函数用来处理所有的二元操作：mp_binary_op() 位于runtime.c

 case MP_BINARY_OP_ADD:                case MP_BINARY_OP_INPLACE_ADD: lhs_val += rhs_val; break;                case MP_BINARY_OP_SUBTRACT:                case MP_BINARY_OP_INPLACE_SUBTRACT: lhs_val -= rhs_val; break;                case MP_BINARY_OP_MULTIPLY:                case MP_BINARY_OP_INPLACE_MULTIPLY: {                    // If long long type exists and is larger than mp_int_t, then                    // we can use the following code to perform overflow-checked multiplication.                    // Otherwise (eg in x64 case) we must use mp_small_int_mul_overflow.                    #if 0                    // compute result using long long precision                    long long res = (long long)lhs_val * (long long)rhs_val;                    if (res > MP_SMALL_INT_MAX || res < MP_SMALL_INT_MIN) {                        // result overflowed SMALL_INT, so return higher precision integer                        return mp_obj_new_int_from_ll(res);                    } else {                        // use standard precision                        lhs_val = (mp_int_t)res;                    }                    #endif

可以看到，加减乘除，都是在这处理的。

0 0