Lua虚拟机分析之一

来源：互联网发布：淘宝店铺信誉提升平台编辑：程序博客网时间：2024/06/11 11:01

可能游戏圈的朋友对lua会更熟悉些，灵活、高效、跨平台、易用等诸多优点决定了这门语言将会得到更广泛的应用，而不只局限于游戏，已经有许多的技术大牛对这门语言做过许多的分析，这其中就包括云风，关于lua的更多介绍，可以查看官网，以及各种搜索引擎。本文旨在对lua虚拟机进行简单的分析，包含的内容有虚拟机中一部分基本的数据结构。

在应用的角度来看，lua有以下几种数据类型：nil、布尔类型、[light]UserData、Number、字符串、表、函数、线程（不同于操作系统的线程），而所有这些类型的表示，都需要一个被称为TValue的类型支撑，如下：

#define numfield/* no such field; numbers are the entire struct */typedef int (*lua_CFunction) (lua_State *L); // 先看成普通的C函数指针吧union Value {  GCObject *gc;    /* collectable objects */ // 这期先不管这个，当成指针来看就ok了  void *p;         /* light userdata */  int b;           /* booleans */  lua_CFunction f; /* light C functions */  numfield         /* numbers */};#define TValuefields  \union { struct { int tt__; Value v__; } i; double d__; } ustruct lua_TValue {  TValuefields;};typedef struct lua_TValue TValue;

看起来有点绕，对吧？基础的数据类型TValue其实就是一个4字节的union，外加一个类型tt__，或者表示成一个double类型，至于类型tt__，上面已经介绍过有那么几种类型，而关于各种具体类型对应的id，如下：

#define LUA_TNONE(-1) // 好吧，把这个也当成空类型吧#define LUA_TNIL0#define LUA_TBOOLEAN1#define LUA_TLIGHTUSERDATA2#define LUA_TNUMBER3#define LUA_TSTRING4#define LUA_TTABLE5#define LUA_TFUNCTION6#define LUA_TUSERDATA7#define LUA_TTHREAD8

关于TValue似乎已经可以不用再介绍了。关于一些lua的类型如TString、UData可阅读lobject.h，不一一介绍，这里只介绍表类型，用过lua的人，一定会喜欢lua的表类型，而其相关的定义如下：

typedef union TKey {  struct {    TValuefields;    struct Node *next;  /* for chaining */  } nk;  TValue tvk;} TKey;typedef struct Node {  TValue i_val;  TKey i_key;} Node;typedef struct Table {  CommonHeader;    /* #define CommonHeader GCObject *next; lu_byte tt; lu_byte marked */  lu_byte flags;  /* 1<<p means tagmethod(p) is not present */  lu_byte lsizenode;  /* log2 of size of `node' array */  struct Table *metatable; // 元表，也就是一个表，不过它可能有些键存的是函数，对这些键的访问，将触发函数调用  TValue *array;  /* array part */  Node *node;  Node *lastfree;  /* any free position is before this position */  GCObject *gclist;  int sizearray;  /* size of `array' array */} Table;

Node似乎很好理解，一个key，对应一个value,用一个Node数组来表示哈希表，如果不同键的哈希值相同时，则将相同哈希的节点以链表的形式存放（又称哈希桶），除了Node数组，Table还有个名为array的TValue数组，原来尽管哈希表的访问速度非常之快，终究也快不过线性数组的访问，考虑在实际应用中，以连续数字直接作为键值的情况并不少见，Table将这种优化也纳入到设计中，但问题是这个数组被设计成多大才合适呢？答案是多大都不合理，具体情况具体对待，在需要扩展的时候增加是最合适的，所以在ltable.c中有一个函数luaH_resize，重新设置哈希表与数组的大小。现在已知表的两种存储形式，根据key值来找到对应的位置，也自然需要区分对待，以下是根据key来找对应位置的代码：

static int findindex (lua_State *L, Table *t, StkId key) { // ltable.c StkId = TValue*  int i;  if (ttisnil(key)) return -1;  /* first iteration */ // 这行表明，在lua中table是不能以nil作为键值的  i = arrayindex(key);  if (0 < i && i <= t->sizearray)  /* is `key' inside array part? */    return i-1;  /* yes; that's the index (corrected to C) */  else {    Node *n = mainposition(t, key); // 先找到在node数组中的索引    for (;;) {  /* check whether `key' is somewhere in the chain */      /* key may be dead already, but it is ok to use it in `next' */      if (luaV_rawequalobj(gkey(n), key) ||            (ttisdeadkey(gkey(n)) && iscollectable(key) &&             deadvalue(gkey(n)) == gcvalue(key))) {        i = cast_int(n - gnode(t, 0));  /* key index in hash table */        /* hash elements are numbered after array ones */        return i + t->sizearray;      }      else n = gnext(n);      if (n == NULL)        luaG_runerror(L, "invalid key to " LUA_QL("next"));  /* key not found */    }  }}

从应用的角度来看，lua的一些类型的设计其实有很多内容都有学习与借鉴的价值，本文打算先结束介绍这些。在介绍lua_State类型时，我在思考与我所熟悉的什么事物比较接近，从命名来看，似乎是某种状态机，实际上当计算机处于单核时代，线程确实是用状态机实现，而lua_State所代表的就是Lua虚拟机里的线程，这个线程上下文包含了栈、当前函数地址、pc、UpValue(函数或者叫闭包的外部变量)，既然是状态机，肯定会有个状态标志，关于状态的具体定义如下：

#define CIST_LUA(1<<0)/* call is running a Lua function */#define CIST_HOOKED(1<<1)/* call is running a debug hook */#define CIST_REENTRY(1<<2)/* call is running on same invocation of                                   luaV_execute of previous call */#define CIST_YIELDED(1<<3)/* call reentered after suspension */#define CIST_YPCALL(1<<4)/* call is a yieldable protected call */#define CIST_STAT(1<<5)/* call has an error status (pcall) */#define CIST_TAIL(1<<6)/* call was tail called */#define CIST_HOOKYIELD(1<<7)/* last hook called yielded */

lua状态机的定义如下：

struct lua_State {  CommonHeader;  lu_byte status; // 状态，如上  StkId top;  /* first free slot in the stack */ // 栈顶  global_State *l_G; // 先把这个理解成进程好了,分配与回收内存是由它管理的  CallInfo *ci;  /* call info for current function */  const Instruction *oldpc;  /* last pc traced */  StkId stack_last;  /* last free slot in the stack */  StkId stack;  /* stack base */   int stacksize; // 栈大小  unsigned short nny;  /* number of non-yieldable calls in stack */  unsigned short nCcalls;  /* number of nested C calls */  // 以下省略....};

x86里，栈是自高地址向低地址增长，而lua则是自低地址向高地址增长的一个TValue数组，初始化state对象时，会分配40个TValue*给栈使用，如下所示（函数所在文件lstate.c)

static void stack_init (lua_State *L1, lua_State *L) {  int i; CallInfo *ci;  /* initialize stack array */  L1->stack = luaM_newvector(L, BASIC_STACK_SIZE, TValue); // BASIC_STACK_SIZE=40  L1->stacksize = BASIC_STACK_SIZE;  for (i = 0; i < BASIC_STACK_SIZE; i++)    setnilvalue(L1->stack + i);  /* erase new stack */  L1->top = L1->stack;  L1->stack_last = L1->stack + L1->stacksize - EXTRA_STACK;  /* initialize first ci */  ci = &L1->base_ci;  ci->next = ci->previous = NULL;  ci->callstatus = 0;  ci->func = L1->top;  setnilvalue(L1->top++);  /* 'function' entry for this 'ci' */  ci->top = L1->top + LUA_MINSTACK; // LUA_MINSTACK=20，分配给当前函数的栈大小为20个TValue*  L1->ci = ci;}

在打开一个lua_State时lstate.c:f_luaopen，除了一些基本的初始化（包括栈的初始化），还会初始化寄存器信息lstate.c:init_registry，语法保留字llex.c:luaX_init，表方法ltm.c:luaT_init。我们只知道lua_State很重要，但还不知道怎么去用，初始化与销毁都很简单，这部分的代码都在lstate.c:lua_newstate与lstate.c:lua_close，lua_State里保存了运行中的数据，怎么得到想要的数据呢，相关代码在lapi.c:index2addr，列出如下：

static TValue *index2addr (lua_State *L, int idx) {  CallInfo *ci = L->ci;  if (idx > 0) {    TValue *o = ci->func + idx;    api_check(L, idx <= ci->top - (ci->func + 1), "unacceptable index");    if (o >= L->top) return NONVALIDVALUE;    else return o;  }  else if (!ispseudo(idx)) {  /* negative index */ idx > -1001000    api_check(L, idx != 0 && -idx <= L->top - (ci->func + 1), "invalid index");    return L->top + idx;  }  else if (idx == LUA_REGISTRYINDEX) //-1001000    return &G(L)->l_registry;  else {  /* upvalues */    idx = LUA_REGISTRYINDEX - idx;    api_check(L, idx <= MAXUPVAL + 1, "upvalue index too large");    if (ttislcf(ci->func))  /* light C function? */      return NONVALIDVALUE;  /* it has no upvalues */    else {      CClosure *func = clCvalue(ci->func);      return (idx <= func->nupvalues) ? &func->upvalue[idx-1] : NONVALIDVALUE;    }  }}

在栈初始化时，知道L->top = ci->func+1，后续每往栈压一个参数，L->top++，所以如果从一开始按顺序分别压入了a,b,c几个值，则结束压栈操作后这几个值的idx分别为-3，-2，-1。

0 0