Lua-5.3.4代码分析(二)TString字符串

屈健柏

2023-12-01

先看LUA中关于字符串TString的源码:

/*
** Header for string value; string bytes follow the end of this structure
** (aligned according to 'UTString'; see next).
*/
typedef struct TString {
  CommonHeader;//可GC对象的头
  lu_byte extra;  /* reserved words for short strings; "has hash" for longs */ //标记是否是虚拟机保留的字符串,如果是短字符串,1就是lua中保留的字符串(关键字),不可GC;长字符串1表示已经hash.
  lu_byte shrlen;  /* length for short strings *///短字符串的长度
  unsigned int hash;//字符串的hash值,字符串的比较可以通过hash值
  union {
    size_t lnglen;  /* length for long strings */  //长字符串的长度
    struct TString *hnext;  /* linked list for hash table *///指向下一个字符串
  } u;
} TString;


/*
** Ensures that address after this type is always fully aligned.
**L_Umaxalign是一个宏,用来保证UTString结构里的TString按照这个长度来对齐
*/
typedef union UTString {
  L_Umaxalign dummy;  /* ensures maximum alignment for strings */
  TString tsv;
} UTString;

extra值是1时表示是虚拟器中保留的字符串,也就是关键字,是不支持自动回收的,在GC过程中会略过对这个字符串的处理.

/* ORDER RESERVED */
static const char *const luaX_tokens [] = {
    "and", "break", "do", "else", "elseif",
    "end", "false", "for", "function", "goto", "if",
    "in", "local", "nil", "not", "or", "repeat",
    "return", "then", "true", "until", "while",
    "//", "..", "...", "==", ">=", "<=", "~=",
    "<<", ">>", "::", "<eof>",
    "<number>", "<integer>", "<name>", "<string>"
};

创建新的字符串:

/*
** new string (with explicit length)
*/
TString *luaS_newlstr (lua_State *L, const char *str, size_t l) {
  if (l <= LUAI_MAXSHORTLEN)  /* short string? */
    return internshrstr(L, str, l);
  else {
    TString *ts;
    if (l >= (MAX_SIZE - sizeof(TString))/sizeof(char))
      luaM_toobig(L);
    ts = luaS_createlngstrobj(L, l);
    memcpy(getstr(ts), str, l * sizeof(char));
    return ts;
  }
}

lua中字符串分为长字符串和短字符串,这两个处理方式略有不同. 当l<LUAI_MAXSHORTLEN,属于短字符串,默认是40.

先看短字符吧:

通过str和长度以及seed(种子)算出该str的hash值.根据hash值在全局字符串table中找到链表(短字符串存在全局表中global_state).如果长度和所有的字符都相同,就重复利用.否则需要重新生成. 从代码中看出如果链表的字符串对象的个数不小于size,就重新调用luaS_resize,空间是之前的2倍.

/*
** checks whether short string exists and reuses it or creates a new one
*/
static TString *internshrstr (lua_State *L, const char *str, size_t l) {
  TString *ts;
  global_State *g = G(L);
  unsigned int h = luaS_hash(str, l, g->seed);
  TString **list = &g->strt.hash[lmod(h, g->strt.size)];
  lua_assert(str != NULL);  /* otherwise 'memcmp'/'memcpy' are undefined */
  for (ts = *list; ts != NULL; ts = ts->u.hnext) {
    if (l == ts->shrlen &&
        (memcmp(str, getstr(ts), l * sizeof(char)) == 0)) {
      /* found! */
      if (isdead(g, ts))  /* dead (but not collected yet)? */
        changewhite(ts);  /* resurrect it */
      return ts;
    }
  }
  if (g->strt.nuse >= g->strt.size && g->strt.size <= MAX_INT/2) {
    luaS_resize(L, g->strt.size * 2);
    list = &g->strt.hash[lmod(h, g->strt.size)];  /* recompute with new size */
  }
  ts = createstrobj(L, l, LUA_TSHRSTR, h);
  memcpy(getstr(ts), str, l * sizeof(char));
  ts->shrlen = cast_byte(l);
  ts->u.hnext = *list;
  *list = ts;
  g->strt.nuse++;
  return ts;
}

createstrobj是真正创建lua字符串的函数:

/*
** creates a new string object
*/
static TString *createstrobj (lua_State *L, size_t l, int tag, unsigned int h) {
  TString *ts;
  GCObject *o;
  size_t totalsize;  /* total size of TString object */
  totalsize = sizelstring(l);
  o = luaC_newobj(L, tag, totalsize);
  ts = gco2ts(o);
  ts->hash = h;
  ts->extra = 0;
  getstr(ts)[l] = '\0';  /* ending 0 */
  return ts;
}

sizelstring(l)是宏,在l的基础上加上sizeof(UTString)+1, 这个1是为'\0'准备的. 宏gco2ts是将GCObject对象转换为TString对象.在extra的赋值上看到赋值为0,不是虚拟器需要保存的字符串.

这区别于之前的luaX_tokens,虚拟器保留的字符串.源码如下:

llex.c

void luaX_init (lua_State *L) {
  int i;
  TString *e = luaS_newliteral(L, LUA_ENV);  /* create env name */
  luaC_fix(L, obj2gco(e));  /* never collect this name */
  for (i=0; i<NUM_RESERVED; i++) {
    TString *ts = luaS_new(L, luaX_tokens[i]);
    luaC_fix(L, obj2gco(ts));  /* reserved words are never collected */
    ts->extra = cast_byte(i+1);  /* reserved word */
  }
}

luaC_fix保证保留字符串不被gc

lgc.c

void luaC_fix (lua_State *L, GCObject *o) {
  global_State *g = G(L);
  lua_assert(g->allgc == o);  /* object must be 1st in 'allgc' list! */
  white2gray(o);  /* they will be gray forever */
  g->allgc = o->next;  /* remove object from 'allgc' list */
  o->next = g->fixedgc;  /* link it to 'fixedgc' list */
  g->fixedgc = o;
}

fixedgc是不可gc的链表.

长字符串创建:

TString *luaS_createlngstrobj (lua_State *L, size_t l) {
  TString *ts = createstrobj(L, l, LUA_TLNGSTR, G(L)->seed);
  ts->u.lnglen = l;
  return ts;
}

长字符串的长度是保存在联合结构体内的lnglen中,和短字符串不同.

接下来看看hash的取法:

unsigned int luaS_hash (const char *str, size_t l, unsigned int seed) {
  unsigned int h = seed ^ cast(unsigned int, l);
  size_t step = (l >> LUAI_HASHLIMIT) + 1;
  for (; l >= step; l -= step)
    h ^= ((h<<5) + (h>>2) + cast_byte(str[l - 1]));
  return h;
}

unsigned int luaS_hashlongstr (TString *ts) {
  lua_assert(ts->tt == LUA_TLNGSTR);
  if (ts->extra == 0) {  /* no hash? */
    ts->hash = luaS_hash(getstr(ts), ts->u.lnglen, ts->hash);
    ts->extra = 1;  /* now it has its hash */
  }
  return ts->hash;
}

从源码上看长字符串是属于惰性求hash值,如果已经错在hash值,就直接返回,不再重新求.

在求hash上, 最新版本新加了种子,全局表中的随机种子： g->seed = makeseed(L); 就是防止产生的hash值相同的太多,导致生成的链表过长,加大了查找和插入的时间.

lua中字符串hash用的是JSHash,关于字符串的各种hash函数,可以参考:

http://blog.csdn.net/u014269285/article/details/79518334

字符串的比较:

/*
** equality for short strings, which are always internalized
*/
#define eqshrstr(a,b)	check_exp((a)->tt == LUA_TSHRSTR, (a) == (b))

短字符串的比较直接比较地址,因为在lua中相同的短字符串只会存在一份.

/*
** equality for long strings
*/
int luaS_eqlngstr (TString *a, TString *b) {
  size_t len = a->u.lnglen;
  lua_assert(a->tt == LUA_TLNGSTR && b->tt == LUA_TLNGSTR);
  return (a == b) ||  /* same instance or... */
    ((len == b->u.lnglen) &&  /* equal length and ... */
     (memcmp(getstr(a), getstr(b), len) == 0));  /* equal contents */
}

长字符串需要先比较长度,在比较内容.

短字符串的比较直接比较地址,因为在lua中相同的短字符串只会存在一份.

Lua-5.3.4代码分析(二)TString字符串

相关阅读

相关文章

相关问答

相关文档