先看LUA中关于字符串TString的源码:
/*
** Header for string value; string bytes follow the end of this structure
** (aligned according to 'UTString'; see next).
*/
typedef struct TString {
CommonHeader;//可GC对象的头
lu_byte extra; /* reserved words for short strings; "has hash" for longs */ //标记是否是虚拟机保留的字符串,如果是短字符串,1就是lua中保留的字符串(关键字),不可GC;长字符串1表示已经hash.
lu_byte shrlen; /* length for short strings *///短字符串的长度
unsigned int hash;//字符串的hash值,字符串的比较可以通过hash值
union {
size_t lnglen; /* length for long strings */ //长字符串的长度
struct TString *hnext; /* linked list for hash table *///指向下一个字符串
} u;
} TString;
/*
** Ensures that address after this type is always fully aligned.
**L_Umaxalign是一个宏,用来保证UTString结构里的TString按照这个长度来对齐
*/
typedef union UTString {
L_Umaxalign dummy; /* ensures maximum alignment for strings */
TString tsv;
} UTString;
extra值是1时表示是虚拟器中保留的字符串,也就是关键字,是不支持自动回收的,在GC过程中会略过对这个字符串的处理.
/* ORDER RESERVED */
static const char *const luaX_tokens [] = {
"and", "break", "do", "else", "elseif",
"end", "false", "for", "function", "goto", "if",
"in", "local", "nil", "not", "or", "repeat",
"return", "then", "true", "until", "while",
"//", "..", "...", "==", ">=", "<=", "~=",
"<<", ">>", "::", "<eof>",
"<number>", "<integer>", "<name>", "<string>"
};
创建新的字符串:
/*
** new string (with explicit length)
*/
TString *luaS_newlstr (lua_State *L, const char *str, size_t l) {
if (l <= LUAI_MAXSHORTLEN) /* short string? */
return internshrstr(L, str, l);
else {
TString *ts;
if (l >= (MAX_SIZE - sizeof(TString))/sizeof(char))
luaM_toobig(L);
ts = luaS_createlngstrobj(L, l);
memcpy(getstr(ts), str, l * sizeof(char));
return ts;
}
}
lua中字符串分为长字符串和短字符串,这两个处理方式略有不同. 当l<LUAI_MAXSHORTLEN,属于短字符串,默认是40.
先看短字符吧:
通过str和长度以及seed(种子)算出该str的hash值.根据hash值在全局字符串table中找到链表(短字符串存在全局表中global_state).如果长度和所有的字符都相同,就重复利用.否则需要重新生成. 从代码中看出如果链表的字符串对象的个数不小于size,就重新调用luaS_resize,空间是之前的2倍.
/*
** checks whether short string exists and reuses it or creates a new one
*/
static TString *internshrstr (lua_State *L, const char *str, size_t l) {
TString *ts;
global_State *g = G(L);
unsigned int h = luaS_hash(str, l, g->seed);
TString **list = &g->strt.hash[lmod(h, g->strt.size)];
lua_assert(str != NULL); /* otherwise 'memcmp'/'memcpy' are undefined */
for (ts = *list; ts != NULL; ts = ts->u.hnext) {
if (l == ts->shrlen &&
(memcmp(str, getstr(ts), l * sizeof(char)) == 0)) {
/* found! */
if (isdead(g, ts)) /* dead (but not collected yet)? */
changewhite(ts); /* resurrect it */
return ts;
}
}
if (g->strt.nuse >= g->strt.size && g->strt.size <= MAX_INT/2) {
luaS_resize(L, g->strt.size * 2);
list = &g->strt.hash[lmod(h, g->strt.size)]; /* recompute with new size */
}
ts = createstrobj(L, l, LUA_TSHRSTR, h);
memcpy(getstr(ts), str, l * sizeof(char));
ts->shrlen = cast_byte(l);
ts->u.hnext = *list;
*list = ts;
g->strt.nuse++;
return ts;
}
createstrobj是真正创建lua字符串的函数:
/*
** creates a new string object
*/
static TString *createstrobj (lua_State *L, size_t l, int tag, unsigned int h) {
TString *ts;
GCObject *o;
size_t totalsize; /* total size of TString object */
totalsize = sizelstring(l);
o = luaC_newobj(L, tag, totalsize);
ts = gco2ts(o);
ts->hash = h;
ts->extra = 0;
getstr(ts)[l] = '\0'; /* ending 0 */
return ts;
}
sizelstring(l)是宏,在l的基础上加上sizeof(UTString)+1, 这个1是为'\0'准备的. 宏gco2ts是将GCObject对象转换为TString对象.在extra的赋值上看到赋值为0,不是虚拟器需要保存的字符串.
这区别于之前的luaX_tokens,虚拟器保留的字符串.源码如下:
llex.c
void luaX_init (lua_State *L) {
int i;
TString *e = luaS_newliteral(L, LUA_ENV); /* create env name */
luaC_fix(L, obj2gco(e)); /* never collect this name */
for (i=0; i<NUM_RESERVED; i++) {
TString *ts = luaS_new(L, luaX_tokens[i]);
luaC_fix(L, obj2gco(ts)); /* reserved words are never collected */
ts->extra = cast_byte(i+1); /* reserved word */
}
}
luaC_fix保证保留字符串不被gc
lgc.c
void luaC_fix (lua_State *L, GCObject *o) {
global_State *g = G(L);
lua_assert(g->allgc == o); /* object must be 1st in 'allgc' list! */
white2gray(o); /* they will be gray forever */
g->allgc = o->next; /* remove object from 'allgc' list */
o->next = g->fixedgc; /* link it to 'fixedgc' list */
g->fixedgc = o;
}
fixedgc是不可gc的链表.
长字符串创建:
TString *luaS_createlngstrobj (lua_State *L, size_t l) {
TString *ts = createstrobj(L, l, LUA_TLNGSTR, G(L)->seed);
ts->u.lnglen = l;
return ts;
}
长字符串的长度是保存在联合结构体内的lnglen中,和短字符串不同.
接下来看看hash的取法:
unsigned int luaS_hash (const char *str, size_t l, unsigned int seed) {
unsigned int h = seed ^ cast(unsigned int, l);
size_t step = (l >> LUAI_HASHLIMIT) + 1;
for (; l >= step; l -= step)
h ^= ((h<<5) + (h>>2) + cast_byte(str[l - 1]));
return h;
}
unsigned int luaS_hashlongstr (TString *ts) {
lua_assert(ts->tt == LUA_TLNGSTR);
if (ts->extra == 0) { /* no hash? */
ts->hash = luaS_hash(getstr(ts), ts->u.lnglen, ts->hash);
ts->extra = 1; /* now it has its hash */
}
return ts->hash;
}
从源码上看长字符串是属于惰性求hash值,如果已经错在hash值,就直接返回,不再重新求.
在求hash上, 最新版本新加了种子,全局表中的随机种子: g->seed = makeseed(L); 就是防止产生的hash值相同的太多,导致生成的链表过长,加大了查找和插入的时间.
lua中字符串hash用的是JSHash,关于字符串的各种hash函数,可以参考:
http://blog.csdn.net/u014269285/article/details/79518334
/*
** equality for short strings, which are always internalized
*/
#define eqshrstr(a,b) check_exp((a)->tt == LUA_TSHRSTR, (a) == (b))
/*
** equality for long strings
*/
int luaS_eqlngstr (TString *a, TString *b) {
size_t len = a->u.lnglen;
lua_assert(a->tt == LUA_TLNGSTR && b->tt == LUA_TLNGSTR);
return (a == b) || /* same instance or... */
((len == b->u.lnglen) && /* equal length and ... */
(memcmp(getstr(a), getstr(b), len) == 0)); /* equal contents */
}
长字符串需要先比较长度,在比较内容.