glib库 hash表实现分析

鲁品

2023-12-01

Hash Table的原理

哈希表的目的简单来说是为了实现存储多个key=>value关系（注意，此处是单项推导，不支持反向查找），一个比较简单的模型实现是用一个数组来存储这些关系，但是在插入数据时，并不在index最小的数组位置插入，而是直接通过函数算出这个key-value应该存储的位置，这样可以避免查找时遍历查找。

一个比较简单的实现方法是这样：

typedef struct
{
    T_Val value;
    T_Pos next;
}T_Item;
struct _HashTable
{
    T_Item items[];
    T_Pos hash_list_Headers[];
}T_HashTable;

将具有相同hash值的条目组成一个链表，用T_Item::next域表示链表的下一个的位置，当插入的时候，首先计算出hash值，以该值作为索引，通过T_HashTable::hash_list_headers，找到该哈希链表的地一个位置T_Pos，然后遍历该链表，找到需要的key对应的value。

这种方法有什么缺点呢？
1. 链表在物理内存中不连续，造成cache命中减小
2. 如果用单项链表，则删除节点十分麻烦
3. 效率很低

一群牛人们在glib库中重新写了hash算法的模板，本文在此分析一下他的运行过程

大体算法：

  gpointer        *keys;
  guint           *hashes;
  gpointer        *values;

这三个数组长度相同，下标为x的三个元素 key[x] hashes[x] values[x]，共同描述这同一个key-value关系。
不会出现当x,y,z任意两个不想等时，key[x] hashes[y] values[z]，共同描述一个key-value关系。当插入一个
key-value关系时，伪代码如下：

find(key)
{
    hash_val = get_hash_by_key( key);
    for ( i = hash_val; HASH_IS_USED( hashes[i] ); i++ )
    {
        if ( HASH_VAL_TOMB == hash[i] )
            continue;
        if ( hash[i] == hash_val && keys[i] == key)
            return values[i];
   }
}

首先计算该key的hash值，以该哈希值x为起始数组位置，找到hashes[x]与keys[x]同时匹配时，返回value

先来看一下最重要的数据结构，这个_GHashTable数据结构写在c文件中，对外不暴露细节，在glib.h文件中被
typedef为 GHashTable

struct GHashTable
{
  gint             size;
  gint             mod;
  guint            mask;
  gint             nnodes;
  gint             noccupied;  /* nnodes + tombstones */

  gpointer        *keys;
  guint           *hashes;
  gpointer        *values;

  GHashFunc        hash_func;
  GEqualFunc       key_equal_func;
  gint             ref_count;
#ifndef G_DISABLE_ASSERT
  /*
   * Tracks the structure of the hash table, not its contents: is only
   * incremented when a node is added or removed (is not incremented
   * when the key or data of a node is modified).
   */
  int              version;
#endif
  GDestroyNotify   key_destroy_func;
  GDestroyNotify   value_destroy_func;
};

GHashTable *
g_hash_table_new (GHashFunc  hash_func,
                  GEqualFunc key_equal_func)
{
  return g_hash_table_new_full (hash_func, key_equal_func, NULL, NULL);
}

GHashTable *
g_hash_table_new_full (GHashFunc      hash_func,
                       GEqualFunc     key_equal_func,
                       GDestroyNotify key_destroy_func,
                       GDestroyNotify value_destroy_func)
{
  GHashTable *hash_table;
  hash_table = g_slice_new (GHashTable);
  /*g_slice_new和new是一样的*/
  g_hash_table_set_shift (hash_table, HASH_TABLE_MIN_SHIFT);
  /*这个函数很有意思，下文着重分析*/
  hash_table->nnodes             = 0;
  /*这个哈希表中，实际被真正占用的有多少个*/
  hash_table->noccupied          = 0;
  /*noccupied  = nnodes + tombs
    tomb 坟墓，也就是被删除了的元素！
    */
  hash_table->hash_func          = hash_func ? hash_func : g_direct_hash;
  hash_table->key_equal_func     = key_equal_func;
  hash_table->ref_count          = 1;
#ifndef G_DISABLE_ASSERT
  hash_table->version            = 0;
#endif
  hash_table->key_destroy_func   = key_destroy_func;
  hash_table->value_destroy_func = value_destroy_func;
  hash_table->keys               = g_new0 (gpointer, hash_table->size);
  hash_table->values             = hash_table->keys;
  /*初始化时，将keys和values指向同一段内存，这是为了防止有key与value一直相等的情况，这样做可以节省内存*/
  hash_table->hashes             = g_new0 (guint, hash_table->size);
/*g_new0就是new*/
  return hash_table;
}

插入过程

gboolean
g_hash_table_insert (GHashTable *hash_table,
                     gpointer    key,
                     gpointer    value)
{
  return g_hash_table_insert_internal (hash_table, key, value, FALSE);
}

static gboolean
g_hash_table_insert_internal (GHashTable *hash_table,
                              gpointer    key,
                              gpointer    value,
                              gboolean    keep_new_key)
{
  guint key_hash;
  guint node_index;

  g_return_val_if_fail (hash_table != NULL, FALSE);
  /*hash_table==NULL 直接退出*/
  node_index = g_hash_table_lookup_node (hash_table, key, &key_hash);
  /*这个函数要么找到命中的node_index，如果没找到，则返回一个坟墓或者空项目的指针  */
  return g_hash_table_insert_node (hash_table, node_index, key_hash, key, value, keep_new_key, FALSE);
}

static inline guint
g_hash_table_lookup_node (GHashTable    *hash_table,
                          gconstpointer  key,
                          guint         *hash_return)
{
  guint node_index;
  guint node_hash;
  guint hash_value;
  guint first_tombstone = 0;
  gboolean have_tombstone = FALSE;
  guint step = 0;

  g_assert (hash_table->ref_count > 0);

  /*这里计算哈希值之后，又对哈希值做了处理，使之不能大于等于2*/
  /*原因在于 0 表示该hash数组元素没有使用，1表示是坟墓*/
  hash_value = hash_table->hash_func (key);
  if (G_UNLIKELY (!HASH_IS_REAL (hash_value)))
    hash_value = 2;

  *hash_return = hash_value;

  node_index = hash_value % hash_table->mod;
  node_hash = hash_table->hashes[node_index];

  /*tomb的hash为1 算是已使用，
  正常的hash >= 2
  未使用hash = 0*/
  /*这里不会产生死循环，因为在要满之前，就会扩充内存，所以总会保证能够有一个空的哈希元素的
  g_hash_table_resize*/
  while (!HASH_IS_UNUSED (node_hash))
    {
      if (node_hash == hash_value)
        {
        /*哈希命中，则比对key*/
          gpointer node_key = hash_table->keys[node_index];

          if (hash_table->key_equal_func)
            {
              if (hash_table->key_equal_func (node_key, key))
                return node_index;
            }
          else if (node_key == key)
            {
              return node_index;
            }
        }
      else if (HASH_IS_TOMBSTONE (node_hash) && !have_tombstone)
        {
        /*如果是一个坟墓，则把下标记录下来，这样这个下标就可以作为插入的时候参考用*/
          first_tombstone = node_index;
          have_tombstone = TRUE;
        }
     /*只要不return 或者遇到没有使用的hash，则一直循环*/
      step++;
      node_index += step;
      node_index &= hash_table->mask;
      node_hash = hash_table->hashes[node_index];
    }

  /*走到这里发现，一直持续循环最后还是没命中，node_index*/
  if (have_tombstone)
    return first_tombstone;

  return node_index;
}

g_hash_table_lookup (GHashTable    *hash_table,
                     gconstpointer  key)
{
  guint node_index;
  guint node_hash;

  g_return_val_if_fail (hash_table != NULL, NULL);

 /*这里查询key又调用了这个函数*/
  node_index = g_hash_table_lookup_node (hash_table, key, &node_hash);

  return HASH_IS_REAL (hash_table->hashes[node_index])
    ? hash_table->values[node_index]
    : NULL;
}

glib库 hash表实现分析

Hash Table的原理

相关阅读

相关文章

相关问答

相关文档