当前位置: 首页 > 工具软件 > PJ2 > 使用案例 >

数据库 PJ-2 基于B+树的简单数据库

关翰
2023-12-01
1930713****

实验内容

已有一个数据库 myjql 的框架,现在在这个框架中写一棵组织在文件里的 B+ 树,以此实现数据库基本功能。

数据库文件要可持久化存储,也就说退出数据库后会留下数据库文件,再次打开时数据不会丢失。

数据库中每个条目由 16 个字节组成:A(int) + B(12字节字符串)。

按照 B 为关键字建立索引。

要求支持如下操作:

  • select: 列出所有的元组
  • select s: 列出所有 B 为 s 的元组
  • insert i s: 插入一个 A = iB = s 的条目
  • delete s: 删除所有 B 为 s 的条目
  • .exit: 退出数据库

测试集:大约有 6e6 级别的条目。

时间限制:180 s

内存限制:6 MB

硬盘空间限制:无

注意:不能把索引存在内存里,否则一定超出内存空间限制。


文件的结构设计

B+树的条目存在叶子结点上,因此文件里可以只存放一棵B+树。可以按照结点为单位在文件中存储B+树。只要知道每个结点在文件中的首位置,就可以用fseek函数在文件中定位结点并且读取,因此我们用一个整数类型来表示一个结点在文件中的首位置,把这个整数称为node_ptr类。

除了结点,为了方便管理文件信息,还需要一些元数据。

元数据:

typedef unsigned int node_ptr;
struct FILE_INFO{
  node_ptr root;
  unsigned int lenth; 
  unsigned int entry_num;
};

root:存下B+树的根结点

lenth:B+树文件的总长度,可以据此在文件内分配新的结点

entry_num:树内条目总数

新建结点并且返回起始位置的代码:

node_ptr New_NODE()
{
  node_ptr ret = finfo.lenth;
  finfo.lenth += sizeof(struct NODE);
  return ret;
}

结点:

struct NODE{
  node_ptr lsibling,rsibling;
  int is_leaf;
  int key_number;
  char data[IDX_SIZE];
};

lsibling,rsibling:左右邻居结点,用来进行合并、迁移,对于叶子结点可以用来有序遍历。

is_leaf:标记该结点是否是叶子结点。

key_number:用来表示结点现存的键值的数量。

data:char数组,用来存放键值和指针(或对于叶子来说,条目的A值)。4 个字节放一个整数,接着 12 字节放一个字符串,这样无论是叶子结点上条目的A值+B值,还是非叶子结点上的子节点号+键值,都可以用 16个连续的char类型来表示。

结点的大小

因为一个页空间的大小是 4096 字节,因此我将一个结点直接设置为一个页的大小,并且根据计算,一个B+树的结点大概最多能存放252个条目(或键值与子结点号),也就是说这棵树的度(Degree)大概在 126 ,也就是说除了根结点,所有结点应该是 126~252 叉的,应当具有至少50%的占有率。

定义了一些宏来表示这些大小信息:

#define PAGE_SIZE 4096
#define NODE_SIZE 4096
#define DEGREE ((NODE_SIZE-16-16-4)/32)
#define IDX_SIZE (NODE_SIZE-16)

文件存储的实现

myjql 需要能够读取识别上面构造的数据库文件,并且在增删元素的时候修改文件,退出的时候留下文件。

程序开始阶段读文件:

void open_file(const char* filename) {
  /* open file */
  if(!(fp = fopen(filename,"r+")))//如果文件不存在,新建一个文件,初始化元信息和B+树
  {
    fp = fopen(filename,"w+");
    finfo.entry_num=0;
    finfo.lenth=sizeof(struct FILE_INFO);
    finfo.root=New_NODE();

    struct NODE node;//建立一个结点,并且插入一条哨兵条目
    memset(&node,0,sizeof(node));
    node.is_leaf=1;
    node.key_number=1;
    np2char(0,node.data);
    memcpy(node.data+4,"\0\0\0\0\0\0\0\0\0\0\0",12);
    write_node(finfo.root,&node);
  }
  else//如果存在,直接把元数据读出来
  {
    fseek(fp,0,SEEK_SET);
    fread(&finfo,sizeof(struct FILE_INFO),1,fp);
  }
}

程序结束阶段改写元数据:

void exit_nicely(int code) {
  /* do clean work */
  fseek(fp,0,SEEK_SET);
  fwrite(&finfo,sizeof(struct FILE_INFO),1,fp);
  exit(code);
}

程序执行中读一个结点:

struct NODE node;
fseek(fp,n,SEEK_SET);
fread(&node,sizeof(struct NODE),1,fp);
//从文件把以n为起始位置的结点读到node

程序执行中写一个结点:

void write_node(node_ptr n,struct NODE* node)
{
  fseek(fp,n,SEEK_SET);
  fwrite(node,sizeof(struct NODE),1,fp);
}

基本操作

分析:

在这个项目中,要求键值相同的条目按照时间后先排序,也就是说相同键值,后到者排在前面。这个看似需要用时间戳维护,但其实不然。当我们插入一个条目 (A,B) 的时候,我们认为此时B+树中所有小于 B 的值都比 B 小,而B+树中所有大于等于 B 的值都比当前插入的这个条目大。这样自然而然,插入条目虽然没有存时间戳的信息,但天然按照时间戳的顺序进行了排序,不需要维护额外的时间戳信息,可谓大道至简。

B+树插入:

递归到叶子进行插入,每递归一层,从文件里把对应结点的信息读入到内存。

如果当前结点满出,则分裂结点,回溯的时候在父亲结点插入新分裂出来的子节点。

结束当前层的时候要注意写回内存。

注意各种细节即可。

node_ptr tree_insert(node_ptr n,char * retkey)
{
  
  assert(n);
  struct NODE node;
  char newkey[12];

  fseek(fp,n,SEEK_SET);
  fread(&node,sizeof(struct NODE),1,fp);

  //printf("\n node:%d key_number:%d\n",n,node.key_number);
  
  int i;
  for(i = 0;i < node.key_number;i++ )
  {
    if(strlessequal(statement.row.b,node.data+(i*16+4)))break;
  }
  node_ptr newchildentry;

  if(!node.is_leaf)newchildentry = tree_insert(char2np(node.data+(i*16)),newkey);
  else {newchildentry = statement.row.a;memcpy(newkey,statement.row.b,12);}
  
  //printf("\nchildentry:%d\n",newchildentry);
  if(!node.is_leaf && !newchildentry)return 0;

  if(!node.is_leaf)i++;
  for(int j=node.key_number;j>=i;j--)
    memcpy(node.data+((j+1)*16),node.data+(j*16),16);
  if(!node.is_leaf)
  {
    memcpy(node.data+(i*16+4),node.data+(i*16-12),12);
    memcpy(node.data+(i*16-12),newkey,12);
    np2char(newchildentry,node.data+(i*16));
  }
  else
  {
    np2char(newchildentry,node.data+(i*16));
    memcpy(node.data+(i*16+4),newkey,12);
  }
  node.key_number++;

  if(node.key_number <= 2*DEGREE)
  {
    write_node(n,&node);
    return 0;
  }
  else
  {
    struct NODE new_node;
    memset(&new_node,0,sizeof(new_node));
    node_ptr new_id = New_NODE();
    if(node.rsibling)
    {
      struct NODE rnode;
      memset(&rnode,0,sizeof(rnode));
      fseek(fp,node.rsibling,SEEK_SET);
      fread(&rnode,sizeof(struct NODE),1,fp);
      rnode.lsibling = new_id;
      write_node(node.rsibling,&rnode);
    }
    new_node.rsibling = node.rsibling;
    new_node.is_leaf = node.is_leaf;
    new_node.lsibling = n;
    node.rsibling = new_id;
    if(!node.is_leaf)
    {
      memcpy(new_node.data,node.data+((DEGREE+1)*16),DEGREE*16+4);
      memset(node.data+((DEGREE+1)*16),0,DEGREE*16+4);
      new_node.key_number = node.key_number = DEGREE;
      memcpy(retkey,node.data+(DEGREE*16+4),12);//UNCLEAR
      //printf("\nretkey:%s\n",retkey);
    }
    else
    {
      memcpy(new_node.data,node.data+((DEGREE)*16),(DEGREE+1)*16);
      new_node.key_number = DEGREE + 1;
      node.key_number = DEGREE;
      memcpy(retkey,new_node.data+4,12);//UNCLEAR
    }
    
    write_node(n,&node);
    write_node(new_id,&new_node);

    if(n==finfo.root)
    {
      struct NODE new_root;
      memset(&new_root,0,sizeof(new_root));
      int root_id = New_NODE();
      finfo.root = root_id;
      new_root.is_leaf = 0;
      new_root.key_number = 1;
      new_root.lsibling = new_root.rsibling = 0;
      np2char(n,new_root.data);
      memcpy(new_root.data+4,retkey,12);
      np2char(new_id,new_root.data+16);
      write_node(root_id,&new_root);
    }
    return new_id;
  }
}

B+树删除:

递归到叶子进行删除,每递归一层,从文件里把对应结点的信息读入到内存。

如果删除之后当前结点占有率低于50%,从左右兄弟选一个平均分配一下所存条目,如果两者条目之和能够用一个结点存储,就合并结点。回溯的时候在父亲结点删除合并消去的子节点。

返回是否找到了要删的目标结点。

注意各种细节即可。(长码警告)

int upd;
char updkey[12];
int tree_delete(node_ptr fa,struct NODE *fa_node,node_ptr n,uint32_t loc,node_ptr *childentry)
{
  struct NODE node;
  node_ptr oldchildenty=0;

  fseek(fp,n,SEEK_SET);
  fread(&node,sizeof(struct NODE),1,fp);

  int i;
  for(i = 0;i < node.key_number;i++ )
  {
    if(strless(statement.row.b,node.data+(i*16+4)))break;
  }
  int exist=0;

  if(!node.is_leaf)exist = tree_delete(n,&node,char2np(node.data+(i*16)),i,&oldchildenty);
  else exist = i && !strcmp(statement.row.b,node.data+(i*16-12));
  //if(statement.row.b[0]=='z')printf("i:%d key_num:%d exist:%d node:%d\n",i,node.key_number,exist,n);
  if(upd)
  {
    if(fa && loc)
    {
      memcpy(fa_node->data+(loc*16-12),updkey,12);
      upd=0;
      write_node(fa,fa_node);
    }
  }
  if(!node.is_leaf && !oldchildenty){*childentry = 0;return exist;}
  if(node.is_leaf && !exist){*childentry = 0;return exist;}

  node.key_number--;
  if(oldchildenty)i=oldchildenty;
  if(!i)
  {
    if(fa && loc)
    {
      //printf("George node:%d key:%s fakey:%s\n",n,statement.row.b,fa_node->data+(loc*16-12));
        
      memcpy(fa_node->data+(loc*16-12),node.data+4,12);
      write_node(fa,fa_node);
    }
    for(int j=i;j<=node.key_number;j++)
      memcpy(node.data+(j*16),node.data+((j+1)*16),16);
  }
  else
  {
    for(int j=i;j<=node.key_number;j++)
      if(!node.is_leaf) memcpy(node.data+(j*16-12),node.data+((j+1)*16-12),16);
      else memcpy(node.data+(j*16-16),node.data+((j+1)*16-16),16);
    if(node.is_leaf && i==1)
    {
      if(fa && loc)
      {
        //printf("George node:%d key:%s fakey:%s\n",n,statement.row.b,fa_node->data+(loc*16-12));
        memcpy(fa_node->data+(loc*16-12),node.data+4,12);
        //printf("George node:%d key:%s fakey:%s\n",n,statement.row.b,fa_node->data+(loc*16-12));
        write_node(fa,fa_node);
      }
      if(fa && !loc)
      {
        upd=1;
        memcpy(updkey,node.data+4,12);
      }
    }
  }
  
  //不想写左sibling的合并了,太麻烦了QAQ
  //算了还是写吧……
  if(fa && fa_node->key_number!=loc)//有右兄弟
  {
    if(node.key_number < DEGREE)
    {
      //printf("move!\n");
      struct NODE sb_node;
      memset(&sb_node,0,sizeof(sb_node));
      fseek(fp,node.rsibling,SEEK_SET);
      fread(&sb_node,sizeof(struct NODE),1,fp);

      if(node.key_number+sb_node.key_number < 2*DEGREE)//合并
      {
        *childentry = loc + 1;
        if(!node.is_leaf)
        {
          memcpy(node.data+(node.key_number*16+4),fa_node->data+(loc*16+4),12);
          node.key_number++;
        }
        for(int j=0;j<=sb_node.key_number;j++)
          memcpy(node.data+((node.key_number+j)*16),sb_node.data+(j*16),16);
        node.key_number += sb_node.key_number;
        if(sb_node.rsibling)//维护链表
        {
          struct NODE sbsb_node;
          memset(&sbsb_node,0,sizeof(sbsb_node));
          fseek(fp,sb_node.rsibling,SEEK_SET);
          fread(&sbsb_node,sizeof(struct NODE),1,fp);
          sbsb_node.lsibling = n;
          write_node(sb_node.rsibling,&sbsb_node);
        }//懒得处理被删掉的空间了
        node.rsibling = sb_node.rsibling;
        write_node(n,&node);
        return exist;
      }
      else //均分
      {
        if(!node.is_leaf)
        {
          int move_num=0;
          while(sb_node.key_number - node.key_number > 1)
          {
            memcpy(node.data+(node.key_number*16+4),fa_node->data+(loc*16+4),12);
            node.key_number++;
            memcpy(node.data+(node.key_number*16),sb_node.data+(move_num*16),4);
            memcpy(fa_node->data+(loc*16+4),sb_node.data+(move_num*16+4),12);
            sb_node.key_number--;
            move_num++;
          }
          for(int j=0;j<=sb_node.key_number;j++)
            memcpy(sb_node.data+(j*16),sb_node.data+((move_num+j)*16),16);
        }
        else
        {
          int move_num=0;
          while(sb_node.key_number - node.key_number > 1)
          {
            memcpy(node.data+(node.key_number*16),sb_node.data+(move_num*16),16);
            node.key_number++;
            sb_node.key_number--;
            move_num++;
          }
          for(int j=0;j<=sb_node.key_number;j++)
            memcpy(sb_node.data+(j*16),sb_node.data+((move_num+j)*16),16);
          memcpy(fa_node->data+(loc*16+4),sb_node.data+4,12);
        }
        write_node(n,&node);
        write_node(node.rsibling,&sb_node);
        write_node(fa,fa_node);
        return exist;
      }
    }    
  }
  else if(fa && loc)//有左兄弟
  {
    if(node.key_number < DEGREE)
    {
      //printf("move!\n");
      struct NODE sb_node;
      memset(&sb_node,0,sizeof(sb_node));
      fseek(fp,node.lsibling,SEEK_SET);
      fread(&sb_node,sizeof(struct NODE),1,fp);

      if(node.key_number+sb_node.key_number < 2*DEGREE)//合并
      {
        *childentry = loc;
        if(!node.is_leaf)
        {
          memcpy(sb_node.data+(sb_node.key_number*16+4),fa_node->data+(loc*16-12),12);
          sb_node.key_number++;
        }
        for(int j=0;j<=node.key_number;j++)
          memcpy(sb_node.data+((sb_node.key_number+j)*16),node.data+(j*16),16);
        sb_node.key_number += node.key_number;
        //printf("node.l:%d node:%d node.r:%d ll:%d\n",node.lsibling,n,node.rsibling,sb_node.lsibling);
        if(node.rsibling)//维护链表
        {
          struct NODE r_node;
          memset(&r_node,0,sizeof(r_node));
          fseek(fp,node.rsibling,SEEK_SET);
          fread(&r_node,sizeof(struct NODE),1,fp);
          r_node.lsibling = node.lsibling;
          write_node(node.rsibling,&r_node);
        }//懒得处理被删掉的空间了
        sb_node.rsibling = node.rsibling;
        write_node(node.lsibling,&sb_node);
        return exist;
      }
      else //均分
      {
        if(!node.is_leaf)
        {
          int move_num=(sb_node.key_number - node.key_number)/2;
          for(int j=node.key_number;j>=0;j--)
            memcpy(node.data+((j+move_num)*16),node.data+(j*16),16);
          while(move_num)
          {
            move_num--;
            memcpy(node.data+(move_num*16+4),fa_node->data+(loc*16-12),12);
            memcpy(node.data+(move_num*16),sb_node.data+(sb_node.key_number*16),4);
            node.key_number++;
            sb_node.key_number--;
            memcpy(fa_node->data+(loc*16-12),sb_node.data+(sb_node.key_number*16+4),12);
          }
        }
        else
        {
          int move_num=(sb_node.key_number - node.key_number)/2;
          for(int j=node.key_number;j>=0;j--)
            memcpy(node.data+((j+move_num)*16),node.data+(j*16),16);
          while(move_num)
          {
            move_num--;
            sb_node.key_number--;
            memcpy(node.data+(move_num*16),sb_node.data+(sb_node.key_number*16),16);
            node.key_number++;
          }
          memcpy(fa_node->data+(loc*16-12),node.data+4,12);
        }
        write_node(n,&node);
        write_node(node.lsibling,&sb_node);
        write_node(fa,fa_node);
        return exist;
      }
    }
  }
  if(node.key_number == 0)
  {
    finfo.root=char2np(node.data);
    return exist;
  }//可以用哨兵避免整个树空的发生。
  write_node(n,&node);
  return exist;
}

void tree_search(node_ptr n,node_ptr *nid,unsigned int *offset)
{
  struct NODE node;

  fseek(fp,n,SEEK_SET);
  fread(&node,sizeof(struct NODE),1,fp);
  //printf("sn:%d\n",n);
  int i;
  for(i = 0;i < node.key_number;i++ )
  {
    if(strlessequal(statement.row.b,node.data+(i*16+4)))break;
  }
  //printf(" i=%d\n",i);
  if(!node.is_leaf)
  {
    tree_search(char2np(node.data+(i*16)),nid,offset);
  }
  else
  {
    if(i==node.key_number)
    {
      *nid=node.rsibling;
      *offset=0;
    }
    else
    {
      *nid=n;
      *offset=i;
    }
  }
}

myjql删除:

因为要删除所有键值Bs的条目,可以多次去删除知道找不到键值为B的条目为止。一次操作的复杂度可能很大,但是总复杂度等价于插入操作的总复杂度。

void b_tree_delete() {
  /* delete row(s) */
  struct NODE tmp;
  node_ptr tmp2;
  while(tree_delete(0,&tmp,finfo.root,0,&tmp2))
  {
    finfo.entry_num--;
  }
}

myjql检索:

先在B+树找到键值 B 的起点,然后在叶子上遍历输出即可。

void b_tree_search() {
  /* print selected rows */
  node_ptr n;
  unsigned int offset;
  tree_search(finfo.root,&n,&offset);

  int empty=1;
  struct NODE node;
  fseek(fp,n,SEEK_SET);
  if(n)fread(&node,sizeof(struct NODE),1,fp);
  
  while(n && !strcmp(statement.row.b,node.data+(offset*16+4)))
  {
    empty=0;
    printf("(%d, %s)\n",char2np(node.data+(offset*16)),node.data+(offset*16+4));
    offset++;
    if(offset==node.key_number)
    {
      n=node.rsibling;
      if(!n)break;
      offset=0;
      fseek(fp,n,SEEK_SET);
      fread(&node,sizeof(struct NODE),1,fp);
    }
  }
  if(empty)
  {
    printf("(Empty)\n");
  }
}

小结

本次项目实现了B+树为基础的数据库,在本次项目中我加深了对B+树结构的认识,了解到了B+树的应用场景,亲手编写并调试来实现了一棵B+树的基本操作。

深刻认识到B+树不同于我们在内存运算中常用的数据结构,这是专门针对IO的低速性质和IO一次读取一个页的特性而设计的数据结构。


最后附上完整代码:

(才不是因为是ACM选手所以喜欢把所有代码写在一个文件里)

(这么做不是方便助教老师编译验收吗?  )

/* You may refer to: https://cstack.github.io/db_tutorial/ */
/* Compile: gcc -o myjql myjql.c -O3 */
/* Test: /usr/bin/time -v ./myjql myjql.db < in.txt > out.txt */
/* Compare: diff out.txt ans.txt */

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <signal.h>
#include <assert.h>

#define PAGE_SIZE 4096
#define NODE_SIZE 4096
#define DEGREE ((NODE_SIZE-16-16-4)/32)
#define IDX_SIZE (NODE_SIZE-16)

typedef unsigned int node_ptr;

struct FILE_INFO{
  node_ptr root;
  unsigned int lenth; 
  unsigned int entry_num;
}finfo;

struct NODE{
  node_ptr lsibling,rsibling;
  int is_leaf;
  int key_number;
  char data[IDX_SIZE];
};

/* shell IO */

#define INPUT_BUFFER_SIZE 31
struct {
  char buffer[INPUT_BUFFER_SIZE + 1];
  size_t length;
} input_buffer;

typedef enum {
  INPUT_SUCCESS,
  INPUT_TOO_LONG
} InputResult;

void print_prompt() { printf("myjql> "); }

InputResult read_input() {
  /* we read the entire line as the input */
  input_buffer.length = 0;
  while (input_buffer.length <= INPUT_BUFFER_SIZE
    && (input_buffer.buffer[input_buffer.length++] = getchar()) != '\n'
    && input_buffer.buffer[input_buffer.length - 1] != EOF);
  if (input_buffer.buffer[input_buffer.length - 1] == EOF)
    exit(EXIT_SUCCESS);
  input_buffer.length--;
  /* if the last character is not new-line, the input is considered too long,
     the remaining characters are discarded */
  if (input_buffer.length == INPUT_BUFFER_SIZE
    && input_buffer.buffer[input_buffer.length] != '\n') {
    while (getchar() != '\n');
    return INPUT_TOO_LONG;
  }
  input_buffer.buffer[input_buffer.length] = 0;
  return INPUT_SUCCESS;
}

FILE *fp;
node_ptr New_NODE();
void np2char(node_ptr,char*);
void write_node(unsigned int,struct NODE *);
void open_file(const char* filename) {
  /* open file */
  if(!(fp = fopen(filename,"r+")))
  {
    fp = fopen(filename,"w+");
    finfo.entry_num=0;
    finfo.lenth=sizeof(struct FILE_INFO);
    finfo.root=New_NODE();

    struct NODE node;
    memset(&node,0,sizeof(node));
    node.is_leaf=1;
    node.key_number=1;
    np2char(0,node.data);
    memcpy(node.data+4,"\0\0\0\0\0\0\0\0\0\0\0",12);
    write_node(finfo.root,&node);
    
  }
  else
  {
    fseek(fp,0,SEEK_SET);
    fread(&finfo,sizeof(struct FILE_INFO),1,fp);
  }
}

void exit_nicely(int code) {
  /* do clean work */
  fseek(fp,0,SEEK_SET);
  fwrite(&finfo,sizeof(struct FILE_INFO),1,fp);
  exit(code);
}

void exit_success() {
  printf("bye~\n");
  exit_nicely(EXIT_SUCCESS);
}

/* specialization of data structure */

#define COLUMN_B_SIZE 11

typedef struct {
  uint32_t a;
  char b[COLUMN_B_SIZE + 1];
} Row;

void print_row(Row* row) {
  printf("(%d, %s)\n", row->a, row->b);
}

/* statement */

typedef enum {
  STATEMENT_INSERT,
  STATEMENT_SELECT,
  STATEMENT_DELETE
} StatementType;

struct {
  StatementType type;
  Row row;
  uint8_t flag; /* whether row.a, row.b have valid values */
} statement;

/* B-Tree operations */

/* B+Tree*/


int strless(char *a,char *b)
{
  return strcmp(a,b)<0;
}
int strlessequal(char *a,char *b)
{
  return strcmp(a,b)<=0;
}
node_ptr char2np(char *s)
{
  node_ptr ret=0;
  ret |= 0xff & s[3];ret <<= 8;
  ret |= 0xff & s[2];ret <<= 8;
  ret |= 0xff & s[1];ret <<= 8;
  ret |= 0xff & s[0];
  return ret;
}
void np2char(node_ptr x,char *s)
{
  s[0] = x & 0xff; x >>= 8;
  s[1] = x & 0xff; x >>= 8;
  s[2] = x & 0xff; x >>= 8;
  s[3] = x & 0xff;
}


node_ptr New_NODE()
{
  node_ptr ret = finfo.lenth;
  finfo.lenth += sizeof(struct NODE);
  return ret;
}

void write_node(node_ptr n,struct NODE* node)
{
  fseek(fp,n,SEEK_SET);
  fwrite(node,sizeof(struct NODE),1,fp);
}

node_ptr tree_insert(node_ptr n,char * retkey)
{
  
  assert(n);
  struct NODE node;
  char newkey[12];

  fseek(fp,n,SEEK_SET);
  fread(&node,sizeof(struct NODE),1,fp);

  //printf("\n node:%d key_number:%d\n",n,node.key_number);
  
  int i;
  for(i = 0;i < node.key_number;i++ )
  {
    if(strlessequal(statement.row.b,node.data+(i*16+4)))break;
  }
  node_ptr newchildentry;

  if(!node.is_leaf)newchildentry = tree_insert(char2np(node.data+(i*16)),newkey);
  else {newchildentry = statement.row.a;memcpy(newkey,statement.row.b,12);}
  
  //printf("\nchildentry:%d\n",newchildentry);
  if(!node.is_leaf && !newchildentry)return 0;

  if(!node.is_leaf)i++;
  for(int j=node.key_number;j>=i;j--)
    memcpy(node.data+((j+1)*16),node.data+(j*16),16);
  if(!node.is_leaf)
  {
    memcpy(node.data+(i*16+4),node.data+(i*16-12),12);
    memcpy(node.data+(i*16-12),newkey,12);
    np2char(newchildentry,node.data+(i*16));
  }
  else
  {
    np2char(newchildentry,node.data+(i*16));
    memcpy(node.data+(i*16+4),newkey,12);
  }
  node.key_number++;

  if(node.key_number <= 2*DEGREE)
  {
    write_node(n,&node);
    return 0;
  }
  else
  {
    struct NODE new_node;
    memset(&new_node,0,sizeof(new_node));
    node_ptr new_id = New_NODE();
    if(node.rsibling)
    {
      struct NODE rnode;
      memset(&rnode,0,sizeof(rnode));
      fseek(fp,node.rsibling,SEEK_SET);
      fread(&rnode,sizeof(struct NODE),1,fp);
      rnode.lsibling = new_id;
      write_node(node.rsibling,&rnode);
    }
    new_node.rsibling = node.rsibling;
    new_node.is_leaf = node.is_leaf;
    new_node.lsibling = n;
    node.rsibling = new_id;
    if(!node.is_leaf)
    {
      memcpy(new_node.data,node.data+((DEGREE+1)*16),DEGREE*16+4);
      memset(node.data+((DEGREE+1)*16),0,DEGREE*16+4);
      new_node.key_number = node.key_number = DEGREE;
      memcpy(retkey,node.data+(DEGREE*16+4),12);//UNCLEAR
      //printf("\nretkey:%s\n",retkey);
    }
    else
    {
      memcpy(new_node.data,node.data+((DEGREE)*16),(DEGREE+1)*16);
      new_node.key_number = DEGREE + 1;
      node.key_number = DEGREE;
      memcpy(retkey,new_node.data+4,12);//UNCLEAR
    }
    
    write_node(n,&node);
    write_node(new_id,&new_node);

    if(n==finfo.root)
    {
      struct NODE new_root;
      memset(&new_root,0,sizeof(new_root));
      int root_id = New_NODE();
      finfo.root = root_id;
      new_root.is_leaf = 0;
      new_root.key_number = 1;
      new_root.lsibling = new_root.rsibling = 0;
      np2char(n,new_root.data);
      memcpy(new_root.data+4,retkey,12);
      np2char(new_id,new_root.data+16);
      write_node(root_id,&new_root);
    }
    return new_id;
  }
}
int upd;
char updkey[12];
int tree_delete(node_ptr fa,struct NODE *fa_node,node_ptr n,uint32_t loc,node_ptr *childentry)
{
  struct NODE node;
  node_ptr oldchildenty=0;

  fseek(fp,n,SEEK_SET);
  fread(&node,sizeof(struct NODE),1,fp);

  int i;
  for(i = 0;i < node.key_number;i++ )
  {
    if(strless(statement.row.b,node.data+(i*16+4)))break;
  }
  int exist=0;

  if(!node.is_leaf)exist = tree_delete(n,&node,char2np(node.data+(i*16)),i,&oldchildenty);
  else exist = i && !strcmp(statement.row.b,node.data+(i*16-12));
  //if(statement.row.b[0]=='z')printf("i:%d key_num:%d exist:%d node:%d\n",i,node.key_number,exist,n);
  if(upd)
  {
    if(fa && loc)
    {
      memcpy(fa_node->data+(loc*16-12),updkey,12);
      upd=0;
      write_node(fa,fa_node);
    }
  }
  if(!node.is_leaf && !oldchildenty){*childentry = 0;return exist;}
  if(node.is_leaf && !exist){*childentry = 0;return exist;}

  node.key_number--;
  if(oldchildenty)i=oldchildenty;
  if(!i)
  {
    if(fa && loc)
    {
      //printf("George node:%d key:%s fakey:%s\n",n,statement.row.b,fa_node->data+(loc*16-12));
        
      memcpy(fa_node->data+(loc*16-12),node.data+4,12);
      write_node(fa,fa_node);
    }
    for(int j=i;j<=node.key_number;j++)
      memcpy(node.data+(j*16),node.data+((j+1)*16),16);
  }
  else
  {
    for(int j=i;j<=node.key_number;j++)
      if(!node.is_leaf) memcpy(node.data+(j*16-12),node.data+((j+1)*16-12),16);
      else memcpy(node.data+(j*16-16),node.data+((j+1)*16-16),16);
    if(node.is_leaf && i==1)
    {
      if(fa && loc)
      {
        //printf("George node:%d key:%s fakey:%s\n",n,statement.row.b,fa_node->data+(loc*16-12));
        memcpy(fa_node->data+(loc*16-12),node.data+4,12);
        //printf("George node:%d key:%s fakey:%s\n",n,statement.row.b,fa_node->data+(loc*16-12));
        write_node(fa,fa_node);
      }
      if(fa && !loc)
      {
        upd=1;
        memcpy(updkey,node.data+4,12);
      }
    }
  }
  
  //不想写左sibling的合并了,太麻烦了QAQ
  //算了还是写吧……
  if(fa && fa_node->key_number!=loc)//有右兄弟
  {
    if(node.key_number < DEGREE)
    {
      //printf("move!\n");
      struct NODE sb_node;
      memset(&sb_node,0,sizeof(sb_node));
      fseek(fp,node.rsibling,SEEK_SET);
      fread(&sb_node,sizeof(struct NODE),1,fp);

      if(node.key_number+sb_node.key_number < 2*DEGREE)//合并
      {
        *childentry = loc + 1;
        if(!node.is_leaf)
        {
          memcpy(node.data+(node.key_number*16+4),fa_node->data+(loc*16+4),12);
          node.key_number++;
        }
        for(int j=0;j<=sb_node.key_number;j++)
          memcpy(node.data+((node.key_number+j)*16),sb_node.data+(j*16),16);
        node.key_number += sb_node.key_number;
        if(sb_node.rsibling)//维护链表
        {
          struct NODE sbsb_node;
          memset(&sbsb_node,0,sizeof(sbsb_node));
          fseek(fp,sb_node.rsibling,SEEK_SET);
          fread(&sbsb_node,sizeof(struct NODE),1,fp);
          sbsb_node.lsibling = n;
          write_node(sb_node.rsibling,&sbsb_node);
        }//懒得处理被删掉的空间了
        node.rsibling = sb_node.rsibling;
        write_node(n,&node);
        return exist;
      }
      else //均分
      {
        if(!node.is_leaf)
        {
          int move_num=0;
          while(sb_node.key_number - node.key_number > 1)
          {
            memcpy(node.data+(node.key_number*16+4),fa_node->data+(loc*16+4),12);
            node.key_number++;
            memcpy(node.data+(node.key_number*16),sb_node.data+(move_num*16),4);
            memcpy(fa_node->data+(loc*16+4),sb_node.data+(move_num*16+4),12);
            sb_node.key_number--;
            move_num++;
          }
          for(int j=0;j<=sb_node.key_number;j++)
            memcpy(sb_node.data+(j*16),sb_node.data+((move_num+j)*16),16);
        }
        else
        {
          int move_num=0;
          while(sb_node.key_number - node.key_number > 1)
          {
            memcpy(node.data+(node.key_number*16),sb_node.data+(move_num*16),16);
            node.key_number++;
            sb_node.key_number--;
            move_num++;
          }
          for(int j=0;j<=sb_node.key_number;j++)
            memcpy(sb_node.data+(j*16),sb_node.data+((move_num+j)*16),16);
          memcpy(fa_node->data+(loc*16+4),sb_node.data+4,12);
        }
        write_node(n,&node);
        write_node(node.rsibling,&sb_node);
        write_node(fa,fa_node);
        return exist;
      }
    }    
  }
  else if(fa && loc)//有左兄弟
  {
    if(node.key_number < DEGREE)
    {
      //printf("move!\n");
      struct NODE sb_node;
      memset(&sb_node,0,sizeof(sb_node));
      fseek(fp,node.lsibling,SEEK_SET);
      fread(&sb_node,sizeof(struct NODE),1,fp);

      if(node.key_number+sb_node.key_number < 2*DEGREE)//合并
      {
        *childentry = loc;
        if(!node.is_leaf)
        {
          memcpy(sb_node.data+(sb_node.key_number*16+4),fa_node->data+(loc*16-12),12);
          sb_node.key_number++;
        }
        for(int j=0;j<=node.key_number;j++)
          memcpy(sb_node.data+((sb_node.key_number+j)*16),node.data+(j*16),16);
        sb_node.key_number += node.key_number;
        //printf("node.l:%d node:%d node.r:%d ll:%d\n",node.lsibling,n,node.rsibling,sb_node.lsibling);
        if(node.rsibling)//维护链表
        {
          struct NODE r_node;
          memset(&r_node,0,sizeof(r_node));
          fseek(fp,node.rsibling,SEEK_SET);
          fread(&r_node,sizeof(struct NODE),1,fp);
          r_node.lsibling = node.lsibling;
          write_node(node.rsibling,&r_node);
        }//懒得处理被删掉的空间了
        sb_node.rsibling = node.rsibling;
        write_node(node.lsibling,&sb_node);
        return exist;
      }
      else //均分
      {
        if(!node.is_leaf)
        {
          int move_num=(sb_node.key_number - node.key_number)/2;
          for(int j=node.key_number;j>=0;j--)
            memcpy(node.data+((j+move_num)*16),node.data+(j*16),16);
          while(move_num)
          {
            move_num--;
            memcpy(node.data+(move_num*16+4),fa_node->data+(loc*16-12),12);
            memcpy(node.data+(move_num*16),sb_node.data+(sb_node.key_number*16),4);
            node.key_number++;
            sb_node.key_number--;
            memcpy(fa_node->data+(loc*16-12),sb_node.data+(sb_node.key_number*16+4),12);
          }
        }
        else
        {
          int move_num=(sb_node.key_number - node.key_number)/2;
          for(int j=node.key_number;j>=0;j--)
            memcpy(node.data+((j+move_num)*16),node.data+(j*16),16);
          while(move_num)
          {
            move_num--;
            sb_node.key_number--;
            memcpy(node.data+(move_num*16),sb_node.data+(sb_node.key_number*16),16);
            node.key_number++;
          }
          memcpy(fa_node->data+(loc*16-12),node.data+4,12);
        }
        write_node(n,&node);
        write_node(node.lsibling,&sb_node);
        write_node(fa,fa_node);
        return exist;
      }
    }
  }
  if(node.key_number == 0)
  {
    finfo.root=char2np(node.data);
    return exist;
  }//可以用哨兵避免整个树空的发生。
  write_node(n,&node);
  return exist;
}

void tree_search(node_ptr n,node_ptr *nid,unsigned int *offset)
{
  struct NODE node;

  fseek(fp,n,SEEK_SET);
  fread(&node,sizeof(struct NODE),1,fp);
  //printf("sn:%d\n",n);
  int i;
  for(i = 0;i < node.key_number;i++ )
  {
    if(strlessequal(statement.row.b,node.data+(i*16+4)))break;
  }
  //printf(" i=%d\n",i);
  if(!node.is_leaf)
  {
    tree_search(char2np(node.data+(i*16)),nid,offset);
  }
  else
  {
    if(i==node.key_number)
    {
      *nid=node.rsibling;
      *offset=0;
    }
    else
    {
      *nid=n;
      *offset=i;
    }
  }
}


/* the key to select is stored in `statement.row.b` */
void b_tree_search() {
  /* print selected rows */
  node_ptr n;
  unsigned int offset;
  tree_search(finfo.root,&n,&offset);

  int empty=1;
  struct NODE node;
  fseek(fp,n,SEEK_SET);
  if(n)fread(&node,sizeof(struct NODE),1,fp);

  while(n && !strcmp(statement.row.b,node.data+(offset*16+4)))
  {
    empty=0;
    printf("(%d, %s)\n",char2np(node.data+(offset*16)),node.data+(offset*16+4));
    offset++;
    if(offset==node.key_number)
    {
      n=node.rsibling;
      if(!n)break;
      offset=0;
      fseek(fp,n,SEEK_SET);
      fread(&node,sizeof(struct NODE),1,fp);
    }
  }
  if(empty)
  {
    printf("(Empty)\n");
  }
  //printf("[INFO] select: %s\n", statement.row.b);
}

/* the row to insert is stored in `statement.row` */
void b_tree_insert() {
  /* insert a row */
  char tmp[12];
  tree_insert(finfo.root,tmp);
  //printf("[INFO] insert: ");
  //print_row(&statement.row);
  finfo.entry_num++;
}

/* the key to delete is stored in `statement.row.b` */
void b_tree_delete() {
  /* delete row(s) */
  struct NODE tmp;
  node_ptr tmp2;
  while(tree_delete(0,&tmp,finfo.root,0,&tmp2))
  {
    //if(statement.row.b[0]=='z')printf("delete_ok\n");
    finfo.entry_num--;
  }
  //("[INFO] delete: %s\n", statement.row.b);
}

void b_tree_traverse() {
  /* print all rows */
  memcpy(statement.row.b,"0\0\0\0\0\0\0\0\0\0\0",12);
  node_ptr n;
  unsigned int offset;
  tree_search(finfo.root,&n,&offset);

  int empty=1;
  struct NODE node;
  fseek(fp,n,SEEK_SET);
  if(n)fread(&node,sizeof(struct NODE),1,fp);
  
  //printf("George::: root:%d id:%d off:%d keynum:%d ls:%d\n",finfo.root,n,offset,node.key_number,node.lsibling);

  
  while(n && offset!=node.key_number)
  {
    empty=0;
    //printf("%d\n",n);
    printf("(%d, %s)\n",char2np(node.data+(offset*16)),node.data+(offset*16+4));
    offset++;
    if(offset==node.key_number)
    {
      n=node.rsibling;
      if(!n)break;
      offset=0;
      fseek(fp,n,SEEK_SET);
      fread(&node,sizeof(struct NODE),1,fp);
    }
  }
  //printf("num:%d\n",finfo.entry_num);
  if(empty)
  {
    printf("(Empty)\n");
  }

  //printf("[INFO] traverse\n");
}

/* logic starts */

typedef enum {
  EXECUTE_SUCCESS
} ExecuteResult;

typedef enum {
  META_COMMAND_SUCCESS,
  META_COMMAND_UNRECOGNIZED_COMMAND
} MetaCommandResult;

typedef enum {
  PREPARE_SUCCESS,
  PREPARE_NEGATIVE_VALUE,
  PREPARE_STRING_TOO_LONG,
  PREPARE_SYNTAX_ERROR,
  PREPARE_UNRECOGNIZED_STATEMENT,
  PREPARE_EMPTY_STATEMENT
} PrepareResult;

MetaCommandResult do_meta_command() {
  if (strcmp(input_buffer.buffer, ".exit") == 0) {
    exit(EXIT_SUCCESS);
  } else {
    return META_COMMAND_UNRECOGNIZED_COMMAND;
  }
}

PrepareResult prepare_insert() {
  statement.type = STATEMENT_INSERT;

  char* keyword = strtok(input_buffer.buffer, " ");
  char* a = strtok(NULL, " ");
  char* b = strtok(NULL, " ");
  int x;

  if (a == NULL || b == NULL)
    return PREPARE_SYNTAX_ERROR;

  x = atoi(a);
  if (x < 0)
    return PREPARE_NEGATIVE_VALUE;
  if (strlen(b) > COLUMN_B_SIZE)
    return PREPARE_STRING_TOO_LONG;

  statement.row.a = x;
  strcpy(statement.row.b, b);

  return PREPARE_SUCCESS;
}

PrepareResult prepare_condition() {
  statement.flag = 0;

  char* keyword = strtok(input_buffer.buffer, " ");
  char* b = strtok(NULL, " ");
  char* c = strtok(NULL, " ");

  if (b == NULL) return PREPARE_SUCCESS;
  if (c != NULL) return PREPARE_SYNTAX_ERROR;

  if (strlen(b) > COLUMN_B_SIZE)
    return PREPARE_STRING_TOO_LONG;

  strcpy(statement.row.b, b);
  statement.flag |= 2;

  return PREPARE_SUCCESS;
}

PrepareResult prepare_select() {
  statement.type = STATEMENT_SELECT;
  return prepare_condition();
}

PrepareResult prepare_delete() {
  statement.type = STATEMENT_DELETE;
  PrepareResult result = prepare_condition();
  if (result == PREPARE_SUCCESS && statement.flag == 0)
    return PREPARE_SYNTAX_ERROR;
  return result;
}

PrepareResult prepare_statement() {
  if (strlen(input_buffer.buffer) == 0) {
    return PREPARE_EMPTY_STATEMENT;
  } else if (strncmp(input_buffer.buffer, "insert", 6) == 0) {
    return prepare_insert();
  } else if (strncmp(input_buffer.buffer, "select", 6) == 0) {
    return prepare_select();
  } else if (strncmp(input_buffer.buffer, "delete", 6) == 0) {
    return prepare_delete();
  }
  return PREPARE_UNRECOGNIZED_STATEMENT;
}

ExecuteResult execute_select() {
  printf("\n");
  if (statement.flag == 0) {
    b_tree_traverse();
  } else {
    b_tree_search();
  }
  return EXECUTE_SUCCESS;
}

ExecuteResult execute_statement() {
  switch (statement.type) {
    case STATEMENT_INSERT:
      b_tree_insert();
      return EXECUTE_SUCCESS;
    case STATEMENT_SELECT:
      return execute_select();
    case STATEMENT_DELETE:
      b_tree_delete();
      return EXECUTE_SUCCESS;
  }
}

void sigint_handler(int signum) {
  printf("\n");
  exit(EXIT_SUCCESS);
}

int main(int argc, char* argv[]) {
  if (argc < 2) {
    printf("Must supply a database filename.\n");
    exit(EXIT_FAILURE);
  }

  atexit(&exit_success);
  signal(SIGINT, &sigint_handler);

  open_file(argv[1]);

  while (1) {
    print_prompt();
    switch (read_input()) {
      case INPUT_SUCCESS:
        break;
      case INPUT_TOO_LONG:
        printf("Input is too long.\n");
        continue;
    }

    if (input_buffer.buffer[0] == '.') {
      switch (do_meta_command()) {
        case META_COMMAND_SUCCESS:
          continue;
        case META_COMMAND_UNRECOGNIZED_COMMAND:
          printf("Unrecognized command '%s'.\n", input_buffer.buffer);
          continue;
      }
    }

    switch (prepare_statement()) {
      case PREPARE_SUCCESS:
        break;
      case PREPARE_EMPTY_STATEMENT:
        continue;
      case PREPARE_NEGATIVE_VALUE:
        printf("Column `a` must be positive.\n");
        continue;
      case PREPARE_STRING_TOO_LONG:
        printf("String for column `b` is too long.\n");
        continue;
      case PREPARE_SYNTAX_ERROR:
        printf("Syntax error. Could not parse statement.\n");
        continue;
      case PREPARE_UNRECOGNIZED_STATEMENT:
        printf("Unrecognized keyword at start of '%s'.\n",
               input_buffer.buffer);
        continue;
    }

    switch (execute_statement()) {
      case EXECUTE_SUCCESS:
        printf("\nExecuted.\n\n");
        break;
    }
    //printf("%d\n",finfo.entry_num);
  }
}
 类似资料: