存储

内存

  • 共享内存

  • 本地内存

  • 缓存

  • 内存上下文

  • 缓存空间管理
    数据块的缓存,减少磁盘IO,有共享缓存和进程缓存

  • Cache
    数据块之外的缓存,例如系统表

    • 系统表缓存不会缓存整个表,是以block为单位缓存?
  • 虚拟文件描述符
    系统中文件有打开的上限,使用VFD可以突破这种限制,本质上是一个LRU缓存

  • 空闲空间定位
    快速定位磁盘中的空闲空间以插入数据

  • 进程间通信 使用共享内存或者信号量通信

读取过程

  1. 从系统表中读取表的元数据信息构造元组信息
  2. 尝试从缓存读取数据
  3. 使用SMGR从磁盘读取数据到缓存中,SMGR是一个抽象层,用于实现不同存储介质的管理
  4. SMGR和存储介质之间使用VFD来管理文件描述符,以突破系统的FD限制
  • 标记删除,vacuum清理数据
  • FSM记录空闲空间

磁盘

  • 表文件

  • SMGR

  • VFD

  • FSM

select * from pg_relation_filepath(‘idx’);

Page 结构

工具汇总说明

create extension pageinspect;

get_raw_page(relname text, fork text, blkno int)
返回执行表的pagetext指定类型,默认是main,代表普通page,使用fsm或者vm查看其他类型,可以省略

page_header(page bytea)
查看page头,输入是page数组,使用上面的函数的输出作为参数

fsm_page_contents(page bytea) returns text
查看fsm页面结构
SELECT fsm_page_contents(get_raw_page('t1', 'fsm', 0));

=== HEAP相关
heap_page_items(page bytea)
查看page的具体信息

heap_tuple_infomask_flags(t_infomask integer, t_infomask2 integer) returns record
查看mask的具体含义,具体的使用方法为
SELECT * FROM heap_page_items(get_raw_page('a', 0)), LATERAL heap_tuple_infomask_flags(t_infomask, t_infomask2) WHERE t_infomask IS NOT NULL OR t_infomask2 IS NOT NULL order by lp desc;
输出为
 t_infomask2 | t_infomask | t_hoff |  t_bits  | t_oid |   t_data   |                                raw_flags                                 | combined_flags 
-------------+------------+--------+----------+-------+------------+--------------------------------------------------------------------------+----------------
           3 |       2305 |     24 | 10000000 |       | \x02000000 | {HEAP_HASNULL,HEAP_XMIN_COMMITTED,HEAP_XMAX_INVALID}                     | {}
        8195 |       1281 |     24 | 10000000 |       | \x01000000 | {HEAP_HASNULL,HEAP_XMIN_COMMITTED,HEAP_XMAX_COMMITTED,HEAP_KEYS_UPDATED} | {}
主要目的是查看数据的mask的信息


=== Btree相关
bt_metap(relname text)
查看btree元数据信息

bt_page_stats(relname text, blkno int)
查看btree page的统计信息

bt_page_items(relname text, blkno int) 或者 bt_page_items(page bytea) returns setof record
查看具体的信息,可以指定index或者直接使用page byte

typedef struct PageHeaderData
{
  /* XXX LSN is member of *any* block, not only page-organized ones */
  PageXLogRecPtr pd_lsn;        /* LSN: next byte after last byte of xlog record for last change to this page */
  uint16    pd_checksum;        /* checksum */
  uint16    pd_flags;           /* flag bits, see below */
  LocationIndex pd_lower;       /* offset to start of free space */
  LocationIndex pd_upper;       /* offset to end of free space */
  LocationIndex pd_special;     /* offset to start of special space */
  uint16 pd_pagesize_version;
  TransactionId pd_prune_xid;   /* oldest prunable XID, or zero if none */
  ItemIdData  pd_linp[FLEXIBLE_ARRAY_MEMBER]; /* line pointer array */
} PageHeaderData;

void
PageInit(Page page, Size pageSize, Size specialSize)
{
  PageHeader  p = (PageHeader) page;

  specialSize = MAXALIGN(specialSize);

  Assert(pageSize == BLCKSZ);
  Assert(pageSize > specialSize + SizeOfPageHeaderData);

  /* Make sure all fields of page are zero, as well as unused space */
  MemSet(p, 0, pageSize);

  p->pd_flags = 0;
  p->pd_lower = SizeOfPageHeaderData;
  p->pd_upper = pageSize - specialSize;
  p->pd_special = pageSize - specialSize;
  PageSetPageSizeAndVersion(page, pageSize, PG_PAGE_LAYOUT_VERSION);
  /* p->pd_prune_xid = InvalidTransactionId;    done by above MemSet */
}

大小为 pageSize,默认为8k,最开始是PageHeader,

  • 数据的变动以page为单位,不直接和存储交互,先把数据块读到缓存,然后在进行insert或者update或者delete,具体的函数有
    • heap_update
    • heap_multi_insert
    • heap_insert

tuple结构如下

struct HeapTupleHeaderData
{
  union
  {
    HeapTupleFields t_heap;
    DatumTupleFields t_datum;
  }      t_choice;

  ItemPointerData t_ctid;    /* current TID of this or newer tuple (or a * speculative insertion token) */

  /* Fields below here must match MinimalTupleData! */

#define FIELDNO_HEAPTUPLEHEADERDATA_INFOMASK2 2
  // hot相关信息,
  //   hot指的是数据在更新的时候,如果有index,会同事更新index,即使没有修改到index属性的数据,此时index中也会有一条数据的链表,为了节约空间,在满足
  //   1. 没有修改到index属性数据
  //   2. 数组修改限于在同一个page内,此时index不会有额外的数据,查找的时候从index找到最老的数据,按照数据链查找到最新数据即可
  uint16    t_infomask2;  /* number of attributes + various flags */

#define FIELDNO_HEAPTUPLEHEADERDATA_INFOMASK 3
  uint16    t_infomask;    /* various flag bits, see below */

#define FIELDNO_HEAPTUPLEHEADERDATA_HOFF 4
  uint8    t_hoff;      /* sizeof header incl. bitmap, padding */

  /* ^ - 23 bytes - ^ */

#define FIELDNO_HEAPTUPLEHEADERDATA_BITS 5
  bits8    t_bits[FLEXIBLE_ARRAY_MEMBER];  /* bitmap of NULLs */

  /* MORE DATA FOLLOWS AT END OF STRUCT */
};

page具体的操作代码在storage/page目录下,建议进行 15445 的实验,可以更好的理解page相关的操作,这里page的操作总体类似,主要是具体的数据结构和某些特定的方法需要时间进行记忆,但是大的方向上似曾相识。

RelationPutHeapTuple insert tuple到page中,根据head中的信息,计算insert的位置,其中page中的数据从首位向中心靠齐,尾部为数据,前面为offset,使用pd_lower和pd_upper进行管理,最后为柔性数组,数据在最后, 在内存总操作的时候,使用偏移可以直接定位到数据,具体使用pageinspect插件查看

调试heap_page_items函数,结合官网和源码熟悉数据的组织结构

esoye=# table a ;
 a | b 
---+---
 1 | 1
 5 | 2
 2 | 2
 7 | 7
 1 | 9
(5 rows)
esoye=# select * from page_header(get_raw_page('a', 0));
    lsn    | checksum | flags | lower | upper | special | pagesize | version | prune_xid 
-----------+----------+-------+-------+-------+---------+----------+---------+-----------
 0/5BD92B8 |        0 |     0 |    64 |  7872 |    8192 |     8192 |       4 |       788
(1 row)
select * from heap_page_items(get_raw_page('a', 0));
 lp | lp_off | lp_flags | lp_len | t_xmin | t_xmax | t_field3 | t_ctid | t_infomask2 | t_infomask | t_hoff | t_bits | t_oid |       t_data       
----+--------+----------+--------+--------+--------+----------+--------+-------------+------------+--------+--------+-------+--------------------
  1 |   8160 |        1 |     32 |    787 |      0 |        0 | (0,1)  |           2 |       2304 |     24 |        |       | \x0100000001000000
  2 |   8128 |        1 |     32 |    788 |    788 |        0 | (0,3)  |           2 |       1312 |     24 |        |       | \x0200000002000000
  3 |   8096 |        1 |     32 |    788 |    788 |        1 | (0,4)  |           2 |       9504 |     24 |        |       | \x0300000002000000
  4 |   8064 |        1 |     32 |    788 |    788 |        2 | (0,5)  |           2 |       9504 |     24 |        |       | \x0400000002000000
  5 |   8032 |        1 |     32 |    788 |      0 |        3 | (0,5)  |           2 |      10496 |     24 |        |       | \x0500000002000000
  6 |   8000 |        1 |     32 |    791 |      0 |        0 | (0,6)  |           2 |       2304 |     24 |        |       | \x0200000002000000
  7 |   7968 |        1 |     32 |    792 |    792 |        0 | (0,8)  |           2 |        544 |     24 |        |       | \x0400000004000000
  8 |   7936 |        1 |     32 |    792 |      0 |        1 | (0,8)  |           2 |      10752 |     24 |        |       | \x0600000004000000
  9 |   7904 |        1 |     32 |    793 |      0 |        0 | (0,9)  |           2 |       2304 |     24 |        |       | \x0700000007000000
 10 |   7872 |        1 |     32 |    794 |      0 |        0 | (0,10) |           2 |       2048 |     24 |        |       | \x0100000009000000
(10 rows)

pageinspect插件get_raw_page直接获得指定page的数据,然后在heap_page_items函数中解读page
page结构如图
tuple结构如图

首先是PageHeader,大小为24,在函数page_header中直接获取到page之后,进行强转,然后读取header中的数据即可,其中lsn和wal有关,后面需要调研机制,prune_xid和数据剪枝有关,后面调研。再然后在page得最后由部分数据和index有关,这里不关注
然后是具体的数据组织,在header之后,开始数据的存储,数据使用index定位偏移,实际的数据从尾部进行增长。定位数据偏移的数据结构为ItemId

typedef struct ItemIdData
{
    unsigned    lp_off:15,        /* offset to tuple (from start of page) */
                lp_flags:2,       /* state of line pointer, see below */
                lp_len:15;        /* byte length of tuple */
} ItemIdData;

lp_off是实际的数据的偏移的位置,lp_flags表示数据状态,例如死元组,或者重定位的数据,一般是1,表示LP_NORMAL,lp_len是数据的长度,具体案例见上面的例子

Datum heap_page_items(PG_FUNCTION_ARGS) {
  bytea *raw_page = PG_GETARG_BYTEA_P(0);
  heap_page_items_state *inter_call_data = NULL;
  FuncCallContext *fctx;
  int raw_page_size;

  if (!superuser())
    ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to use raw page functions")));

  raw_page_size = VARSIZE(raw_page) - VARHDRSZ;

  if (SRF_IS_FIRSTCALL()) {
    TupleDesc tupdesc;
    MemoryContext mctx;

    if (raw_page_size < SizeOfPageHeaderData)
      ereport(ERROR,
              (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("input page too small (%d bytes)", raw_page_size)));

    fctx = SRF_FIRSTCALL_INIT();
    mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx);

    inter_call_data = palloc(sizeof(heap_page_items_state));

    /* Build a tuple descriptor for our result type */
    if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
      elog(ERROR, "return type must be a row type");

    inter_call_data->tupd = tupdesc;

    inter_call_data->offset = FirstOffsetNumber;
    inter_call_data->page = VARDATA(raw_page);

    fctx->max_calls = PageGetMaxOffsetNumber(inter_call_data->page);
    fctx->user_fctx = inter_call_data;

    MemoryContextSwitchTo(mctx);
  }

  fctx = SRF_PERCALL_SETUP();
  inter_call_data = fctx->user_fctx;

  if (fctx->call_cntr < fctx->max_calls) {
    Page page = inter_call_data->page;
    HeapTuple resultTuple;
    Datum result;
    ItemId id;
    Datum values[14];
    bool nulls[14];
    uint16 lp_offset;
    uint16 lp_flags;
    uint16 lp_len;

    memset(nulls, 0, sizeof(nulls));

    /* Extract information from the line pointer */

    id = PageGetItemId(page, inter_call_data->offset);

    lp_offset = ItemIdGetOffset(id);
    lp_flags = ItemIdGetFlags(id);
    lp_len = ItemIdGetLength(id);

    values[0] = UInt16GetDatum(inter_call_data->offset);
    values[1] = UInt16GetDatum(lp_offset);
    values[2] = UInt16GetDatum(lp_flags);
    values[3] = UInt16GetDatum(lp_len);

    /*
     * We do just enough validity checking to make sure we don't reference
     * data outside the page passed to us. The page could be corrupt in
     * many other ways, but at least we won't crash.
     */
    if (ItemIdHasStorage(id) && lp_len >= MinHeapTupleSize && lp_offset == MAXALIGN(lp_offset) &&
        lp_offset + lp_len <= raw_page_size) {
      HeapTupleHeader tuphdr;
      bytea *tuple_data_bytea;
      int tuple_data_len;

      /* Extract information from the tuple header */

      // 由id定位到具体的tuple,然后进行读取,这里需要对照tuple的数据结构
      tuphdr = (HeapTupleHeader)PageGetItem(page, id);

      values[4] = UInt32GetDatum(HeapTupleHeaderGetRawXmin(tuphdr));
      values[5] = UInt32GetDatum(HeapTupleHeaderGetRawXmax(tuphdr));
      /* shared with xvac */
      values[6] = UInt32GetDatum(HeapTupleHeaderGetRawCommandId(tuphdr));
      // 数据的具体位置,使用此字段可以直接定位一个tuple的位置
      values[7] = PointerGetDatum(&tuphdr->t_ctid);
      values[8] = UInt32GetDatum(tuphdr->t_infomask2);
      values[9] = UInt32GetDatum(tuphdr->t_infomask);
      values[10] = UInt8GetDatum(tuphdr->t_hoff);

      /* Copy raw tuple data into bytea attribute */
      tuple_data_len = lp_len - tuphdr->t_hoff;
      tuple_data_bytea = (bytea *)palloc(tuple_data_len + VARHDRSZ);
      SET_VARSIZE(tuple_data_bytea, tuple_data_len + VARHDRSZ);
      memcpy(VARDATA(tuple_data_bytea), (char *)tuphdr + tuphdr->t_hoff, tuple_data_len);
      values[13] = PointerGetDatum(tuple_data_bytea);

      /*
       * We already checked that the item is completely within the raw
       * page passed to us, with the length given in the line pointer.
       * Let's check that t_hoff doesn't point over lp_len, before using
       * it to access t_bits and oid.
       */
      if (tuphdr->t_hoff >= SizeofHeapTupleHeader && tuphdr->t_hoff <= lp_len &&
          tuphdr->t_hoff == MAXALIGN(tuphdr->t_hoff)) {
        if (tuphdr->t_infomask & HEAP_HASNULL) {
          int bits_len;

          bits_len = BITMAPLEN(HeapTupleHeaderGetNatts(tuphdr)) * BITS_PER_BYTE;
          values[11] = CStringGetTextDatum(bits_to_text(tuphdr->t_bits, bits_len));
        } else
          nulls[11] = true;

        if (tuphdr->t_infomask & HEAP_HASOID_OLD)
          values[12] = HeapTupleHeaderGetOidOld(tuphdr);
        else
          nulls[12] = true;
      } else {
        nulls[11] = true;
        nulls[12] = true;
      }
    } else {
      /*
       * The line pointer is not used, or it's invalid. Set the rest of
       * the fields to NULL
       */
      int i;

      for (i = 4; i <= 13; i++) nulls[i] = true;
    }

    /* Build and return the result tuple. */
    resultTuple = heap_form_tuple(inter_call_data->tupd, values, nulls);
    result = HeapTupleGetDatum(resultTuple);

    inter_call_data->offset++;

    SRF_RETURN_NEXT(fctx, result);
  } else
    SRF_RETURN_DONE(fctx);
}

index

btree

select * from bt_metap('idx');
select * from bt_page_stats('idx',3);
SELECT * FROM bt_page_items('idx', 3);
  • Implement an algorithm for building the index and map the data into pages (for the buffer cache manager to uniformly process each index).
  • Search information in the index by a predicate in the form “indexed-field operator expression”.
  • Evaluate the index usage cost.
  • Manipulate the locks required for correct parallel processing.
  • Generate write-ahead log (WAL) records.

btree

  1. 树节点默认8k,按照page的方式进行组织,几乎和heap的组织格式类似,尾部有一个特殊区域,常规page中也有,但是并未启用
typedef struct BTPageOpaqueData
{
    BlockNumber btpo_prev;        /* left sibling, or P_NONE if leftmost */
    BlockNumber btpo_next;        /* right sibling, or P_NONE if rightmost */
    uint32      btpo_level;       /* tree level --- zero for leaf pages */
    uint16      btpo_flags;       /* flag bits, see below */
    BTCycleId   btpo_cycleid;     /* vacuum cycle ID of latest split */
} BTPageOpaqueData;
#define BTP_LEAF              (1 << 0)    /* leaf page, i.e. not internal page */
#define BTP_ROOT              (1 << 1)    /* root page (has no parent) */
#define BTP_DELETED           (1 << 2)    /* page has been deleted from tree */
#define BTP_META              (1 << 3)    /* meta-page */
#define BTP_HALF_DEAD         (1 << 4)    /* empty, but still in tree */
#define BTP_SPLIT_END         (1 << 5)    /* rightmost page of split group */
#define BTP_HAS_GARBAGE       (1 << 6)    /* page has LP_DEAD tuples (deprecated) */
#define BTP_INCOMPLETE_SPLIT  (1 << 7)    /* right sibling's downlink is missing */
#define BTP_HAS_FULLXID       (1 << 8)    /* contains BTDeletedPageData */

主要保存当前page的prev和next,以及level和flag,flag状态如上,此项可以使用bt_page_stats进行观察

  1. 实现来自于一篇古老的论文,添加额外的特性使得他获得更高的并发度。和常规Btree区别是

  2. 每层兄弟节点之间使用指针相连

  3. 每层非最右节点保存当前节点的最大值
    1. 在查找得时候,如果已经查找到子节点,此时表示数据一定在当前的page中,但是假如在查找的过程中page分裂了,则此时可以快速使用high key 进行判断,此时在无需对整个树加锁

  4. 假设一个indextuple大小为16,则完整表示一个item的空间大小为(16 + 4),再加上header和special计算得一个8k的page大约可以保存407条记录,可以简单的测试一下,最终page的free space 的大小为8

create table a (a int, b int);
create index idx on a(b);
insert into a select g, random() * 100000, g%999 from generate_series(1, 400) g;
esoye=# select * from bt_metap('idx');
 magic  | version | root | level | fastroot | fastlevel | last_cleanup_num_delpages | last_cleanup_num_tuples | allequalimage 
--------+---------+------+-------+----------+-----------+---------------------------+-------------------------+---------------
 340322 |       4 |    3 |     1 |        1 |         0 |                         0 |                      -1 | t
(1 row)

esoye=# SELECT * FROM bt_page_items('idx', 3);
 itemoffset | ctid  | itemlen | nulls | vars | data | dead | htid | tids 
------------+-------+---------+-------+------+------+------+------+------
          1 | (1,0) |       8 | f     | f    |      |      |      | 
(1 row)
esoye=# select * from bt_page_stats('idx',1);
 blkno | type | live_items | dead_items | avg_item_size | page_size | free_size | btpo_prev | btpo_next | btpo_level | btpo_flags 
-------+------+------------+------------+---------------+-----------+-----------+-----------+-----------+------------+------------
     1 | l    |        400 |          0 |            16 |      8192 |       148 |         0 |         0 |          0 |          1
(1 row)

index function

pg中所有的数据类型,或者函数,索引等都可以使用元数据管理,感觉他在底层是搭了一个架子,其他东西都是架子上的工具

所有的index都有一个对应的handle,btree对应的是bthandler,内部把btree的关键的操作函数指针赋给IndexAmRoutine,

Datum
bthandler(PG_FUNCTION_ARGS)
{
    IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);

    amroutine->amstrategies = BTMaxStrategyNumber;
    amroutine->amsupport = BTNProcs;
    amroutine->amoptsprocnum = BTOPTIONS_PROC;
    amroutine->amcanorder = true;
    amroutine->amcanorderbyop = false;
    amroutine->amcanbackward = true;
    amroutine->amcanunique = true;
    amroutine->amcanmulticol = true;
    amroutine->amoptionalkey = true;
    amroutine->amsearcharray = true;
    amroutine->amsearchnulls = true;
    amroutine->amstorage = false;
    amroutine->amclusterable = true;
    amroutine->ampredlocks = true;
    amroutine->amcanparallel = true;
    amroutine->amcaninclude = true;
    amroutine->amusemaintenanceworkmem = false;
    amroutine->amparallelvacuumoptions =
        VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP;
    amroutine->amkeytype = InvalidOid;

    amroutine->ambuild = btbuild;
    amroutine->ambuildempty = btbuildempty;
    amroutine->aminsert = btinsert;
    amroutine->ambulkdelete = btbulkdelete;
    amroutine->amvacuumcleanup = btvacuumcleanup;
    amroutine->amcanreturn = btcanreturn;
    amroutine->amcostestimate = btcostestimate;
    amroutine->amoptions = btoptions;
    amroutine->amproperty = btproperty;
    amroutine->ambuildphasename = btbuildphasename;
    amroutine->amvalidate = btvalidate;
    amroutine->amadjustmembers = btadjustmembers;
    amroutine->ambeginscan = btbeginscan;
    amroutine->amrescan = btrescan;
    amroutine->amgettuple = btgettuple;
    amroutine->amgetbitmap = btgetbitmap;
    amroutine->amendscan = btendscan;
    amroutine->ammarkpos = btmarkpos;
    amroutine->amrestrpos = btrestrpos;
    amroutine->amestimateparallelscan = btestimateparallelscan;
    amroutine->aminitparallelscan = btinitparallelscan;
    amroutine->amparallelrescan = btparallelrescan;

    PG_RETURN_POINTER(amroutine);
}

其他index和表类似,都有自己的接口

  1. create

建立index

   a    |   b    |  c   |   ctid    
--------+--------+------+-----------
  19928 |      0 | 947  | (107,133)
    883 |      1 | 883  | (4,143)
  18064 |      1 | 82   | (97,119)
  89238 |      2 | 327  | (482,68)
  96552 |      3 | 648  | (521,167)
  96831 |      3 | 927  | (523,76)
  22937 |      6 | 959  | (123,182)
  59675 |      6 | 734  | (322,105)
  76742 |     10 | 818  | (414,152)
  42563 |     11 | 605  | (230,13)
  53421 |     11 | 474  | (288,141)
  41441 |     12 | 482  | (224,1)
  88531 |     12 | 619  | (478,101)
  14010 |     17 | 24   | (75,135)
  57127 |     17 | 184  | (308,147)
  31710 |     19 | 741  | (171,75)
  58259 |     19 | 317  | (314,169)
  61877 |     19 | 938  | (334,87)
  20741 |     22 | 761  | (112,21)
  83730 |     22 | 813  | (452,110)
  92344 |     22 | 436  | (499,29)
  18195 |     24 | 213  | (98,65)
  54456 |     26 | 510  | (294,66)
  58764 |     26 | 822  | (317,119)
   1742 |     27 | 743  | (9,77)
  98925 |     27 | 24   | (534,135)
  29363 |     29 | 392  | (158,133)
   6564 |     30 | 570  | (35,89)
  63464 |     32 | 527  | (343,9)
  65040 |     33 | 105  | (351,105)
   1598 |     37 | 599  | (8,118)
  53945 |     39 | 998  | (291,110)

https://www.interdb.jp/pg/pgsql01.html

  • TOAST
    大字段数据,在字段数据大于2k的时候,会触发相应的机制,把数据按2k切分,存储到TOAST表中,原表使用专门的指针指向数据

brin

块级索引,空间占用小且有效

SMGR


/*
 * This struct of function pointers defines the API between smgr.c and
 * any individual storage manager module.  Note that smgr subfunctions are
 * generally expected to report problems via elog(ERROR).  An exception is
 * that smgr_unlink should use elog(WARNING), rather than erroring out,
 * because we normally unlink relations during post-commit/abort cleanup,
 * and so it's too late to raise an error.  Also, various conditions that
 * would normally be errors should be allowed during bootstrap and/or WAL
 * recovery --- see comments in md.c for details.
 */
typedef struct f_smgr
{
  void    (*smgr_init) (void);  /* may be NULL */
  void    (*smgr_shutdown) (void);  /* may be NULL */
  void    (*smgr_open) (SMgrRelation reln);
  void    (*smgr_close) (SMgrRelation reln, ForkNumber forknum);
  void    (*smgr_create) (SMgrRelation reln, ForkNumber forknum,
                bool isRedo);
  bool    (*smgr_exists) (SMgrRelation reln, ForkNumber forknum);
  void    (*smgr_unlink) (RelFileLocatorBackend rlocator, ForkNumber forknum,
                bool isRedo);
  void    (*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
                BlockNumber blocknum, char *buffer, bool skipFsync);
  bool    (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
                  BlockNumber blocknum);
  void    (*smgr_read) (SMgrRelation reln, ForkNumber forknum,
                BlockNumber blocknum, char *buffer);
  void    (*smgr_write) (SMgrRelation reln, ForkNumber forknum,
                 BlockNumber blocknum, char *buffer, bool skipFsync);
  void    (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum,
                   BlockNumber blocknum, BlockNumber nblocks);
  BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
  void    (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
                  BlockNumber nblocks);
  void    (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
} f_smgr;

static const f_smgr smgrsw[] = {
  /* magnetic disk */
  {
    .smgr_init = mdinit,
    .smgr_shutdown = NULL,
    .smgr_open = mdopen,
    .smgr_close = mdclose,
    .smgr_create = mdcreate,
    .smgr_exists = mdexists,
    .smgr_unlink = mdunlink,
    .smgr_extend = mdextend,
    .smgr_prefetch = mdprefetch,
    .smgr_read = mdread,
    .smgr_write = mdwrite,
    .smgr_writeback = mdwriteback,
    .smgr_nblocks = mdnblocks,
    .smgr_truncate = mdtruncate,
    .smgr_immedsync = mdimmedsync,
  }
};

操作不同介质的文件抽象层,在f_smgr中定义了操作的接口,当前默认实现为smgrsw中的对磁盘操作的函数,理论上支持其他存储介质,只需要实现对应的接口即可,当前只是作为一个简单的中转,具体的文件操作在smgr/md.c

VFD

使用LRU缓存维护的fd,管理打开的文件描述符,主要代码在file/fd.c中,主要结构为

typedef struct vfd
{
  int      fd;        /* current FD, or VFD_CLOSED if none */
  unsigned short fdstate;    /* bitflags for VFD's state */
  ResourceOwner resowner;    /* owner, for automatic cleanup */
  File    nextFree;    /* link to next free VFD, if in freelist */
  File    lruMoreRecently;  /* doubly linked recency-of-use list */
  File    lruLessRecently;
  off_t    fileSize;    /* current size of file (0 if not temporary) */
  char     *fileName;    /* name of file, or NULL for unused VFD */
  /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
  int      fileFlags;    /* open(2) flags for (re)opening the file */
  mode_t    fileMode;    /* mode to pass to open(2) */
} Vfd;

FSM

free space map,page中的数据在删除且vacuum之后,会有数据空洞,此时为了节约空间,在后续insert的时候,会尝试把数据insert到之前的page中,但是遍历page查找空间类似全表扫,所以为了加快这个过程,使用额外的数据结构记录空闲page的大小,insert的时候直接定位page

  1. 空闲空间不是精确统计,page默认大小为8k,把8k划分为255份,一份大小为32字节,此时使用一个字节就可以记录page中大约空闲的空间,
  2. 为了快速定位,fsm使用二叉树来维护,一个page大小为8192,除了header剩下的空间约为8000字节,使用叶子节点标记page大小,此时一个page大约可以记录4000个page,
  3. 一般使用三层fsm,此时前两层为辅助块,最后层page数目为4000^3,完全可以记录所有的数据块的使用情况,所以最初的时候fsm文件大小为 8192* 3,此时只使用到3个块记录大小,后期数据扩展的时候,此时fsm文件大小也会进行扩展

It is important to keep the map small so that it can be searched rapidly. Therefore, we don’t attempt to record the exact free space on a page. We allocate one map byte to each page, allowing us to record free space at a granularity of 1/256th of a page. Another way to say it is that the stored value is the free space divided by BLCKSZ/256 (rounding down). We assume that the free space must always be less than BLCKSZ, since all pages have some overhead; so the maximum map value is 255.

The binary tree is stored on each FSM page as an array. Because the page header takes some space on a page, the binary tree isn’t perfect. That is, a few right-most leaf nodes are missing, and there are some useless non-leaf nodes at the right. So the tree looks something like this:

       0
   1       2
 3   4   5   6
7 8 9 A B
/*
 * Structure of a FSM page. See src/backend/storage/freespace/README for
 * details.
 */
typedef struct
{
  /*
   * fsm_search_avail() tries to spread the load of multiple backends by
   * returning different pages to different backends in a round-robin
   * fashion. fp_next_slot points to the next slot to be returned (assuming
   * there's enough space on it for the request). It's defined as an int,
   * because it's updated without an exclusive lock. uint16 would be more
   * appropriate, but int is more likely to be atomically
   * fetchable/storable.
   */
  int      fp_next_slot;

  /*
   * fp_nodes contains the binary tree, stored in array. The first
   * NonLeafNodesPerPage elements are upper nodes, and the following
   * LeafNodesPerPage elements are leaf nodes. Unused nodes are zero.
   */
  uint8    fp_nodes[FLEXIBLE_ARRAY_MEMBER];
} FSMPageData;

fp_next_slot记录下一此使用的节点,目的是

  1. 在多进程写同一个表时候,避免对同一个page的竞争
  2. 记录下一个page,在顺序读写的时候可以有更高的性能

具体的查找算法为


  /*----------
   * Start the search from the target slot.  At every step, move one
   * node to the right, then climb up to the parent.  Stop when we reach
   * a node with enough free space (as we must, since the root has enough
   * space).
   *
   * The idea is to gradually expand our "search triangle", that is, all
   * nodes covered by the current node, and to be sure we search to the
   * right from the start point.  At the first step, only the target slot
   * is examined.  When we move up from a left child to its parent, we are
   * adding the right-hand subtree of that parent to the search triangle.
   * When we move right then up from a right child, we are dropping the
   * current search triangle (which we know doesn't contain any suitable
   * page) and instead looking at the next-larger-size triangle to its
   * right.  So we never look left from our original start point, and at
   * each step the size of the search triangle doubles, ensuring it takes
   * only log2(N) work to search N pages.
   *
   * The "move right" operation will wrap around if it hits the right edge
   * of the tree, so the behavior is still good if we start near the right.
   * Note also that the move-and-climb behavior ensures that we can't end
   * up on one of the missing nodes at the right of the leaf level.
   *
   * For example, consider this tree:
   *
   *       7
   *     7     6
   *   5   7   6   5
   *  4 5 5 7 2 6 5 2
   *        T
   *
   * Assume that the target node is the node indicated by the letter T,
   * and we're searching for a node with value of 6 or higher. The search
   * begins at T. At the first iteration, we move to the right, then to the
   * parent, arriving at the rightmost 5. At the second iteration, we move
   * to the right, wrapping around, then climb up, arriving at the 7 on the
   * third level.  7 satisfies our search, so we descend down to the bottom,
   * following the path of sevens.  This is in fact the first suitable page
   * to the right of (allowing for wraparound) our start point.
   *----------
   */

结合代码理解

  1. 先判断root节点
  2. 从fp_next_slot或者midel node开始,向右上查找,直到找到复合的node
  3. 从这里开始向下查找

代码就是简单的对二叉树的操作,所有代码在freespace/fsmpage.c中,vacuum的时候会触发sfm的操作,具体的代码在freespce/freespace.c中。

VM

可见性映射表,记录数据变动的page,pg支持多版本,在数据变动的时候不会立即清除数据,而指挥打上tag,等待后续的vacuum进程进行数据的清理,vm记录数据的变动,让vacuum可以快速的清理数据,vacuum有两种模式,一种是lazy vacuum,一种是full vacuum,lazy 的时候不会跨page清理,此时可以使用vm文件,但是full vacuum的时候一般需要全表扫描,基本不会有太大的最用, vm是简单的bit位,0代表有数据变动,

内存

内存上下文,简单的理解就是一定范围内的内存池?

https://smartkeyerror.com/PostgreSQL-MemoryContext

insert

堆表insert操作

  1. 对应函数为heap_insert 1. heap_prepare_insert 设置 tuple header 2. RelationGetBufferForTuple 获取可用buffer 3. RelationPutHeapTuple 进行 insert 4. 设置 xlog 信息

insert 中可能会有 page 看见不足等问题,需要进行扩展,主要在 RelationGetBufferForTuple 函数中

  RelationGetBufferForTuple:
    1. insert 会优先使用上次 insert 操作的 page
      如果空间足够,则
        LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
        pageFreeSpace = PageGetHeapFreeSpace(page);
        if (targetFreeSpace <= pageFreeSpace)
          RelationSetTargetBlock(relation, targetBlock);
          return buffer;
      否则从 FSM 中寻找一个空间足够的页面
        LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
        pageFreeSpace = PageGetHeapFreeSpace(page);
        if (targetFreeSpace <= pageFreeSpace) // false
        LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
        targetBlock = RecordAndGetPageWithFreeSpace()
      如果 FSM 中没有,则进行扩展,扩展时区分并发和非并发,没有并发的时候
        LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
        pageFreeSpace = PageGetHeapFreeSpace(page);
        if (targetFreeSpace <= pageFreeSpace) // false
        LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
        targetBlock = RecordAndGetPageWithFreeSpace()
        if (!ConditionalLockRelationForExtension(relation, ExclusiveLock)) // 表加排他锁成功
        buffer = ReadBufferBI(relation, P_NEW, RBM_ZERO_AND_LOCK, bistate);// 扩展
        page = BufferGetPage(buffer);
        PageInit(page, BufferGetPageSize(buffer), 0);
        MarkBufferDirty(buffer);
        RelationSetTargetBlock(relation, BufferGetBlockNumber(buffer));
      如果有并发,则会在表加锁的时候尝试失败,然后进行bulk 扩展过程
        LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
        pageFreeSpace = PageGetHeapFreeSpace(page);
        if (targetFreeSpace <= pageFreeSpace) // false
        LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
        targetBlock = RecordAndGetPageWithFreeSpace()
        if (!ConditionalLockRelationForExtension(relation, ExclusiveLock)) // 表加排他锁 失败
          LockRelationForExtension(relation, ExclusiveLock);
          targetBlock = GetPageWithFreeSpace(relation, targetFreeSpace);
          if (targetBlock != InvalidBlockNumber)
            UnlockRelationForExtension(relation, ExclusiveLock);
            return
          RelationAddExtraBlocks(relation, bistate);
        buffer = ReadBufferBI(relation, P_NEW, RBM_ZERO_AND_LOCK, bistate);// 扩展
        page = BufferGetPage(buffer);
        PageInit(page, BufferGetPageSize(buffer), 0);
        MarkBufferDirty(buffer);
        RelationSetTargetBlock(relation, BufferGetBlockNumber(buffer));

  RelationAddExtraBlocks 批量扩展,大小为 min(512, lockWaiters * 20)
  ```