PostgreSQL 源码解读(200)- PG 12 Pluggable storage for tables接口浅析

赵兴朝
2023-12-01

PostgreSQL 12 beta 1 已于近期发布,此版本包含了众多新特性,其中可插拔表存储接口允许创建和使用不同的表存储方式,该特性的详细描述如下:
Pluggable storage for tables
PostgreSQL 12 引入了可插入表存储接口,允许创建和使用不同的表存储方法。可以使用 CREATE ACCESS METHOD 命令将新的访问方法添加到 PostgreSQL 集群,然后将其添加到 CREATE TABLE 上具有新 USING 子句的表中。可以通过创建新的表访问方法来定义表存储接口。在 PostgreSQL 12 中,默认使用的存储接口是堆访问方法,它目前是唯一的内置方法。

本节简单探究了“Pluggable storage for tables”的实现。

一、源码解读

在创建/初始化relation时,指定access method,相应的源文件为relcache.c
relcache.c
1.在RelationBuildLocalRelation方法中,调用RelationInitTableAccessMethod初始化Table Access Method


Relation
RelationBuildLocalRelation(const char *relname,
                           Oid relnamespace,
                           TupleDesc tupDesc,
                           Oid relid,
                           Oid accessmtd,
                           Oid relfilenode,
                           Oid reltablespace,
                           bool shared_relation,
                           bool mapped_relation,
                           char relpersistence,
                           char relkind)
{
    ...
    if (relkind == RELKIND_RELATION ||
        relkind == RELKIND_SEQUENCE ||
        relkind == RELKIND_TOASTVALUE ||
        relkind == RELKIND_MATVIEW)
        RelationInitTableAccessMethod(rel);
    ...
}
/*
 * Initialize table access method support for a table like relation
 * 初始化表访问方法
 */
void
RelationInitTableAccessMethod(Relation relation)
{
    HeapTuple   tuple;
    Form_pg_am  aform;
    if (relation->rd_rel->relkind == RELKIND_SEQUENCE)//序列号
    {
        /*
         * Sequences are currently accessed like heap tables, but it doesn't
         * seem prudent to show that in the catalog. So just overwrite it
         * here.
         */
        //设置access method handler
        relation->rd_amhandler = HEAP_TABLE_AM_HANDLER_OID;
    }
    else if (IsCatalogRelation(relation))//系统表
    {
        /*
         * Avoid doing a syscache lookup for catalog tables.
         */
        Assert(relation->rd_rel->relam == HEAP_TABLE_AM_OID);
        relation->rd_amhandler = HEAP_TABLE_AM_HANDLER_OID;
    }
    else//其他
    {
        /*
         * Look up the table access method, save the OID of its handler
         * function.
         */
        Assert(relation->rd_rel->relam != InvalidOid);
        tuple = SearchSysCache1(AMOID,
                                ObjectIdGetDatum(relation->rd_rel->relam));
        if (!HeapTupleIsValid(tuple))
            elog(ERROR, "cache lookup failed for access method %u",
                 relation->rd_rel->relam);
        aform = (Form_pg_am) GETSTRUCT(tuple);
        relation->rd_amhandler = aform->amhandler;
        ReleaseSysCache(tuple);
    }
    /*
     * Now we can fetch the table AM's API struct
     */
    InitTableAmRoutine(relation);
}
/*
 * Fill in the TableAmRoutine for a relation
 *
 * relation's rd_amhandler must be valid already.
 */
static void
InitTableAmRoutine(Relation relation)
{
    relation->rd_tableam = GetTableAmRoutine(relation->rd_amhandler);
}

2.在formrdesc方法中,设置relation的访问方法(relation->rd_tableam)


static void
formrdesc(const char *relationName, Oid relationReltype,
          bool isshared,
          int natts, const FormData_pg_attribute *attrs)
{
    ...
    /*
     * initialize the table am handler
     */
    relation->rd_rel->relam = HEAP_TABLE_AM_OID;
    relation->rd_tableam = GetHeapamTableAmRoutine();
    ...
}

GetHeapamTableAmRoutine方法在源文件heapam_handler.c中
heapam_handler.c
方法定义如下


...
const TableAmRoutine *
GetHeapamTableAmRoutine(void)
{
    return &heapam_methods;
}
Datum
heap_tableam_handler(PG_FUNCTION_ARGS)
{
    PG_RETURN_POINTER(&heapam_methods);
}

heapam_methods是TableAmRoutine结构体,定义了一套heap access method,函数指针指向heap_XXX函数.


/* ------------------------------------------------------------------------
 * Definition of the heap table access method.
 * ------------------------------------------------------------------------
 */
static const TableAmRoutine heapam_methods = {
    .type = T_TableAmRoutine,
    .slot_callbacks = heapam_slot_callbacks,
    .scan_begin = heap_beginscan,
    .scan_end = heap_endscan,
    .scan_rescan = heap_rescan,
    .scan_getnextslot = heap_getnextslot,
    .parallelscan_estimate = table_block_parallelscan_estimate,
    .parallelscan_initialize = table_block_parallelscan_initialize,
    .parallelscan_reinitialize = table_block_parallelscan_reinitialize,
    .index_fetch_begin = heapam_index_fetch_begin,
    .index_fetch_reset = heapam_index_fetch_reset,
    .index_fetch_end = heapam_index_fetch_end,
    .index_fetch_tuple = heapam_index_fetch_tuple,
    .tuple_insert = heapam_tuple_insert,
    .tuple_insert_speculative = heapam_tuple_insert_speculative,
    .tuple_complete_speculative = heapam_tuple_complete_speculative,
    .multi_insert = heap_multi_insert,
    .tuple_delete = heapam_tuple_delete,
    .tuple_update = heapam_tuple_update,
    .tuple_lock = heapam_tuple_lock,
    .finish_bulk_insert = heapam_finish_bulk_insert,
    .tuple_fetch_row_version = heapam_fetch_row_version,
    .tuple_get_latest_tid = heap_get_latest_tid,
    .tuple_tid_valid = heapam_tuple_tid_valid,
    .tuple_satisfies_snapshot = heapam_tuple_satisfies_snapshot,
    .compute_xid_horizon_for_tuples = heap_compute_xid_horizon_for_tuples,
    .relation_set_new_filenode = heapam_relation_set_new_filenode,
    .relation_nontransactional_truncate = heapam_relation_nontransactional_truncate,
    .relation_copy_data = heapam_relation_copy_data,
    .relation_copy_for_cluster = heapam_relation_copy_for_cluster,
    .relation_vacuum = heap_vacuum_rel,
    .scan_analyze_next_block = heapam_scan_analyze_next_block,
    .scan_analyze_next_tuple = heapam_scan_analyze_next_tuple,
    .index_build_range_scan = heapam_index_build_range_scan,
    .index_validate_scan = heapam_index_validate_scan,
    .relation_size = heapam_relation_size,
    .relation_estimate_size = heapam_estimate_rel_size,
    .scan_bitmap_next_block = heapam_scan_bitmap_next_block,
    .scan_bitmap_next_tuple = heapam_scan_bitmap_next_tuple,
    .scan_sample_next_block = heapam_scan_sample_next_block,
    .scan_sample_next_tuple = heapam_scan_sample_next_tuple
};

TableAmRoutine在源文件tableam.h中定义
tableam.h
TableAmRoutine结构体封装了table access method,如需自定义storage接口,则需实现(部分)该结构体中定义的函数,在创建表时指定自定义的存储引擎,指向自定义的access method.


/*
 * API struct for a table AM.  Note this must be allocated in a
 * server-lifetime manner, typically as a static const struct, which then gets
 * returned by FormData_pg_am.amhandler.
 *
 * In most cases it's not appropriate to call the callbacks directly, use the
 * table_* wrapper functions instead.
 *
 * GetTableAmRoutine() asserts that required callbacks are filled in, remember
 * to update when adding a callback.
 */
typedef struct TableAmRoutine
{
    /* this must be set to T_TableAmRoutine */
    NodeTag     type;
    /* ------------------------------------------------------------------------
     * Slot related callbacks.
     * ------------------------------------------------------------------------
     */
    /*
     * Return slot implementation suitable for storing a tuple of this AM.
     */
    const TupleTableSlotOps *(*slot_callbacks) (Relation rel);
    /* ------------------------------------------------------------------------
     * Table scan callbacks.
     * ------------------------------------------------------------------------
     */
    /*
     * Start a scan of `rel`.  The callback has to return a TableScanDesc,
     * which will typically be embedded in a larger, AM specific, struct.
     *
     * If nkeys != 0, the results need to be filtered by those scan keys.
     *
     * pscan, if not NULL, will have already been initialized with
     * parallelscan_initialize(), and has to be for the same relation. Will
     * only be set coming from table_beginscan_parallel().
     *
     * `flags` is a bitmask indicating the type of scan (ScanOptions's
     * SO_TYPE_*, currently only one may be specified), options controlling
     * the scan's behaviour (ScanOptions's SO_ALLOW_*, several may be
     * specified, an AM may ignore unsupported ones) and whether the snapshot
     * needs to be deallocated at scan_end (ScanOptions's SO_TEMP_SNAPSHOT).
     */
    TableScanDesc (*scan_begin) (Relation rel,
                                 Snapshot snapshot,
                                 int nkeys, struct ScanKeyData *key,
                                 ParallelTableScanDesc pscan,
                                 uint32 flags);
    /*
     * Release resources and deallocate scan. If TableScanDesc.temp_snap,
     * TableScanDesc.rs_snapshot needs to be unregistered.
     */
    void        (*scan_end) (TableScanDesc scan);
    /*
     * Restart relation scan.  If set_params is set to true, allow_{strat,
     * sync, pagemode} (see scan_begin) changes should be taken into account.
     */
    void        (*scan_rescan) (TableScanDesc scan, struct ScanKeyData *key,
                                bool set_params, bool allow_strat,
                                bool allow_sync, bool allow_pagemode);
    /*
     * Return next tuple from `scan`, store in slot.
     */
    bool        (*scan_getnextslot) (TableScanDesc scan,
                                     ScanDirection direction,
                                     TupleTableSlot *slot);
    /* ------------------------------------------------------------------------
     * Parallel table scan related functions.
     * ------------------------------------------------------------------------
     */
    /*
     * Estimate the size of shared memory needed for a parallel scan of this
     * relation. The snapshot does not need to be accounted for.
     */
    Size        (*parallelscan_estimate) (Relation rel);
    /*
     * Initialize ParallelTableScanDesc for a parallel scan of this relation.
     * `pscan` will be sized according to parallelscan_estimate() for the same
     * relation.
     */
    Size        (*parallelscan_initialize) (Relation rel,
                                            ParallelTableScanDesc pscan);
    /*
     * Reinitialize `pscan` for a new scan. `rel` will be the same relation as
     * when `pscan` was initialized by parallelscan_initialize.
     */
    void        (*parallelscan_reinitialize) (Relation rel,
                                              ParallelTableScanDesc pscan);
    /* ------------------------------------------------------------------------
     * Index Scan Callbacks
     * ------------------------------------------------------------------------
     */
    /*
     * Prepare to fetch tuples from the relation, as needed when fetching
     * tuples for an index scan.  The callback has to return an
     * IndexFetchTableData, which the AM will typically embed in a larger
     * structure with additional information.
     *
     * Tuples for an index scan can then be fetched via index_fetch_tuple.
     */
    struct IndexFetchTableData *(*index_fetch_begin) (Relation rel);
    /*
     * Reset index fetch. Typically this will release cross index fetch
     * resources held in IndexFetchTableData.
     */
    void        (*index_fetch_reset) (struct IndexFetchTableData *data);
    /*
     * Release resources and deallocate index fetch.
     */
    void        (*index_fetch_end) (struct IndexFetchTableData *data);
    /*
     * Fetch tuple at `tid` into `slot`, after doing a visibility test
     * according to `snapshot`. If a tuple was found and passed the visibility
     * test, return true, false otherwise.
     *
     * Note that AMs that do not necessarily update indexes when indexed
     * columns do not change, need to return the current/correct version of
     * the tuple that is visible to the snapshot, even if the tid points to an
     * older version of the tuple.
     *
     * *call_again is false on the first call to index_fetch_tuple for a tid.
     * If there potentially is another tuple matching the tid, *call_again
     * needs be set to true by index_fetch_tuple, signalling to the caller
     * that index_fetch_tuple should be called again for the same tid.
     *
     * *all_dead, if all_dead is not NULL, should be set to true by
     * index_fetch_tuple iff it is guaranteed that no backend needs to see
     * that tuple. Index AMs can use that do avoid returning that tid in
     * future searches.
     */
    bool        (*index_fetch_tuple) (struct IndexFetchTableData *scan,
                                      ItemPointer tid,
                                      Snapshot snapshot,
                                      TupleTableSlot *slot,
                                      bool *call_again, bool *all_dead);
    /* ------------------------------------------------------------------------
     * Callbacks for non-modifying operations on individual tuples
     * ------------------------------------------------------------------------
     */
    /*
     * Fetch tuple at `tid` into `slot`, after doing a visibility test
     * according to `snapshot`. If a tuple was found and passed the visibility
     * test, returns true, false otherwise.
     */
    bool        (*tuple_fetch_row_version) (Relation rel,
                                            ItemPointer tid,
                                            Snapshot snapshot,
                                            TupleTableSlot *slot);
    /*
     * Is tid valid for a scan of this relation.
     */
    bool        (*tuple_tid_valid) (TableScanDesc scan,
                                    ItemPointer tid);
    /*
     * Return the latest version of the tuple at `tid`, by updating `tid` to
     * point at the newest version.
     */
    void        (*tuple_get_latest_tid) (TableScanDesc scan,
                                         ItemPointer tid);
    /*
     * Does the tuple in `slot` satisfy `snapshot`?  The slot needs to be of
     * the appropriate type for the AM.
     */
    bool        (*tuple_satisfies_snapshot) (Relation rel,
                                             TupleTableSlot *slot,
                                             Snapshot snapshot);
    /* see table_compute_xid_horizon_for_tuples() */
    TransactionId (*compute_xid_horizon_for_tuples) (Relation rel,
                                                     ItemPointerData *items,
                                                     int nitems);
    /* ------------------------------------------------------------------------
     * Manipulations of physical tuples.
     * ------------------------------------------------------------------------
     */
    /* see table_insert() for reference about parameters */
    void        (*tuple_insert) (Relation rel, TupleTableSlot *slot,
                                 CommandId cid, int options,
                                 struct BulkInsertStateData *bistate);
    /* see table_insert_speculative() for reference about parameters */
    void        (*tuple_insert_speculative) (Relation rel,
                                             TupleTableSlot *slot,
                                             CommandId cid,
                                             int options,
                                             struct BulkInsertStateData *bistate,
                                             uint32 specToken);
    /* see table_complete_speculative() for reference about parameters */
    void        (*tuple_complete_speculative) (Relation rel,
                                               TupleTableSlot *slot,
                                               uint32 specToken,
                                               bool succeeded);
    /* see table_multi_insert() for reference about parameters */
    void        (*multi_insert) (Relation rel, TupleTableSlot **slots, int nslots,
                                 CommandId cid, int options, struct BulkInsertStateData *bistate);
    /* see table_delete() for reference about parameters */
    TM_Result   (*tuple_delete) (Relation rel,
                                 ItemPointer tid,
                                 CommandId cid,
                                 Snapshot snapshot,
                                 Snapshot crosscheck,
                                 bool wait,
                                 TM_FailureData *tmfd,
                                 bool changingPart);
    /* see table_update() for reference about parameters */
    TM_Result   (*tuple_update) (Relation rel,
                                 ItemPointer otid,
                                 TupleTableSlot *slot,
                                 CommandId cid,
                                 Snapshot snapshot,
                                 Snapshot crosscheck,
                                 bool wait,
                                 TM_FailureData *tmfd,
                                 LockTupleMode *lockmode,
                                 bool *update_indexes);
    /* see table_lock_tuple() for reference about parameters */
    TM_Result   (*tuple_lock) (Relation rel,
                               ItemPointer tid,
                               Snapshot snapshot,
                               TupleTableSlot *slot,
                               CommandId cid,
                               LockTupleMode mode,
                               LockWaitPolicy wait_policy,
                               uint8 flags,
                               TM_FailureData *tmfd);
    /*
     * Perform operations necessary to complete insertions made via
     * tuple_insert and multi_insert with a BulkInsertState specified. This
     * may for example be used to flush the relation, when the
     * TABLE_INSERT_SKIP_WAL option was used.
     *
     * Typically callers of tuple_insert and multi_insert will just pass all
     * the flags that apply to them, and each AM has to decide which of them
     * make sense for it, and then only take actions in finish_bulk_insert for
     * those flags, and ignore others.
     *
     * Optional callback.
     */
    void        (*finish_bulk_insert) (Relation rel, int options);
    /* ------------------------------------------------------------------------
     * DDL related functionality.
     * ------------------------------------------------------------------------
     */
    /*
     * This callback needs to create a new relation filenode for `rel`, with
     * appropriate durability behaviour for `persistence`.
     *
     * Note that only the subset of the relcache filled by
     * RelationBuildLocalRelation() can be relied upon and that the relation's
     * catalog entries either will either not yet exist (new relation), or
     * will still reference the old relfilenode.
     *
     * As output *freezeXid, *minmulti must be set to the values appropriate
     * for pg_class.{relfrozenxid, relminmxid}. For AMs that don't need those
     * fields to be filled they can be set to InvalidTransactionId and
     * InvalidMultiXactId, respectively.
     *
     * See also table_relation_set_new_filenode().
     */
    void        (*relation_set_new_filenode) (Relation rel,
                                              const RelFileNode *newrnode,
                                              char persistence,
                                              TransactionId *freezeXid,
                                              MultiXactId *minmulti);
    /*
     * This callback needs to remove all contents from `rel`'s current
     * relfilenode. No provisions for transactional behaviour need to be made.
     * Often this can be implemented by truncating the underlying storage to
     * its minimal size.
     *
     * See also table_relation_nontransactional_truncate().
     */
    void        (*relation_nontransactional_truncate) (Relation rel);
    /*
     * See table_relation_copy_data().
     *
     * This can typically be implemented by directly copying the underlying
     * storage, unless it contains references to the tablespace internally.
     */
    void        (*relation_copy_data) (Relation rel,
                                       const RelFileNode *newrnode);
    /* See table_relation_copy_for_cluster() */
    void        (*relation_copy_for_cluster) (Relation NewHeap,
                                              Relation OldHeap,
                                              Relation OldIndex,
                                              bool use_sort,
                                              TransactionId OldestXmin,
                                              TransactionId *xid_cutoff,
                                              MultiXactId *multi_cutoff,
                                              double *num_tuples,
                                              double *tups_vacuumed,
                                              double *tups_recently_dead);
    /*
     * React to VACUUM command on the relation. The VACUUM might be user
     * triggered or by autovacuum. The specific actions performed by the AM
     * will depend heavily on the individual AM.
     *
     * On entry a transaction is already established, and the relation is
     * locked with a ShareUpdateExclusive lock.
     *
     * Note that neither VACUUM FULL (and CLUSTER), nor ANALYZE go through
     * this routine, even if (for ANALYZE) it is part of the same VACUUM
     * command.
     *
     * There probably, in the future, needs to be a separate callback to
     * integrate with autovacuum's scheduling.
     */
    void        (*relation_vacuum) (Relation onerel,
                                    struct VacuumParams *params,
                                    BufferAccessStrategy bstrategy);
    /*
     * Prepare to analyze block `blockno` of `scan`. The scan has been started
     * with table_beginscan_analyze().  See also
     * table_scan_analyze_next_block().
     *
     * The callback may acquire resources like locks that are held until
     * table_scan_analyze_next_tuple() returns false. It e.g. can make sense
     * to hold a lock until all tuples on a block have been analyzed by
     * scan_analyze_next_tuple.
     *
     * The callback can return false if the block is not suitable for
     * sampling, e.g. because it's a metapage that could never contain tuples.
     *
     * XXX: This obviously is primarily suited for block-based AMs. It's not
     * clear what a good interface for non block based AMs would be, so there
     * isn't one yet.
     */
    bool        (*scan_analyze_next_block) (TableScanDesc scan,
                                            BlockNumber blockno,
                                            BufferAccessStrategy bstrategy);
    /*
     * See table_scan_analyze_next_tuple().
     *
     * Not every AM might have a meaningful concept of dead rows, in which
     * case it's OK to not increment *deadrows - but note that that may
     * influence autovacuum scheduling (see comment for relation_vacuum
     * callback).
     */
    bool        (*scan_analyze_next_tuple) (TableScanDesc scan,
                                            TransactionId OldestXmin,
                                            double *liverows,
                                            double *deadrows,
                                            TupleTableSlot *slot);
    /* see table_index_build_range_scan for reference about parameters */
    double      (*index_build_range_scan) (Relation heap_rel,
                                           Relation index_rel,
                                           struct IndexInfo *index_nfo,
                                           bool allow_sync,
                                           bool anyvisible,
                                           bool progress,
                                           BlockNumber start_blockno,
                                           BlockNumber end_blockno,
                                           IndexBuildCallback callback,
                                           void *callback_state,
                                           TableScanDesc scan);
    /* see table_index_validate_scan for reference about parameters */
    void        (*index_validate_scan) (Relation heap_rel,
                                        Relation index_rel,
                                        struct IndexInfo *index_info,
                                        Snapshot snapshot,
                                        struct ValidateIndexState *state);
    /* ------------------------------------------------------------------------
     * Miscellaneous functions.
     * ------------------------------------------------------------------------
     */
    /*
     * See table_relation_size().
     *
     * Note that currently a few callers use the MAIN_FORKNUM size to figure
     * out the range of potentially interesting blocks (brin, analyze). It's
     * probable that we'll need to revise the interface for those at some
     * point.
     */
    uint64      (*relation_size) (Relation rel, ForkNumber forkNumber);
    /* ------------------------------------------------------------------------
     * Planner related functions.
     * ------------------------------------------------------------------------
     */
    /*
     * See table_relation_estimate_size().
     *
     * While block oriented, it shouldn't be too hard for an AM that doesn't
     * doesn't internally use blocks to convert into a usable representation.
     *
     * This differs from the relation_size callback by returning size
     * estimates (both relation size and tuple count) for planning purposes,
     * rather than returning a currently correct estimate.
     */
    void        (*relation_estimate_size) (Relation rel, int32 *attr_widths,
                                           BlockNumber *pages, double *tuples,
                                           double *allvisfrac);
    /* ------------------------------------------------------------------------
     * Executor related functions.
     * ------------------------------------------------------------------------
     */
    /*
     * Prepare to fetch / check / return tuples from `tbmres->blockno` as part
     * of a bitmap table scan. `scan` was started via table_beginscan_bm().
     * Return false if there are no tuples to be found on the page, true
     * otherwise.
     *
     * This will typically read and pin the target block, and do the necessary
     * work to allow scan_bitmap_next_tuple() to return tuples (e.g. it might
     * make sense to perform tuple visibility checks at this time). For some
     * AMs it will make more sense to do all the work referencing `tbmres`
     * contents here, for others it might be better to defer more work to
     * scan_bitmap_next_tuple.
     *
     * If `tbmres->blockno` is -1, this is a lossy scan and all visible tuples
     * on the page have to be returned, otherwise the tuples at offsets in
     * `tbmres->offsets` need to be returned.
     *
     * XXX: Currently this may only be implemented if the AM uses md.c as its
     * storage manager, and uses ItemPointer->ip_blkid in a manner that maps
     * blockids directly to the underlying storage. nodeBitmapHeapscan.c
     * performs prefetching directly using that interface.  This probably
     * needs to be rectified at a later point.
     *
     * XXX: Currently this may only be implemented if the AM uses the
     * visibilitymap, as nodeBitmapHeapscan.c unconditionally accesses it to
     * perform prefetching.  This probably needs to be rectified at a later
     * point.
     *
     * Optional callback, but either both scan_bitmap_next_block and
     * scan_bitmap_next_tuple need to exist, or neither.
     */
    bool        (*scan_bitmap_next_block) (TableScanDesc scan,
                                           struct TBMIterateResult *tbmres);
    /*
     * Fetch the next tuple of a bitmap table scan into `slot` and return true
     * if a visible tuple was found, false otherwise.
     *
     * For some AMs it will make more sense to do all the work referencing
     * `tbmres` contents in scan_bitmap_next_block, for others it might be
     * better to defer more work to this callback.
     *
     * Optional callback, but either both scan_bitmap_next_block and
     * scan_bitmap_next_tuple need to exist, or neither.
     */
    bool        (*scan_bitmap_next_tuple) (TableScanDesc scan,
                                           struct TBMIterateResult *tbmres,
                                           TupleTableSlot *slot);
    /*
     * Prepare to fetch tuples from the next block in a sample scan. Return
     * false if the sample scan is finished, true otherwise. `scan` was
     * started via table_beginscan_sampling().
     *
     * Typically this will first determine the target block by call the
     * TsmRoutine's NextSampleBlock() callback if not NULL, or alternatively
     * perform a sequential scan over all blocks.  The determined block is
     * then typically read and pinned.
     *
     * As the TsmRoutine interface is block based, a block needs to be passed
     * to NextSampleBlock(). If that's not appropriate for an AM, it
     * internally needs to perform mapping between the internal and a block
     * based representation.
     *
     * Note that it's not acceptable to hold deadlock prone resources such as
     * lwlocks until scan_sample_next_tuple() has exhausted the tuples on the
     * block - the tuple is likely to be returned to an upper query node, and
     * the next call could be off a long while. Holding buffer pins and such
     * is obviously OK.
     *
     * Currently it is required to implement this interface, as there's no
     * alternative way (contrary e.g. to bitmap scans) to implement sample
     * scans. If infeasible to implement the AM may raise an error.
     */
    bool        (*scan_sample_next_block) (TableScanDesc scan,
                                           struct SampleScanState *scanstate);
    /*
     * This callback, only called after scan_sample_next_block has returned
     * true, should determine the next tuple to be returned from the selected
     * block using the TsmRoutine's NextSampleTuple() callback.
     *
     * The callback needs to perform visibility checks, and only return
     * visible tuples. That obviously can mean calling NextSampletuple()
     * multiple times.
     *
     * The TsmRoutine interface assumes that there's a maximum offset on a
     * given page, so if that doesn't apply to an AM, it needs to emulate that
     * assumption somehow.
     */
    bool        (*scan_sample_next_tuple) (TableScanDesc scan,
                                           struct SampleScanState *scanstate,
                                           TupleTableSlot *slot);
} TableAmRoutine;

二、参考资料

Future of storage
What is the Future of PostgreSQL? - PGConf India, 2019

来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/6906/viewspace-2646264/,如需转载,请注明出处,否则将追究法律责任。

转载于:http://blog.itpub.net/6906/viewspace-2646264/

 类似资料: