pgstrom如何工作

汪才英

2023-12-01

pgstrom啥时候被启动

当开启服务器的时候
src\backend\postmaster\postmaster.c:PostmasterMain

	/*
	 * process any libraries that should be preloaded at postmaster start
	 */
	process_shared_preload_libraries();

在这儿的时候，pgstrom就被启动了
- 然后就执行了void _PG_init(void)

/*

 * _PG_init
 *
 * Main entrypoint of PG-Strom. It shall be invoked only once when postmaster
 * process is starting up, then it calls other sub-systems to initialize for each ones.
 */
void
_PG_init(void)
{
	/*
	 * PG-Strom has to be loaded using shared_preload_libraries option
	 */
	if (!process_shared_preload_libraries_in_progress)
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
		errmsg("PG-Strom must be loaded via shared_preload_libraries")));
	/* link nvrtc library according to the current CUDA version */

	pgstrom_init_nvrtc();
	/* dump version number */
#ifdef PGSTROM_VERSION
	elog(LOG, "PG-Strom version %s built for PostgreSQL %s",
		 PGSTROM_VERSION, PG_MAJORVERSION);
#else
	elog(LOG, "PG-Strom built for PostgreSQL %s", PG_MAJORVERSION);
#endif
	/* init misc variables */
	PAGE_SIZE = sysconf(_SC_PAGESIZE);
	PAGE_MASK = PAGE_SIZE - 1;
	PAGE_SHIFT = get_next_log2(PAGE_SIZE);
	PHYS_PAGES = sysconf(_SC_PHYS_PAGES);


	/* init GPU/CUDA infrastracture */
	pgstrom_init_common_guc();
	pgstrom_init_shmbuf();
	pgstrom_init_gpu_device();
	pgstrom_init_gpu_mmgr();
	pgstrom_init_gpu_context();
	pgstrom_init_cuda_program();
	pgstrom_init_nvme_strom();
	pgstrom_init_codegen();


	/* init custom-scan providers/FDWs */
	pgstrom_init_gputasks();
	pgstrom_init_gpuscan();
	pgstrom_init_gpujoin();
	pgstrom_init_inners();
	pgstrom_init_gpupreagg();
	pgstrom_init_relscan();
	pgstrom_init_arrow_fdw();


	/* check commercial license, if any */
	check_heterodb_license();
	/* dummy custom-scan node */

	memset(&pgstrom_dummy_path_methods, 0, sizeof(CustomPathMethods));
	pgstrom_dummy_path_methods.CustomName	= "Dummy";
	pgstrom_dummy_path_methods.PlanCustomPath
		= pgstrom_dummy_create_plan;
	memset(&pgstrom_dummy_plan_methods, 0, sizeof(CustomScanMethods));
	pgstrom_dummy_plan_methods.CustomName	= "Dummy";
	pgstrom_dummy_plan_methods.CreateCustomScanState
		= pgstrom_dummy_create_scan_state;


	/* planner hook registration */
	planner_hook_next = planner_hook;
	planner_hook = pgstrom_post_planner;
}

举例：如何添加GPUscan结点到生成计划里面？

/*
 * set_base_rel_pathlists
 *	  Finds all paths available for scanning each base-relation entry.
 *	  Sequential scan and any available indices are considered.
 *	  Each useful path is attached to its relation's 'pathlist' field.
 */
static void
set_base_rel_pathlists(PlannerInfo *root)
{
	Index		rti;

	for (rti = 1; rti < root->simple_rel_array_size; rti++)
	{
		RelOptInfo *rel = root->simple_rel_array[rti];

		/* there may be empty slots corresponding to non-baserel RTEs */
		if (rel == NULL)
			continue;

		Assert(rel->relid == rti);	/* sanity check on array */

		/* ignore RTEs that are "other rels" */
		if (rel->reloptkind != RELOPT_BASEREL)
			continue;

		set_rel_pathlist(root, rel, rti, root->simple_rte_array[rti]);
	}
}

set_rel_pathlist


/*
 * set_rel_pathlist
 *	  Build access paths for a base relation
 */
static void
set_rel_pathlist(PlannerInfo *root, RelOptInfo *rel,
				 Index rti, RangeTblEntry *rte)
{
	if (IS_DUMMY_REL(rel))
	{
		/* We already proved the relation empty, so nothing more to do */
	}
	else if (rte->inh)
	{
		/* It's an "append relation", process accordingly */
		set_append_rel_pathlist(root, rel, rti, rte);
	}
	else
	{
		switch (rel->rtekind)
		{
			case RTE_RELATION:
				if (rte->relkind == RELKIND_FOREIGN_TABLE)
				{
					/* Foreign table */
					set_foreign_pathlist(root, rel, rte);
				}
				else if (rte->tablesample != NULL)
				{
					/* Sampled relation */
					set_tablesample_rel_pathlist(root, rel, rte);
				}
				else
				{
					/* Plain relation */
					set_plain_rel_pathlist(root, rel, rte);
				}
				break;
			case RTE_SUBQUERY:
				/* Subquery --- fully handled during set_rel_size */
				break;
			case RTE_FUNCTION:
				/* RangeFunction */
				set_function_pathlist(root, rel, rte);
				break;
			case RTE_TABLEFUNC:
				/* Table Function */
				set_tablefunc_pathlist(root, rel, rte);
				break;
			case RTE_VALUES:
				/* Values list */
				set_values_pathlist(root, rel, rte);
				break;
			case RTE_CTE:
				/* CTE reference --- fully handled during set_rel_size */
				break;
			case RTE_NAMEDTUPLESTORE:
				/* tuplestore reference --- fully handled during set_rel_size */
				break;
			default:
				elog(ERROR, "unexpected rtekind: %d", (int) rel->rtekind);
				break;
		}
	}

	/*
	 * Allow a plugin to editorialize on the set of Paths for this base
	 * relation.  It could add new paths (such as CustomPaths) by calling
	 * add_path(), or add_partial_path() if parallel aware.  It could also
	 * delete or modify paths added by the core code.
	 */
	if (set_rel_pathlist_hook)
		(*set_rel_pathlist_hook) (root, rel, rti, rte);

	/*
	 * If this is a baserel, we should normally consider gathering any partial
	 * paths we may have created for it.  We have to do this after calling the
	 * set_rel_pathlist_hook, else it cannot add partial paths to be included
	 * here.
	 *
	 * However, if this is an inheritance child, skip it.  Otherwise, we could
	 * end up with a very large number of gather nodes, each trying to grab
	 * its own pool of workers.  Instead, we'll consider gathering partial
	 * paths for the parent appendrel.
	 *
	 * Also, if this is the topmost scan/join rel (that is, the only baserel),
	 * we postpone gathering until the final scan/join targetlist is available
	 * (see grouping_planner).
	 */
	if (rel->reloptkind == RELOPT_BASEREL &&
		bms_membership(root->all_baserels) != BMS_SINGLETON)
		generate_gather_paths(root, rel, false);

	/* Now find the cheapest of the paths for this rel */
	set_cheapest(rel);

#ifdef OPTIMIZER_DEBUG
	debug_print_rel(root, rel);
#endif
}

以CustomPath的形式加入GPUscanpath

pgstrom_init_gpuscan();

/*
 * pgstrom_init_gpuscan
 */
void
pgstrom_init_gpuscan(void)
{
	/* pg_strom.enable_gpuscan */
	DefineCustomBoolVariable("pg_strom.enable_gpuscan",
							 "Enables the use of GPU accelerated full-scan",
							 NULL,
							 &enable_gpuscan,
							 true,
							 PGC_USERSET,
							 GUC_NOT_IN_SAMPLE,
							 NULL, NULL, NULL);
	/* pg_strom.pullup_outer_scan */
	DefineCustomBoolVariable("pg_strom.pullup_outer_scan",
							 "Enables to pull up simple outer scan",
							 NULL,
							 &enable_pullup_outer_scan,
							 true,
							 PGC_USERSET,
                             GUC_NOT_IN_SAMPLE,
                             NULL, NULL, NULL);

	/* setup path methods */
	memset(&gpuscan_path_methods, 0, sizeof(gpuscan_path_methods));
	gpuscan_path_methods.CustomName			= "GpuScan";
	gpuscan_path_methods.PlanCustomPath		= PlanGpuScanPath;

	/* setup plan methods */
	memset(&gpuscan_plan_methods, 0, sizeof(gpuscan_plan_methods));
	gpuscan_plan_methods.CustomName			= "GpuScan";
	gpuscan_plan_methods.CreateCustomScanState = gpuscan_create_scan_state;
	RegisterCustomScanMethods(&gpuscan_plan_methods);

	/* setup exec methods */
	memset(&gpuscan_exec_methods, 0, sizeof(gpuscan_exec_methods));
	gpuscan_exec_methods.CustomName         = "GpuScan";
	gpuscan_exec_methods.BeginCustomScan    = ExecInitGpuScan;
	gpuscan_exec_methods.ExecCustomScan     = ExecGpuScan;
	gpuscan_exec_methods.EndCustomScan      = ExecEndGpuScan;
	gpuscan_exec_methods.ReScanCustomScan   = ExecReScanGpuScan;
	gpuscan_exec_methods.EstimateDSMCustomScan = ExecGpuScanEstimateDSM;
	gpuscan_exec_methods.InitializeDSMCustomScan = ExecGpuScanInitDSM;
	gpuscan_exec_methods.InitializeWorkerCustomScan = ExecGpuScanInitWorker;
	gpuscan_exec_methods.ReInitializeDSMCustomScan = ExecGpuScanReInitializeDSM;
	gpuscan_exec_methods.ShutdownCustomScan	= ExecShutdownGpuScan;
	gpuscan_exec_methods.ExplainCustomScan  = ExplainGpuScan;

	/* hook registration */
	set_rel_pathlist_next = set_rel_pathlist_hook;
	set_rel_pathlist_hook = gpuscan_add_scan_path;
}

gpuscan_add_scan_path;

/*
 * gpuscan_add_scan_path - entrypoint of the set_rel_pathlist_hook
 */
static void
gpuscan_add_scan_path(PlannerInfo *root,
					  RelOptInfo *baserel,
					  Index rtindex,
					  RangeTblEntry *rte)
{
	Path	   *pathnode;
	List	   *dev_quals = NIL;
	List	   *host_quals = NIL;
	IndexOptInfo *indexOpt;
	List	   *indexConds;
	List	   *indexQuals;
	cl_long		indexNBlocks;
	ListCell   *lc;

	/* call the secondary hook */
	if (set_rel_pathlist_next)
		set_rel_pathlist_next(root, baserel, rtindex, rte);

	/* nothing to do, if either PG-Strom or GpuScan is not enabled */
	if (!pgstrom_enabled || !enable_gpuscan)
		return;

	/* We already proved the relation empty, so nothing more to do */
	if (is_dummy_rel(baserel))
		return;

	/* It is the role of built-in Append node */
	if (rte->inh)
		return;
	/*
	 * GpuScan can run on only base relations or foreign table managed
	 * by arrow_fdw.
	 */
	if (rte->relkind == RELKIND_FOREIGN_TABLE)
	{
		if (!baseRelIsArrowFdw(baserel))
			return;
	}
	else if (rte->relkind != RELKIND_RELATION &&
			 rte->relkind != RELKIND_MATVIEW)
		return;

	/* Check whether the qualifier can run on GPU device */
	foreach (lc, baserel->baserestrictinfo)
	{
		RestrictInfo   *rinfo = lfirst(lc);

		if (pgstrom_device_expression(root, baserel, rinfo->clause))
			dev_quals = lappend(dev_quals, rinfo);
		else
			host_quals = lappend(host_quals, rinfo);
	}
	if (dev_quals == NIL)
		return;

	/* Check availability of GpuScan+BRIN Index */
	indexOpt = pgstrom_tryfind_brinindex(root, baserel,
										 &indexConds,
										 &indexQuals,
										 &indexNBlocks);

	/* add GpuScan path in single process */
	pathnode = create_gpuscan_path(root, baserel,
								   dev_quals,
								   host_quals,
								   0,
								   indexOpt,
								   indexConds,
								   indexQuals,
								   indexNBlocks);
	add_path(baserel, pathnode);

	/* If appropriate, consider parallel GpuScan */
	if (baserel->consider_parallel && baserel->lateral_relids == NULL)
	{
		int		parallel_nworkers
			= compute_parallel_worker(baserel,
									  baserel->pages, -1.0,
									  max_parallel_workers_per_gather);
		/*
		 * XXX - Do we need a something specific logic for GpuScan to adjust
		 * parallel_workers.
		 */
		if (parallel_nworkers <= 0)
			return;

		/* add GpuScan path performing on parallel workers */
		pathnode = create_gpuscan_path(root, baserel,
									   dev_quals,
									   host_quals,
									   parallel_nworkers,
									   indexOpt,
									   indexConds,
									   indexQuals,
									   indexNBlocks);
		add_partial_path(baserel, pathnode);
	}
}

pg提供了一些变量可以实现把

以后的优化函数就不是standard_planner啦！

static PlannedStmt *
pgstrom_post_planner(Query *parse,int cursorOptions,ParamListInfo boundParams)
{
	PlannedStmt	   *pstmt;
	ListCell	   *lc;
	if (planner_hook_next)
		pstmt = planner_hook_next(parse, cursorOptions, boundParams);
	else
		pstmt = standard_planner(parse, cursorOptions, boundParams);
	pgstrom_post_planner_recurse(pstmt, &pstmt->planTree);
	foreach (lc, pstmt->subplans)
		pgstrom_post_planner_recurse(pstmt, (Plan **)&lfirst(lc));
	return pstmt;
}

/*

 * pgstrom_post_planner
 *
 * remove 'dummy' custom scan node.
 */

static void
pgstrom_post_planner_recurse(PlannedStmt *pstmt, Plan **p_plan)
{

	Plan	   *plan = *p_plan;
	ListCell   *lc;
	Assert(plan != NULL);
	switch (nodeTag(plan))

	{
		case T_ModifyTable:
			{

				ModifyTable *splan = (ModifyTable *) plan;
				foreach (lc, splan->plans)

					pgstrom_post_planner_recurse(pstmt, (Plan **)&lfirst(lc));
			}

			break;
		case T_Append:

			{

				Append	   *splan = (Append *) plan;
				foreach (lc, splan->appendplans)

					pgstrom_post_planner_recurse(pstmt, (Plan **)&lfirst(lc));

			}
			break;
		case T_MergeAppend:

			{

				MergeAppend *splan = (MergeAppend *) plan;
				foreach (lc, splan->mergeplans)

					pgstrom_post_planner_recurse(pstmt, (Plan **)&lfirst(lc));

			}

			break;
		case T_BitmapAnd:

			{

				BitmapAnd  *splan = (BitmapAnd *) plan;
				foreach (lc, splan->bitmapplans)

					pgstrom_post_planner_recurse(pstmt, (Plan **)&lfirst(lc));
			}
			break;
		case T_BitmapOr:

			{

				BitmapOr   *splan = (BitmapOr *) plan;

				foreach (lc, splan->bitmapplans)

					pgstrom_post_planner_recurse(pstmt, (Plan **)&lfirst(lc));

			}

			break;
		case T_SubqueryScan:

			{

				SubqueryScan *sscan = (SubqueryScan *) plan;
				pgstrom_post_planner_recurse(pstmt, &sscan->subplan);

			}
			break;
		case T_CustomScan:

			{
				CustomScan *cscan = (CustomScan *) plan;
				if (cscan->methods == &pgstrom_dummy_plan_methods)

				{

					*p_plan = pgstrom_dummy_remove_plan(pstmt, cscan);

					pgstrom_post_planner_recurse(pstmt, p_plan);

					return;

				}

				else if (pgstrom_plan_is_gpupreagg(&cscan->scan.plan))

					gpupreagg_post_planner(pstmt, cscan);
				foreach (lc, cscan->custom_plans)
					pgstrom_post_planner_recurse(pstmt, (Plan **)&lfirst(lc));
			}

			break;



		default:

			break;

	}
	if (plan->lefttree)

		pgstrom_post_planner_recurse(pstmt, &plan->lefttree);

	if (plan->righttree)

		pgstrom_post_planner_recurse(pstmt, &plan->righttree);
}

重要连接

https://blog.csdn.net/weixin_40730091/article/details/102734600?depth_1-utm_source=distribute.pc_relevant.none-task&utm_source=distribute.pc_relevant.none-task

重要连接的例子

SELECT cat, count(*), avg(ax) FROM t0 NATURAL JOIN t1 NATURAL JOIN t2 NATURAL JOIN t3 NATURAL JOIN t4 NATURAL JOIN t5 NATURAL JOIN t6 NATURAL JOIN t7 NATURAL JOIN t8 GROUP BY cat;


HashAggregate  (cost=3529132.25..3529132.57 rows=26 width=12)
  Output: t0.cat, pgstrom.count((pgstrom.nrows())), pgstrom.avg((pgstrom.nrows((t1.ax IS NOT NULL))), (pgstrom.psum(t1.ax)))
  Group Key: t0.cat
  ->  Custom Scan (GpuPreAgg)  (cost=29099.24..2859511.64 rows=234 width=48)
        Output: t0.cat, pgstrom.nrows(), pgstrom.nrows((t1.ax IS NOT NULL)), pgstrom.psum(t1.ax)
        Reduction: Local + Global
        GPU Projection: t0.cat, t1.ax
        Logic Parameter: SafetyLimit: 2147483647, KeyDistSalt: 9
        Extra: outer-bulk-exec
        ->  Custom Scan (GpuJoin) on public.t0  (cost=25099.24..2826221.34 rows=93721454 width=12)
              Output: t0.cat, t1.ax
              GPU Projection: t0.cat::text, t1.ax::double precision
              Depth 1: GpuHashJoin, HashKeys: (t0.fid)
                       JoinQuals: (t0.fid = t6.fid)
                       Nrows (in/out: 97.75%), KDS-Hash (size: 13.47MB, nbatches: 1)
              Depth 2: GpuHashJoin, HashKeys: (t0.cid)
                       JoinQuals: (t0.cid = t3.cid)
                       Nrows (in/out: 98.34%), KDS-Hash (size: 13.47MB, nbatches: 1)
              Depth 3: GpuHashJoin, HashKeys: (t0.bid)
                       JoinQuals: (t0.bid = t2.bid)
                       Nrows (in/out: 98.76%), KDS-Hash (size: 13.47MB, nbatches: 1)
              Depth 4: GpuHashJoin, HashKeys: (t0.eid)
                       JoinQuals: (t0.eid = t5.eid)
                       Nrows (in/out: 98.97%), KDS-Hash (size: 13.47MB, nbatches: 1)
              Depth 5: GpuHashJoin, HashKeys: (t0.aid)
                       JoinQuals: (t0.aid = t1.aid)
                       Nrows (in/out: 100.00%), KDS-Hash (size: 13.47MB, nbatches: 1)
              Depth 6: GpuHashJoin, HashKeys: (t0.did)
                       JoinQuals: (t0.did = t4.did)
                       Nrows (in/out: 100.00%), KDS-Hash (size: 13.47MB, nbatches: 1)
              Depth 7: GpuHashJoin, HashKeys: (t0.gid)
                       JoinQuals: (t0.gid = t7.gid)
                       Nrows (in/out: 100.00%), KDS-Hash (size: 13.47MB, nbatches: 1)
              Depth 8: GpuHashJoin, HashKeys: (t0.hid)
                       JoinQuals: (t0.hid = t8.hid)
                       Nrows (in/out: 99.74%), KDS-Hash (size: 13.47MB, nbatches: 1)
              Extra: bulk-exec-support, row-format
              ->  Seq Scan on public.t6  (cost=0.00..1935.00 rows=100000 width=4)
                    Output: t6.fid, t6.ftext, t6.fx
              ->  Seq Scan on public.t3  (cost=0.00..1935.00 rows=100000 width=4)
                    Output: t3.cid, t3.ctext, t3.cx
              ->  Seq Scan on public.t2  (cost=0.00..1935.00 rows=100000 width=4)
                    Output: t2.bid, t2.btext, t2.bx
              ->  Seq Scan on public.t5  (cost=0.00..1935.00 rows=100000 width=4)
                    Output: t5.eid, t5.etext, t5.ex
              ->  Seq Scan on public.t1  (cost=0.00..1935.00 rows=100000 width=12)
                    Output: t1.aid, t1.atext, t1.ax
              ->  Seq Scan on public.t4  (cost=0.00..1935.00 rows=100000 width=4)
                    Output: t4.did, t4.dtext, t4.dx
              ->  Seq Scan on public.t7  (cost=0.00..1935.00 rows=100000 width=4)
                    Output: t7.gid, t7.gtext, t7.gx
              ->  Seq Scan on public.t8  (cost=0.00..1935.00 rows=100000 width=4)
                    Output: t8.hid, t8.htext, t8.hx

pgstrom如何工作

pgstrom啥时候被启动

举例：如何添加GPUscan结点到生成计划里面？

set_rel_pathlist

pgstrom_init_gpuscan();

gpuscan_add_scan_path;

pg提供了一些变量可以实现把

重要连接

重要连接的例子

相关阅读

相关文章

相关问答

相关文档