greenplumn planagg 源码

2022-08-18
浏览 (247)

greenplumn planagg 代码

文件路径：/src/backend/optimizer/plan/planagg.c

/*-------------------------------------------------------------------------
 *
 * planagg.c
 *	  Special planning for aggregate queries.
 *
 * This module tries to replace MIN/MAX aggregate functions by subqueries
 * of the form
 *		(SELECT col FROM tab
 *		 WHERE col IS NOT NULL AND existing-quals
 *		 ORDER BY col ASC/DESC
 *		 LIMIT 1)
 * Given a suitable index on tab.col, this can be much faster than the
 * generic scan-all-the-rows aggregation plan.  We can handle multiple
 * MIN/MAX aggregates by generating multiple subqueries, and their
 * orderings can be different.  However, if the query contains any
 * non-optimizable aggregates, there's no point since we'll have to
 * scan all the rows anyway.
 *
 *
 * Portions Copyright (c) 2006-2008, Greenplum inc
 * Portions Copyright (c) 2012-Present VMware, Inc. or its affiliates.
 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
 *	  src/backend/optimizer/plan/planagg.c
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

#include "access/htup_details.h"
#include "catalog/pg_aggregate.h"
#include "catalog/pg_type.h"
#include "nodes/makefuncs.h"
#include "nodes/nodeFuncs.h"
#include "optimizer/clauses.h"
#include "optimizer/cost.h"
#include "optimizer/optimizer.h"
#include "optimizer/pathnode.h"
#include "optimizer/paths.h"
#include "optimizer/planmain.h"
#include "optimizer/subselect.h"
#include "optimizer/tlist.h"
#include "parser/parsetree.h"
#include "parser/parse_clause.h"
#include "rewrite/rewriteManip.h"
#include "utils/lsyscache.h"
#include "utils/syscache.h"

#include "cdb/cdbpath.h"
#include "cdb/cdbsetop.h"
#include "cdb/cdbutil.h"
#include "cdb/cdbvars.h"


static bool find_minmax_aggs_walker(Node *node, List **context);
static bool build_minmax_path(PlannerInfo *root, MinMaxAggInfo *mminfo,
							  Oid eqop, Oid sortop, bool nulls_first);
static void minmax_qp_callback(PlannerInfo *root, void *extra);
static Oid	fetch_agg_sort_op(Oid aggfnoid);


/*
 * preprocess_minmax_aggregates - preprocess MIN/MAX aggregates
 *
 * Check to see whether the query contains MIN/MAX aggregate functions that
 * might be optimizable via indexscans.  If it does, and all the aggregates
 * are potentially optimizable, then create a MinMaxAggPath and add it to
 * the (UPPERREL_GROUP_AGG, NULL) upperrel.
 *
 * This should be called by grouping_planner() just before it's ready to call
 * query_planner(), because we generate indexscan paths by cloning the
 * planner's state and invoking query_planner() on a modified version of
 * the query parsetree.  Thus, all preprocessing needed before query_planner()
 * must already be done.
 */
void
preprocess_minmax_aggregates(PlannerInfo *root)
{
	Query	   *parse = root->parse;
	FromExpr   *jtnode;
	RangeTblRef *rtr;
	RangeTblEntry *rte;
	List	   *aggs_list;
	RelOptInfo *grouped_rel;
	ListCell   *lc;

	/* minmax_aggs list should be empty at this point */
	Assert(root->minmax_aggs == NIL);

	/* Nothing to do if query has no aggregates */
	if (!parse->hasAggs)
		return;

	Assert(!parse->setOperations);	/* shouldn't get here if a setop */
	Assert(parse->rowMarks == NIL); /* nor if FOR UPDATE */

	/*
	 * Reject unoptimizable cases.
	 *
	 * We don't handle GROUP BY or windowing, because our current
	 * implementations of grouping require looking at all the rows anyway, and
	 * so there's not much point in optimizing MIN/MAX.
	 */
	if (parse->groupClause || list_length(parse->groupingSets) > 1 ||
		parse->hasWindowFuncs)
		return;

	/*
	 * Reject if disabled by caller.
	 */
	if (!root->config->gp_enable_minmax_optimization)
		return;

	/*
	 * Reject if query contains any CTEs; there's no way to build an indexscan
	 * on one so we couldn't succeed here.  (If the CTEs are unreferenced,
	 * that's not true, but it doesn't seem worth expending cycles to check.)
	 */
	if (parse->cteList)
		return;

	/*
	 * We also restrict the query to reference exactly one table, since join
	 * conditions can't be handled reasonably.  (We could perhaps handle a
	 * query containing cartesian-product joins, but it hardly seems worth the
	 * trouble.)  However, the single table could be buried in several levels
	 * of FromExpr due to subqueries.  Note the "single" table could be an
	 * inheritance parent, too, including the case of a UNION ALL subquery
	 * that's been flattened to an appendrel.
	 */
	jtnode = parse->jointree;
	while (IsA(jtnode, FromExpr))
	{
		if (list_length(jtnode->fromlist) != 1)
			return;
		jtnode = linitial(jtnode->fromlist);
	}
	if (!IsA(jtnode, RangeTblRef))
		return;
	rtr = (RangeTblRef *) jtnode;
	rte = planner_rt_fetch(rtr->rtindex, root);
	if (rte->rtekind == RTE_RELATION)
		 /* ordinary relation, ok */ ;
	else if (rte->rtekind == RTE_SUBQUERY && rte->inh)
		 /* flattened UNION ALL subquery, ok */ ;
	else
		return;

	/*
	 * Scan the tlist and HAVING qual to find all the aggregates and verify
	 * all are MIN/MAX aggregates.  Stop as soon as we find one that isn't.
	 */
	aggs_list = NIL;
	if (find_minmax_aggs_walker((Node *) root->processed_tlist, &aggs_list))
		return;
	if (find_minmax_aggs_walker(parse->havingQual, &aggs_list))
		return;

	/*
	 * OK, there is at least the possibility of performing the optimization.
	 * Build an access path for each aggregate.  If any of the aggregates
	 * prove to be non-indexable, give up; there is no point in optimizing
	 * just some of them.
	 */
	foreach(lc, aggs_list)
	{
		MinMaxAggInfo *mminfo = (MinMaxAggInfo *) lfirst(lc);
		Oid			eqop;
		bool		reverse;

		/*
		 * We'll need the equality operator that goes with the aggregate's
		 * ordering operator.
		 */
		eqop = get_equality_op_for_ordering_op(mminfo->aggsortop, &reverse);
		if (!OidIsValid(eqop))	/* shouldn't happen */
			elog(ERROR, "could not find equality operator for ordering operator %u",
				 mminfo->aggsortop);

		/*
		 * We can use either an ordering that gives NULLS FIRST or one that
		 * gives NULLS LAST; furthermore there's unlikely to be much
		 * performance difference between them, so it doesn't seem worth
		 * costing out both ways if we get a hit on the first one.  NULLS
		 * FIRST is more likely to be available if the operator is a
		 * reverse-sort operator, so try that first if reverse.
		 */
		if (build_minmax_path(root, mminfo, eqop, mminfo->aggsortop, reverse))
			continue;
		if (build_minmax_path(root, mminfo, eqop, mminfo->aggsortop, !reverse))
			continue;

		/* No indexable path for this aggregate, so fail */
		return;
	}

	/*
	 * OK, we can do the query this way.  Prepare to create a MinMaxAggPath
	 * node.
	 *
	 * First, create an output Param node for each agg.  (If we end up not
	 * using the MinMaxAggPath, we'll waste a PARAM_EXEC slot for each agg,
	 * which is not worth worrying about.  We can't wait till create_plan time
	 * to decide whether to make the Param, unfortunately.)
	 */
	foreach(lc, aggs_list)
	{
		MinMaxAggInfo *mminfo = (MinMaxAggInfo *) lfirst(lc);

		mminfo->param =
			SS_make_initplan_output_param(root,
										  exprType((Node *) mminfo->target),
										  -1,
										  exprCollation((Node *) mminfo->target));
	}

	/*
	 * Create a MinMaxAggPath node with the appropriate estimated costs and
	 * other needed data, and add it to the UPPERREL_GROUP_AGG upperrel, where
	 * it will compete against the standard aggregate implementation.  (It
	 * will likely always win, but we need not assume that here.)
	 *
	 * Note: grouping_planner won't have created this upperrel yet, but it's
	 * fine for us to create it first.  We will not have inserted the correct
	 * consider_parallel value in it, but MinMaxAggPath paths are currently
	 * never parallel-safe anyway, so that doesn't matter.  Likewise, it
	 * doesn't matter that we haven't filled FDW-related fields in the rel.
	 * Also, because there are no rowmarks, we know that the processed_tlist
	 * doesn't need to change anymore, so making the pathtarget now is safe.
	 */
	grouped_rel = fetch_upper_rel(root, UPPERREL_GROUP_AGG, NULL);
	add_path(grouped_rel, (Path *)
			 create_minmaxagg_path(root, grouped_rel,
								   create_pathtarget(root,
													 root->processed_tlist),
								   aggs_list,
								   (List *) parse->havingQual));
}

/*
 * find_minmax_aggs_walker
 *		Recursively scan the Aggref nodes in an expression tree, and check
 *		that each one is a MIN/MAX aggregate.  If so, build a list of the
 *		distinct aggregate calls in the tree.
 *
 * Returns true if a non-MIN/MAX aggregate is found, false otherwise.
 * (This seemingly-backward definition is used because expression_tree_walker
 * aborts the scan on true return, which is what we want.)
 *
 * Found aggregates are added to the list at *context; it's up to the caller
 * to initialize the list to NIL.
 *
 * This does not descend into subqueries, and so should be used only after
 * reduction of sublinks to subplans.  There mustn't be outer-aggregate
 * references either.
 */
static bool
find_minmax_aggs_walker(Node *node, List **context)
{
	if (node == NULL)
		return false;
	if (IsA(node, Aggref))
	{
		Aggref	   *aggref = (Aggref *) node;
		Oid			aggsortop;
		TargetEntry *curTarget;
		MinMaxAggInfo *mminfo;
		ListCell   *l;

		Assert(aggref->agglevelsup == 0);
		if (list_length(aggref->args) != 1)
			return true;		/* it couldn't be MIN/MAX */

		/*
		 * ORDER BY is usually irrelevant for MIN/MAX, but it can change the
		 * outcome if the aggsortop's operator class recognizes non-identical
		 * values as equal.  For example, 4.0 and 4.00 are equal according to
		 * numeric_ops, yet distinguishable.  If MIN() receives more than one
		 * value equal to 4.0 and no value less than 4.0, it is unspecified
		 * which of those equal values MIN() returns.  An ORDER BY expression
		 * that differs for each of those equal values of the argument
		 * expression makes the result predictable once again.  This is a
		 * niche requirement, and we do not implement it with subquery paths.
		 * In any case, this test lets us reject ordered-set aggregates
		 * quickly.
		 */
		if (aggref->aggorder != NIL)
			return true;
		/* note: we do not care if DISTINCT is mentioned ... */

		/*
		 * We might implement the optimization when a FILTER clause is present
		 * by adding the filter to the quals of the generated subquery.  For
		 * now, just punt.
		 */
		if (aggref->aggfilter != NULL)
			return true;

		aggsortop = fetch_agg_sort_op(aggref->aggfnoid);
		if (!OidIsValid(aggsortop))
			return true;		/* not a MIN/MAX aggregate */

		curTarget = (TargetEntry *) linitial(aggref->args);

		if (contain_mutable_functions((Node *) curTarget->expr))
			return true;		/* not potentially indexable */

		if (type_is_rowtype(exprType((Node *) curTarget->expr)))
			return true;		/* IS NOT NULL would have weird semantics */

		/*
		 * Check whether it's already in the list, and add it if not.
		 */
		foreach(l, *context)
		{
			mminfo = (MinMaxAggInfo *) lfirst(l);
			if (mminfo->aggfnoid == aggref->aggfnoid &&
				equal(mminfo->target, curTarget->expr))
				return false;
		}

		mminfo = makeNode(MinMaxAggInfo);
		mminfo->aggfnoid = aggref->aggfnoid;
		mminfo->aggsortop = aggsortop;
		mminfo->target = curTarget->expr;
		mminfo->subroot = NULL; /* don't compute path yet */
		mminfo->path = NULL;
		mminfo->pathcost = 0;
		mminfo->param = NULL;

		*context = lappend(*context, mminfo);

		/*
		 * We need not recurse into the argument, since it can't contain any
		 * aggregates.
		 */
		return false;
	}
	Assert(!IsA(node, SubLink));
	return expression_tree_walker(node, find_minmax_aggs_walker,
								  (void *) context);
}

/*
 * build_minmax_path
 *		Given a MIN/MAX aggregate, try to build an indexscan Path it can be
 *		optimized with.
 *
 * If successful, stash the best path in *mminfo and return true.
 * Otherwise, return false.
 */
static bool
build_minmax_path(PlannerInfo *root, MinMaxAggInfo *mminfo,
				  Oid eqop, Oid sortop, bool nulls_first)
{
	PlannerInfo *subroot;
	Query	   *parse;
	TargetEntry *tle;
	List	   *tlist;
	NullTest   *ntest;
	SortGroupClause *sortcl;
	RelOptInfo *final_rel;
	Path	   *sorted_path;
	Cost		path_cost;
	double		path_fraction;

	/*
	 * We are going to construct what is effectively a sub-SELECT query, so
	 * clone the current query level's state and adjust it to make it look
	 * like a subquery.  Any outer references will now be one level higher
	 * than before.  (This means that when we are done, there will be no Vars
	 * of level 1, which is why the subquery can become an initplan.)
	 */
	subroot = (PlannerInfo *) palloc(sizeof(PlannerInfo));
	memcpy(subroot, root, sizeof(PlannerInfo));
	subroot->query_level++;
	subroot->parent_root = root;
	/* reset subplan-related stuff */
	subroot->plan_params = NIL;
	subroot->outer_params = NULL;
	subroot->init_plans = NIL;

	subroot->parse = parse = copyObject(root->parse);
	IncrementVarSublevelsUp((Node *) parse, 1, 1);

	/* append_rel_list might contain outer Vars? */
	subroot->append_rel_list = copyObject(root->append_rel_list);
	IncrementVarSublevelsUp((Node *) subroot->append_rel_list, 1, 1);
	/* There shouldn't be any OJ info to translate, as yet */
	Assert(subroot->join_info_list == NIL);
	/* and we haven't made equivalence classes, either */
	Assert(subroot->eq_classes == NIL);
	/* and we haven't created PlaceHolderInfos, either */
	Assert(subroot->placeholder_list == NIL);

	/*----------
	 * Generate modified query of the form
	 *		(SELECT col FROM tab
	 *		 WHERE col IS NOT NULL AND existing-quals
	 *		 ORDER BY col ASC/DESC
	 *		 LIMIT 1)
	 *----------
	 */
	/* single tlist entry that is the aggregate target */
	tle = makeTargetEntry(copyObject(mminfo->target),
						  (AttrNumber) 1,
						  pstrdup("agg_target"),
						  false);
	tlist = list_make1(tle);
	subroot->processed_tlist = parse->targetList = tlist;

	/* No HAVING, no DISTINCT, no aggregates anymore */
	parse->havingQual = NULL;
	subroot->hasHavingQual = false;
	parse->distinctClause = NIL;
	parse->hasDistinctOn = false;
	parse->hasAggs = false;

	/* Build "target IS NOT NULL" expression */
	ntest = makeNode(NullTest);
	ntest->nulltesttype = IS_NOT_NULL;
	ntest->arg = copyObject(mminfo->target);
	/* we checked it wasn't a rowtype in find_minmax_aggs_walker */
	ntest->argisrow = false;
	ntest->location = -1;

	/* User might have had that in WHERE already */
	if (!list_member((List *) parse->jointree->quals, ntest))
		parse->jointree->quals = (Node *)
			lcons(ntest, (List *) parse->jointree->quals);

	/* Build suitable ORDER BY clause */
	sortcl = makeNode(SortGroupClause);
	sortcl->tleSortGroupRef = assignSortGroupRef(tle, subroot->processed_tlist);
	sortcl->eqop = eqop;
	sortcl->sortop = sortop;
	sortcl->nulls_first = nulls_first;
	sortcl->hashable = false;	/* no need to make this accurate */
	parse->sortClause = list_make1(sortcl);

	/* set up expressions for LIMIT 1 */
	parse->limitOffset = NULL;
	parse->limitCount = (Node *) makeConst(INT8OID, -1, InvalidOid,
										   sizeof(int64),
										   Int64GetDatum(1), false,
										   FLOAT8PASSBYVAL);

	/*
	 * Generate the best paths for this query, telling query_planner that we
	 * have LIMIT 1.
	 */
	subroot->tuple_fraction = 1.0;
	subroot->limit_tuples = 1.0;

	final_rel = query_planner(subroot, minmax_qp_callback, NULL);

	/*
	 * Since we didn't go through subquery_planner() to handle the subquery,
	 * we have to do some of the same cleanup it would do, in particular cope
	 * with params and initplans used within this subquery.  (This won't
	 * matter if we end up not using the subplan.)
	 */
	SS_identify_outer_params(subroot);
	SS_charge_for_initplans(subroot, final_rel);

	/*
	 * Get the best presorted path, that being the one that's cheapest for
	 * fetching just one row.  If there's no such path, fail.
	 */
	if (final_rel->rows > 1.0)
		path_fraction = 1.0 / final_rel->rows;
	else
		path_fraction = 1.0;

	sorted_path =
		get_cheapest_fractional_path_for_pathkeys(final_rel->pathlist,
												  subroot->query_pathkeys,
												  NULL,
												  path_fraction);
	if (!sorted_path)
		return false;

	/*
	 * The path might not return exactly what we want, so fix that.  (We
	 * assume that this won't change any conclusions about which was the
	 * cheapest path.)
	 */
	sorted_path = apply_projection_to_path(subroot, final_rel, sorted_path,
										   create_pathtarget(subroot,
															 subroot->processed_tlist));

	/*
	 * Need to gather the results to a single node, if it's not already
	 * like that. Otherwise, we won't get the single min/max row, but
	 * one min/max row from every segment.
	 */
	if (Gp_role == GP_ROLE_DISPATCH && CdbPathLocus_IsPartitioned(sorted_path->locus))
	{
		CdbPathLocus singleQE;
		List	   *pathkeys;

		CdbPathLocus_MakeSingleQE(&singleQE, getgpsegmentCount());

		pathkeys = sorted_path->pathkeys;
		sorted_path = cdbpath_create_motion_path(root, sorted_path, sorted_path->pathkeys,
												 false, singleQE);
		/*
		 * Sanity check that order was preserved. (Given how cdbpath_create_motion_path() is
		 * implemented, pointer equality is enough here, but in principle we should be
		 * using something more sophisticated for the comparison.)
		 */
		if (sorted_path->pathkeys != pathkeys)
			elog(ERROR, "could not create an order-preserving gather motion for min/max path");
	}

	/*
	 * Determine cost to get just the first row of the presorted path.
	 *
	 * Note: cost calculation here should match
	 * compare_fractional_path_costs().
	 */
	path_cost = sorted_path->startup_cost +
		path_fraction * (sorted_path->total_cost - sorted_path->startup_cost);

	/* Save state for further processing */
	mminfo->subroot = subroot;
	mminfo->path = sorted_path;
	mminfo->pathcost = path_cost;

	return true;
}

/*
 * Compute query_pathkeys and other pathkeys during query_planner()
 */
static void
minmax_qp_callback(PlannerInfo *root, void *extra)
{
	root->group_pathkeys = NIL;
	root->window_pathkeys = NIL;
	root->distinct_pathkeys = NIL;

	root->sort_pathkeys =
		make_pathkeys_for_sortclauses(root,
									  root->parse->sortClause,
									  root->parse->targetList);

	root->query_pathkeys = root->sort_pathkeys;
}

/*
 * Get the OID of the sort operator, if any, associated with an aggregate.
 * Returns InvalidOid if there is no such operator.
 */
static Oid
fetch_agg_sort_op(Oid aggfnoid)
{
	HeapTuple	aggTuple;
	Form_pg_aggregate aggform;
	Oid			aggsortop;

	/* fetch aggregate entry from pg_aggregate */
	aggTuple = SearchSysCache1(AGGFNOID, ObjectIdGetDatum(aggfnoid));
	if (!HeapTupleIsValid(aggTuple))
		return InvalidOid;
	aggform = (Form_pg_aggregate) GETSTRUCT(aggTuple);
	aggsortop = aggform->aggsortop;
	ReleaseSysCache(aggTuple);

	return aggsortop;
}

相关信息

greenplumn 源码目录

相关文章

greenplumn analyzejoins 源码

greenplumn createplan 源码

greenplumn initsplan 源码

greenplumn joinpartprune 源码

greenplumn orca 源码

greenplumn planmain 源码

greenplumn planner 源码

greenplumn planshare 源码

greenplumn setrefs 源码

greenplumn subselect 源码

0 赞

所属分类： 大数据
本文标签： greenplum
版权声明： 原创文章如转载，请注明本文链接: https://www.seaxiang.com/blog/0ac3a12d6fd04920877df8f44b856066

热门推荐

1、直接访问google.com
2、优质文章
3、 gt
4、 harmony 鸿蒙hdc使用指导
5、 harmony 鸿蒙ArkUI组件（ArkTS）开发常见问题
6、 harmony 鸿蒙初识ArkTS语言
7、 flink kafka connector scan.startup.mode 的几个选项
8、 openharmony
9、 golang
10、 Vue中input框自动聚焦

Loading...