greenplumn CJoinStatsProcessor 源码

  • 2022-08-18
  • 浏览 (62)

greenplumn CJoinStatsProcessor 代码


//	Greenplum Database
//	Copyright 2018 VMware, Inc. or its affiliates.
//	@filename:
//		CJoinStatsProcessor.cpp
//	@doc:
//		Statistics helper routines for processing all join types

#include "naucrates/statistics/CJoinStatsProcessor.h"

#include "gpopt/base/COptCtxt.h"
#include "gpopt/operators/CLogicalIndexApply.h"
#include "gpopt/operators/CLogicalNAryJoin.h"
#include "gpopt/operators/CPredicateUtils.h"
#include "gpopt/operators/CScalarNAryJoinPredList.h"
#include "gpopt/optimizer/COptimizerConfig.h"
#include "naucrates/statistics/CFilterStatsProcessor.h"
#include "naucrates/statistics/CLeftAntiSemiJoinStatsProcessor.h"
#include "naucrates/statistics/CScaleFactorUtils.h"
#include "naucrates/statistics/CStatisticsUtils.h"

using namespace gpopt;

BOOL CJoinStatsProcessor::m_compute_scale_factor_from_histogram_buckets = false;

// helper for joining histograms
	CMemoryPool *mp, const CHistogram *histogram1, const CHistogram *histogram2,
	CStatsPredJoin *join_pred_stats, CDouble num_rows1, CDouble num_rows2,
	CHistogram **result_hist1,	// output: histogram 1 after join
	CHistogram **result_hist2,	// output: histogram 2 after join
	CDouble *scale_factor,		// output: scale factor based on the join
	BOOL is_input_empty, IStatistics::EStatsJoinType join_type,
	BOOL DoIgnoreLASJHistComputation)
	GPOS_ASSERT(nullptr != histogram1);
	GPOS_ASSERT(nullptr != histogram2);
	GPOS_ASSERT(nullptr != join_pred_stats);
	GPOS_ASSERT(nullptr != result_hist1);
	GPOS_ASSERT(nullptr != result_hist2);
	GPOS_ASSERT(nullptr != scale_factor);

	if (IStatistics::EsjtLeftAntiSemiJoin == join_type)
			histogram1, histogram2, join_pred_stats, num_rows1, num_rows2,
			result_hist1, result_hist2, scale_factor, is_input_empty, join_type,


	if (is_input_empty)
		// use Cartesian product as scale factor
		*scale_factor = num_rows1 * num_rows2;
		*result_hist1 = GPOS_NEW(mp) CHistogram(mp);
		*result_hist2 = GPOS_NEW(mp) CHistogram(mp);


	*scale_factor = CScaleFactorUtils::DefaultJoinPredScaleFactor;

	CStatsPred::EStatsCmpType stats_cmp_type = join_pred_stats->GetCmpType();
	BOOL empty_histograms = histogram1->IsEmpty() || histogram2->IsEmpty();

	if (empty_histograms)
		// if one more input has no histograms (due to lack of statistics
		// for table columns or computed columns), we estimate
		// the join cardinality to be the max of the two rows.
		// In other words, the scale factor is equivalent to the
		// min of the two rows.
		*scale_factor = std::min(num_rows1, num_rows2);
	else if (CHistogram::JoinPredCmpTypeIsSupported(stats_cmp_type))
		CHistogram *join_histogram = histogram1->MakeJoinHistogramNormalize(
			stats_cmp_type, num_rows1, histogram2, num_rows2, scale_factor);

		if (CStatsPred::EstatscmptEq == stats_cmp_type ||
			CStatsPred::EstatscmptINDF == stats_cmp_type ||
			if (histogram1->WereNDVsScaled())
			*result_hist1 = join_histogram;
			*result_hist2 = (*result_hist1)->CopyHistogram();
			if (histogram2->WereNDVsScaled())

		// note that for IDF and Not Equality predicate, we do not generate histograms but
		// just the scale factors.


		// TODO:  Feb 21 2014, for all join condition except for "=" join predicate
		// we currently do not compute new histograms for the join columns

	// for an unsupported join predicate operator or in the case of
	// missing histograms, copy input histograms and use default scale factor
	*result_hist1 = histogram1->CopyHistogram();
	*result_hist2 = histogram2->CopyHistogram();

//	derive statistics for the given join's predicate(s)
IStatistics *
CJoinStatsProcessor::CalcAllJoinStats(CMemoryPool *mp,
									  IStatisticsArray *statistics_array,
									  CExpression *expr, COperator *pop)
	GPOS_ASSERT(nullptr != expr);
	GPOS_ASSERT(nullptr != statistics_array);
	GPOS_ASSERT(0 < statistics_array->Size());
	// Is the operator passed in a 2-way LOJ? We will later refine this to find whether
	// an individual predicate is for an LOJ or not.
	BOOL left_outer_2_way_join = false;

	// create an empty set of outer references for statistics derivation
	CColRefSet *outer_refs = GPOS_NEW(mp) CColRefSet(mp);

	// join statistics objects one by one using relevant predicates in given scalar expression
	const ULONG num_stats = statistics_array->Size();
	IStatistics *stats = (*statistics_array)[0]->CopyStats(mp);
	CDouble num_rows_outer = stats->Rows();
	// predicate indexes, if we have a mix of inner and LOJs
	ULongPtrArray *predIndexes = nullptr;
	CExpression *inner_or_simple_2_way_loj_preds = expr;

	switch (pop->Eopid())
		case COperator::EopLogicalIndexApply:
			left_outer_2_way_join =

		case COperator::EopLogicalLeftOuterJoin:
			left_outer_2_way_join = true;

		case COperator::EopLogicalNAryJoin:
			predIndexes =
			if (nullptr != predIndexes)
				GPOS_ASSERT(COperator::EopScalarNAryJoinPredList ==
				inner_or_simple_2_way_loj_preds =


	for (ULONG i = 1; i < num_stats; i++)
		IStatistics *current_stats = (*statistics_array)[i];

		CColRefSetArray *output_colrefsets = GPOS_NEW(mp) CColRefSetArray(mp);

		CStatsPred *unsupported_pred_stats = nullptr;
		BOOL is_a_left_join = left_outer_2_way_join;
		CExpression *join_preds_available = nullptr;

		if (nullptr == predIndexes ||
			GPOPT_ZERO_INNER_JOIN_PRED_INDEX == *(*predIndexes)[i])
			join_preds_available = inner_or_simple_2_way_loj_preds;
			// this is an LOJ that is part of an NAry join, get the corresponding ON predicate
			is_a_left_join = true;
			join_preds_available = (*expr)[*(*predIndexes)[i]];

		CStatsPredJoinArray *join_preds_stats =
				mp, join_preds_available, output_colrefsets, outer_refs,
				is_a_left_join,	 // left joins use an anti-semijoin internally

		IStatistics *new_stats = nullptr;

		if (is_a_left_join)
			new_stats =
				stats->CalcLOJoinStats(mp, current_stats, join_preds_stats);
			new_stats =
				stats->CalcInnerJoinStats(mp, current_stats, join_preds_stats);
		stats = new_stats;

		if (nullptr != unsupported_pred_stats)
			// apply the unsupported join filters as a filter on top of the join results.
			// TODO,  June 13 2014 we currently only cap NDVs for filters
			// immediately on top of tables.
			IStatistics *stats_after_join_filter =
					mp, dynamic_cast<CStatistics *>(stats),
					unsupported_pred_stats, false /* do_cap_NDVs */);

			// If it is outer join and the cardinality after applying the unsupported join
			// filters is less than the cardinality of outer child, we don't use this stats.
			// Because we need to make sure that Card(LOJ) >= Card(Outer child of LOJ).
			if (is_a_left_join &&
				stats_after_join_filter->Rows() < num_rows_outer)
				stats = stats_after_join_filter;


		num_rows_outer = stats->Rows();


	// clean up

	return stats;

// main driver to generate join stats
CStatistics *
	CMemoryPool *mp, CStatisticsConfig *stats_config,
	const IStatistics *outer_stats_input, const IStatistics *inner_stats_input,
	CStatsPredJoinArray *join_pred_stats_info,
	IStatistics::EStatsJoinType join_type, BOOL DoIgnoreLASJHistComputation)
	GPOS_ASSERT(nullptr != mp);
	GPOS_ASSERT(nullptr != inner_stats_input);
	GPOS_ASSERT(nullptr != outer_stats_input);

	GPOS_ASSERT(nullptr != join_pred_stats_info);

	BOOL IsLASJ = (IStatistics::EsjtLeftAntiSemiJoin == join_type);
	BOOL semi_join = IStatistics::IsSemiJoin(join_type);

	// Extract stat objects for inner and outer child.
	// Historically, IStatistics was meant to have multiple derived classes
	// However, currently only CStatistics implements IStatistics
	// Until this changes, the interfaces have been designed to take IStatistics as parameters
	// In the future, IStatistics should be removed, as it is not being utilized as designed
	const CStatistics *outer_stats =
		dynamic_cast<const CStatistics *>(outer_stats_input);
	const CStatistics *inner_side_stats =
		dynamic_cast<const CStatistics *>(inner_stats_input);

	// create hash map from colid -> histogram for resultant structure
	UlongToHistogramMap *result_col_hist_mapping =
		GPOS_NEW(mp) UlongToHistogramMap(mp);

	// build a bitset with all join columns
	CBitSet *join_colids = GPOS_NEW(mp) CBitSet(mp);
	for (ULONG i = 0; i < join_pred_stats_info->Size(); i++)
		CStatsPredJoin *join_stats = (*join_pred_stats_info)[i];

		if (join_stats->HasValidColIdOuter())
			(void) join_colids->ExchangeSet(join_stats->ColIdOuter());
		if (!semi_join && join_stats->HasValidColIdInner())
			(void) join_colids->ExchangeSet(join_stats->ColIdInner());

	// histograms on columns that do not appear in join condition will
	// be copied over to the result structure
	outer_stats->AddNotExcludedHistograms(mp, join_colids,
	if (!semi_join)
		inner_side_stats->AddNotExcludedHistograms(mp, join_colids,

	CScaleFactorUtils::SJoinConditionArray *join_conds_scale_factors =
		GPOS_NEW(mp) CScaleFactorUtils::SJoinConditionArray(mp);
	const ULONG num_join_conds = join_pred_stats_info->Size();

	BOOL output_is_empty = false;
	CDouble num_join_rows = 0;
	// iterate over join's predicate(s)
	for (ULONG i = 0; i < num_join_conds; i++)
		CStatsPredJoin *pred_info = (*join_pred_stats_info)[i];
		ULONG colid1 = pred_info->ColIdOuter();
		ULONG colid2 = pred_info->ColIdInner();
		GPOS_ASSERT(colid1 != colid2);
		const CHistogram *outer_histogram = nullptr;
		const CHistogram *inner_histogram = nullptr;
		BOOL is_input_empty =
			CStatistics::IsEmptyJoin(outer_stats, inner_side_stats, IsLASJ);
		CDouble local_scale_factor(1.0);
		CHistogram *outer_histogram_after = nullptr;
		CHistogram *inner_histogram_after = nullptr;

		// find the histograms corresponding to the two columns
		// are column id1 and 2 always in the order of outer inner?
		if (pred_info->HasValidColIdOuter())
			outer_histogram = outer_stats->GetHistogram(colid1);
			GPOS_ASSERT(nullptr != outer_histogram);
		if (pred_info->HasValidColIdInner())
			inner_histogram = inner_side_stats->GetHistogram(colid2);
			GPOS_ASSERT(nullptr != inner_histogram);

		// When we have any form of equi join with join condition of type f(a)=b,
		// we calculate the NDV of such a join as NDV(b) ( from Selinger et al.)
		if (nullptr == outer_histogram)
			GPOS_ASSERT(CStatsPred::EstatscmptEqNDV == pred_info->GetCmpType());
			outer_histogram = inner_histogram;
			colid1 = colid2;
		else if (nullptr == inner_histogram)
			GPOS_ASSERT(CStatsPred::EstatscmptEqNDV == pred_info->GetCmpType());
			inner_histogram = outer_histogram;
			colid2 = colid1;

		JoinHistograms(mp, outer_histogram, inner_histogram, pred_info,
					   outer_stats->Rows(), inner_side_stats->Rows(),
					   &outer_histogram_after, &inner_histogram_after,
					   &local_scale_factor, is_input_empty, join_type,

		output_is_empty = JoinStatsAreEmpty(
			outer_stats->IsEmpty(), output_is_empty, outer_histogram,
			inner_histogram, outer_histogram_after, join_type);

		CStatisticsUtils::AddHistogram(mp, colid1, outer_histogram_after,
		if (!semi_join && colid1 != colid2)
			CStatisticsUtils::AddHistogram(mp, colid2, inner_histogram_after,


		// remember which tables the columns came from, this info is used to combine scale factors
		CColumnFactory *col_factory = COptCtxt::PoctxtFromTLS()->Pcf();

		CColRef *colref_outer = col_factory->LookupColRef(colid1);
		CColRef *colref_inner = col_factory->LookupColRef(colid2);

		GPOS_ASSERT(colref_outer != nullptr);
		GPOS_ASSERT(colref_inner != nullptr);

		IMDId *mdid_outer = colref_outer->GetMdidTable();
		IMDId *mdid_inner = colref_inner->GetMdidTable();
		IMdIdArray *mdid_pair = nullptr;
		BOOL both_dist_keys = false;
		if ((mdid_outer != nullptr) && (mdid_inner != nullptr))
			// there should only be two tables involved in a join condition
			// if the predicate is more complex (i.e. more than 2 tables involved in the predicate such as t1.a=t2.a+t3.a),
			// the mdid of the base table will be NULL:
			// Note that we hash on the pointer to the Mdid, not the value of the Mdid,
			// but we know that CColRef::GetMdidTable() will always return the same
			// pointer for a given table.
			mdid_pair = GPOS_NEW(mp) IMdIdArray(mp, 2);

			if (colref_outer->IsDistCol() && colref_inner->IsDistCol())
				both_dist_keys = true;

			GPOS_NEW(mp) CScaleFactorUtils::SJoinCondition(
				local_scale_factor, mdid_pair, both_dist_keys));

	num_join_rows = CStatistics::MinRows;
	if (!output_is_empty)
		num_join_rows = CalcJoinCardinality(
			mp, stats_config, outer_stats->Rows(), inner_side_stats->Rows(),
			join_conds_scale_factors, join_type);

	// clean up

	UlongToDoubleMap *col_width_mapping_result = outer_stats->CopyWidths(mp);
	if (!semi_join)
		inner_side_stats->CopyWidthsInto(mp, col_width_mapping_result);

	// create an output stats object
	CStatistics *join_stats = GPOS_NEW(mp) CStatistics(
		mp, result_col_hist_mapping, col_width_mapping_result, num_join_rows,
		output_is_empty, outer_stats->GetNumberOfPredicates());

	// In the output statistics object, the upper bound source cardinality of the join column
	// cannot be greater than the upper bound source cardinality information maintained in the input
	// statistics object. Therefore we choose CStatistics::EcbmMin the bounding method which takes
	// the minimum of the cardinality upper bound of the source column (in the input hash map)
	// and estimated join cardinality.

		mp, outer_stats, join_stats, num_join_rows,
		CStatistics::EcbmMin /* card_bounding_method */);
	if (!semi_join)
			mp, inner_side_stats, join_stats, num_join_rows,
			CStatistics::EcbmMin /* card_bounding_method */);

	return join_stats;

// return join cardinality based on scaling factor and join type
	CMemoryPool *mp, CStatisticsConfig *stats_config, CDouble left_num_rows,
	CDouble right_num_rows,
	CScaleFactorUtils::SJoinConditionArray *join_conds_scale_factors,
	IStatistics::EStatsJoinType join_type)
	GPOS_ASSERT(nullptr != stats_config);
	GPOS_ASSERT(nullptr != join_conds_scale_factors);
	CDouble limit_for_result_scale_factor(
		std::max(left_num_rows.Get(), right_num_rows.Get()));

	CDouble scale_factor = CScaleFactorUtils::CumulativeJoinScaleFactor(
		mp, stats_config, join_conds_scale_factors,
	CDouble cartesian_product_num_rows = left_num_rows * right_num_rows;

	if (IStatistics::EsjtLeftAntiSemiJoin == join_type ||
		IStatistics::EsjtLeftSemiJoin == join_type)
		CDouble rows = left_num_rows;

		if (IStatistics::EsjtLeftAntiSemiJoin == join_type)
			rows = left_num_rows / scale_factor;
			// semi join results cannot exceed size of outer side
			rows = std::min(left_num_rows.Get(),
							(cartesian_product_num_rows / scale_factor).Get());

		return std::max(DOUBLE(1.0), rows.Get());

	GPOS_ASSERT(CStatistics::MinRows <= scale_factor);

	return std::max(CStatistics::MinRows.Get(),
					(cartesian_product_num_rows / scale_factor).Get());

// check if the join statistics object is empty output based on the input
// histograms and the join histograms
CJoinStatsProcessor::JoinStatsAreEmpty(BOOL outer_is_empty,
									   BOOL output_is_empty,
									   const CHistogram *outer_histogram,
									   const CHistogram *inner_histogram,
									   CHistogram *join_histogram,
									   IStatistics::EStatsJoinType join_type)
	GPOS_ASSERT(nullptr != outer_histogram);
	GPOS_ASSERT(nullptr != inner_histogram);
	GPOS_ASSERT(nullptr != join_histogram);
	BOOL IsLASJ = IStatistics::EsjtLeftAntiSemiJoin == join_type;
	return output_is_empty || (!IsLASJ && outer_is_empty) ||
		   (!outer_histogram->IsEmpty() && !inner_histogram->IsEmpty() &&

// Derive statistics for join operation given array of statistics object
IStatistics *
CJoinStatsProcessor::DeriveJoinStats(CMemoryPool *mp,
									 CExpressionHandle &exprhdl,
									 IStatisticsArray *stats_ctxt)
	GPOS_ASSERT(CLogical::EspNone <

	IStatisticsArray *statistics_array = GPOS_NEW(mp) IStatisticsArray(mp);
	const ULONG arity = exprhdl.Arity();
	for (ULONG i = 0; i < arity - 1; i++)
		IStatistics *child_stats = exprhdl.Pstats(i);

	CExpression *join_pred_expr = exprhdl.PexprScalarRepChild(arity - 1);

	join_pred_expr = CPredicateUtils::PexprRemoveImpliedConjuncts(
		mp, join_pred_expr, exprhdl);

	// split join predicate into local predicate and predicate involving outer references
	CExpression *local_expr = nullptr;
	CExpression *expr_with_outer_refs = nullptr;

	// get outer references from expression handle
	CColRefSet *outer_refs = exprhdl.DeriveOuterReferences();

	CPredicateUtils::SeparateOuterRefs(mp, join_pred_expr, outer_refs,
									   &local_expr, &expr_with_outer_refs);

	COperator::EOperatorId op_id = exprhdl.Pop()->Eopid();
	GPOS_ASSERT(COperator::EopLogicalLeftOuterJoin == op_id ||
				COperator::EopLogicalInnerJoin == op_id ||
				COperator::EopLogicalNAryJoin == op_id ||
				COperator::EopLogicalFullOuterJoin == op_id ||
				COperator::EopLogicalRightOuterJoin == op_id);

	// derive stats based on local join condition
	IStatistics *join_stats = CJoinStatsProcessor::CalcAllJoinStats(
		mp, statistics_array, local_expr, exprhdl.Pop());

	if (exprhdl.HasOuterRefs() && 0 < stats_ctxt->Size())
		// derive stats based on outer references
		IStatistics *stats = DeriveStatsWithOuterRefs(
			mp, exprhdl, expr_with_outer_refs, join_stats, stats_ctxt);
		join_stats = stats;



	return join_stats;

// Derives statistics when the scalar expression contains one or more outer references.
// This stats derivation mechanism passes around a context array onto which
// operators append their stats objects as they get derived. The context array is
// filled as we derive stats on the children of a given operator. This gives each
// operator access to the stats objects of its previous siblings as well as to the outer
// operators in higher levels.
// For example, in this expression:
//   |--Get(R)
//   +--Select(R.r=S.s)
//       +-- Get(S)
// We start by deriving stats on JOIN's left child (Get(R)) and append its
// stats to the context. Then, we call stats derivation on JOIN's right child
// (SELECT), passing it the current context.  This gives SELECT access to the
// histogram on column R.r--which is an outer reference in this example. After
// JOIN's children's stats are computed, JOIN combines them into a parent stats
// object, which is passed upwards to JOIN's parent. This mechanism gives any
// operator access to the histograms of outer references defined anywhere in
// the logical tree. For example, we also support the case where outer
// reference R.r is defined two levels upwards:
//    JOIN
//      |---Get(R)
//      +--JOIN
//         |--Get(T)
//         +--Select(R.r=S.s)
//               +--Get(S)
// The next step is to combine the statistics objects of the outer references
// with those of the local columns. You can think of this as a correlated
// expression, where for each outer tuple, we need to extract the outer ref
// value and re-execute the inner expression using the current outer ref value.
// This has the same semantics as a Join from a statistics perspective.
// We pull statistics for outer references from the passed statistics context,
// using Join statistics derivation in this case.
// For example:
// 			Join
// 			 |--Get(R)
// 			 +--Join
// 				|--Get(S)
// 				+--Select(T.t=R.r)
// 					+--Get(T)
// when deriving statistics on 'Select(T.t=R.r)', we join T with the cross
// product (R x S) based on the condition (T.t=R.r)
IStatistics *
	CMemoryPool *mp,
	CExpressionHandle &
		exprhdl,  // handle attached to the logical expression we want to derive stats for
	CExpression *expr,	 // scalar condition to be used for stats derivation
	IStatistics *stats,	 // statistics object of the attached expression
	IStatisticsArray *
		all_outer_stats	 // array of stats objects where outer references are defined
	GPOS_ASSERT(exprhdl.HasOuterRefs() &&
				"attached expression does not have outer references");
	GPOS_ASSERT(nullptr != expr);
	GPOS_ASSERT(nullptr != stats);
	GPOS_ASSERT(nullptr != all_outer_stats);
	GPOS_ASSERT(0 < all_outer_stats->Size());

	// join outer stats object based on given scalar expression,
	// we use inner join semantics here to consider all relevant combinations of outer tuples
	IStatistics *outer_stats = CJoinStatsProcessor::CalcAllJoinStats(
		mp, all_outer_stats, expr, exprhdl.Pop());
	CDouble num_rows_outer = outer_stats->Rows();

	// join passed stats object and outer stats based on the passed join type
	IStatisticsArray *statistics_array = GPOS_NEW(mp) IStatisticsArray(mp);
	IStatistics *result_join_stats = CJoinStatsProcessor::CalcAllJoinStats(
		mp, statistics_array, expr, exprhdl.Pop());

	// scale result using cardinality of outer stats and set number of rebinds of returned stats
	IStatistics *result_stats =
		result_join_stats->ScaleStats(mp, CDouble(1.0 / num_rows_outer));

	return result_stats;

// EOF


greenplumn 源码目录


greenplumn CBucket 源码

greenplumn CFilterStatsProcessor 源码

greenplumn CGroupByStatsProcessor 源码

greenplumn CHistogram 源码

greenplumn CInnerJoinStatsProcessor 源码

greenplumn CLeftAntiSemiJoinStatsProcessor 源码

greenplumn CLeftOuterJoinStatsProcessor 源码

greenplumn CLeftSemiJoinStatsProcessor 源码

greenplumn CLimitStatsProcessor 源码

greenplumn CPoint 源码

0  赞