greenplumn CFilterStatsProcessor 源码
greenplumn CFilterStatsProcessor 代码
文件路径:/src/backend/gporca/libnaucrates/src/statistics/CFilterStatsProcessor.cpp
//---------------------------------------------------------------------------
//	Greenplum Database
//	Copyright 2018 VMware, Inc. or its affiliates.
//
//	@filename:
//		CFilterStatsProcessor.cpp
//
//	@doc:
//		Statistics helper routines for processing limit operations
//---------------------------------------------------------------------------
#include "naucrates/statistics/CFilterStatsProcessor.h"
#include "gpopt/base/COptCtxt.h"
#include "gpopt/operators/CExpressionHandle.h"
#include "gpopt/operators/CPredicateUtils.h"
#include "gpopt/operators/CScalarCmp.h"
#include "gpopt/optimizer/COptimizerConfig.h"
#include "naucrates/statistics/CBucket.h"
#include "naucrates/statistics/CJoinStatsProcessor.h"
#include "naucrates/statistics/CScaleFactorUtils.h"
#include "naucrates/statistics/CStatistics.h"
#include "naucrates/statistics/CStatisticsUtils.h"
using namespace gpopt;
// derive statistics for filter operation based on given scalar expression
IStatistics *
CFilterStatsProcessor::MakeStatsFilterForScalarExpr(
	CMemoryPool *mp, CExpressionHandle &exprhdl, IStatistics *child_stats,
	CExpression *local_scalar_expr,	 // filter expression on local columns only
	CExpression *
		outer_refs_scalar_expr,	 // filter expression involving outer references
	IStatisticsArray *all_outer_stats)
{
	GPOS_ASSERT(nullptr != child_stats);
	GPOS_ASSERT(nullptr != local_scalar_expr);
	GPOS_ASSERT(nullptr != outer_refs_scalar_expr);
	GPOS_ASSERT(nullptr != all_outer_stats);
	CColRefSet *outer_refs = exprhdl.DeriveOuterReferences();
	// TODO  June 13 2014, we currently only cap ndvs when we have a filter
	// immediately on top of tables
	BOOL do_cap_NDVs = (1 == exprhdl.DeriveJoinDepth());
	// extract local filter
	CStatsPred *pred_stats =
		CStatsPredUtils::ExtractPredStats(mp, local_scalar_expr, outer_refs);
	// derive stats based on local filter
	IStatistics *result_stats = CFilterStatsProcessor::MakeStatsFilter(
		mp, dynamic_cast<CStatistics *>(child_stats), pred_stats, do_cap_NDVs);
	pred_stats->Release();
	if (exprhdl.HasOuterRefs() && 0 < all_outer_stats->Size())
	{
		// derive stats based on outer references
		IStatistics *stats = CJoinStatsProcessor::DeriveStatsWithOuterRefs(
			mp, exprhdl, outer_refs_scalar_expr, result_stats, all_outer_stats);
		result_stats->Release();
		result_stats = stats;
	}
	return result_stats;
}
// compute the selectivity of a predicate, applied to a base table,
// making some simple default assumptions about outer refs
CDouble
CFilterStatsProcessor::SelectivityOfPredicate(CMemoryPool *mp,
											  CExpression *pred,
											  CTableDescriptor *ptabdesc,
											  CColRefSet *outer_refs)
{
	// separate the outer refs
	CExpression *local_expr = nullptr;
	CExpression *expr_with_outer_refs = nullptr;
	CColRefSet *used_col_refs = pred->DeriveUsedColumns();
	CColRefSet *used_local_col_refs =
		GPOS_NEW(mp) CColRefSet(mp, *used_col_refs);
	ULONG num_outer_ref_preds = 0;
	if (nullptr != outer_refs)
	{
		used_local_col_refs->Exclude(outer_refs);
		CPredicateUtils::SeparateOuterRefs(mp, pred, outer_refs, &local_expr,
										   &expr_with_outer_refs);
	}
	else
	{
		pred->AddRef();
		local_expr = pred;
	}
	// extract local filter
	CStatsPred *pred_stats =
		CStatsPredUtils::ExtractPredStats(mp, local_expr, outer_refs);
	const COptCtxt *poctxt = COptCtxt::PoctxtFromTLS();
	CMDAccessor *md_accessor = poctxt->Pmda();
	// grab default stats config
	CStatisticsConfig *stats_config =
		poctxt->GetOptimizerConfig()->GetStatsConf();
	// we don't care about the width of the columns, just the row count
	CColRefSet *dummy_width_set = GPOS_NEW(mp) CColRefSet(mp);
	IStatistics *base_table_stats =
		md_accessor->Pstats(mp, ptabdesc->MDId(), used_local_col_refs,
							dummy_width_set, stats_config);
	// derive stats based on local filter
	IStatistics *result_stats = CFilterStatsProcessor::MakeStatsFilter(
		mp, dynamic_cast<CStatistics *>(base_table_stats), pred_stats, false);
	CDouble result = result_stats->Rows() / base_table_stats->Rows();
	BOOL have_local_preds = (result < 1.0);
	pred_stats->Release();
	used_local_col_refs->Release();
	base_table_stats->Release();
	dummy_width_set->Release();
	// handle outer_refs
	if (nullptr != expr_with_outer_refs)
	{
		CExpressionArray *outer_ref_exprs =
			CPredicateUtils::PdrgpexprConjuncts(mp, expr_with_outer_refs);
		const ULONG size = outer_ref_exprs->Size();
		for (ULONG ul = 0; ul < size; ul++)
		{
			CExpression *pexpr = (*outer_ref_exprs)[ul];
			CColRef *local_col_ref = nullptr;
			if (CPredicateUtils::FIdentCompareOuterRefExprIgnoreCast(
					pexpr, outer_refs, &local_col_ref))
			{
				CScalarCmp *sc_cmp = CScalarCmp::PopConvert(pexpr->Pop());
				// Use more accurate NDV calculation if the comparison is an equality type
				if (IMDType::EcmptEq == sc_cmp->ParseCmpType())
				{
					GPOS_ASSERT(nullptr != local_col_ref);
					CDouble ndv = result_stats->GetNDVs(local_col_ref);
					if (ndv < 1.0)
					{
						// An NDV of less than 1 means that we have no stats on this column
						result = result * CHistogram::DefaultSelectivity;
					}
					else
					{
						result = result * (1 / ndv);
					}
				}
				else
				{
					// a comparison col op <outer ref> other than an equals
					result = result * CHistogram::DefaultSelectivity;
				}
				num_outer_ref_preds++;
			}
			else
			{
				// if it is a true filter, then we had no expressions with outer refs
				if (!CUtils::FScalarConstTrue(pexpr))
				{
					// some other expression, not of the form col op <outer ref>,
					// e.g. an OR expression
					result = result * CHistogram::DefaultSelectivity;
					num_outer_ref_preds++;
				}
			}
		}
		expr_with_outer_refs->Release();
		outer_ref_exprs->Release();
	}
	// apply damping factor to the outer ref predicates whose selectivities we multiplied above
	if (have_local_preds)
	{
		// add one for the combined non-outer refs which were dampened internally,
		// but not in combination with the preds on outer refs
		num_outer_ref_preds++;
	}
	if (1 < num_outer_ref_preds)
	{
		CStatisticsConfig *stats_config =
			CStatisticsConfig::PstatsconfDefault(mp);
		result =
			std::min(result.Get() / CScaleFactorUtils::DampedFilterScaleFactor(
										stats_config, num_outer_ref_preds)
										.Get(),
					 1.0);
		stats_config->Release();
	}
	result_stats->Release();
	local_expr->Release();
	return result;
}
// create new structure from a list of statistics filters
CStatistics *
CFilterStatsProcessor::MakeStatsFilter(CMemoryPool *mp,
									   const CStatistics *input_stats,
									   CStatsPred *base_pred_stats,
									   BOOL do_cap_NDVs)
{
	GPOS_ASSERT(nullptr != base_pred_stats);
	CDouble input_rows =
		std::max(CStatistics::MinRows.Get(), input_stats->Rows().Get());
	CDouble scale_factor(1.0);
	ULONG num_predicates = 1;
	CDouble rows_filter = CStatistics::MinRows;
	UlongToHistogramMap *histograms_new = nullptr;
	UlongToHistogramMap *histograms_copy = input_stats->CopyHistograms(mp);
	CStatisticsConfig *stats_config = input_stats->GetStatsConfig();
	if (input_stats->IsEmpty())
	{
		histograms_new = GPOS_NEW(mp) UlongToHistogramMap(mp);
		CHistogram::AddEmptyHistogram(mp, histograms_new, histograms_copy);
	}
	else
	{
		histograms_new = MakeHistHashMapConjOrDisjFilter(
			mp, stats_config, histograms_copy, input_rows, base_pred_stats,
			&scale_factor);
		GPOS_ASSERT(CStatistics::MinRows.Get() <= scale_factor.Get());
		rows_filter = input_rows / scale_factor;
		rows_filter = std::max(CStatistics::MinRows.Get(), rows_filter.Get());
	}
	histograms_copy->Release();
	GPOS_ASSERT(rows_filter.Get() <= input_rows.Get());
	if (do_cap_NDVs)
	{
		CStatistics::CapNDVs(rows_filter, histograms_new);
	}
	CStatistics *filter_stats = GPOS_NEW(mp)
		CStatistics(mp, histograms_new, input_stats->CopyWidths(mp),
					rows_filter, input_stats->IsEmpty(),
					input_stats->GetNumberOfPredicates() + num_predicates);
	// since the filter operation is reductive, we choose the bounding method that takes
	// the minimum of the cardinality upper bound of the source column (in the input hash map)
	// and estimated output cardinality
	CStatisticsUtils::ComputeCardUpperBounds(
		mp, input_stats, filter_stats, rows_filter,
		CStatistics::EcbmMin /* card_bounding_method */);
	return filter_stats;
}
// create a new hash map of histograms after applying a conjunctive
// or a disjunctive filter
UlongToHistogramMap *
CFilterStatsProcessor::MakeHistHashMapConjOrDisjFilter(
	CMemoryPool *mp, const CStatisticsConfig *stats_config,
	UlongToHistogramMap *input_histograms, CDouble input_rows,
	CStatsPred *pred_stats, CDouble *scale_factor)
{
	GPOS_ASSERT(nullptr != pred_stats);
	GPOS_ASSERT(nullptr != stats_config);
	GPOS_ASSERT(nullptr != input_histograms);
	UlongToHistogramMap *result_histograms = nullptr;
	if (CStatsPred::EsptConj == pred_stats->GetPredStatsType())
	{
		CStatsPredConj *conjunctive_pred_stats =
			CStatsPredConj::ConvertPredStats(pred_stats);
		return MakeHistHashMapConjFilter(mp, stats_config, input_histograms,
										 input_rows, conjunctive_pred_stats,
										 scale_factor);
	}
	CStatsPredDisj *disjunctive_pred_stats =
		CStatsPredDisj::ConvertPredStats(pred_stats);
	result_histograms = MakeHistHashMapDisjFilter(
		mp, stats_config, input_histograms, input_rows, disjunctive_pred_stats,
		scale_factor);
	GPOS_ASSERT(nullptr != result_histograms);
	return result_histograms;
}
// create new hash map of histograms after applying conjunctive predicates
UlongToHistogramMap *
CFilterStatsProcessor::MakeHistHashMapConjFilter(
	CMemoryPool *mp, const CStatisticsConfig *stats_config,
	UlongToHistogramMap *input_histograms, CDouble input_rows,
	CStatsPredConj *conjunctive_pred_stats, CDouble *scale_factor)
{
	GPOS_ASSERT(nullptr != stats_config);
	GPOS_ASSERT(nullptr != input_histograms);
	GPOS_ASSERT(nullptr != conjunctive_pred_stats);
	conjunctive_pred_stats->Sort();
	CBitSet *filter_colids = GPOS_NEW(mp) CBitSet(mp);
	CDoubleArray *scale_factors = GPOS_NEW(mp) CDoubleArray(mp);
	// create copy of the original hash map of colid -> histogram
	UlongToHistogramMap *result_histograms =
		CStatisticsUtils::CopyHistHashMap(mp, input_histograms);
	// properties of last seen column
	CDouble last_scale_factor(1.0);
	ULONG last_colid = gpos::ulong_max;
	// iterate over filters and update corresponding histograms
	const ULONG filters = conjunctive_pred_stats->GetNumPreds();
	for (ULONG ul = 0; ul < filters; ul++)
	{
		CStatsPred *child_pred_stats = conjunctive_pred_stats->GetPredStats(ul);
		GPOS_ASSERT(CStatsPred::EsptConj !=
					child_pred_stats->GetPredStatsType());
		// get the components of the statistics filter
		ULONG colid = child_pred_stats->GetColId();
		if (CStatsPredUtils::IsUnsupportedPredOnDefinedCol(child_pred_stats))
		{
			// for example, (expression OP const) where expression is a defined column like (a+b)
			CStatsPredUnsupported *unsupported_pred_stats =
				CStatsPredUnsupported::ConvertPredStats(child_pred_stats);
			scale_factors->Append(
				GPOS_NEW(mp) CDouble(unsupported_pred_stats->ScaleFactor()));
			continue;
		}
		// the histogram to apply filter on
		CHistogram *hist_before = nullptr;
		if (IsNewStatsColumn(colid, last_colid))
		{
			scale_factors->Append(GPOS_NEW(mp) CDouble(last_scale_factor));
			last_scale_factor = CDouble(1.0);
		}
		if (CStatsPred::EsptDisj != child_pred_stats->GetPredStatsType())
		{
			GPOS_ASSERT(gpos::ulong_max != colid);
			hist_before = result_histograms->Find(&colid)->CopyHistogram();
			GPOS_ASSERT(nullptr != hist_before);
			CHistogram *result_histogram = nullptr;
			result_histogram = MakeHistSimpleFilter(
				mp, child_pred_stats, filter_colids, hist_before,
				&last_scale_factor, &last_colid);
			GPOS_DELETE(hist_before);
			GPOS_ASSERT(nullptr != result_histogram);
			CHistogram *input_histogram = input_histograms->Find(&colid);
			GPOS_ASSERT(nullptr != input_histogram);
			if (input_histogram->IsEmpty())
			{
				// input histogram is empty so scaling factor does not make sense.
				// if the input itself is empty, then scaling factor is of no effect
				last_scale_factor = 1 / CHistogram::DefaultSelectivity;
			}
			CStatisticsUtils::AddHistogram(mp, colid, result_histogram,
										   result_histograms,
										   true /* fReplaceOld */);
			GPOS_DELETE(result_histogram);
		}
		else
		{
			CStatsPredDisj *disjunctive_pred_stats =
				CStatsPredDisj::ConvertPredStats(child_pred_stats);
			result_histograms->AddRef();
			UlongToHistogramMap *disjunctive_input_histograms =
				result_histograms;
			CDouble disjunctive_scale_factor(1.0);
			CDouble num_disj_input_rows(CStatistics::MinRows.Get());
			if (gpos::ulong_max != colid)
			{
				// The disjunction predicate uses a single column. The input rows to the disjunction
				// is obtained by scaling attained so far on that column
				num_disj_input_rows =
					std::max(CStatistics::MinRows.Get(),
							 (input_rows / last_scale_factor).Get());
			}
			else
			{
				// the disjunction uses multiple columns therefore cannot reason about the number of input rows
				// to the disjunction
				num_disj_input_rows = input_rows.Get();
			}
			UlongToHistogramMap *disjunctive_histograms_after =
				MakeHistHashMapDisjFilter(
					mp, stats_config, result_histograms, num_disj_input_rows,
					disjunctive_pred_stats, &disjunctive_scale_factor);
			// replace intermediate result with the newly generated result from the disjunction
			if (gpos::ulong_max != colid)
			{
				CHistogram *result_histogram =
					disjunctive_histograms_after->Find(&colid);
				CStatisticsUtils::AddHistogram(mp, colid, result_histogram,
											   result_histograms,
											   true /* fReplaceOld */);
				disjunctive_histograms_after->Release();
				last_scale_factor =
					last_scale_factor * disjunctive_scale_factor;
			}
			else
			{
				last_scale_factor = disjunctive_scale_factor.Get();
				result_histograms->Release();
				result_histograms = disjunctive_histograms_after;
			}
			last_colid = colid;
			disjunctive_input_histograms->Release();
		}
	}
	// scaling factor of the last predicate
	scale_factors->Append(GPOS_NEW(mp) CDouble(last_scale_factor));
	GPOS_ASSERT(nullptr != scale_factors);
	CScaleFactorUtils::SortScalingFactor(scale_factors, true /* fDescending */);
	*scale_factor = CScaleFactorUtils::CalcScaleFactorCumulativeConj(
		stats_config, scale_factors);
	// clean up
	scale_factors->Release();
	filter_colids->Release();
	return result_histograms;
}
// create new hash map of histograms after applying disjunctive predicates
UlongToHistogramMap *
CFilterStatsProcessor::MakeHistHashMapDisjFilter(
	CMemoryPool *mp, const CStatisticsConfig *stats_config,
	UlongToHistogramMap *input_histograms, CDouble input_rows,
	CStatsPredDisj *disjunctive_pred_stats, CDouble *scale_factor)
{
	GPOS_ASSERT(nullptr != stats_config);
	GPOS_ASSERT(nullptr != input_histograms);
	GPOS_ASSERT(nullptr != disjunctive_pred_stats);
	CBitSet *non_updatable_cols =
		CStatisticsUtils::GetColsNonUpdatableHistForDisj(
			mp, disjunctive_pred_stats);
	disjunctive_pred_stats->Sort();
	CBitSet *filter_colids = GPOS_NEW(mp) CBitSet(mp);
	CDoubleArray *scale_factors = GPOS_NEW(mp) CDoubleArray(mp);
	UlongToHistogramMap *disjunctive_result_histograms =
		GPOS_NEW(mp) UlongToHistogramMap(mp);
	CHistogram *previous_histogram = nullptr;
	ULONG previous_colid = gpos::ulong_max;
	// This is set to input_rows since SF = 1 / selectivity. So if SF is large, then we are less selective.
	// We will then get selectivity = 1 / input_rows => gives 1 expected row. It is the min # of rows we can select
	CDouble previous_scale_factor(input_rows);
	CDouble cumulative_rows(CStatistics::MinRows.Get());
	// iterate over filters and update corresponding histograms
	const ULONG filters = disjunctive_pred_stats->GetNumPreds();
	for (ULONG ul = 0; ul < filters; ul++)
	{
		CStatsPred *child_pred_stats = disjunctive_pred_stats->GetPredStats(ul);
		// get the components of the statistics filter
		ULONG colid = child_pred_stats->GetColId();
		if (CStatsPredUtils::IsUnsupportedPredOnDefinedCol(child_pred_stats))
		{
			CStatsPredUnsupported *unsupported_pred_stats =
				CStatsPredUnsupported::ConvertPredStats(child_pred_stats);
			scale_factors->Append(
				GPOS_NEW(mp) CDouble(unsupported_pred_stats->ScaleFactor()));
			continue;
		}
		if (IsNewStatsColumn(colid, previous_colid))
		{
			scale_factors->Append(GPOS_NEW(mp)
									  CDouble(previous_scale_factor.Get()));
			CStatisticsUtils::UpdateDisjStatistics(
				mp, non_updatable_cols, input_rows, cumulative_rows,
				previous_histogram, disjunctive_result_histograms,
				previous_colid);
			previous_histogram = nullptr;
		}
		CHistogram *histogram = input_histograms->Find(&colid);
		CHistogram *disjunctive_child_col_histogram = nullptr;
		BOOL is_pred_simple =
			!CStatsPredUtils::IsConjOrDisjPred(child_pred_stats);
		BOOL is_colid_present = (gpos::ulong_max != colid);
		UlongToHistogramMap *child_histograms = nullptr;
		CDouble child_scale_factor(1.0);
		if (is_pred_simple)
		{
			GPOS_ASSERT(nullptr != histogram);
			disjunctive_child_col_histogram = MakeHistSimpleFilter(
				mp, child_pred_stats, filter_colids, histogram,
				&child_scale_factor, &previous_colid);
			CHistogram *input_histogram = input_histograms->Find(&colid);
			GPOS_ASSERT(nullptr != input_histogram);
			if (input_histogram->IsEmpty())
			{
				// input histogram is empty so scaling factor does not make sense.
				// if the input itself is empty, then scaling factor is of no effect
				child_scale_factor = 1 / CHistogram::DefaultSelectivity;
			}
		}
		else
		{
			child_histograms = MakeHistHashMapConjOrDisjFilter(
				mp, stats_config, input_histograms, input_rows,
				child_pred_stats, &child_scale_factor);
			GPOS_ASSERT_IMP(
				CStatsPred::EsptDisj == child_pred_stats->GetPredStatsType(),
				gpos::ulong_max != colid);
			if (is_colid_present)
			{
				// conjunction or disjunction uses only a single column
				disjunctive_child_col_histogram =
					child_histograms->Find(&colid)->CopyHistogram();
			}
		}
		CDouble num_rows_disj_child = input_rows / child_scale_factor;
		if (is_colid_present)
		{
			// 1. a simple predicate (a == 5), (b LIKE "%%GOOD%%")
			// 2. conjunctive / disjunctive predicate where each of its component are predicates on the same column
			// e.g. (a <= 5 AND a >= 1), a in (5, 1)
			GPOS_ASSERT(nullptr != disjunctive_child_col_histogram);
			if (nullptr == previous_histogram)
			{
				previous_histogram = disjunctive_child_col_histogram;
				cumulative_rows = num_rows_disj_child;
			}
			else
			{
				CHistogram *new_histogram = nullptr;
				CDouble output_rows(0.0);
				new_histogram = previous_histogram->MakeUnionHistogramNormalize(
					cumulative_rows, disjunctive_child_col_histogram,
					num_rows_disj_child, &output_rows);
				cumulative_rows = output_rows;
				GPOS_DELETE(previous_histogram);
				GPOS_DELETE(disjunctive_child_col_histogram);
				previous_histogram = new_histogram;
			}
			previous_scale_factor =
				input_rows /
				std::max(CStatistics::MinRows.Get(), cumulative_rows.Get());
			previous_colid = colid;
		}
		else
		{
			// conjunctive predicate where each of it component are predicates on different columns
			// e.g. ((a <= 5) AND (b LIKE "%%GOOD%%"))
			GPOS_ASSERT(nullptr != child_histograms);
			GPOS_ASSERT(nullptr == disjunctive_child_col_histogram);
			CDouble current_rows_estimate =
				input_rows / CScaleFactorUtils::CalcScaleFactorCumulativeDisj(
								 stats_config, scale_factors, input_rows);
			UlongToHistogramMap *merged_histograms =
				CStatisticsUtils::MergeHistogramMapsForDisjPreds(
					mp, non_updatable_cols, disjunctive_result_histograms,
					child_histograms, current_rows_estimate,
					num_rows_disj_child);
			disjunctive_result_histograms->Release();
			disjunctive_result_histograms = merged_histograms;
			previous_histogram = nullptr;
			previous_scale_factor = child_scale_factor;
			previous_colid = colid;
		}
		CRefCount::SafeRelease(child_histograms);
	}
	// process the result and scaling factor of the last predicate
	CStatisticsUtils::UpdateDisjStatistics(
		mp, non_updatable_cols, input_rows, cumulative_rows, previous_histogram,
		disjunctive_result_histograms, previous_colid);
	previous_histogram = nullptr;
	scale_factors->Append(GPOS_NEW(mp) CDouble(
		std::max(CStatistics::MinRows.Get(), previous_scale_factor.Get())));
	*scale_factor = CScaleFactorUtils::CalcScaleFactorCumulativeDisj(
		stats_config, scale_factors, input_rows);
	CHistogram::AddHistograms(mp, input_histograms,
							  disjunctive_result_histograms);
	non_updatable_cols->Release();
	// clean up
	scale_factors->Release();
	filter_colids->Release();
	return disjunctive_result_histograms;
}
//	create a new histograms after applying the filter that is not
//	an AND/OR predicate
CHistogram *
CFilterStatsProcessor::MakeHistSimpleFilter(CMemoryPool *mp,
											CStatsPred *pred_stats,
											CBitSet *filter_colids,
											CHistogram *hist_before,
											CDouble *last_scale_factor,
											ULONG *target_last_colid)
{
	if (CStatsPred::EsptPoint == pred_stats->GetPredStatsType())
	{
		CStatsPredPoint *point_pred_stats =
			CStatsPredPoint::ConvertPredStats(pred_stats);
		return MakeHistPointFilter(point_pred_stats, filter_colids, hist_before,
								   last_scale_factor, target_last_colid);
	}
	if (CStatsPred::EsptLike == pred_stats->GetPredStatsType())
	{
		CStatsPredLike *like_pred_stats =
			CStatsPredLike::ConvertPredStats(pred_stats);
		return MakeHistLikeFilter(like_pred_stats, filter_colids, hist_before,
								  last_scale_factor, target_last_colid);
	}
	if (CStatsPred::EsptArrayCmp == pred_stats->GetPredStatsType())
	{
		CStatsPredArrayCmp *arraycmp_pred_stats =
			CStatsPredArrayCmp::ConvertPredStats(pred_stats);
		return MakeHistArrayCmpAnyFilter(mp, arraycmp_pred_stats, filter_colids,
										 hist_before, last_scale_factor,
										 target_last_colid);
	}
	CStatsPredUnsupported *unsupported_pred_stats =
		CStatsPredUnsupported::ConvertPredStats(pred_stats);
	return MakeHistUnsupportedPred(unsupported_pred_stats, filter_colids,
								   hist_before, last_scale_factor,
								   target_last_colid);
}
// create a new histograms after applying the point filter
CHistogram *
CFilterStatsProcessor::MakeHistPointFilter(CStatsPredPoint *pred_stats,
										   CBitSet *filter_colids,
										   CHistogram *hist_before,
										   CDouble *last_scale_factor,
										   ULONG *target_last_colid)
{
	GPOS_ASSERT(nullptr != pred_stats);
	GPOS_ASSERT(nullptr != filter_colids);
	GPOS_ASSERT(nullptr != hist_before);
	const ULONG colid = pred_stats->GetColId();
	GPOS_ASSERT(CHistogram::IsOpSupportedForFilter(pred_stats->GetCmpType()));
	CPoint *point = pred_stats->GetPredPoint();
	// note column id
	(void) filter_colids->ExchangeSet(colid);
	CDouble local_scale_factor(1.0);
	CHistogram *result_histogram = hist_before->MakeHistogramFilterNormalize(
		pred_stats->GetCmpType(), point, &local_scale_factor);
	GPOS_ASSERT(DOUBLE(1.0) <= local_scale_factor.Get());
	*last_scale_factor = *last_scale_factor * local_scale_factor;
	*target_last_colid = colid;
	return result_histogram;
}
//	create a new histograms for an unsupported predicate
CHistogram *
CFilterStatsProcessor::MakeHistUnsupportedPred(
	CStatsPredUnsupported *pred_stats, CBitSet *filter_colids,
	CHistogram *hist_before, CDouble *last_scale_factor,
	ULONG *target_last_colid)
{
	GPOS_ASSERT(nullptr != pred_stats);
	GPOS_ASSERT(nullptr != filter_colids);
	GPOS_ASSERT(nullptr != hist_before);
	const ULONG colid = pred_stats->GetColId();
	// note column id
	(void) filter_colids->ExchangeSet(colid);
	// generate after histogram
	CHistogram *result_histogram = hist_before->CopyHistogram();
	GPOS_ASSERT(nullptr != result_histogram);
	*last_scale_factor = *last_scale_factor * pred_stats->ScaleFactor();
	*target_last_colid = colid;
	return result_histogram;
}
//	create a new histograms after applying the LIKE filter
CHistogram *
CFilterStatsProcessor::MakeHistLikeFilter(CStatsPredLike *pred_stats,
										  CBitSet *filter_colids,
										  CHistogram *hist_before,
										  CDouble *last_scale_factor,
										  ULONG *target_last_colid)
{
	GPOS_ASSERT(nullptr != pred_stats);
	GPOS_ASSERT(nullptr != filter_colids);
	GPOS_ASSERT(nullptr != hist_before);
	const ULONG colid = pred_stats->GetColId();
	// note column id
	(void) filter_colids->ExchangeSet(colid);
	CHistogram *result_histogram = hist_before->CopyHistogram();
	*last_scale_factor = *last_scale_factor * pred_stats->DefaultScaleFactor();
	*target_last_colid = colid;
	return result_histogram;
}
// create a new histograms after applying the ArrayCmp filter
CHistogram *
CFilterStatsProcessor::MakeHistArrayCmpAnyFilter(CMemoryPool *mp,
												 CStatsPredArrayCmp *pred_stats,
												 CBitSet *filter_colids,
												 CHistogram *base_histogram,
												 CDouble *last_scale_factor,
												 ULONG *target_last_colid)
{
	GPOS_ASSERT(nullptr != pred_stats);
	GPOS_ASSERT(nullptr != filter_colids);
	GPOS_ASSERT(nullptr != base_histogram);
	GPOS_ASSERT(pred_stats->GetCmpType() == CStatsPred::EstatscmptEq);
	// Evaluate statistics for "select * from foo where a in (...)" as
	// "select * from foo join (values (...)) x(a) on foo.a=x.a"
	// as long as the list is deduplicated
	//
	// General algorithm:
	// 1. Construct a histogram with the same bucket boundaries as present in the
	//    base_histogram.
	//    This is better than using a singleton bucket per point, because it that
	//    case, the frequency of each bucket is so small, it is often less than
	//    CStatistics::Epsilon, and may be considered as 0, leading to
	//    cardinality misestimation. Using the same buckets as base_histogram
	//    also aids in joining histogram later.
	// 2. Compute the normalized frequency for each bucket based on the number of points (NDV)
	//    present within each bucket boundary. NB: the points must be de-duplicated
	//    beforehand to prevent double counting.
	// 3. Join this "dummy_histogram" with the base_histogram to determine the buckets
	//    from base_histogram that should be selected.
	// 4. Compute and adjust the resultant scale factor for the filter.
	// First, de-duplicate the constants in the array list
	CPointArray *points = pred_stats->GetPoints();
	if (points->Size() > 1)
	{
		points->Sort(&CUtils::CPointCmp);
	}
	CPointArray *deduped_points = GPOS_NEW(mp) CPointArray(mp);
	IDatum *prev_datum = nullptr;
	for (ULONG ul = 0; ul < points->Size(); ++ul)
	{
		CPoint *point = (*points)[ul];
		IDatum *datum = point->GetDatum();
		GPOS_ASSERT(datum->StatsAreComparable(datum));
		if (datum->IsNull())
		{
			continue;
		}
		if (prev_datum != nullptr && prev_datum->StatsAreEqual(datum))
		{
			continue;
		}
		point->AddRef();
		deduped_points->Append(point);
		prev_datum = datum;
	}
	CDouble dummy_rows(deduped_points->Size());
	// Create buckets for the result histogram using the same bucket boundaries
	// as in the base histogram.
	CBucketArray *dummy_histogram_buckets =
		CHistogram::DeepCopyHistogramBuckets(mp, base_histogram->GetBuckets());
	ULONG point_iter = 0;
	ULONG ndv_remain = 0;
	for (ULONG bucket_iter = 0; bucket_iter < dummy_histogram_buckets->Size();
		 ++bucket_iter)
	{
		CBucket *bucket = (*dummy_histogram_buckets)[bucket_iter];
		bucket->SetFrequency(CDouble(0.0));
		bucket->SetDistinct(CDouble(0.0));
		ULONG ndv = 0;
		// ignore datums that are before the bucket, add it to ndv_remain
		while (point_iter < deduped_points->Size() &&
			   bucket->IsBefore((*deduped_points)[point_iter]))
		{
			ndv_remain++;
			point_iter++;
		}
		// if the point is after the bucket, move to the next bucket
		if (point_iter >= deduped_points->Size() ||
			bucket->IsAfter((*deduped_points)[point_iter]))
		{
			continue;
		}
		// count the number of points that map to the current bucket
		while (point_iter < deduped_points->Size() &&
			   bucket->Contains((*deduped_points)[point_iter]))
		{
			ndv++;
			point_iter++;
		}
		// set frequency based on matched points
		bucket->SetFrequency(CDouble(ndv) / dummy_rows);
		bucket->SetDistinct(CDouble(ndv));
	}
	// if we have gone through all the buckets, and there are still points, add them to ndv_remain
	if (point_iter < deduped_points->Size())
	{
		ndv_remain += deduped_points->Size() - point_iter;
	}
	CDouble freq_remain(0.0);
	if (ndv_remain != 0)
	{
		freq_remain = CDouble(ndv_remain) / dummy_rows;
	}
	CHistogram *dummy_histogram = GPOS_NEW(mp)
		CHistogram(mp, dummy_histogram_buckets, true /* is_well_defined */,
				   CDouble(0.0) /* null_freq */,
				   CDouble(ndv_remain) /* distinct_remain */, freq_remain);
	// dummy histogram should already be normalized since each bucket's frequency
	// is already adjusted by a scale factor of 1/dummy_rows to avoid unnecessarily
	// deep-copying the histogram buckets
	GPOS_ASSERT(dummy_histogram->IsValid() &&
				dummy_histogram->GetNumDistinct() - dummy_rows <
					CStatistics::Epsilon);
	// Compute the join'ed histogram
	CHistogram *result_histogram = base_histogram->MakeJoinHistogram(
		pred_stats->GetCmpType(), dummy_histogram);
	CDouble local_scale_factor = result_histogram->NormalizeHistogram();
	// Adjust the local scale factor by the scale factor of dummy histogram
	local_scale_factor = local_scale_factor / dummy_rows;
	local_scale_factor = CDouble(std::max(local_scale_factor.Get(), 1.0));
	GPOS_ASSERT(DOUBLE(1.0) <= local_scale_factor.Get());
	// update scale factor
	*last_scale_factor = *last_scale_factor * local_scale_factor;
	// note column id
	const ULONG colid = pred_stats->GetColId();
	(void) filter_colids->ExchangeSet(colid);
	*target_last_colid = colid;
	// clean up
	GPOS_DELETE(dummy_histogram);
	deduped_points->Release();
	return result_histogram;
}
// check if the column is a new column for statistic calculation
BOOL
CFilterStatsProcessor::IsNewStatsColumn(ULONG colid, ULONG last_colid)
{
	return (gpos::ulong_max == colid || colid != last_colid);
}
// EOF
相关信息
相关文章
greenplumn CGroupByStatsProcessor 源码
greenplumn CInnerJoinStatsProcessor 源码
greenplumn CJoinStatsProcessor 源码
greenplumn CLeftAntiSemiJoinStatsProcessor 源码
greenplumn CLeftOuterJoinStatsProcessor 源码
greenplumn CLeftSemiJoinStatsProcessor 源码
                        
                            0
                        
                        
                             赞
                        
                    
                    
                热门推荐
- 
                        2、 - 优质文章
 - 
                        3、 gate.io
 - 
                        7、 openharmony
 - 
                        9、 golang