greenplumn CStatistics 源码

  • 2022-08-18
  • 浏览 (42)

greenplumn CStatistics 代码

文件路径:/src/backend/gporca/libnaucrates/src/statistics/CStatistics.cpp

//---------------------------------------------------------------------------
//	Greenplum Database
//	Copyright (C) 2018 VMware, Inc. or its affiliates.
//
//	@filename:
//		CStatistics.cpp
//
//	@doc:
//		Histograms based statistics
//---------------------------------------------------------------------------

#include "naucrates/statistics/CStatistics.h"

#include "gpos/common/CBitSet.h"
#include "gpos/error/CAutoTrace.h"
#include "gpos/memory/CAutoMemoryPool.h"

#include "gpopt/base/CColRefSet.h"
#include "gpopt/base/CColRefSetIter.h"
#include "gpopt/base/CColumnFactory.h"
#include "gpopt/base/COptCtxt.h"
#include "gpopt/base/CReqdPropRelational.h"
#include "gpopt/engine/CStatisticsConfig.h"
#include "gpopt/mdcache/CMDAccessor.h"
#include "gpopt/optimizer/COptimizerConfig.h"
#include "naucrates/dxl/CDXLUtils.h"
#include "naucrates/statistics/CInnerJoinStatsProcessor.h"
#include "naucrates/statistics/CJoinStatsProcessor.h"
#include "naucrates/statistics/CLeftAntiSemiJoinStatsProcessor.h"
#include "naucrates/statistics/CLeftOuterJoinStatsProcessor.h"
#include "naucrates/statistics/CLeftSemiJoinStatsProcessor.h"
#include "naucrates/statistics/CScaleFactorUtils.h"
#include "naucrates/statistics/CStatisticsUtils.h"

using namespace gpmd;
using namespace gpdxl;
using namespace gpopt;

FORCE_GENERATE_DBGSTR(CStatistics);

// default number of rows in relation
const CDouble CStatistics::DefaultRelationRows(1000.0);

// epsilon to be used for various computations
const CDouble CStatistics::Epsilon(0.00001);

// minimum number of rows in relation
const CDouble CStatistics::MinRows(1.0);

// default column width
const CDouble CStatistics::DefaultColumnWidth(8.0);

// default number of distinct values
const CDouble CStatistics::DefaultDistinctValues(1000.0);

// the default value for operators that have no cardinality estimation risk
const ULONG CStatistics::no_card_est_risk_default_val = 1;

// ctor
CStatistics::CStatistics(CMemoryPool *mp,
						 UlongToHistogramMap *col_histogram_mapping,
						 UlongToDoubleMap *colid_width_mapping, CDouble rows,
						 BOOL is_empty, ULONG num_predicates)
	: m_colid_histogram_mapping(col_histogram_mapping),
	  m_colid_width_mapping(colid_width_mapping),
	  m_rows(rows),
	  m_stats_estimation_risk(no_card_est_risk_default_val),
	  m_empty(is_empty),
	  m_relpages(0),
	  m_relallvisible(0),
	  m_num_rebinds(
		  1.0),	 // by default, a stats object is rebound to parameters only once
	  m_num_predicates(num_predicates),
	  m_src_upper_bound_NDVs(nullptr)
{
	GPOS_ASSERT(nullptr != m_colid_histogram_mapping);
	GPOS_ASSERT(nullptr != m_colid_width_mapping);
	GPOS_ASSERT(CDouble(0.0) <= m_rows);

	// hash map for source id -> max source cardinality mapping
	m_src_upper_bound_NDVs = GPOS_NEW(mp) CUpperBoundNDVPtrArray(mp);

	m_stats_conf =
		COptCtxt::PoctxtFromTLS()->GetOptimizerConfig()->GetStatsConf();
}

CStatistics::CStatistics(CMemoryPool *mp,
						 UlongToHistogramMap *col_histogram_mapping,
						 UlongToDoubleMap *colid_width_mapping, CDouble rows,
						 BOOL is_empty, ULONG relpages, ULONG relallvisible,
						 CDouble rebinds, ULONG num_predicates)
	: m_colid_histogram_mapping(col_histogram_mapping),
	  m_colid_width_mapping(colid_width_mapping),
	  m_rows(rows),
	  m_stats_estimation_risk(no_card_est_risk_default_val),
	  m_empty(is_empty),
	  m_relpages(relpages),
	  m_relallvisible(relallvisible),
	  m_num_rebinds(rebinds),
	  m_num_predicates(num_predicates),
	  m_src_upper_bound_NDVs(nullptr)
{
	GPOS_ASSERT(nullptr != m_colid_histogram_mapping);
	GPOS_ASSERT(nullptr != m_colid_width_mapping);
	GPOS_ASSERT(CDouble(0.0) <= m_rows);

	// hash map for source id -> max source cardinality mapping
	m_src_upper_bound_NDVs = GPOS_NEW(mp) CUpperBoundNDVPtrArray(mp);

	m_stats_conf =
		COptCtxt::PoctxtFromTLS()->GetOptimizerConfig()->GetStatsConf();
}

// Dtor
CStatistics::~CStatistics()
{
	m_colid_histogram_mapping->Release();
	m_colid_width_mapping->Release();
	m_src_upper_bound_NDVs->Release();
}

// look up the width of a particular column
const CDouble *
CStatistics::GetWidth(ULONG colid) const
{
	return m_colid_width_mapping->Find(&colid);
}


//	cap the total number of distinct values (NDVs) in buckets to the number of rows
void
CStatistics::CapNDVs(CDouble rows, UlongToHistogramMap *col_histogram_mapping)
{
	GPOS_ASSERT(nullptr != col_histogram_mapping);
	UlongToHistogramMapIter col_hist_mapping(col_histogram_mapping);
	while (col_hist_mapping.Advance())
	{
		CHistogram *histogram =
			const_cast<CHistogram *>(col_hist_mapping.Value());
		histogram->CapNDVs(rows);
	}
}

// helper print function
IOstream &
CStatistics::OsPrint(IOstream &os) const
{
	os << "{" << std::endl;
	os << "Rows = " << Rows() << std::endl;
	os << "Rebinds = " << NumRebinds() << std::endl;

	UlongToHistogramMapIter col_hist_mapping(m_colid_histogram_mapping);
	while (col_hist_mapping.Advance())
	{
		ULONG colid = *(col_hist_mapping.Key());
		os << "Col" << colid << ":" << std::endl;
		const CHistogram *histogram = col_hist_mapping.Value();
		histogram->OsPrint(os);
		os << std::endl;
	}

	UlongToDoubleMapIter col_width_map_iterator(m_colid_width_mapping);
	while (col_width_map_iterator.Advance())
	{
		ULONG colid = *(col_width_map_iterator.Key());
		os << "Col" << colid << ":" << std::endl;
		const CDouble *width = col_width_map_iterator.Value();
		os << " width " << (*width) << std::endl;
	}

	const ULONG length = m_src_upper_bound_NDVs->Size();
	for (ULONG i = 0; i < length; i++)
	{
		const CUpperBoundNDVs *upper_bound_NDVs = (*m_src_upper_bound_NDVs)[i];
		upper_bound_NDVs->OsPrint(os);
	}
	os << "StatsEstimationRisk = " << StatsEstimationRisk() << std::endl;
	os << "}" << std::endl;

	return os;
}


//	return the total number of rows for this statistics object
CDouble
CStatistics::Rows() const
{
	return m_rows;
}

// return the estimated skew of the given column
CDouble
CStatistics::GetSkew(ULONG colid) const
{
	CHistogram *histogram = m_colid_histogram_mapping->Find(&colid);
	if (nullptr == histogram)
	{
		return CDouble(1.0);
	}

	return histogram->GetSkew();
}

// return total width in bytes
CDouble
CStatistics::Width() const
{
	CDouble total_width(0.0);
	UlongToDoubleMapIter col_width_map_iterator(m_colid_width_mapping);
	while (col_width_map_iterator.Advance())
	{
		const CDouble *width = col_width_map_iterator.Value();
		total_width = total_width + (*width);
	}
	return total_width.Ceil();
}

// return the width in bytes of a set of columns
CDouble
CStatistics::Width(ULongPtrArray *colids) const
{
	GPOS_ASSERT(nullptr != colids);

	CColumnFactory *col_factory = COptCtxt::PoctxtFromTLS()->Pcf();
	CDouble total_width(0.0);
	const ULONG size = colids->Size();
	for (ULONG idx = 0; idx < size; idx++)
	{
		ULONG colid = *((*colids)[idx]);
		CDouble *width = m_colid_width_mapping->Find(&colid);
		if (nullptr != width)
		{
			total_width = total_width + (*width);
		}
		else
		{
			CColRef *colref = col_factory->LookupColRef(colid);
			GPOS_ASSERT(nullptr != colref);

			total_width = total_width + CStatisticsUtils::DefaultColumnWidth(
											colref->RetrieveType());
		}
	}
	return total_width.Ceil();
}

// return width in bytes of a set of columns
CDouble
CStatistics::Width(CMemoryPool *mp, CColRefSet *colrefs) const
{
	GPOS_ASSERT(nullptr != colrefs);

	ULongPtrArray *colids = GPOS_NEW(mp) ULongPtrArray(mp);
	colrefs->ExtractColIds(mp, colids);

	CDouble width = Width(colids);
	colids->Release();

	return width;
}

// return dummy statistics object
CStatistics *
CStatistics::MakeDummyStats(CMemoryPool *mp, ULongPtrArray *colids,
							CDouble rows)
{
	GPOS_ASSERT(nullptr != colids);

	// hash map from colid -> histogram for resultant structure
	UlongToHistogramMap *col_histogram_mapping =
		GPOS_NEW(mp) UlongToHistogramMap(mp);

	// hashmap from colid -> width (double)
	UlongToDoubleMap *colid_width_mapping = GPOS_NEW(mp) UlongToDoubleMap(mp);

	CColumnFactory *col_factory = COptCtxt::PoctxtFromTLS()->Pcf();

	BOOL is_empty = (CStatistics::Epsilon >= rows);
	CHistogram::AddDummyHistogramAndWidthInfo(
		mp, col_factory, col_histogram_mapping, colid_width_mapping, colids,
		is_empty);

	CStatistics *stats = GPOS_NEW(mp) CStatistics(
		mp, col_histogram_mapping, colid_width_mapping, rows, is_empty);
	CreateAndInsertUpperBoundNDVs(mp, stats, colids, rows);

	return stats;
}

// add upper bound ndvs information for a given set of columns
void
CStatistics::CreateAndInsertUpperBoundNDVs(CMemoryPool *mp, CStatistics *stats,
										   ULongPtrArray *colids, CDouble rows)
{
	GPOS_ASSERT(nullptr != stats);
	GPOS_ASSERT(nullptr != colids);

	CColumnFactory *col_factory = COptCtxt::PoctxtFromTLS()->Pcf();
	CColRefSet *colrefs = GPOS_NEW(mp) CColRefSet(mp);
	const ULONG num_cols = colids->Size();
	for (ULONG i = 0; i < num_cols; i++)
	{
		ULONG colid = *(*colids)[i];
		const CColRef *colref = col_factory->LookupColRef(colid);
		if (nullptr != colref)
		{
			colrefs->Include(colref);
		}
	}

	if (0 < colrefs->Size())
	{
		stats->AddCardUpperBound(GPOS_NEW(mp) CUpperBoundNDVs(colrefs, rows));
	}
	else
	{
		colrefs->Release();
	}
}

//	return dummy statistics object
CStatistics *
CStatistics::MakeDummyStats(CMemoryPool *mp,
							ULongPtrArray *col_histogram_mapping,
							ULongPtrArray *col_width_mapping, CDouble rows)
{
	GPOS_ASSERT(nullptr != col_histogram_mapping);
	GPOS_ASSERT(nullptr != col_width_mapping);

	BOOL is_empty = (CStatistics::Epsilon >= rows);
	CColumnFactory *col_factory = COptCtxt::PoctxtFromTLS()->Pcf();

	// hash map from colid -> histogram for resultant structure
	UlongToHistogramMap *result_col_histogram_mapping =
		GPOS_NEW(mp) UlongToHistogramMap(mp);

	const ULONG num_col_hist = col_histogram_mapping->Size();
	for (ULONG ul = 0; ul < num_col_hist; ul++)
	{
		ULONG colid = *(*col_histogram_mapping)[ul];

		CColRef *colref = col_factory->LookupColRef(colid);
		GPOS_ASSERT(nullptr != colref);

		// empty histogram
		CHistogram *histogram =
			CHistogram::MakeDefaultHistogram(mp, colref, is_empty);
		result_col_histogram_mapping->Insert(GPOS_NEW(mp) ULONG(colid),
											 histogram);
	}

	// hashmap from colid -> width (double)
	UlongToDoubleMap *colid_width_mapping = GPOS_NEW(mp) UlongToDoubleMap(mp);

	const ULONG num_col_width = col_width_mapping->Size();
	for (ULONG ul = 0; ul < num_col_width; ul++)
	{
		ULONG colid = *(*col_width_mapping)[ul];

		CColRef *colref = col_factory->LookupColRef(colid);
		GPOS_ASSERT(nullptr != colref);

		CDouble width =
			CStatisticsUtils::DefaultColumnWidth(colref->RetrieveType());
		colid_width_mapping->Insert(GPOS_NEW(mp) ULONG(colid),
									GPOS_NEW(mp) CDouble(width));
	}

	CStatistics *stats = GPOS_NEW(mp)
		CStatistics(mp, result_col_histogram_mapping, colid_width_mapping, rows,
					false /* is_empty */);
	CreateAndInsertUpperBoundNDVs(mp, stats, col_histogram_mapping, rows);

	return stats;
}


//	check if the input statistics from join statistics computation empty
BOOL
CStatistics::IsEmptyJoin(const CStatistics *outer_stats,
						 const CStatistics *inner_side_stats, BOOL IsLASJ)
{
	GPOS_ASSERT(nullptr != outer_stats);
	GPOS_ASSERT(nullptr != inner_side_stats);

	if (IsLASJ)
	{
		return outer_stats->IsEmpty();
	}

	return outer_stats->IsEmpty() || inner_side_stats->IsEmpty();
}

// Currently, Pstats[Join type] are thin wrappers the C[Join type]StatsProcessor class's method
// for deriving the stat objects for the corresponding join operator

//	return statistics object after performing LOJ operation with another statistics structure
IStatistics *
CStatistics::CalcLOJoinStats(CMemoryPool *mp, const IStatistics *other_stats,
							 CStatsPredJoinArray *join_preds_stats) const
{
	return CLeftOuterJoinStatsProcessor::CalcLOJoinStatsStatic(
		mp, this, other_stats, join_preds_stats);
}



//	return statistics object after performing semi-join with another statistics structure
IStatistics *
CStatistics::CalcLSJoinStats(CMemoryPool *mp,
							 const IStatistics *inner_side_stats,
							 CStatsPredJoinArray *join_preds_stats) const
{
	return CLeftSemiJoinStatsProcessor::CalcLSJoinStatsStatic(
		mp, this, inner_side_stats, join_preds_stats);
}



// return statistics object after performing inner join
IStatistics *
CStatistics::CalcInnerJoinStats(CMemoryPool *mp, const IStatistics *other_stats,
								CStatsPredJoinArray *join_preds_stats) const
{
	return CInnerJoinStatsProcessor::CalcInnerJoinStatsStatic(
		mp, this, other_stats, join_preds_stats);
}

// return statistics object after performing LASJ
IStatistics *
CStatistics::CalcLASJoinStats(CMemoryPool *mp, const IStatistics *other_stats,
							  CStatsPredJoinArray *join_preds_stats,
							  BOOL DoIgnoreLASJHistComputation) const
{
	return CLeftAntiSemiJoinStatsProcessor::CalcLASJoinStatsStatic(
		mp, this, other_stats, join_preds_stats, DoIgnoreLASJHistComputation);
}

//	helper method to copy statistics on columns that are not excluded by bitset
void
CStatistics::AddNotExcludedHistograms(
	CMemoryPool *mp, CBitSet *excluded_cols,
	UlongToHistogramMap *col_histogram_mapping) const
{
	GPOS_ASSERT(nullptr != excluded_cols);
	GPOS_ASSERT(nullptr != col_histogram_mapping);

	UlongToHistogramMapIter col_hist_mapping(m_colid_histogram_mapping);
	while (col_hist_mapping.Advance())
	{
		ULONG colid = *(col_hist_mapping.Key());
		if (!excluded_cols->Get(colid))
		{
			const CHistogram *histogram = col_hist_mapping.Value();
			CStatisticsUtils::AddHistogram(mp, colid, histogram,
										   col_histogram_mapping);
		}

		GPOS_CHECK_ABORT;
	}
}

UlongToDoubleMap *
CStatistics::CopyWidths(CMemoryPool *mp) const
{
	UlongToDoubleMap *widths_copy = GPOS_NEW(mp) UlongToDoubleMap(mp);
	CStatisticsUtils::AddWidthInfo(mp, m_colid_width_mapping, widths_copy);

	return widths_copy;
}

void
CStatistics::CopyWidthsInto(CMemoryPool *mp,
							UlongToDoubleMap *colid_width_mapping) const
{
	CStatisticsUtils::AddWidthInfo(mp, m_colid_width_mapping,
								   colid_width_mapping);
}

UlongToHistogramMap *
CStatistics::CopyHistograms(CMemoryPool *mp) const
{
	// create hash map from colid -> histogram for resultant structure
	UlongToHistogramMap *histograms_copy = GPOS_NEW(mp) UlongToHistogramMap(mp);

	BOOL is_empty = IsEmpty();

	UlongToHistogramMapIter col_hist_mapping(m_colid_histogram_mapping);
	while (col_hist_mapping.Advance())
	{
		ULONG colid = *(col_hist_mapping.Key());
		const CHistogram *histogram = col_hist_mapping.Value();
		CHistogram *histogram_copy = nullptr;
		if (is_empty)
		{
			histogram_copy =
				GPOS_NEW(mp) CHistogram(mp, false /* is_well_defined */);
		}
		else
		{
			histogram_copy = histogram->CopyHistogram();
		}

		histograms_copy->Insert(GPOS_NEW(mp) ULONG(colid), histogram_copy);
	}

	return histograms_copy;
}



//	return required props associated with statistics object
CReqdPropRelational *
CStatistics::GetReqdRelationalProps(CMemoryPool *mp) const
{
	CColumnFactory *col_factory = COptCtxt::PoctxtFromTLS()->Pcf();
	GPOS_ASSERT(nullptr != col_factory);

	CColRefSet *colrefs = GPOS_NEW(mp) CColRefSet(mp);

	// add columns from histogram map
	UlongToHistogramMapIter col_hist_mapping(m_colid_histogram_mapping);
	while (col_hist_mapping.Advance())
	{
		ULONG colid = *(col_hist_mapping.Key());
		CColRef *colref = col_factory->LookupColRef(colid);
		GPOS_ASSERT(nullptr != colref);

		colrefs->Include(colref);
	}

	return GPOS_NEW(mp) CReqdPropRelational(colrefs);
}

// append given statistics to current object
void
CStatistics::AppendStats(CMemoryPool *mp, IStatistics *input_stats)
{
	CStatistics *stats = CStatistics::CastStats(input_stats);

	CHistogram::AddHistograms(mp, stats->m_colid_histogram_mapping,
							  m_colid_histogram_mapping);
	GPOS_CHECK_ABORT;

	CStatisticsUtils::AddWidthInfo(mp, stats->m_colid_width_mapping,
								   m_colid_width_mapping);
	GPOS_CHECK_ABORT;
}

// copy statistics object
IStatistics *
CStatistics::CopyStats(CMemoryPool *mp) const
{
	return ScaleStats(mp, CDouble(1.0) /*factor*/);
}

// return a copy of this statistics object scaled by a given factor
IStatistics *
CStatistics::ScaleStats(CMemoryPool *mp, CDouble factor) const
{
	UlongToHistogramMap *histograms_new = GPOS_NEW(mp) UlongToHistogramMap(mp);
	UlongToDoubleMap *widths_new = GPOS_NEW(mp) UlongToDoubleMap(mp);

	CHistogram::AddHistograms(mp, m_colid_histogram_mapping, histograms_new);
	GPOS_CHECK_ABORT;

	CStatisticsUtils::AddWidthInfo(mp, m_colid_width_mapping, widths_new);
	GPOS_CHECK_ABORT;

	CDouble scaled_num_rows = m_rows * factor;

	// create a scaled stats object
	CStatistics *scaled_stats = GPOS_NEW(mp) CStatistics(
		mp, histograms_new, widths_new, scaled_num_rows, IsEmpty(), RelPages(),
		RelAllVisible(), NumRebinds(), m_num_predicates);

	// In the output statistics object, the upper bound source cardinality of the scaled column
	// cannot be greater than the the upper bound source cardinality information maintained in the input
	// statistics object. Therefore we choose CStatistics::EcbmMin the bounding method which takes
	// the minimum of the cardinality upper bound of the source column (in the input hash map)
	// and estimated output cardinality.

	// modify source id to upper bound card information
	CStatisticsUtils::ComputeCardUpperBounds(
		mp, this, scaled_stats, scaled_num_rows,
		CStatistics::EcbmMin /* card_bounding_method */);

	return scaled_stats;
}

//	copy statistics object with re-mapped column ids
IStatistics *
CStatistics::CopyStatsWithRemap(CMemoryPool *mp,
								UlongToColRefMap *colref_mapping,
								BOOL must_exist) const
{
	GPOS_ASSERT(nullptr != colref_mapping);
	UlongToHistogramMap *histograms_new = GPOS_NEW(mp) UlongToHistogramMap(mp);
	UlongToDoubleMap *widths_new = GPOS_NEW(mp) UlongToDoubleMap(mp);

	AddHistogramsWithRemap(mp, m_colid_histogram_mapping, histograms_new,
						   colref_mapping, must_exist);
	AddWidthInfoWithRemap(mp, m_colid_width_mapping, widths_new, colref_mapping,
						  must_exist);

	// create a copy of the stats object
	CStatistics *stats_copy = GPOS_NEW(mp) CStatistics(
		mp, histograms_new, widths_new, m_rows, IsEmpty(), RelPages(),
		RelAllVisible(), NumRebinds(), m_num_predicates);

	// In the output statistics object, the upper bound source cardinality of the join column
	// cannot be greater than the the upper bound source cardinality information maintained in the input
	// statistics object.

	// copy the upper bound ndv information
	const ULONG length = m_src_upper_bound_NDVs->Size();
	for (ULONG i = 0; i < length; i++)
	{
		const CUpperBoundNDVs *upper_bound_NDVs = (*m_src_upper_bound_NDVs)[i];
		CUpperBoundNDVs *upper_bound_NDVs_copy =
			upper_bound_NDVs->CopyUpperBoundNDVWithRemap(mp, colref_mapping);

		if (nullptr != upper_bound_NDVs_copy)
		{
			stats_copy->AddCardUpperBound(upper_bound_NDVs_copy);
		}
	}

	return stats_copy;
}

//	return the column identifiers of all columns whose statistics are
//	maintained by the statistics object
ULongPtrArray *
CStatistics::GetColIdsWithStats(CMemoryPool *mp) const
{
	ULongPtrArray *colids = GPOS_NEW(mp) ULongPtrArray(mp);

	UlongToHistogramMapIter col_hist_mapping(m_colid_histogram_mapping);
	while (col_hist_mapping.Advance())
	{
		ULONG colid = *(col_hist_mapping.Key());
		colids->Append(GPOS_NEW(mp) ULONG(colid));
	}

	return colids;
}

// return the set of column references we have statistics for
CColRefSet *
CStatistics::GetColRefSet(CMemoryPool *mp) const
{
	CColRefSet *colrefs = GPOS_NEW(mp) CColRefSet(mp);
	CColumnFactory *col_factory = COptCtxt::PoctxtFromTLS()->Pcf();

	UlongToHistogramMapIter col_hist_mapping(m_colid_histogram_mapping);
	while (col_hist_mapping.Advance())
	{
		ULONG colid = *(col_hist_mapping.Key());
		CColRef *colref = col_factory->LookupColRef(colid);
		GPOS_ASSERT(nullptr != colref);

		colrefs->Include(colref);
	}

	return colrefs;
}

//	append given histograms to current object where the column ids have been re-mapped
void
CStatistics::AddHistogramsWithRemap(CMemoryPool *mp,
									UlongToHistogramMap *src_histograms,
									UlongToHistogramMap *dest_histograms,
									UlongToColRefMap *colref_mapping,
									BOOL
#ifdef GPOS_DEBUG
										must_exist
#endif	//GPOS_DEBUG
)
{
	UlongToColRefMapIter colref_iterator(colref_mapping);
	while (colref_iterator.Advance())
	{
		ULONG src_colid = *(colref_iterator.Key());
		const CColRef *dest_colref = colref_iterator.Value();
		GPOS_ASSERT_IMP(must_exist, nullptr != dest_colref);

		ULONG dest_colid = dest_colref->Id();

		const CHistogram *src_histogram = src_histograms->Find(&src_colid);
		if (nullptr != src_histogram)
		{
			CStatisticsUtils::AddHistogram(mp, dest_colid, src_histogram,
										   dest_histograms);
		}
	}
}

// add width information where the column ids have been re-mapped
void
CStatistics::AddWidthInfoWithRemap(CMemoryPool *mp, UlongToDoubleMap *src_width,
								   UlongToDoubleMap *dest_width,
								   UlongToColRefMap *colref_mapping,
								   BOOL must_exist)
{
	UlongToDoubleMapIter col_width_map_iterator(src_width);
	while (col_width_map_iterator.Advance())
	{
		ULONG colid = *(col_width_map_iterator.Key());
		CColRef *new_colref = colref_mapping->Find(&colid);
		if (must_exist && nullptr == new_colref)
		{
			continue;
		}

		if (nullptr != new_colref)
		{
			colid = new_colref->Id();
		}

		if (nullptr == dest_width->Find(&colid))
		{
			const CDouble *width = col_width_map_iterator.Value();
			CDouble *width_copy = GPOS_NEW(mp) CDouble(*width);
			BOOL result GPOS_ASSERTS_ONLY =
				dest_width->Insert(GPOS_NEW(mp) ULONG(colid), width_copy);
			GPOS_ASSERT(result);
		}
	}
}

// return the index of the array of upper bound ndvs to which column reference belongs
ULONG
CStatistics::GetIndexUpperBoundNDVs(const CColRef *colref)
{
	GPOS_ASSERT(nullptr != colref);

	const ULONG length = m_src_upper_bound_NDVs->Size();
	for (ULONG i = 0; i < length; i++)
	{
		const CUpperBoundNDVs *upper_bound_NDVs = (*m_src_upper_bound_NDVs)[i];
		if (upper_bound_NDVs->IsPresent(colref))
		{
			return i;
		}
	}

	return gpos::ulong_max;
}

// add upper bound of source cardinality
void
CStatistics::AddCardUpperBound(CUpperBoundNDVs *upper_bound_NDVs)
{
	GPOS_ASSERT(nullptr != upper_bound_NDVs);

	m_src_upper_bound_NDVs->Append(upper_bound_NDVs);
}

// return the dxl representation of the statistics object
CDXLStatsDerivedRelation *
CStatistics::GetDxlStatsDrvdRelation(CMemoryPool *mp,
									 CMDAccessor *md_accessor) const
{
	CDXLStatsDerivedColumnArray *dxl_stats_derived_col_array =
		GPOS_NEW(mp) CDXLStatsDerivedColumnArray(mp);

	UlongToHistogramMapIter col_hist_mapping(m_colid_histogram_mapping);
	while (col_hist_mapping.Advance())
	{
		ULONG colid = *(col_hist_mapping.Key());
		const CHistogram *histogram = col_hist_mapping.Value();

		CDouble *width = m_colid_width_mapping->Find(&colid);
		GPOS_ASSERT(width);

		CDXLStatsDerivedColumn *dxl_derived_col_stats =
			histogram->TranslateToDXLDerivedColumnStats(md_accessor, colid,
														*width);
		dxl_stats_derived_col_array->Append(dxl_derived_col_stats);
	}

	return GPOS_NEW(mp) CDXLStatsDerivedRelation(m_rows, IsEmpty(),
												 dxl_stats_derived_col_array);
}

// return the upper bound of ndvs for a column reference
CDouble
CStatistics::GetColUpperBoundNDVs(const CColRef *colref)
{
	GPOS_ASSERT(nullptr != colref);

	const ULONG length = m_src_upper_bound_NDVs->Size();
	for (ULONG i = 0; i < length; i++)
	{
		const CUpperBoundNDVs *upper_bound_NDVs = (*m_src_upper_bound_NDVs)[i];
		if (upper_bound_NDVs->IsPresent(colref))
		{
			return upper_bound_NDVs->UpperBoundNDVs();
		}
	}

	return DefaultDistinctValues;
}


// look up the number of distinct values of a particular column
CDouble
CStatistics::GetNDVs(const CColRef *colref)
{
	ULONG colid = colref->Id();
	CHistogram *col_histogram = m_colid_histogram_mapping->Find(&colid);
	if (nullptr != col_histogram)
	{
		return std::min(col_histogram->GetNumDistinct(),
						GetColUpperBoundNDVs(colref));
	}

#ifdef GPOS_DEBUG
	{
		// the case of no histogram available for requested column signals
		// something wrong with computation of required statistics columns,
		// we print a debug message to log this case

		CAutoMemoryPool amp;
		CAutoTrace at(amp.Pmp());

		at.Os() << "\nREQUESTED NDVs FOR COL (" << colref->Id()
				<< ") WITH A MISSING HISTOGRAM";
	}
#endif	//GPOS_DEBUG

	// if no histogram is available for required column, we use
	// the number of rows as NDVs estimate
	return std::min(m_rows, GetColUpperBoundNDVs(colref));
}


// EOF

相关信息

greenplumn 源码目录

相关文章

greenplumn CBucket 源码

greenplumn CFilterStatsProcessor 源码

greenplumn CGroupByStatsProcessor 源码

greenplumn CHistogram 源码

greenplumn CInnerJoinStatsProcessor 源码

greenplumn CJoinStatsProcessor 源码

greenplumn CLeftAntiSemiJoinStatsProcessor 源码

greenplumn CLeftOuterJoinStatsProcessor 源码

greenplumn CLeftSemiJoinStatsProcessor 源码

greenplumn CLimitStatsProcessor 源码

0  赞