greenplumn CHistogram 源码
greenplumn CHistogram 代码
文件路径:/src/backend/gporca/libnaucrates/src/statistics/CHistogram.cpp
//---------------------------------------------------------------------------
//	Greenplum Database
//	Copyright (C) 2018 VMware, Inc. or its affiliates.
//
//	@filename:
//		CHistogram.cpp
//
//	@doc:
//		Implementation of single-dimensional histogram
//---------------------------------------------------------------------------
#include "naucrates/statistics/CHistogram.h"
#include "gpos/common/syslibwrapper.h"
#include "gpos/io/COstreamString.h"
#include "gpos/string/CWStringDynamic.h"
#include "gpopt/base/CColRef.h"
#include "gpopt/base/COptCtxt.h"
#include "naucrates/dxl/CDXLUtils.h"
#include "naucrates/dxl/operators/CDXLScalarConstValue.h"
#include "naucrates/statistics/CLeftAntiSemiJoinStatsProcessor.h"
#include "naucrates/statistics/CScaleFactorUtils.h"
#include "naucrates/statistics/CStatistics.h"
#include "naucrates/statistics/CStatisticsUtils.h"
using namespace gpnaucrates;
using namespace gpopt;
using namespace gpmd;
// default histogram selectivity
const CDouble CHistogram::DefaultSelectivity(0.4);
// default scale factor when there is no filtering of input
const CDouble CHistogram::NeutralScaleFactor(1.0);
// the minimum number of distinct values in a column
const CDouble CHistogram::MinDistinct(1.0);
// default Null frequency
const CDouble CHistogram::DefaultNullFreq(0.0);
// default NDV remain
const CDouble CHistogram::DefaultNDVRemain(0.0);
// default frequency of NDV remain
const CDouble CHistogram::DefaultNDVFreqRemain(0.0);
// sample size used to estimate skew
#define GPOPT_SKEW_SAMPLE_SIZE 1000
// ctor
CHistogram::CHistogram(CMemoryPool *mp, CBucketArray *histogram_buckets,
					   BOOL is_well_defined)
	: m_mp(mp),
	  m_histogram_buckets(histogram_buckets),
	  m_is_well_defined(is_well_defined),
	  m_null_freq(CHistogram::DefaultNullFreq),
	  m_distinct_remaining(DefaultNDVRemain),
	  m_freq_remaining(DefaultNDVFreqRemain),
	  m_skew_was_measured(false),
	  m_skew(1.0),
	  m_NDVs_were_scaled(false),
	  m_is_col_stats_missing(false)
{
	GPOS_ASSERT(nullptr != histogram_buckets);
}
// ctor
CHistogram::CHistogram(CMemoryPool *mp, BOOL is_well_defined)
	: m_mp(mp),
	  m_histogram_buckets(nullptr),
	  m_is_well_defined(is_well_defined),
	  m_null_freq(CHistogram::DefaultNullFreq),
	  m_distinct_remaining(DefaultNDVRemain),
	  m_freq_remaining(DefaultNDVFreqRemain),
	  m_skew_was_measured(false),
	  m_skew(1.0),
	  m_NDVs_were_scaled(false),
	  m_is_col_stats_missing(false)
{
	m_histogram_buckets = GPOS_NEW(m_mp) CBucketArray(m_mp);
}
// ctor
CHistogram::CHistogram(CMemoryPool *mp, CBucketArray *histogram_buckets,
					   BOOL is_well_defined, CDouble null_freq,
					   CDouble distinct_remaining, CDouble freq_remaining,
					   BOOL is_col_stats_missing)
	: m_mp(mp),
	  m_histogram_buckets(histogram_buckets),
	  m_is_well_defined(is_well_defined),
	  m_null_freq(null_freq),
	  m_distinct_remaining(distinct_remaining),
	  m_freq_remaining(freq_remaining),
	  m_skew_was_measured(false),
	  m_skew(1.0),
	  m_NDVs_were_scaled(false),
	  m_is_col_stats_missing(is_col_stats_missing)
{
	GPOS_ASSERT(m_histogram_buckets);
	GPOS_ASSERT(CDouble(0.0) <= null_freq);
	GPOS_ASSERT(CDouble(1.0) >= null_freq);
	GPOS_ASSERT(CDouble(0.0) <= distinct_remaining);
	GPOS_ASSERT(CDouble(0.0) <= freq_remaining);
	GPOS_ASSERT(CDouble(1.0) >= freq_remaining);
	// if distinct_remaining is 0, freq_remaining must be 0 too
	GPOS_ASSERT_IMP(distinct_remaining < CStatistics::Epsilon,
					freq_remaining < CStatistics::Epsilon);
}
// set histograms null frequency
void
CHistogram::SetNullFrequency(CDouble null_freq)
{
	GPOS_ASSERT(CDouble(0.0) <= null_freq && CDouble(1.0) >= null_freq);
	m_null_freq = null_freq;
}
FORCE_GENERATE_DBGSTR(gpnaucrates::CHistogram);
//	print function
IOstream &
CHistogram::OsPrint(IOstream &os) const
{
	os << std::endl << "[" << std::endl;
	ULONG num_buckets = m_histogram_buckets->Size();
	for (ULONG bucket_index = 0; bucket_index < num_buckets; bucket_index++)
	{
		os << "b" << bucket_index << " = ";
		(*m_histogram_buckets)[bucket_index]->OsPrint(os);
		os << std::endl;
	}
	os << "]" << std::endl;
	os << "Null fraction      : " << GetNullFreq() << std::endl;
	os << "Remaining NDV      : " << GetDistinctRemain() << std::endl;
	os << "Total NDV          : " << GetNumDistinct() << std::endl;
	os << "Remaining frequency: " << GetFreqRemain() << std::endl;
	os << "Total frequency    : " << GetFrequency() << std::endl;
	if (m_skew_was_measured)
	{
		os << "Skew: " << m_skew << std::endl;
	}
	else
	{
		os << "Skew: not measured" << std::endl;
	}
	os << "Was NDVs re-scaled Based on Row Estimate: " << m_NDVs_were_scaled
	   << std::endl;
	return os;
}
// check if histogram is empty
BOOL
CHistogram::IsEmpty() const
{
	return (0 == m_histogram_buckets->Size() &&
			CStatistics::Epsilon > m_null_freq &&
			CStatistics::Epsilon > m_distinct_remaining);
}
// construct new histogram with less than or less than equal to filter
CHistogram *
CHistogram::MakeHistogramLessThanOrLessThanEqualFilter(
	CStatsPred::EStatsCmpType stats_cmp_type, CPoint *point) const
{
	GPOS_ASSERT(CStatsPred::EstatscmptL == stats_cmp_type ||
				CStatsPred::EstatscmptLEq == stats_cmp_type);
	CBucketArray *new_buckets = GPOS_NEW(m_mp) CBucketArray(m_mp);
	const ULONG num_buckets = m_histogram_buckets->Size();
	for (ULONG bucket_index = 0; bucket_index < num_buckets; bucket_index++)
	{
		CBucket *bucket = (*m_histogram_buckets)[bucket_index];
		if (bucket->IsBefore(point))
		{
			break;
		}
		else if (bucket->IsAfter(point))
		{
			new_buckets->Append(bucket->MakeBucketCopy(m_mp));
		}
		else
		{
			GPOS_ASSERT(bucket->Contains(point));
			CBucket *last_bucket = bucket->MakeBucketScaleUpper(
				m_mp, point,
				CStatsPred::EstatscmptLEq == stats_cmp_type /*include_upper*/);
			if (nullptr != last_bucket)
			{
				new_buckets->Append(last_bucket);
			}
			break;
		}
	}
	CDouble distinct_remaining = 0.0;
	CDouble freq_remaining = 0.0;
	if (CStatistics::Epsilon < m_distinct_remaining * DefaultSelectivity)
	{
		distinct_remaining = m_distinct_remaining * DefaultSelectivity;
		freq_remaining = m_freq_remaining * DefaultSelectivity;
	}
	return GPOS_NEW(m_mp) CHistogram(m_mp, new_buckets,
									 true,			// is_well_defined
									 CDouble(0.0),	// fNullFreq
									 distinct_remaining, freq_remaining);
}
// return an array buckets after applying non equality filter on the histogram buckets
CBucketArray *
CHistogram::MakeBucketsWithInequalityFilter(CPoint *point) const
{
	GPOS_ASSERT(nullptr != point);
	CBucketArray *new_buckets = GPOS_NEW(m_mp) CBucketArray(m_mp);
	const ULONG num_buckets = m_histogram_buckets->Size();
	bool point_is_null = point->GetDatum()->IsNull();
	for (ULONG bucket_index = 0; bucket_index < num_buckets; bucket_index++)
	{
		CBucket *bucket = (*m_histogram_buckets)[bucket_index];
		if (bucket->Contains(point) && !point_is_null)
		{
			CBucket *less_than_bucket = bucket->MakeBucketScaleUpper(
				m_mp, point, false /*include_upper */);
			if (nullptr != less_than_bucket)
			{
				new_buckets->Append(less_than_bucket);
			}
			CBucket *greater_than_bucket =
				bucket->MakeBucketGreaterThan(m_mp, point);
			if (nullptr != greater_than_bucket)
			{
				new_buckets->Append(greater_than_bucket);
			}
		}
		else
		{
			new_buckets->Append(bucket->MakeBucketCopy(m_mp));
		}
	}
	return new_buckets;
}
// construct new histogram with not equal filter
CHistogram *
CHistogram::MakeHistogramInequalityFilter(CPoint *point) const
{
	GPOS_ASSERT(nullptr != point);
	CBucketArray *histogram_buckets = MakeBucketsWithInequalityFilter(point);
	CDouble null_freq(0.0);
	return GPOS_NEW(m_mp)
		CHistogram(m_mp, histogram_buckets, true /*is_well_defined*/, null_freq,
				   m_distinct_remaining, m_freq_remaining);
}
// construct new histogram with IDF filter
CHistogram *
CHistogram::MakeHistogramIDFFilter(CPoint *point) const
{
	GPOS_ASSERT(nullptr != point);
	CBucketArray *histogram_buckets = MakeBucketsWithInequalityFilter(point);
	CDouble null_freq(0.0);
	if (!point->GetDatum()->IsNull())
	{
		// (col IDF NOT-NULL-CONSTANT) means that null values will also be returned
		null_freq = m_null_freq;
	}
	return GPOS_NEW(m_mp)
		CHistogram(m_mp, histogram_buckets, true /*is_well_defined*/, null_freq,
				   m_distinct_remaining, m_freq_remaining);
}
// return an array buckets after applying equality filter on the histogram buckets
CBucketArray *
CHistogram::MakeBucketsWithEqualityFilter(CPoint *point) const
{
	GPOS_ASSERT(nullptr != point);
	CBucketArray *histogram_buckets = GPOS_NEW(m_mp) CBucketArray(m_mp);
	if (point->GetDatum()->IsNull())
	{
		return histogram_buckets;
	}
	const ULONG num_buckets = m_histogram_buckets->Size();
	ULONG bucket_index = 0;
	for (bucket_index = 0; bucket_index < num_buckets; bucket_index++)
	{
		CBucket *bucket = (*m_histogram_buckets)[bucket_index];
		if (bucket->Contains(point))
		{
			if (bucket->IsSingleton())
			{
				// reuse existing bucket
				histogram_buckets->Append(bucket->MakeBucketCopy(m_mp));
			}
			else
			{
				// scale containing bucket
				CBucket *last_bucket = bucket->MakeBucketSingleton(m_mp, point);
				histogram_buckets->Append(last_bucket);
			}
			break;	// only one bucket can contain point
		}
	}
	return histogram_buckets;
}
// construct new histogram with equality filter
CHistogram *
CHistogram::MakeHistogramEqualFilter(CPoint *point) const
{
	GPOS_ASSERT(nullptr != point);
	if (point->GetDatum()->IsNull())
	{
		return GPOS_NEW(m_mp) CHistogram(
			m_mp, GPOS_NEW(m_mp) CBucketArray(m_mp), true /* is_well_defined */,
			m_null_freq, DefaultNDVRemain, DefaultNDVFreqRemain);
	}
	CBucketArray *histogram_buckets = MakeBucketsWithEqualityFilter(point);
	if (CStatistics::Epsilon < m_distinct_remaining &&
		0 == histogram_buckets->Size())	 // no match is found in the buckets
	{
		return GPOS_NEW(m_mp) CHistogram(
			m_mp, histogram_buckets,
			true,  // is_well_defined
			0.0,   // null_freq
			1.0,   // distinct_remaining
			std::min(CDouble(1.0),
					 m_freq_remaining / m_distinct_remaining)  // freq_remaining
		);
	}
	return GPOS_NEW(m_mp) CHistogram(m_mp, histogram_buckets);
}
// construct new histogram with INDF filter
CHistogram *
CHistogram::MakeHistogramINDFFilter(CPoint *point) const
{
	GPOS_ASSERT(nullptr != point);
	CBucketArray *histogram_buckets = MakeBucketsWithEqualityFilter(point);
	const ULONG num_of_buckets = histogram_buckets->Size();
	CDouble null_freq(0.0);
	if (point->GetDatum()->IsNull())
	{
		GPOS_ASSERT(0 == num_of_buckets);
		// (col INDF NULL) means that only the null values will be returned
		null_freq = m_null_freq;
	}
	if (CStatistics::Epsilon < m_distinct_remaining &&
		0 == num_of_buckets)  // no match is found in the buckets
	{
		return GPOS_NEW(m_mp) CHistogram(
			m_mp, histogram_buckets,
			true,  // is_well_defined
			null_freq,
			1.0,  // distinct_remaining
			std::min(CDouble(1.0),
					 m_freq_remaining / m_distinct_remaining)  // freq_remaining
		);
	}
	return GPOS_NEW(m_mp) CHistogram(
		m_mp, histogram_buckets, true /* is_well_defined */, null_freq,
		CHistogram::DefaultNDVRemain, CHistogram::DefaultNDVFreqRemain);
}
// construct new histogram with greater than or greater than equal filter
CHistogram *
CHistogram::MakeHistogramGreaterThanOrGreaterThanEqualFilter(
	CStatsPred::EStatsCmpType stats_cmp_type, CPoint *point) const
{
	GPOS_ASSERT(CStatsPred::EstatscmptGEq == stats_cmp_type ||
				CStatsPred::EstatscmptG == stats_cmp_type);
	CBucketArray *new_buckets = GPOS_NEW(m_mp) CBucketArray(m_mp);
	const ULONG num_buckets = m_histogram_buckets->Size();
	// find first bucket that contains point
	ULONG bucket_index = 0;
	for (bucket_index = 0; bucket_index < num_buckets; bucket_index++)
	{
		CBucket *bucket = (*m_histogram_buckets)[bucket_index];
		if (bucket->IsBefore(point))
		{
			break;
		}
		if (bucket->Contains(point))
		{
			if (CStatsPred::EstatscmptGEq == stats_cmp_type)
			{
				// first bucket needs to be scaled down
				CBucket *first_bucket = bucket->MakeBucketScaleLower(
					m_mp, point, true /* include_lower */);
				new_buckets->Append(first_bucket);
			}
			else
			{
				CBucket *greater_than_bucket =
					bucket->MakeBucketGreaterThan(m_mp, point);
				if (nullptr != greater_than_bucket)
				{
					new_buckets->Append(greater_than_bucket);
				}
			}
			bucket_index++;
			break;
		}
	}
	// add rest of the buckets
	for (; bucket_index < num_buckets; bucket_index++)
	{
		CBucket *bucket = (*m_histogram_buckets)[bucket_index];
		new_buckets->Append(bucket->MakeBucketCopy(m_mp));
	}
	CDouble distinct_remaining = 0.0;
	CDouble freq_remaining = 0.0;
	if (CStatistics::Epsilon < m_distinct_remaining * DefaultSelectivity)
	{
		distinct_remaining = m_distinct_remaining * DefaultSelectivity;
		freq_remaining = m_freq_remaining * DefaultSelectivity;
	}
	return GPOS_NEW(m_mp) CHistogram(m_mp, new_buckets,
									 true,			// is_well_defined
									 CDouble(0.0),	// fNullFreq
									 distinct_remaining, freq_remaining);
}
// sum of frequencies from buckets.
CDouble
CHistogram::GetFrequency() const
{
	CDouble frequency(0.0);
	const ULONG num_of_buckets = m_histogram_buckets->Size();
	for (ULONG bucket_index = 0; bucket_index < num_of_buckets; bucket_index++)
	{
		CBucket *bucket = (*m_histogram_buckets)[bucket_index];
		frequency = frequency + bucket->GetFrequency();
	}
	if (CStatistics::Epsilon < m_null_freq)
	{
		frequency = frequency + m_null_freq;
	}
	return frequency + m_freq_remaining;
}
// sum of number of distinct values from buckets
CDouble
CHistogram::GetNumDistinct() const
{
	CDouble distinct(0.0);
	const ULONG num_of_buckets = m_histogram_buckets->Size();
	for (ULONG bucket_index = 0; bucket_index < num_of_buckets; bucket_index++)
	{
		CBucket *bucket = (*m_histogram_buckets)[bucket_index];
		distinct = distinct + bucket->GetNumDistinct();
	}
	CDouble distinct_null(0.0);
	if (CStatistics::Epsilon < m_null_freq)
	{
		distinct_null = 1.0;
	}
	return distinct + distinct_null + m_distinct_remaining;
}
// cap the total number of distinct values (NDVs) in buckets to the number of rows
// creates new histogram of buckets, as this modifies individual buckets in the array
void
CHistogram::CapNDVs(CDouble rows)
{
	const ULONG num_of_buckets = m_histogram_buckets->Size();
	CDouble distinct = GetNumDistinct();
	if (rows >= distinct)
	{
		// no need for capping
		return;
	}
	m_NDVs_were_scaled = true;
	CDouble scale_ratio = (rows / distinct).Get();
	// since we want to modify individual buckets for this and only this histogram,
	// we must first make a deep copy of the existing m_histogram_buckets as these buckets
	// may be shared among histograms. We can then overwrite m_histogram_buckets with the copy
	// and modify individual buckets.
	CBucketArray *histogram_buckets =
		DeepCopyHistogramBuckets(m_mp, m_histogram_buckets);
	for (ULONG ul = 0; ul < num_of_buckets; ul++)
	{
		CBucket *bucket = (*histogram_buckets)[ul];
		CDouble distinct_bucket = bucket->GetNumDistinct();
		bucket->SetDistinct(std::max(CHistogram::MinDistinct.Get(),
									 (distinct_bucket * scale_ratio).Get()));
	}
	m_histogram_buckets->Release();
	m_histogram_buckets = histogram_buckets;
	m_distinct_remaining = m_distinct_remaining * scale_ratio;
}
// create a deep copy of the bucket array.
// this should be used if a bucket needs to be modified
CBucketArray *
CHistogram::DeepCopyHistogramBuckets(CMemoryPool *mp,
									 const CBucketArray *buckets)
{
	CBucketArray *histogram_buckets =
		GPOS_NEW(mp) CBucketArray(mp, buckets->Size());
	for (ULONG ul = 0; ul < buckets->Size(); ul++)
	{
		CBucket *newBucket = (*buckets)[ul]->MakeBucketCopy(mp);
		histogram_buckets->Append(newBucket);
	}
	return histogram_buckets;
}
// sum of frequencies is approx 1.0
BOOL
CHistogram::IsNormalized() const
{
	CDouble frequency = GetFrequency();
	return (frequency > CDouble(1.0) - CStatistics::Epsilon &&
			frequency < CDouble(1.0) + CStatistics::Epsilon);
}
//	check if histogram is well formed? Checks are:
//		1. Buckets should be in order (if applicable)
//		2. Buckets should not overlap
//		3. Frequencies should add up to less than 1.0
BOOL
CHistogram::IsValid() const
{
	// frequencies should not add up to more than 1.0
	// since we round up for column statistics, it's possible for histograms to be slightly over 1.
	// ex: 26 buckets evenly split gives freq of 0.03846153846. ORCA receives this as 0.038462
	// giving a total frequency of 1.000012.
	// At most, we would be overestimating by 0.5 freq per bucket, so give (num_buckets / 2)
	// We also give an additional bit for internal rounding within ORCA calculations.
	if (GetFrequency() >
		CDouble(1.0) + (CStatistics::Epsilon * GetNumBuckets() / 2 + 1))
	{
		return false;
	}
	if (IsHistogramForTextRelatedTypes())
	{
		return m_histogram_buckets->Size() == 0 ||
			   this->ContainsOnlySingletonBuckets();
	}
	else
	{
		for (ULONG bucket_index = 1; bucket_index < m_histogram_buckets->Size();
			 bucket_index++)
		{
			CBucket *bucket = (*m_histogram_buckets)[bucket_index];
			CBucket *previous_bucket = (*m_histogram_buckets)[bucket_index - 1];
			// the later bucket's lower point must be greater than or equal to
			// earlier bucket's upper point. Even if the underlying datum does not
			// support ordering, the check is safe.
			if (bucket->GetLowerBound()->IsLessThan(
					previous_bucket->GetUpperBound()))
			{
				return false;
			}
		}
	}
	return true;
}
// construct new histogram with filter and normalize output histogram
CHistogram *
CHistogram::MakeHistogramFilterNormalize(
	CStatsPred::EStatsCmpType stats_cmp_type, CPoint *point,
	CDouble *scale_factor) const
{
	// if histogram is not well-defined, then result is not well defined
	if (!IsWellDefined())
	{
		CHistogram *result_histogram =
			GPOS_NEW(m_mp) CHistogram(m_mp, false /* is_well_defined */);
		*scale_factor = CDouble(1.0) / CHistogram::DefaultSelectivity;
		return result_histogram;
	}
	CHistogram *result_histogram = MakeHistogramFilter(stats_cmp_type, point);
	*scale_factor = result_histogram->NormalizeHistogram();
	GPOS_ASSERT(result_histogram->IsValid());
	return result_histogram;
}
// construct new histogram by joining with another and normalize
// output histogram. If the join is not an equality join the function
// returns an empty histogram
CHistogram *
CHistogram::MakeJoinHistogramNormalize(CStatsPred::EStatsCmpType stats_cmp_type,
									   CDouble rows,
									   const CHistogram *other_histogram,
									   CDouble rows_other,
									   CDouble *scale_factor) const
{
	GPOS_ASSERT(nullptr != other_histogram);
	if (CStatisticsUtils::IsStatsCmpTypeNdvEq(stats_cmp_type))
	{
		*scale_factor = std::max(
			std::max(CHistogram::MinDistinct.Get(), GetNumDistinct().Get()),
			std::max(CHistogram::MinDistinct.Get(),
					 other_histogram->GetNumDistinct().Get()));
		return MakeNDVBasedJoinHistogramEqualityFilter(other_histogram);
	}
	BOOL fEqOrINDF = (CStatsPred::EstatscmptEq == stats_cmp_type ||
					  CStatsPred::EstatscmptINDF == stats_cmp_type);
	if (!fEqOrINDF)
	{
		*scale_factor = CScaleFactorUtils::DefaultInequalityJoinPredScaleFactor;
		if (CStatsPred::EstatscmptNEq == stats_cmp_type ||
			CStatsPred::EstatscmptIDF == stats_cmp_type)
		{
			*scale_factor =
				GetInequalityJoinScaleFactor(rows, other_histogram, rows_other);
		}
		return MakeJoinHistogram(stats_cmp_type, other_histogram);
	}
	// if either histogram is not well-defined, the result is not well defined
	if (!IsWellDefined() || !other_histogram->IsWellDefined())
	{
		CHistogram *result_histogram =
			GPOS_NEW(m_mp) CHistogram(m_mp, false /* is_well_defined */);
		(*scale_factor) = CDouble(std::min(rows.Get(), rows_other.Get()));
		return result_histogram;
	}
	CHistogram *result_histogram =
		MakeJoinHistogram(stats_cmp_type, other_histogram);
	// The returned frequencies are based on the cartesian product, now normalize the histogram.
	// The returned scale factor will give us the ratio of the cartesian product's cardinality
	// and the actual join cardinality.
	*scale_factor = result_histogram->NormalizeHistogram();
	// TODO: legacy code, apply the Ramakrishnan and Gehrke method again on the entire table,
	// ignoring the computation we did on each histogram bucket in
	// CBucket::MakeBucketIntersect()
	if (!CJoinStatsProcessor::ComputeScaleFactorFromHistogramBuckets())
	{
		*scale_factor =
			std::max(std::max(MinDistinct.Get(), GetNumDistinct().Get()),
					 std::max(MinDistinct.Get(),
							  other_histogram->GetNumDistinct().Get()));
	}
	CDouble cartesian_product_num_rows = rows * rows_other;
	if (result_histogram->IsEmpty())
	{
		// if join histogram is empty for equality join condition
		// use Cartesian product size as scale factor
		*scale_factor = cartesian_product_num_rows;
	}
	if (CStatsPred::EstatscmptINDF == stats_cmp_type)
	{
		// if the predicate is INDF then we must count for the cartesian
		// product of NULL values in both the histograms
		CDouble expected_num_rows_eq_join =
			cartesian_product_num_rows / *scale_factor;
		CDouble num_null_rows = rows * m_null_freq;
		CDouble num_null_rows_other =
			rows_other * other_histogram->GetNullFreq();
		CDouble expected_num_rows_INDF =
			expected_num_rows_eq_join + (num_null_rows * num_null_rows_other);
		*scale_factor = std::max(
			CDouble(1.0), cartesian_product_num_rows / expected_num_rows_INDF);
	}
	// bound scale factor by cross product
	*scale_factor =
		std::min((*scale_factor).Get(), cartesian_product_num_rows.Get());
	GPOS_ASSERT(result_histogram->IsValid());
	return result_histogram;
}
// scalar factor of inequality (!=) join condition
CDouble
CHistogram::GetInequalityJoinScaleFactor(CDouble rows,
										 const CHistogram *other_histogram,
										 CDouble rows_other) const
{
	GPOS_ASSERT(nullptr != other_histogram);
	CDouble scale_factor(1.0);
	// we compute the scale factor of the inequality join (!= aka <>)
	// from the scale factor of equi-join
	CHistogram *equijoin_histogram =
		MakeJoinHistogramNormalize(CStatsPred::EstatscmptEq, rows,
								   other_histogram, rows_other, &scale_factor);
	GPOS_DELETE(equijoin_histogram);
	CDouble cartesian_product_num_rows = rows * rows_other;
	GPOS_ASSERT(CDouble(1.0) <= scale_factor);
	CDouble equality_selectivity = 1 / scale_factor;
	CDouble inequality_selectivity = (1 - equality_selectivity);
	scale_factor = cartesian_product_num_rows;
	if (CStatistics::Epsilon < inequality_selectivity)
	{
		scale_factor = 1 / inequality_selectivity;
	}
	return scale_factor;
}
// construct new histogram by left anti semi join with another and normalize
// output histogram
CHistogram *
CHistogram::MakeLASJHistogramNormalize(CStatsPred::EStatsCmpType stats_cmp_type,
									   CDouble rows,
									   const CHistogram *other_histogram,
									   CDouble *scale_factor,
									   BOOL DoIgnoreLASJHistComputation) const
{
	// if either histogram is not well-defined, the result is not well defined
	if (!IsWellDefined() || !other_histogram->IsWellDefined())
	{
		CHistogram *result_histogram =
			GPOS_NEW(m_mp) CHistogram(m_mp, false /* is_well_defined */);
		(*scale_factor) = CDouble(1.0);
		return result_histogram;
	}
	if (DoIgnoreLASJHistComputation)
	{
		// TODO:  04/14/2012 : LASJ derivation is pretty aggressive.
		// simply return a copy of the histogram with a scale factor corresponding to default selectivity.
		CHistogram *result_histogram = CopyHistogram();
		*scale_factor = CDouble(1.0) / CHistogram::DefaultSelectivity;
		GPOS_ASSERT(result_histogram->IsValid());
		return result_histogram;
	}
	CHistogram *result_histogram =
		MakeLASJHistogram(stats_cmp_type, other_histogram);
	*scale_factor = result_histogram->NormalizeHistogram();
	if (CStatsPred::EstatscmptEq != stats_cmp_type &&
		CStatsPred::EstatscmptINDF != stats_cmp_type)
	{
		// TODO: , June 6 2014, we currently only support join histogram computation
		// for equality and INDF predicates, we cannot accurately estimate
		// the scale factor of the other predicates
		*scale_factor = CDouble(1.0) / CHistogram::DefaultSelectivity;
	}
	*scale_factor = std::min((*scale_factor).Get(), rows.Get());
	GPOS_ASSERT(result_histogram->IsValid());
	return result_histogram;
}
// construct new histogram after applying the filter (no normalization)
CHistogram *
CHistogram::MakeHistogramFilter(CStatsPred::EStatsCmpType stats_cmp_type,
								CPoint *point) const
{
	CHistogram *result_histogram = nullptr;
	GPOS_ASSERT(IsOpSupportedForFilter(stats_cmp_type));
	switch (stats_cmp_type)
	{
		case CStatsPred::EstatscmptEq:
		{
			result_histogram = MakeHistogramEqualFilter(point);
			break;
		}
		case CStatsPred::EstatscmptINDF:
		{
			result_histogram = MakeHistogramINDFFilter(point);
			break;
		}
		case CStatsPred::EstatscmptL:
		case CStatsPred::EstatscmptLEq:
		{
			result_histogram = MakeHistogramLessThanOrLessThanEqualFilter(
				stats_cmp_type, point);
			break;
		}
		case CStatsPred::EstatscmptG:
		case CStatsPred::EstatscmptGEq:
		{
			result_histogram = MakeHistogramGreaterThanOrGreaterThanEqualFilter(
				stats_cmp_type, point);
			break;
		}
		case CStatsPred::EstatscmptNEq:
		{
			result_histogram = MakeHistogramInequalityFilter(point);
			break;
		}
		case CStatsPred::EstatscmptIDF:
		{
			result_histogram = MakeHistogramIDFFilter(point);
			break;
		}
		default:
		{
			GPOS_RTL_ASSERT_MSG(false, "Not supported. Must not be called.");
			break;
		}
	}
	return result_histogram;
}
// construct new histogram by joining with another histogram, no normalization
CHistogram *
CHistogram::MakeJoinHistogram(CStatsPred::EStatsCmpType stats_cmp_type,
							  const CHistogram *histogram) const
{
	GPOS_ASSERT(JoinPredCmpTypeIsSupported(stats_cmp_type));
	if (CStatsPred::EstatscmptEq == stats_cmp_type)
	{
		return MakeJoinHistogramEqualityFilter(histogram);
	}
	if (CStatsPred::EstatscmptINDF == stats_cmp_type)
	{
		return MakeJoinHistogramINDFFilter(histogram);
	}
	// TODO:  Feb 24 2014, We currently only support creation of histogram for equi join
	return GPOS_NEW(m_mp) CHistogram(m_mp, false /* is_well_defined */);
}
// construct new histogram by LASJ with another histogram, no normalization
CHistogram *
CHistogram::MakeLASJHistogram(CStatsPred::EStatsCmpType stats_cmp_type,
							  const CHistogram *histogram) const
{
	GPOS_ASSERT(nullptr != histogram);
	if (CStatsPred::EstatscmptEq != stats_cmp_type &&
		CStatsPred::EstatscmptINDF != stats_cmp_type)
	{
		// TODO: , June 6 2014, we currently only support join histogram computation
		// for equality and INDF predicates, we return the original histogram
		return CopyHistogram();
	}
	CBucketArray *new_buckets = GPOS_NEW(m_mp) CBucketArray(m_mp);
	CBucket *lower_split_bucket = nullptr;
	CBucket *upper_split_bucket = nullptr;
	ULONG idx1 = 0;	 // index into this histogram
	ULONG idx2 = 0;	 // index into other histogram
	CBucket *candidate_bucket = nullptr;
	const ULONG buckets1 = GetNumBuckets();
	const ULONG buckets2 = histogram->GetNumBuckets();
	while (idx1 < buckets1 && idx2 < buckets2)
	{
		// bucket from other histogram
		CBucket *bucket2 = (*histogram->m_histogram_buckets)[idx2];
		// yet to choose a candidate
		GPOS_ASSERT(nullptr == candidate_bucket);
		if (nullptr != upper_split_bucket)
		{
			candidate_bucket = upper_split_bucket;
		}
		else
		{
			candidate_bucket = (*m_histogram_buckets)[idx1]->MakeBucketCopy(
				m_mp);	// candidate bucket in result histogram
			idx1++;
		}
		lower_split_bucket = nullptr;
		upper_split_bucket = nullptr;
		candidate_bucket->Difference(m_mp, bucket2, &lower_split_bucket,
									 &upper_split_bucket);
		if (nullptr != lower_split_bucket)
		{
			new_buckets->Append(lower_split_bucket);
		}
		// need to find a new candidate
		GPOS_DELETE(candidate_bucket);
		candidate_bucket = nullptr;
		idx2++;
	}
	candidate_bucket = upper_split_bucket;
	// if we looked at all the buckets from the other histogram, then add remaining buckets from
	// this histogram
	if (idx2 == buckets2)
	{
		// candidate is part of the result
		if (nullptr != candidate_bucket)
		{
			new_buckets->Append(candidate_bucket);
		}
		CStatisticsUtils::AddRemainingBuckets(m_mp, m_histogram_buckets,
											  new_buckets, &idx1);
	}
	else
	{
		GPOS_DELETE(candidate_bucket);
	}
	CDouble null_freq = CLeftAntiSemiJoinStatsProcessor::NullFreqLASJ(
		stats_cmp_type, this, histogram);
	return GPOS_NEW(m_mp)
		CHistogram(m_mp, new_buckets, true /*is_well_defined*/, null_freq,
				   m_distinct_remaining, m_freq_remaining);
}
// scales frequencies on histogram so that they add up to 1.0.
// Returns the scaling factor that was employed. Should not be called on empty histogram.
CDouble
CHistogram::NormalizeHistogram()
{
	// trivially normalized
	if (GetNumBuckets() == 0 && CStatistics::Epsilon > m_null_freq &&
		CStatistics::Epsilon > m_distinct_remaining)
	{
		return CDouble(GPOS_FP_ABS_MAX);
	}
	CDouble scale_factor =
		std::max(DOUBLE(1.0), (CDouble(1.0) / GetFrequency()).Get());
	// if the scale factor is 1.0, we don't need to copy the buckets
	if (scale_factor != DOUBLE(1.0))
	{
		// since we want to modify individual buckets for this and only this histogram,
		// we must first make a deep copy of the existing m_histogram_buckets as these buckets
		// may be shared among histograms. We can then overwrite m_histogram_buckets with the copy
		// and modify individual buckets.
		CBucketArray *histogram_buckets =
			DeepCopyHistogramBuckets(m_mp, m_histogram_buckets);
		for (ULONG ul = 0; ul < histogram_buckets->Size(); ul++)
		{
			CBucket *bucket = (*histogram_buckets)[ul];
			bucket->SetFrequency(bucket->GetFrequency() * scale_factor);
		}
		m_histogram_buckets->Release();
		m_histogram_buckets = histogram_buckets;
	}
	m_null_freq = m_null_freq * scale_factor;
	CDouble freq_remaining =
		std::min((m_freq_remaining * scale_factor).Get(), DOUBLE(1.0));
	if (CStatistics::Epsilon > m_distinct_remaining)
	{
		freq_remaining = 0.0;
	}
	m_freq_remaining = freq_remaining;
	return scale_factor;
}
// returns shallow copy of histogram
CHistogram *
CHistogram::CopyHistogram() const
{
	m_histogram_buckets->AddRef();
	CHistogram *histogram_copy = GPOS_NEW(m_mp)
		CHistogram(m_mp, m_histogram_buckets, m_is_well_defined, m_null_freq,
				   m_distinct_remaining, m_freq_remaining);
	if (WereNDVsScaled())
	{
		histogram_copy->SetNDVScaled();
	}
	return histogram_copy;
}
BOOL
CHistogram::IsOpSupportedForTextFilter(CStatsPred::EStatsCmpType stats_cmp_type)
{
	// is the scalar comparison type one of =, <>
	switch (stats_cmp_type)
	{
		case CStatsPred::EstatscmptEq:
		case CStatsPred::EstatscmptNEq:
			return true;
		default:
			return false;
	}
}
// is statistics comparison type supported for filter?
BOOL
CHistogram::IsOpSupportedForFilter(CStatsPred::EStatsCmpType stats_cmp_type)
{
	// is the scalar comparison type one of <, <=, =, >=, >, <>, IDF, INDF
	switch (stats_cmp_type)
	{
		case CStatsPred::EstatscmptL:
		case CStatsPred::EstatscmptLEq:
		case CStatsPred::EstatscmptEq:
		case CStatsPred::EstatscmptGEq:
		case CStatsPred::EstatscmptG:
		case CStatsPred::EstatscmptNEq:
		case CStatsPred::EstatscmptIDF:
		case CStatsPred::EstatscmptINDF:
			return true;
		default:
			return false;
	}
}
BOOL
CHistogram::ContainsOnlySingletonBuckets() const
{
	for (ULONG ul = 0; ul < m_histogram_buckets->Size(); ++ul)
	{
		CBucket *bucket = (*m_histogram_buckets)[ul];
		if (!bucket->IsSingleton())
		{
			return false;
		}
	}
	return true;
}
// is comparison type supported for join?
BOOL
CHistogram::JoinPredCmpTypeIsSupported(CStatsPred::EStatsCmpType stats_cmp_type)
{
	return (CStatsPred::EstatscmptOther != stats_cmp_type);
}
// construct a new histogram with equality join
CHistogram *
CHistogram::MakeJoinHistogramEqualityFilter(const CHistogram *histogram) const
{
	ULONG idx1 = 0;	 // index on buckets from this histogram
	ULONG idx2 = 0;	 // index on buckets from other histogram
	const ULONG buckets1 = GetNumBuckets();
	const ULONG buckets2 = histogram->GetNumBuckets();
	CDouble hist1_buckets_freq(0.0);
	CDouble hist2_buckets_freq(0.0);
	CDouble distinct_remaining(0.0);
	CDouble freq_remaining(0.0);
	if (NeedsNDVBasedCardEstimationForEq(this) ||
		NeedsNDVBasedCardEstimationForEq(histogram))
	{
		return MakeNDVBasedJoinHistogramEqualityFilter(histogram);
	}
	CBucketArray *join_buckets = GPOS_NEW(m_mp) CBucketArray(m_mp);
	while (idx1 < buckets1 && idx2 < buckets2)
	{
		CBucket *bucket1 = (*m_histogram_buckets)[idx1];
		CBucket *bucket2 = (*histogram->m_histogram_buckets)[idx2];
		if (bucket1->Intersects(bucket2))
		{
			CDouble freq_intersect1(0.0);
			CDouble freq_intersect2(0.0);
			CBucket *new_bucket = bucket1->MakeBucketIntersect(
				m_mp, bucket2, &freq_intersect1, &freq_intersect2);
			join_buckets->Append(new_bucket);
			hist1_buckets_freq = hist1_buckets_freq + freq_intersect1;
			hist2_buckets_freq = hist2_buckets_freq + freq_intersect2;
			INT res = CBucket::CompareUpperBounds(bucket1, bucket2);
			if (0 == res)
			{
				// both ubs are equal
				idx1++;
				idx2++;
			}
			else if (1 > res)
			{
				// bucket1's ub is smaller than that of the ub of bucket2
				idx1++;
			}
			else
			{
				idx2++;
			}
		}
		else if (bucket1->IsBefore(bucket2))
		{
			// buckets do not intersect there one bucket is before the other
			idx1++;
		}
		else
		{
			GPOS_ASSERT(bucket2->IsBefore(bucket1));
			idx2++;
		}
	}
	ComputeJoinNDVRemainInfo(this, histogram, join_buckets, hist1_buckets_freq,
							 hist2_buckets_freq, &distinct_remaining,
							 &freq_remaining);
	return GPOS_NEW(m_mp)
		CHistogram(m_mp, join_buckets, true /*is_well_defined*/,
				   0.0 /*null_freq*/, distinct_remaining, freq_remaining);
}
// construct a new histogram for NDV based cardinality estimation
CHistogram *
CHistogram::MakeNDVBasedJoinHistogramEqualityFilter(
	const CHistogram *histogram) const
{
	CDouble distinct_remaining(0.0);
	CDouble freq_remaining(0.0);
	CBucketArray *join_buckets = GPOS_NEW(m_mp) CBucketArray(m_mp);
	// compute the number of non-null distinct values in the input histograms
	CDouble NDVs1 = this->GetNumDistinct();
	CDouble freq_remain1 = this->GetFrequency();
	CDouble null_freq1 = this->GetNullFreq();
	if (CStatistics::Epsilon < null_freq1)
	{
		NDVs1 = std::max(CDouble(0.0), (NDVs1 - 1.0));
		freq_remain1 = freq_remain1 - null_freq1;
	}
	CDouble NDVs2 = histogram->GetNumDistinct();
	CDouble freq_remain2 = histogram->GetFrequency();
	CDouble null_freq2 = histogram->GetNullFreq();
	if (CStatistics::Epsilon < histogram->GetNullFreq())
	{
		NDVs2 = std::max(CDouble(0.0), (NDVs2 - 1.0));
		freq_remain2 = freq_remain2 - null_freq2;
	}
	// the estimated number of distinct value is the minimum of the non-null
	// distinct values of the two inputs.
	distinct_remaining = std::min(NDVs1, NDVs2);
	// the frequency of a tuple in this histogram (with frequency freq_remain1) joining with
	// a tuple in another relation (with frequency freq_remain2) is a product of the two frequencies divided by
	// the maximum NDV of the two inputs
	// Example: consider two relations A and B with 10 tuples each. Let both relations have no nulls.
	// Let A have 2 distinct values, while B have 5 distinct values. Under uniform distribution of NDVs
	// for statistics purposes we can view A = (1,2,1,2,1,2,1,2,1,2) and B = (1,2,3,4,5,1,2,3,4,5)
	// Join Cardinality is 20, with frequency of the join tuple being 0.2 (since cartesian product is 100).
	// freq_remain1 = freq_remain2 = 1, and std::max(NDVs1, NDVs2) = 5. Therefore freq_remaining = 1/5 = 0.2
	if (CStatistics::Epsilon < distinct_remaining)
	{
		freq_remaining = freq_remain1 * freq_remain2 / std::max(NDVs1, NDVs2);
	}
	return GPOS_NEW(m_mp)
		CHistogram(m_mp, join_buckets, true /*is_well_defined*/,
				   0.0 /*null_freq*/, distinct_remaining, freq_remaining);
}
// construct a new histogram for an INDF join predicate
CHistogram *
CHistogram::MakeJoinHistogramINDFFilter(const CHistogram *histogram) const
{
	CHistogram *join_histogram = MakeJoinHistogramEqualityFilter(histogram);
	// compute the null frequency is the same means as how we perform equi-join
	// see CBuckets::MakeBucketIntersect for details
	CDouble null_freq = GetNullFreq() * histogram->GetNullFreq() * DOUBLE(1.0) /
						std::max(GetNumDistinct(), histogram->GetNumDistinct());
	join_histogram->SetNullFrequency(null_freq);
	return join_histogram;
}
// check if we can compute NDVRemain for JOIN histogram for the given input histograms
BOOL
CHistogram::CanComputeJoinNDVRemain(const CHistogram *histogram1,
									const CHistogram *histogram2)
{
	GPOS_ASSERT(nullptr != histogram1);
	GPOS_ASSERT(nullptr != histogram2);
	BOOL has_buckets1 = (0 != histogram1->GetNumBuckets());
	BOOL has_buckets2 = (0 != histogram2->GetNumBuckets());
	BOOL has_distinct_remain1 =
		CStatistics::Epsilon < histogram1->GetDistinctRemain();
	BOOL has_distinct_remain2 =
		CStatistics::Epsilon < histogram2->GetDistinctRemain();
	if (!has_distinct_remain1 && !has_distinct_remain2)
	{
		// no remaining tuples information hence no need compute NDVRemain for join histogram
		return false;
	}
	if ((has_distinct_remain1 || has_buckets1) &&
		(has_distinct_remain2 || has_buckets2))
	{
		//
		return true;
	}
	// insufficient information to compute JoinNDVRemain, we need:
	// 1. for each input either have a histograms or NDVRemain
	// 2. at least one inputs must have NDVRemain
	return false;
}
// compute the effects of the NDV and frequency of the tuples not captured by the histogram
void
CHistogram::ComputeJoinNDVRemainInfo(const CHistogram *histogram1,
									 const CHistogram *histogram2,
									 const CBucketArray *join_buckets,
									 CDouble hist1_buckets_freq,
									 CDouble hist2_buckets_freq,
									 CDouble *result_distinct_remain,
									 CDouble *result_freq_remain)
{
	GPOS_ASSERT(nullptr != histogram1);
	GPOS_ASSERT(nullptr != histogram2);
	GPOS_ASSERT(nullptr != join_buckets);
	CDouble join_NDVs = CStatisticsUtils::GetNumDistinct(join_buckets);
	CDouble join_freq = CStatisticsUtils::GetFrequency(join_buckets);
	*result_distinct_remain = 0.0;
	*result_freq_remain = 0.0;
	if (!CanComputeJoinNDVRemain(histogram1, histogram2))
	{
		return;
	}
	if (CStatistics::Epsilon >= (1 - join_freq))
	{
		return;
	}
	// we compute the following information for the resulting join histogram
	// 1. NDVRemain and 2. Frequency of the NDVRemain
	// compute the number of non-null distinct values in each input histogram
	CDouble num_distinct1 = histogram1->GetNumDistinct();
	CDouble num_distinct_non_null1 = num_distinct1;
	if (CStatistics::Epsilon < histogram1->GetNullFreq())
	{
		num_distinct_non_null1 = num_distinct_non_null1 - 1.0;
	}
	CDouble num_distinct2 = histogram2->GetNumDistinct();
	CDouble num_distinct_non_null2 = num_distinct2;
	if (CStatistics::Epsilon < histogram2->GetNullFreq())
	{
		num_distinct_non_null2 = num_distinct_non_null2 - 1.0;
	}
	// the estimated final number of distinct value for the join is the minimum of the non-null
	// distinct values of the two inputs. This follows the principle of used to estimate join
	// scaling factor -- defined as the maximum NDV of the two inputs
	CDouble final_join_NDVs =
		std::min(num_distinct_non_null1, num_distinct_non_null2);
	CDouble remaining_join_NDVs =
		std::max(final_join_NDVs - join_NDVs, CDouble(0.0));
	// compute the frequency of the non-joining buckets in each input histogram
	CDouble freq_buckets1 =
		CStatisticsUtils::GetFrequency(histogram1->GetBuckets());
	CDouble freq_buckets2 =
		CStatisticsUtils::GetFrequency(histogram2->GetBuckets());
	CDouble freq_non_join_buckets1 =
		std::max(CDouble(0), (freq_buckets1 - hist1_buckets_freq));
	CDouble freq_non_join_buckets2 =
		std::max(CDouble(0), (freq_buckets2 - hist2_buckets_freq));
	// compute the NDV of the non-joining buckets
	CDouble NDVs_non_join_buckets1 =
		CStatisticsUtils::GetNumDistinct(histogram1->GetBuckets()) - join_NDVs;
	CDouble NDVs_non_join_buckets2 =
		CStatisticsUtils::GetNumDistinct(histogram2->GetBuckets()) - join_NDVs;
	CDouble freq_remain1 = histogram1->GetFreqRemain();
	CDouble freq_remain2 = histogram2->GetFreqRemain();
	// the frequency of the NDVRemain of the join is made of:
	// 1. frequency of the NDV as a result of joining NDVRemain1 and NDVRemain2
	// 2. frequency of the NDV as a results of joining NDVRemain1 and NDVs_non_join_buckets2
	// 3. frequency of the NDV as a results of joining NDVRemain2 and NDVs_non_join_buckets1
	CDouble freq_remain_join = freq_remain1 * freq_remain2 /
							   std::max(histogram1->GetDistinctRemain(),
										histogram2->GetDistinctRemain());
	freq_remain_join =
		freq_remain_join +
		freq_remain1 * freq_non_join_buckets2 /
			std::max(num_distinct_non_null1, NDVs_non_join_buckets2);
	freq_remain_join =
		freq_remain_join +
		freq_remain2 * freq_non_join_buckets1 /
			std::max(num_distinct_non_null2, NDVs_non_join_buckets1);
	*result_distinct_remain = remaining_join_NDVs;
	*result_freq_remain = freq_remain_join;
}
// construct new histogram by removing duplicates and normalize output histogram
CHistogram *
CHistogram::MakeGroupByHistogramNormalize(CDouble,	// rows,
										  CDouble *result_distinct_values) const
{
	// if either histogram is not well-defined, the result is not well defined
	if (!IsWellDefined())
	{
		CHistogram *result_histogram =
			GPOS_NEW(m_mp) CHistogram(m_mp, false /* is_well_defined */);
		*result_distinct_values = MinDistinct / CHistogram::DefaultSelectivity;
		return result_histogram;
	}
	// total number of distinct values
	CDouble distinct = GetNumDistinct();
	CBucketArray *new_buckets = GPOS_NEW(m_mp) CBucketArray(m_mp);
	const ULONG num_of_buckets = m_histogram_buckets->Size();
	for (ULONG ul = 0; ul < num_of_buckets; ul++)
	{
		CBucket *bucket = (*m_histogram_buckets)[ul];
		CPoint *lower_bound = bucket->GetLowerBound();
		CPoint *upper_bound = bucket->GetUpperBound();
		lower_bound->AddRef();
		upper_bound->AddRef();
		BOOL is_upper_closed = false;
		if (lower_bound->Equals(upper_bound))
		{
			is_upper_closed = true;
		}
		CBucket *new_bucket = GPOS_NEW(m_mp) CBucket(
			lower_bound, upper_bound, true /* fClosedLower */, is_upper_closed,
			bucket->GetNumDistinct() / distinct, bucket->GetNumDistinct());
		new_buckets->Append(new_bucket);
	}
	CDouble new_null_freq = CDouble(0.0);
	if (CStatistics::Epsilon < m_null_freq)
	{
		new_null_freq = std::min(CDouble(1.0), CDouble(1.0) / distinct);
	}
	CDouble freq_remaining = 0.0;
	if (CStatistics::Epsilon < m_distinct_remaining)
	{
		// TODO:  11/22/2013 - currently the NDV of a histogram could be 0 or a decimal number.
		// We may not need the capping if later distinct is lower bounded at 1.0 for non-empty histogram
		freq_remaining =
			std::min(CDouble(1.0), m_distinct_remaining / distinct);
	}
	CHistogram *result_histogram = GPOS_NEW(m_mp)
		CHistogram(m_mp, new_buckets, true /*is_well_defined*/, new_null_freq,
				   m_distinct_remaining, freq_remaining);
	*result_distinct_values = result_histogram->GetNumDistinct();
	return result_histogram;
}
// construct new histogram based on union all operation
CHistogram *
CHistogram::MakeUnionAllHistogramNormalize(CDouble rows,
										   const CHistogram *histogram,
										   CDouble rows_other) const
{
	GPOS_ASSERT(nullptr != histogram);
	GPOS_ASSERT(this->IsValid());
	GPOS_ASSERT(histogram->IsValid());
	CBucketArray *new_buckets = GPOS_NEW(m_mp) CBucketArray(m_mp);
	ULONG idx1 = 0;	 // index on buckets from this histogram
	ULONG idx2 = 0;	 // index on buckets from other histogram
	CBucket *bucket1 = (*this)[idx1];
	CBucket *bucket2 = (*histogram)[idx2];
	CDouble rows_new = (rows_other + rows);
	// flags to determine if the buckets where residue of the bucket-merge operation
	BOOL bucket1_is_residual = false;
	BOOL bucket2_is_residual = false;
	while (nullptr != bucket1 && nullptr != bucket2)
	{
		if (bucket1->IsBefore(bucket2))
		{
			new_buckets->Append(
				bucket1->MakeBucketUpdateFrequency(m_mp, rows, rows_new));
			CleanupResidualBucket(bucket1, bucket1_is_residual);
			idx1++;
			bucket1 = (*this)[idx1];
			bucket1_is_residual = false;
		}
		else if (bucket2->IsBefore(bucket1))
		{
			new_buckets->Append(
				bucket2->MakeBucketUpdateFrequency(m_mp, rows_other, rows_new));
			CleanupResidualBucket(bucket2, bucket2_is_residual);
			idx2++;
			bucket2 = (*histogram)[idx2];
			bucket2_is_residual = false;
		}
		else
		{
			GPOS_ASSERT(bucket1->Intersects(bucket2));
			CBucket *bucket1_new = nullptr;
			CBucket *bucket2_new = nullptr;
			CDouble result_rows(0.0);
			CBucket *merge_bucket = bucket1->SplitAndMergeBuckets(
				m_mp, bucket2, rows, rows_other, &bucket1_new, &bucket2_new,
				&result_rows);
			new_buckets->Append(merge_bucket);
			CleanupResidualBucket(bucket1, bucket1_is_residual);
			CleanupResidualBucket(bucket2, bucket2_is_residual);
			bucket1 =
				GetNextBucket(this, bucket1_new, &bucket1_is_residual, &idx1);
			bucket2 = GetNextBucket(histogram, bucket2_new,
									&bucket2_is_residual, &idx2);
		}
	}
	const ULONG num_buckets1 = GetNumBuckets();
	const ULONG num_buckets2 = histogram->GetNumBuckets();
	GPOS_ASSERT_IFF(nullptr == bucket1, idx1 == num_buckets1);
	GPOS_ASSERT_IFF(nullptr == bucket2, idx2 == num_buckets2);
	idx1 = AddResidualUnionAllBucket(new_buckets, bucket1, rows, rows_new,
									 bucket1_is_residual, idx1);
	idx2 = AddResidualUnionAllBucket(new_buckets, bucket2, rows_other, rows_new,
									 bucket2_is_residual, idx2);
	CleanupResidualBucket(bucket1, bucket1_is_residual);
	CleanupResidualBucket(bucket2, bucket2_is_residual);
	// add any leftover buckets from other histogram
	AddBuckets(m_mp, histogram->m_histogram_buckets, new_buckets, rows_other,
			   rows_new, idx2, num_buckets2);
	// add any leftover buckets from this histogram
	AddBuckets(m_mp, m_histogram_buckets, new_buckets, rows, rows_new, idx1,
			   num_buckets1);
	CDouble new_null_freq =
		(m_null_freq * rows + histogram->m_null_freq * rows_other) / rows_new;
	CDouble distinct_remaining =
		std::max(m_distinct_remaining, histogram->GetDistinctRemain());
	CDouble freq_remaining =
		(m_freq_remaining * rows + histogram->GetFreqRemain() * rows_other) /
		rows_new;
	ULONG max_num_buckets = COptCtxt::PoctxtFromTLS()
								->GetOptimizerConfig()
								->GetStatsConf()
								->UlMaxStatsBuckets();
	ULONG desired_num_buckets =
		std::max((ULONG) max_num_buckets, std::max(num_buckets1, num_buckets2));
	CBucketArray *result_buckets =
		CombineBuckets(m_mp, new_buckets, desired_num_buckets);
	CHistogram *result_histogram = GPOS_NEW(m_mp)
		CHistogram(m_mp, result_buckets, true /*is_well_defined*/,
				   new_null_freq, distinct_remaining, freq_remaining);
	(void) result_histogram->NormalizeHistogram();
	GPOS_ASSERT(result_histogram->IsValid());
	new_buckets->Release();
	return result_histogram;
}
// add residual bucket in the union all operation to the array of buckets in the histogram
ULONG
CHistogram::AddResidualUnionAllBucket(CBucketArray *histogram_buckets,
									  CBucket *bucket, CDouble rows_old,
									  CDouble rows_new, BOOL bucket_is_residual,
									  ULONG index) const
{
	GPOS_ASSERT(nullptr != histogram_buckets);
	if (bucket_is_residual)
	{
		histogram_buckets->Append(
			bucket->MakeBucketUpdateFrequency(m_mp, rows_old, rows_new));
		return index + 1;
	}
	return index;
}
// add buckets from one array to another
void
CHistogram::AddBuckets(CMemoryPool *mp, const CBucketArray *src_buckets,
					   CBucketArray *dest_buckets, CDouble rows_old,
					   CDouble rows_new, ULONG begin, ULONG end)
{
	GPOS_ASSERT(nullptr != src_buckets);
	GPOS_ASSERT(nullptr != dest_buckets);
	GPOS_ASSERT(begin <= end);
	GPOS_ASSERT(end <= src_buckets->Size());
	for (ULONG ul = begin; ul < end; ul++)
	{
		dest_buckets->Append(
			((*src_buckets)[ul])
				->MakeBucketUpdateFrequency(mp, rows_old, rows_new));
	}
}
// Given an array of buckets, merge together buckets with similar information
// until the total number of buckets reaches the desired_num_buckets. It does
// this by using a combination of two ratios: freq/ndv and freq/bucket_width.
// These two ratios were decided based off the following examples:
//
// Assuming that we calculate row counts for selections like the following:
// - For a predicate col = const: rows * freq / NDVs
// - For a predicate col < const: rows * (sum of full or fractional frequencies)
//
// Example 1 (rows = 100), freq/width, ndvs/width and ndvs/freq are all the same:
// Bucket 1: [0, 4)   freq .2  NDVs 2  width 4  freq/width = .05 ndv/width = .5 freq/ndv = .1
// Bucket 2: [4, 12)  freq .4  NDVs 4  width 8  freq/width = .05 ndv/width = .5 freq/ndv = .1
// Combined: [0, 12)  freq .6  NDVs 6  width 12
//
// This should give the same estimates for various predicates, with separate or combined buckets:
// pred          separate buckets         combined bucket   result
// -------       ---------------------    ---------------   -----------
// col = 3  ==>  100 * .2 / 2           = 100 * .6 / 6    = 10 rows
// col = 5  ==>  100 * .4 / 4           = 100 * .6 / 6    = 10 rows
// col < 6  ==>  100 * (.2 + .25 * .4)  = 100 * .5 * .6   = 30 rows
//
// Example 2 (rows = 100), freq and ndvs are the same, but width is different:
// Bucket 1: [0, 4)   freq .4  NDVs 4  width 4  freq/width = .1 ndv/width = 1 freq/ndv = .1
// Bucket 2: [4, 12)  freq .4  NDVs 4  width 8  freq/width = .05 ndv/width = .5 freq/ndv = .1
// Combined: [0, 12)  freq .8  NDVs 8  width 12
//
// This will give different estimates with the combined bucket, but only for non-equal preds:
// pred          separate buckets         combined bucket   results
// -------       ---------------------    ---------------   --------------
// col = 3  ==>  100 * .4 / 4           = 100 * .8 / 8    = 10 rows
// col = 5  ==>  100 * .4 / 4           = 100 * .8 / 8    = 10 rows
// col < 6  ==>  100 * (.4 + .25 * .4) != 100 * .5 * .8     50 vs. 40 rows
//
// Example 3 (rows = 100), now NDVs / freq is different:
// Bucket 1: [0, 4)   freq .2  NDVs 4  width 4  freq/width = .05 ndv/width = 1 freq/ndv = .05
// Bucket 2: [4, 12)  freq .4  NDVs 4  width 8  freq/width = .05 ndv/width = .5 freq/ndv = .1
// Combined: [0, 12)  freq .6  NDVs 8  width 12
//
// This will give different estimates with the combined bucket, but only for equal preds:
// pred          separate buckets         combined bucket   results
// -------       ---------------------    ---------------   ---------------
// col = 3  ==>  100 * .2 / 4          != 100 * .6 / 8      5 vs. 7.5 rows
// col = 5  ==>  100 * .4 / 4          != 100 * .8 / 8      10 vs. 7.5 rows
// col < 6  ==>  100 * (.2 + .25 * .4)  = 100 * .5 * .6   = 30 rows
CBucketArray *
CHistogram::CombineBuckets(CMemoryPool *mp, CBucketArray *buckets,
						   ULONG desired_num_buckets)
{
	GPOS_ASSERT(desired_num_buckets >= 1);
	if (buckets->Size() <= desired_num_buckets)
	{
		buckets->AddRef();
		return buckets;
	}
#ifdef GPOS_DEBUG
	CDouble start_frequency(0.0);
	for (ULONG ul = 0; ul < buckets->Size(); ++ul)
	{
		CBucket *bucket = (*buckets)[ul];
		start_frequency = start_frequency + bucket->GetFrequency();
	}
	GPOS_ASSERT(start_frequency <= CDouble(1.0) + CStatistics::Epsilon);
#endif
	CBucketArray *result_buckets = GPOS_NEW(mp) CBucketArray(mp);
	ULONG bucketsToCombine = buckets->Size() - desired_num_buckets;
	CKHeap<SAdjBucketBoundaryArray, SAdjBucketBoundary> *boundary_factors =
		GPOS_NEW(mp) CKHeap<SAdjBucketBoundaryArray, SAdjBucketBoundary>(
			mp, bucketsToCombine);
	// loop over the bucket boundaries and determine how much information
	// we would lose by eliminating them
	for (ULONG ul = 0; ul < buckets->Size() - 1; ++ul)
	{
		// calculate the ratios for each value
		CBucket *bucket1 = (*buckets)[ul];
		CBucket *bucket2 = (*buckets)[ul + 1];
		// only consider buckets that have matching boundaries
		if (bucket1->GetUpperBound()->Equals(bucket2->GetLowerBound()) &&
			bucket1->IsUpperClosed() ^ bucket2->IsLowerClosed())
		{
			GPOS_ASSERT(bucket1->IsUpperClosed() ^ bucket2->IsLowerClosed());
			CDouble freq1 = bucket1->GetFrequency();
			CDouble ndv1 = bucket1->GetNumDistinct();
			CDouble width1 = bucket1->GetUpperBound()->Width(
				bucket1->GetLowerBound(), bucket1->IsLowerClosed(),
				bucket1->IsUpperClosed());
			CDouble freq2 = bucket2->GetFrequency();
			CDouble ndv2 = bucket2->GetNumDistinct();
			CDouble width2 = bucket2->GetUpperBound()->Width(
				bucket2->GetLowerBound(), bucket2->IsLowerClosed(),
				bucket2->IsUpperClosed());
			// a bucket boundary doesn't really add any new information
			// when eliminating it results in the same cardinality estimates.
			// This is true if both adjacent buckets have the same freq/NDV
			// and the same freq/width.
			CDouble freqNdv1 = freq1 / ndv1;
			CDouble freqNdv2 = freq2 / ndv2;
			CDouble freqWidth1 = freq1 / width1;
			CDouble freqWidth2 = freq2 / width2;
			// ideal merge is when the factors = 0
			CDouble factor1 = (freqNdv1 - freqNdv2).Absolute();
			CDouble factor2 = (freqWidth1 - freqWidth2).Absolute();
			// we give equal weight to the information contributed by
			// freq/ndv and freq/width
			SAdjBucketBoundary *elem =
				GPOS_NEW(mp) SAdjBucketBoundary(ul, factor1 + factor2);
			boundary_factors->Insert(elem);
		}
	}
	// find the boundaries that are least important and track the index locations
	// that need to be removed in a BitSet. This allows us to check each boundary
	// as we walk the bucket array sequentially.
	CBitSet *indexes_to_merge = GPOS_NEW(mp) CBitSet(mp);
	SAdjBucketBoundary *candidate_to_remove;
	while (nullptr !=
		   (candidate_to_remove = boundary_factors->RemoveBestElement()))
	{
		indexes_to_merge->ExchangeSet(candidate_to_remove->m_boundary_index);
		GPOS_DELETE(candidate_to_remove);
	}
	// go through the bucket array and combine buckets as necessary
	for (ULONG ul = 0; ul < buckets->Size(); ++ul)
	{
		CBucket *bucket = (*buckets)[ul];
		ULONG end_bucket_ix = ul;
		CDouble merged_frequencies = bucket->GetFrequency();
		CDouble merged_ndvs = bucket->GetNumDistinct();
		// 2 or more buckets will be combined into one
		while (indexes_to_merge->Get(end_bucket_ix))
		{
			end_bucket_ix++;
			merged_frequencies =
				merged_frequencies + (*buckets)[end_bucket_ix]->GetFrequency();
			merged_ndvs =
				merged_ndvs + (*buckets)[end_bucket_ix]->GetNumDistinct();
		}
		if (end_bucket_ix > ul)
		{
			CBucket *merged = nullptr;
			CBucket *bucket1 = (*buckets)[ul];
			CBucket *bucket2 = (*buckets)[end_bucket_ix];
			// merge the bucket
			bucket1->GetLowerBound()->AddRef();
			bucket2->GetUpperBound()->AddRef();
			merged = GPOS_NEW(mp)
				CBucket(bucket1->GetLowerBound(), bucket2->GetUpperBound(),
						bucket1->IsLowerClosed(), bucket2->IsUpperClosed(),
						merged_frequencies, merged_ndvs);
			result_buckets->Append(merged);
			ul = end_bucket_ix;
		}
		else
		{
			result_buckets->Append(bucket->MakeBucketCopy(mp));
		}
	}
#ifdef GPOS_DEBUG
	CDouble end_frequency(0.0);
	for (ULONG ul = 0; ul < result_buckets->Size(); ++ul)
	{
		CBucket *bucket = (*result_buckets)[ul];
		end_frequency = end_frequency + bucket->GetFrequency();
	}
	GPOS_ASSERT(start_frequency - end_frequency <=
				CDouble(0.0) + CStatistics::Epsilon);
#endif
	// GPDB_12_MERGE_FIXME: the desired_num_buckets handling is broken for singleton buckets
	// While it limits the number of buckets for non-singleton buckets, singleton buckets
	// are not merged, and thus we can get cases where the number of result buckets is
	// larger than the desired number of buckets
	GPOS_ASSERT(result_buckets->Size() == desired_num_buckets ||
				result_buckets->Size() == desired_num_buckets + 1);
	indexes_to_merge->Release();
	boundary_factors->Release();
	return result_buckets;
}
// cleanup residual buckets
void
CHistogram::CleanupResidualBucket(CBucket *bucket, BOOL bucket_is_residual)
{
	if (nullptr != bucket && bucket_is_residual)
	{
		GPOS_DELETE(bucket);
		bucket = nullptr;
	}
}
// get the next bucket for union / union all
CBucket *
CHistogram::GetNextBucket(const CHistogram *histogram, CBucket *new_bucket,
						  BOOL *result_bucket_is_residual,
						  ULONG *current_bucket_index)
{
	GPOS_ASSERT(nullptr != histogram);
	if (nullptr != new_bucket)
	{
		*result_bucket_is_residual = true;
		return new_bucket;
	}
	*current_bucket_index = *current_bucket_index + 1;
	*result_bucket_is_residual = false;
	return (*histogram)[*current_bucket_index];
}
//	construct new histogram based on union operation
CHistogram *
CHistogram::MakeUnionHistogramNormalize(CDouble rows,
										const CHistogram *other_histogram,
										CDouble rows_other,
										CDouble *num_output_rows) const
{
	GPOS_ASSERT(nullptr != other_histogram);
	GPOS_ASSERT(this->IsValid());
	GPOS_ASSERT(other_histogram->IsValid());
	if (!other_histogram->IsWellDefined() && !IsWellDefined())
	{
		*num_output_rows = CDouble(std::max(rows.Get(), rows_other.Get()));
		CHistogram *result_histogram =
			GPOS_NEW(m_mp) CHistogram(m_mp, false /* is_well_defined */);
		return result_histogram;
	}
	ULONG idx1 = 0;	 // index on buckets from this histogram
	ULONG idx2 = 0;	 // index on buckets from other histogram
	CBucket *bucket1 = (*this)[idx1];
	CBucket *bucket2 = (*other_histogram)[idx2];
	// flags to determine if the buckets where residue of the bucket-merge operation
	BOOL bucket1_is_residual = false;
	BOOL bucket2_is_residual = false;
	// array of buckets in the resulting histogram
	CBucketArray *histogram_buckets = GPOS_NEW(m_mp) CBucketArray(m_mp);
	// number of tuples in each bucket of the resulting histogram
	CDoubleArray *num_tuples_per_bucket = GPOS_NEW(m_mp) CDoubleArray(m_mp);
	CDouble cumulative_num_rows(0.0);
	while (nullptr != bucket1 && nullptr != bucket2)
	{
		if (bucket1->IsBefore(bucket2))
		{
			histogram_buckets->Append(bucket1->MakeBucketCopy(m_mp));
			num_tuples_per_bucket->Append(
				GPOS_NEW(m_mp) CDouble(bucket1->GetFrequency() * rows));
			CleanupResidualBucket(bucket1, bucket1_is_residual);
			idx1++;
			bucket1 = (*this)[idx1];
			bucket1_is_residual = false;
		}
		else if (bucket2->IsBefore(bucket1))
		{
			histogram_buckets->Append(bucket2->MakeBucketCopy(m_mp));
			num_tuples_per_bucket->Append(
				GPOS_NEW(m_mp) CDouble(bucket2->GetFrequency() * rows_other));
			CleanupResidualBucket(bucket2, bucket2_is_residual);
			idx2++;
			bucket2 = (*other_histogram)[idx2];
			bucket2_is_residual = false;
		}
		else
		{
			GPOS_ASSERT(bucket1->Intersects(bucket2));
			CBucket *bucket1_new = nullptr;
			CBucket *bucket2_new = nullptr;
			CBucket *merge_bucket = nullptr;
			CDouble result_rows(0.0);
			merge_bucket = bucket1->SplitAndMergeBuckets(
				m_mp, bucket2, rows, rows_other, &bucket1_new, &bucket2_new,
				&result_rows, false /* is_union_all */
			);
			// add the estimated number of rows in the merged bucket
			num_tuples_per_bucket->Append(GPOS_NEW(m_mp) CDouble(
				merge_bucket->GetFrequency() * result_rows));
			histogram_buckets->Append(merge_bucket);
			CleanupResidualBucket(bucket1, bucket1_is_residual);
			CleanupResidualBucket(bucket2, bucket2_is_residual);
			bucket1 =
				GetNextBucket(this, bucket1_new, &bucket1_is_residual, &idx1);
			bucket2 = GetNextBucket(other_histogram, bucket2_new,
									&bucket2_is_residual, &idx2);
		}
	}
	const ULONG num_buckets1 = GetNumBuckets();
	const ULONG num_buckets2 = other_histogram->GetNumBuckets();
	GPOS_ASSERT_IFF(nullptr == bucket1, idx1 == num_buckets1);
	GPOS_ASSERT_IFF(nullptr == bucket2, idx2 == num_buckets2);
	idx1 = AddResidualUnionBucket(histogram_buckets, bucket1, rows,
								  bucket1_is_residual, idx1,
								  num_tuples_per_bucket);
	idx2 = AddResidualUnionBucket(histogram_buckets, bucket2, rows_other,
								  bucket2_is_residual, idx2,
								  num_tuples_per_bucket);
	CleanupResidualBucket(bucket1, bucket1_is_residual);
	CleanupResidualBucket(bucket2, bucket2_is_residual);
	// add any leftover buckets from other histogram
	AddBuckets(m_mp, other_histogram->m_histogram_buckets, histogram_buckets,
			   rows_other, num_tuples_per_bucket, idx2, num_buckets2);
	// add any leftover buckets from this histogram
	AddBuckets(m_mp, m_histogram_buckets, histogram_buckets, rows,
			   num_tuples_per_bucket, idx1, num_buckets1);
	// compute the total number of null values from both histograms
	CDouble num_null_rows =
		std::max((this->GetNullFreq() * rows),
				 (other_histogram->GetNullFreq() * rows_other));
	// compute the total number of distinct values (NDV) that are not captured by the buckets in both the histograms
	CDouble num_NDV_remain =
		std::max(m_distinct_remaining, other_histogram->GetDistinctRemain());
	// compute the total number of rows having distinct values not captured by the buckets in both the histograms
	CDouble NDV_remain_num_rows =
		std::max((this->GetFreqRemain() * rows),
				 (other_histogram->GetFreqRemain() * rows_other));
	// finally, create a normalized histogram using num_tuples_per_bucket, num_null_rows & NDV_remain_num_rows
	GPOS_ASSERT(num_tuples_per_bucket->Size() == histogram_buckets->Size());
	// compute the total number of rows in the resultant histogram
	CDouble total_output_rows = num_null_rows + NDV_remain_num_rows;
	for (ULONG ul = 0; ul < num_tuples_per_bucket->Size(); ++ul)
	{
		total_output_rows = total_output_rows + *(*num_tuples_per_bucket)[ul];
	}
	*num_output_rows = std::max(CStatistics::MinRows, total_output_rows);
	// set the frequency for each row in the resultant histogram
	for (ULONG ul = 0; ul < histogram_buckets->Size(); ++ul)
	{
		CBucket *bucket = (*histogram_buckets)[ul];
		CDouble rows = *(*num_tuples_per_bucket)[ul];
		bucket->SetFrequency(rows / *num_output_rows);
	}
	CDouble null_freq = num_null_rows / *num_output_rows;
	CDouble NDV_remain_freq = NDV_remain_num_rows / *num_output_rows;
	ULONG max_num_buckets = COptCtxt::PoctxtFromTLS()
								->GetOptimizerConfig()
								->GetStatsConf()
								->UlMaxStatsBuckets();
	ULONG desired_num_buckets =
		std::max((ULONG) max_num_buckets, std::max(num_buckets1, num_buckets2));
	CBucketArray *result_buckets =
		CombineBuckets(m_mp, histogram_buckets, desired_num_buckets);
	CHistogram *result_histogram = GPOS_NEW(m_mp) CHistogram(
		m_mp, result_buckets, true /* is_well_defined */, null_freq,
		num_NDV_remain, NDV_remain_freq, false /* is_col_stats_missing */
	);
	// clean up
	num_tuples_per_bucket->Release();
	histogram_buckets->Release();
	GPOS_ASSERT(result_histogram->IsValid());
	return result_histogram;
}
// add residual bucket in an union operation to the array of buckets in the histogram
ULONG
CHistogram::AddResidualUnionBucket(CBucketArray *histogram_buckets,
								   CBucket *bucket, CDouble rows,
								   BOOL bucket_is_residual, ULONG index,
								   CDoubleArray *dest_bucket_freqs) const
{
	GPOS_ASSERT(nullptr != histogram_buckets);
	GPOS_ASSERT(nullptr != dest_bucket_freqs);
	if (!bucket_is_residual)
	{
		return index;
	}
	CBucket *new_bucket = bucket->MakeBucketCopy(m_mp);
	histogram_buckets->Append(new_bucket);
	dest_bucket_freqs->Append(GPOS_NEW(m_mp)
								  CDouble(new_bucket->GetFrequency() * rows));
	return index + 1;
}
// add buckets from one array to another
void
CHistogram::AddBuckets(CMemoryPool *mp, const CBucketArray *src_buckets,
					   CBucketArray *dest_buckets, CDouble rows,
					   CDoubleArray *dest_bucket_freqs, ULONG begin, ULONG end)
{
	GPOS_ASSERT(nullptr != src_buckets);
	GPOS_ASSERT(nullptr != dest_buckets);
	GPOS_ASSERT(begin <= end);
	GPOS_ASSERT(end <= src_buckets->Size());
	for (ULONG ul = begin; ul < end; ul++)
	{
		CBucket *new_bucket = ((*src_buckets)[ul])->MakeBucketCopy(mp);
		dest_buckets->Append(new_bucket);
		dest_bucket_freqs->Append(
			GPOS_NEW(mp) CDouble(new_bucket->GetFrequency() * rows));
	}
}
// accessor for n-th bucket. Returns NULL if outside bounds
CBucket *
CHistogram::operator[](ULONG pos) const
{
	if (pos < GetNumBuckets())
	{
		return (*m_histogram_buckets)[pos];
	}
	return nullptr;
}
// translate the histogram into a the dxl derived column statistics
CDXLStatsDerivedColumn *
CHistogram::TranslateToDXLDerivedColumnStats(CMDAccessor *md_accessor,
											 ULONG colid, CDouble width) const
{
	CDXLBucketArray *dxl_stats_bucket_array =
		GPOS_NEW(m_mp) CDXLBucketArray(m_mp);
	const ULONG num_of_buckets = m_histogram_buckets->Size();
	for (ULONG ul = 0; ul < num_of_buckets; ul++)
	{
		CBucket *bucket = (*m_histogram_buckets)[ul];
		CDouble freq = bucket->GetFrequency();
		CDouble distinct = bucket->GetNumDistinct();
		CDXLDatum *dxl_datum_lower =
			bucket->GetLowerBound()->GetDatumVal(m_mp, md_accessor);
		CDXLDatum *dxl_datum_upper =
			bucket->GetUpperBound()->GetDatumVal(m_mp, md_accessor);
		CDXLBucket *dxl_bucket = GPOS_NEW(m_mp) CDXLBucket(
			dxl_datum_lower, dxl_datum_upper, bucket->IsLowerClosed(),
			bucket->IsUpperClosed(), freq, distinct);
		dxl_stats_bucket_array->Append(dxl_bucket);
	}
	return GPOS_NEW(m_mp)
		CDXLStatsDerivedColumn(colid, width, m_null_freq, m_distinct_remaining,
							   m_freq_remaining, dxl_stats_bucket_array);
}
// randomly pick a bucket index based on bucket frequency values
ULONG
CHistogram::GetRandomBucketIndex(ULONG *seed) const
{
	const ULONG size = m_histogram_buckets->Size();
	GPOS_ASSERT(0 < size);
	DOUBLE rand_val = ((DOUBLE) clib::Rand(seed)) / RAND_MAX;
	CDouble accumulated_freq = 0;
	for (ULONG ul = 0; ul < size - 1; ul++)
	{
		CBucket *bucket = (*m_histogram_buckets)[ul];
		accumulated_freq = accumulated_freq + bucket->GetFrequency();
		// we compare generated random value with accumulated frequency,
		// this will result in picking a bucket based on its frequency,
		// example: bucket freq {0.1, 0.3, 0.6}
		//			random value in [0,0.1] --> pick bucket 1
		//			random value in [0.1,0.4] --> pick bucket 2
		//			random value in [0.4,1.0] --> pick bucket 3
		if (rand_val <= accumulated_freq)
		{
			return ul;
		}
	}
	return size - 1;
}
// estimate data skew by sampling histogram buckets,
// the estimate value is >= 1.0, where 1.0 indicates no skew
//
// skew is estimated by computing the second and third moments of
// sample distribution: for a sample of size n, where x_bar is sample mean,
// skew is estimated as (m_3/m_2^(1.5)), where m_2 = 1/n Sum ((x -x_bar)^2), and
// m_3 = 1/n Sum ((x -x_bar)^3)
//
// since we use skew as multiplicative factor in cost model, this function
// returns (1.0 + |skew estimate|)
void
CHistogram::ComputeSkew()
{
	m_skew_was_measured = true;
	if (!IsNormalized() || 0 == m_histogram_buckets->Size() ||
		!(*m_histogram_buckets)[0]->CanSample())
	{
		return;
	}
	// generate randomization seed from system time
	TIMEVAL tv;
	syslib::GetTimeOfDay(&tv, nullptr /*timezone*/);
	ULONG seed = CombineHashes((ULONG) tv.tv_sec, (ULONG) tv.tv_usec);
	// generate a sample from histogram data, and compute sample mean
	DOUBLE sample_mean = 0;
	DOUBLE samples[GPOPT_SKEW_SAMPLE_SIZE];
	for (ULONG ul = 0; ul < GPOPT_SKEW_SAMPLE_SIZE; ul++)
	{
		ULONG bucket_index = GetRandomBucketIndex(&seed);
		CBucket *bucket = (*m_histogram_buckets)[bucket_index];
		samples[ul] = bucket->GetSample(&seed).Get();
		sample_mean = sample_mean + samples[ul];
	}
	sample_mean = (DOUBLE) sample_mean / GPOPT_SKEW_SAMPLE_SIZE;
	// compute second and third moments of sample distribution
	DOUBLE num2 = 0;
	DOUBLE num3 = 0;
	for (ULONG ul = 0; ul < GPOPT_SKEW_SAMPLE_SIZE; ul++)
	{
		num2 = num2 + pow((samples[ul] - sample_mean), 2.0);
		num3 = num3 + pow((samples[ul] - sample_mean), 3.0);
	}
	DOUBLE moment2 = (DOUBLE)(num2 / GPOPT_SKEW_SAMPLE_SIZE);
	DOUBLE moment3 = (DOUBLE)(num3 / GPOPT_SKEW_SAMPLE_SIZE);
	// set skew measure
	m_skew = CDouble(1.0 + fabs(moment3 / pow(moment2, 1.5)));
}
// create the default histogram for a given column reference
CHistogram *
CHistogram::MakeDefaultHistogram(CMemoryPool *mp, CColRef *col_ref,
								 BOOL is_empty)
{
	GPOS_ASSERT(nullptr != col_ref);
	if (IMDType::EtiBool == col_ref->RetrieveType()->GetDatumType() &&
		!is_empty)
	{
		return CHistogram::MakeDefaultBoolHistogram(mp);
	}
	return GPOS_NEW(mp) CHistogram(mp, false /* is_well_defined */);
}
// create the default non-empty histogram for a boolean column
CHistogram *
CHistogram::MakeDefaultBoolHistogram(CMemoryPool *mp)
{
	CBucketArray *buckets = GPOS_NEW(mp) CBucketArray(mp);
	// a boolean column can at most have 3 values (true, false, and NULL).
	CDouble distinct_remaining = CDouble(3.0);
	CDouble freq_remaining = CDouble(1.0);
	CDouble null_freq = CDouble(0.0);
	return GPOS_NEW(mp) CHistogram(
		mp, buckets, true /* is_well_defined */, null_freq, distinct_remaining,
		freq_remaining, true /*is_col_stats_missing */
	);
}
// check if the join cardinality estimation can be done based on NDV alone
BOOL
CHistogram::NeedsNDVBasedCardEstimationForEq(const CHistogram *histogram)
{
	GPOS_ASSERT(nullptr != histogram);
	if (0 == histogram->GetNumBuckets())
	{
		// no buckets, so join cardinality estimation is based solely on NDV remain
		return true;
	}
	const IBucket *bucket = (*histogram->GetBuckets())[0];
	IDatum *datum = bucket->GetLowerBound()->GetDatum();
	IMDType::ETypeInfo type_info = datum->GetDatumType();
	if (IMDType::EtiInt2 == type_info || IMDType::EtiInt4 == type_info ||
		IMDType::EtiInt8 == type_info || IMDType::EtiBool == type_info ||
		IMDType::EtiOid == type_info)
	{
		return false;
	}
	if (datum->StatsMappable())
	{
		if (datum->IsDatumMappableToDouble())
		{
			// int like type such as numeric
			return false;
		}
		else if (histogram->ContainsOnlySingletonBuckets())
		{
			GPOS_ASSERT(datum->IsDatumMappableToLINT());
			// Types such as text should only produce histograms that contain only singleton buckets.
			// The histograms cannot be used for range predicates but it is ok for equality predicates.
			return false;
		}
	}
	// For other cases, (e.g. certain non int types with non-singleton buckets),
	// we are forced to use NDV based cardinality estimation.
	return true;
}
// append given histograms to current object
void
CHistogram::AddHistograms(CMemoryPool *mp, UlongToHistogramMap *src_histograms,
						  UlongToHistogramMap *dest_histograms)
{
	UlongToHistogramMapIter col_hist_mapping(src_histograms);
	while (col_hist_mapping.Advance())
	{
		ULONG colid = *(col_hist_mapping.Key());
		const CHistogram *histogram = col_hist_mapping.Value();
		CStatisticsUtils::AddHistogram(mp, colid, histogram, dest_histograms);
	}
}
// add dummy histogram buckets and column information for the array of columns
void
CHistogram::AddDummyHistogramAndWidthInfo(
	CMemoryPool *mp, CColumnFactory *col_factory,
	UlongToHistogramMap *output_histograms, UlongToDoubleMap *output_col_widths,
	const ULongPtrArray *columns, BOOL is_empty)
{
	GPOS_ASSERT(nullptr != col_factory);
	GPOS_ASSERT(nullptr != output_histograms);
	GPOS_ASSERT(nullptr != output_col_widths);
	GPOS_ASSERT(nullptr != columns);
	const ULONG count = columns->Size();
	// for computed aggregates, we're not going to be very smart right now
	for (ULONG ul = 0; ul < count; ul++)
	{
		ULONG colid = *(*columns)[ul];
		CColRef *col_ref = col_factory->LookupColRef(colid);
		GPOS_ASSERT(nullptr != col_ref);
		CHistogram *histogram =
			CHistogram::MakeDefaultHistogram(mp, col_ref, is_empty);
		output_histograms->Insert(GPOS_NEW(mp) ULONG(colid), histogram);
		CDouble width =
			CStatisticsUtils::DefaultColumnWidth(col_ref->RetrieveType());
		output_col_widths->Insert(GPOS_NEW(mp) ULONG(colid),
								  GPOS_NEW(mp) CDouble(width));
	}
}
//	add empty histogram for the columns in the input histogram
void
CHistogram::AddEmptyHistogram(CMemoryPool *mp,
							  UlongToHistogramMap *output_histograms,
							  UlongToHistogramMap *input_histograms)
{
	GPOS_ASSERT(nullptr != output_histograms);
	GPOS_ASSERT(nullptr != input_histograms);
	UlongToHistogramMapIter col_hist_mapping(input_histograms);
	while (col_hist_mapping.Advance())
	{
		ULONG colid = *(col_hist_mapping.Key());
		// empty histogram
		CHistogram *histogram =
			GPOS_NEW(mp) CHistogram(mp, false /* is_well_defined */);
		output_histograms->Insert(GPOS_NEW(mp) ULONG(colid), histogram);
	}
}
BOOL
CHistogram::IsHistogramForTextRelatedTypes() const
{
	if (m_histogram_buckets->Size() > 0)
	{
		IMDId *mdid =
			(*m_histogram_buckets)[0]->GetLowerBound()->GetDatum()->MDId();
		CMDAccessor *md_accessor = COptCtxt::PoctxtFromTLS()->Pmda();
		const IMDType *type = md_accessor->RetrieveType(mdid);
		return type->IsTextRelated();
	}
	return false;
}
// EOF
相关信息
相关文章
greenplumn CFilterStatsProcessor 源码
greenplumn CGroupByStatsProcessor 源码
greenplumn CInnerJoinStatsProcessor 源码
greenplumn CJoinStatsProcessor 源码
greenplumn CLeftAntiSemiJoinStatsProcessor 源码
greenplumn CLeftOuterJoinStatsProcessor 源码
greenplumn CLeftSemiJoinStatsProcessor 源码
                        
                            0
                        
                        
                             赞
                        
                    
                    
                热门推荐
- 
                        2、 - 优质文章
- 
                        3、 gate.io
- 
                        8、 openharmony
- 
                        9、 golang