greenplumn CHistogram 源码
greenplumn CHistogram 代码
文件路径:/src/backend/gporca/libnaucrates/include/naucrates/statistics/CHistogram.h
//---------------------------------------------------------------------------
// Greenplum Database
// Copyright (C) 2018 VMware, Inc. or its affiliates.
//
// @filename:
// CHistogram.h
//
// @doc:
// Histogram implementation
//---------------------------------------------------------------------------
#ifndef GPNAUCRATES_CHistogram_H
#define GPNAUCRATES_CHistogram_H
#include "gpos/base.h"
#include "gpos/common/DbgPrintMixin.h"
#include "gpopt/base/CKHeap.h"
#include "naucrates/statistics/CBucket.h"
#include "naucrates/statistics/CStatsPred.h"
namespace gpopt
{
class CColRef;
class CStatisticsConfig;
class CColumnFactory;
} // namespace gpopt
namespace gpnaucrates
{
// type definitions
// array of doubles
using CDoubleArray = CDynamicPtrArray<CDouble, CleanupDelete>;
//---------------------------------------------------------------------------
// @class:
// CHistogram
//
// @doc:
//
//---------------------------------------------------------------------------
class CHistogram : public gpos::DbgPrintMixin<CHistogram>
{
// hash map from column id to a histogram
using UlongToHistogramMap =
CHashMap<ULONG, CHistogram, gpos::HashValue<ULONG>, gpos::Equals<ULONG>,
CleanupDelete<ULONG>, CleanupDelete<CHistogram>>;
// iterator
using UlongToHistogramMapIter =
CHashMapIter<ULONG, CHistogram, gpos::HashValue<ULONG>,
gpos::Equals<ULONG>, CleanupDelete<ULONG>,
CleanupDelete<CHistogram>>;
// hash map from column ULONG to CDouble
using UlongToDoubleMap =
CHashMap<ULONG, CDouble, gpos::HashValue<ULONG>, gpos::Equals<ULONG>,
CleanupDelete<ULONG>, CleanupDelete<CDouble>>;
// iterator
using UlongToDoubleMapIter =
CHashMapIter<ULONG, CDouble, gpos::HashValue<ULONG>,
gpos::Equals<ULONG>, CleanupDelete<ULONG>,
CleanupDelete<CDouble>>;
private:
// shared memory pool
CMemoryPool *m_mp;
// all the buckets in the histogram. This is shared among histograms,
// so must not be modified unless we first make a new copy. We do not copy
// histograms unless required, as it is an expensive operation in memory and time.
CBucketArray *m_histogram_buckets;
// well-defined histogram. if false, then bounds are unknown
BOOL m_is_well_defined;
// null fraction
CDouble m_null_freq;
// ndistinct of tuples not covered in the buckets
CDouble m_distinct_remaining;
// frequency of tuples not covered in the buckets
CDouble m_freq_remaining;
// has histogram skew been measures
BOOL m_skew_was_measured;
// skew estimate
CDouble m_skew;
// was the NDVs in histogram scaled
BOOL m_NDVs_were_scaled;
// is column statistics missing in the database
BOOL m_is_col_stats_missing;
// return an array buckets after applying equality filter on the histogram buckets
CBucketArray *MakeBucketsWithEqualityFilter(CPoint *point) const;
// return an array buckets after applying non equality filter on the histogram buckets
CBucketArray *MakeBucketsWithInequalityFilter(CPoint *point) const;
// less than or less than equal filter
CHistogram *MakeHistogramLessThanOrLessThanEqualFilter(
CStatsPred::EStatsCmpType stats_cmp_type, CPoint *point) const;
// greater than or greater than equal filter
CHistogram *MakeHistogramGreaterThanOrGreaterThanEqualFilter(
CStatsPred::EStatsCmpType stats_cmp_type, CPoint *point) const;
// equal filter
CHistogram *MakeHistogramEqualFilter(CPoint *point) const;
// not equal filter
CHistogram *MakeHistogramInequalityFilter(CPoint *point) const;
// IDF filter
CHistogram *MakeHistogramIDFFilter(CPoint *point) const;
// INDF filter
CHistogram *MakeHistogramINDFFilter(CPoint *point) const;
// equality join
CHistogram *MakeJoinHistogramEqualityFilter(
const CHistogram *histogram) const;
// generate histogram based on NDV
CHistogram *MakeNDVBasedJoinHistogramEqualityFilter(
const CHistogram *histogram) const;
// construct a new histogram for an INDF join predicate
CHistogram *MakeJoinHistogramINDFFilter(const CHistogram *histogram) const;
// accessor for n-th bucket
CBucket *operator[](ULONG) const;
// compute skew estimate
void ComputeSkew();
// helper to add buckets from one histogram to another
static void AddBuckets(CMemoryPool *mp, const CBucketArray *src_buckets,
CBucketArray *dest_buckets, CDouble rows_old,
CDouble rows_new, ULONG begin, ULONG end);
static void AddBuckets(CMemoryPool *mp, const CBucketArray *src_buckets,
CBucketArray *dest_buckets, CDouble rows,
CDoubleArray *dest_bucket_freqs, ULONG begin,
ULONG end);
// helper to combine histogram buckets to reduce total buckets
static CBucketArray *CombineBuckets(CMemoryPool *mp, CBucketArray *buckets,
ULONG desired_num_buckets);
// check if we can compute NDVRemain for JOIN histogram for the given input histograms
static BOOL CanComputeJoinNDVRemain(const CHistogram *histogram1,
const CHistogram *histogram2);
// compute the effects of the NDV and frequency of the tuples not captured
// by the histogram
static void ComputeJoinNDVRemainInfo(
const CHistogram *histogram1, const CHistogram *histogram2,
const CBucketArray *join_buckets, // join buckets
CDouble
hist1_buckets_freq, // frequency of the buckets in input1 that contributed to the join
CDouble
hist2_buckets_freq, // frequency of the buckets in input2 that contributed to the join
CDouble *result_distinct_remain, CDouble *result_freq_remain);
// check if the cardinality estimation should be done only via NDVs
static BOOL NeedsNDVBasedCardEstimationForEq(const CHistogram *histogram);
BOOL IsHistogramForTextRelatedTypes() const;
// add residual union all buckets after the merge
ULONG AddResidualUnionAllBucket(CBucketArray *histogram_buckets,
CBucket *bucket, CDouble rows_old,
CDouble rows_new, BOOL bucket_is_residual,
ULONG index) const;
// add residual union buckets after the merge
ULONG AddResidualUnionBucket(CBucketArray *histogram_buckets,
CBucket *bucket, CDouble rows,
BOOL bucket_is_residual, ULONG index,
CDoubleArray *dest_bucket_freqs) const;
// used to keep track of adjacent stats buckets and how similar
// they are in terms of distribution
struct SAdjBucketBoundary
{
// boundary_index 0 refers to boundary between b[0] and b[1]
ULONG m_boundary_index;
// similarity factor between two adjacent buckets calculated as (freq0/ndv0 - freq1/ndv1) + (freq0/width0 - freq1/width1)
CDouble m_similarity_factor;
SAdjBucketBoundary(ULONG index, CDouble similarity_factor)
: m_boundary_index(index), m_similarity_factor(similarity_factor)
{
}
// used for sorting in the binary heap
CDouble
DCost() const
{
return m_similarity_factor;
}
CDouble
GetCostForHeap() const
{
return DCost();
}
};
using SAdjBucketBoundaryArray =
CDynamicPtrArray<SAdjBucketBoundary, CleanupDelete<SAdjBucketBoundary>>;
public:
CHistogram &operator=(const CHistogram &) = delete;
CHistogram(const CHistogram &) = delete;
// ctors
explicit CHistogram(CMemoryPool *mp, CBucketArray *histogram_buckets,
BOOL is_well_defined = true);
explicit CHistogram(CMemoryPool *mp, BOOL is_well_defined = true);
CHistogram(CMemoryPool *mp, CBucketArray *histogram_buckets,
BOOL is_well_defined, CDouble null_freq,
CDouble distinct_remaining, CDouble freq_remaining,
BOOL is_col_stats_missing = false);
// set null frequency
void SetNullFrequency(CDouble null_freq);
// set information about the scaling of NDVs
void
SetNDVScaled()
{
m_NDVs_were_scaled = true;
}
// have the NDVs been scaled
BOOL
WereNDVsScaled() const
{
return m_NDVs_were_scaled;
}
// filter by comparing with point
CHistogram *MakeHistogramFilter(CStatsPred::EStatsCmpType stats_cmp_type,
CPoint *point) const;
// filter by comparing with point and normalize
CHistogram *MakeHistogramFilterNormalize(
CStatsPred::EStatsCmpType stats_cmp_type, CPoint *point,
CDouble *scale_factor) const;
// join with another histogram
CHistogram *MakeJoinHistogram(CStatsPred::EStatsCmpType stats_cmp_type,
const CHistogram *histogram) const;
// LASJ with another histogram
CHistogram *MakeLASJHistogram(CStatsPred::EStatsCmpType stats_cmp_type,
const CHistogram *histogram) const;
// join with another histogram and normalize it.
// If the join is not an equality join the function returns an empty histogram
CHistogram *MakeJoinHistogramNormalize(
CStatsPred::EStatsCmpType stats_cmp_type, CDouble rows,
const CHistogram *other_histogram, CDouble rows_other,
CDouble *scale_factor) const;
// scale factor of inequality (!=) join
CDouble GetInequalityJoinScaleFactor(CDouble rows,
const CHistogram *other_histogram,
CDouble rows_other) const;
// left anti semi join with another histogram and normalize
CHistogram *MakeLASJHistogramNormalize(
CStatsPred::EStatsCmpType stats_cmp_type, CDouble rows,
const CHistogram *other_histogram, CDouble *scale_factor,
BOOL
DoIgnoreLASJHistComputation // except for the case of LOJ cardinality estimation this flag is always
// "true" since LASJ stats computation is very aggressive
) const;
// group by and normalize
CHistogram *MakeGroupByHistogramNormalize(CDouble rows,
CDouble *scale_factor) const;
// union all and normalize
CHistogram *MakeUnionAllHistogramNormalize(
CDouble rows, const CHistogram *other_histogram,
CDouble rows_other) const;
// union and normalize
CHistogram *MakeUnionHistogramNormalize(CDouble rows,
const CHistogram *other_histogram,
CDouble rows_other,
CDouble *num_output_rows) const;
// cleanup residual buckets
static void CleanupResidualBucket(CBucket *bucket, BOOL bucket_is_residual);
// get the next bucket for union / union all
static CBucket *GetNextBucket(const CHistogram *histogram,
CBucket *new_bucket,
BOOL *target_bucket_is_residual,
ULONG *current_bucket_index);
// number of buckets
ULONG
GetNumBuckets() const
{
GPOS_ASSERT(m_histogram_buckets != nullptr);
return m_histogram_buckets->Size();
}
// buckets accessor
const CBucketArray *
GetBuckets() const
{
return m_histogram_buckets;
}
// well defined
BOOL
IsWellDefined() const
{
return m_is_well_defined;
}
BOOL ContainsOnlySingletonBuckets() const;
// is the column statistics missing in the database
BOOL
IsColStatsMissing() const
{
return m_is_col_stats_missing;
}
// print function
IOstream &OsPrint(IOstream &os) const;
// total frequency from buckets and null fraction
CDouble GetFrequency() const;
// total number of distinct values
CDouble GetNumDistinct() const;
// is histogram well formed
BOOL IsValid() const;
// return copy of histogram
CHistogram *CopyHistogram() const;
// destructor
virtual ~CHistogram()
{
m_histogram_buckets->Release();
}
// normalize histogram and return scaling factor
CDouble NormalizeHistogram();
// is histogram normalized
BOOL IsNormalized() const;
// translate the histogram into a derived column stats
CDXLStatsDerivedColumn *TranslateToDXLDerivedColumnStats(
CMDAccessor *md_accessor, ULONG colid, CDouble width) const;
// randomly pick a bucket index
ULONG GetRandomBucketIndex(ULONG *seed) const;
// estimate of data skew
CDouble
GetSkew()
{
if (!m_skew_was_measured)
{
ComputeSkew();
}
return m_skew;
}
// accessor of null fraction
CDouble
GetNullFreq() const
{
return m_null_freq;
}
// accessor of remaining number of tuples
CDouble
GetDistinctRemain() const
{
return m_distinct_remaining;
}
// accessor of remaining frequency
CDouble
GetFreqRemain() const
{
return m_freq_remaining;
}
// check if histogram is empty
BOOL IsEmpty() const;
// cap the total number of distinct values (NDVs) in buckets to the number of rows
void CapNDVs(CDouble rows);
// is comparison type supported for filters for text columns
static BOOL IsOpSupportedForTextFilter(
CStatsPred::EStatsCmpType stats_cmp_type);
// is comparison type supported for filters
static BOOL IsOpSupportedForFilter(
CStatsPred::EStatsCmpType stats_cmp_type);
// is the join predicate's comparison type supported
static BOOL JoinPredCmpTypeIsSupported(
CStatsPred::EStatsCmpType stats_cmp_type);
// create the default histogram for a given column reference
static CHistogram *MakeDefaultHistogram(CMemoryPool *mp, CColRef *col_ref,
BOOL is_empty);
// create the default non empty histogram for a boolean column
static CHistogram *MakeDefaultBoolHistogram(CMemoryPool *mp);
// helper method to append histograms from one map to the other
static void AddHistograms(CMemoryPool *mp,
UlongToHistogramMap *src_histograms,
UlongToHistogramMap *dest_histograms);
// add dummy histogram buckets and column width for the array of columns
static void AddDummyHistogramAndWidthInfo(
CMemoryPool *mp, CColumnFactory *col_factory,
UlongToHistogramMap *output_histograms,
UlongToDoubleMap *output_col_widths, const ULongPtrArray *columns,
BOOL is_empty);
// add dummy histogram buckets for the columns in the input histogram
static void AddEmptyHistogram(CMemoryPool *mp,
UlongToHistogramMap *output_histograms,
UlongToHistogramMap *input_histograms);
// create a deep copy of m_histogram_buckets
static CBucketArray *DeepCopyHistogramBuckets(CMemoryPool *mp,
const CBucketArray *buckets);
// default histogram selectivity
static const CDouble DefaultSelectivity;
// minimum number of distinct values in a column
static const CDouble MinDistinct;
// default scale factor when there is no filtering of input
static const CDouble NeutralScaleFactor;
// default Null frequency
static const CDouble DefaultNullFreq;
// default NDV remain
static const CDouble DefaultNDVRemain;
// default frequency of NDV remain
static const CDouble DefaultNDVFreqRemain;
}; // class CHistogram
} // namespace gpnaucrates
#endif // !GPNAUCRATES_CHistogram_H
// EOF
相关信息
相关文章
greenplumn CFilterStatsProcessor 源码
greenplumn CGroupByStatsProcessor 源码
greenplumn CInnerJoinStatsProcessor 源码
greenplumn CJoinStatsProcessor 源码
greenplumn CLeftAntiSemiJoinStatsProcessor 源码
greenplumn CLeftOuterJoinStatsProcessor 源码
greenplumn CLeftSemiJoinStatsProcessor 源码
0
赞
热门推荐
-
2、 - 优质文章
-
3、 gate.io
-
8、 golang
-
9、 openharmony
-
10、 Vue中input框自动聚焦