1 /*------------------------------------------------------------------------- 2 * 3 * pg_statistic.h 4 * definition of the "statistics" system catalog (pg_statistic) 5 * 6 * 7 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group 8 * Portions Copyright (c) 1994, Regents of the University of California 9 * 10 * src/include/catalog/pg_statistic.h 11 * 12 * NOTES 13 * The Catalog.pm module reads this file and derives schema 14 * information. 15 * 16 *------------------------------------------------------------------------- 17 */ 18 #ifndef PG_STATISTIC_H 19 #define PG_STATISTIC_H 20 21 #include "catalog/genbki.h" 22 #include "catalog/pg_statistic_d.h" 23 24 /* ---------------- 25 * pg_statistic definition. cpp turns this into 26 * typedef struct FormData_pg_statistic 27 * ---------------- 28 */ 29 CATALOG(pg_statistic,2619,StatisticRelationId) 30 { 31 /* These fields form the unique key for the entry: */ 32 Oid starelid BKI_LOOKUP(pg_class); /* relation containing 33 * attribute */ 34 int16 staattnum; /* attribute (column) stats are for */ 35 bool stainherit; /* true if inheritance children are included */ 36 37 /* the fraction of the column's entries that are NULL: */ 38 float4 stanullfrac; 39 40 /* 41 * stawidth is the average width in bytes of non-null entries. For 42 * fixed-width datatypes this is of course the same as the typlen, but for 43 * var-width types it is more useful. Note that this is the average width 44 * of the data as actually stored, post-TOASTing (eg, for a 45 * moved-out-of-line value, only the size of the pointer object is 46 * counted). This is the appropriate definition for the primary use of 47 * the statistic, which is to estimate sizes of in-memory hash tables of 48 * tuples. 49 */ 50 int32 stawidth; 51 52 /* ---------------- 53 * stadistinct indicates the (approximate) number of distinct non-null 54 * data values in the column. The interpretation is: 55 * 0 unknown or not computed 56 * > 0 actual number of distinct values 57 * < 0 negative of multiplier for number of rows 58 * The special negative case allows us to cope with columns that are 59 * unique (stadistinct = -1) or nearly so (for example, a column in which 60 * non-null values appear about twice on the average could be represented 61 * by stadistinct = -0.5 if there are no nulls, or -0.4 if 20% of the 62 * column is nulls). Because the number-of-rows statistic in pg_class may 63 * be updated more frequently than pg_statistic is, it's important to be 64 * able to describe such situations as a multiple of the number of rows, 65 * rather than a fixed number of distinct values. But in other cases a 66 * fixed number is correct (eg, a boolean column). 67 * ---------------- 68 */ 69 float4 stadistinct; 70 71 /* ---------------- 72 * To allow keeping statistics on different kinds of datatypes, 73 * we do not hard-wire any particular meaning for the remaining 74 * statistical fields. Instead, we provide several "slots" in which 75 * statistical data can be placed. Each slot includes: 76 * kind integer code identifying kind of data (see below) 77 * op OID of associated operator, if needed 78 * coll OID of relevant collation, or 0 if none 79 * numbers float4 array (for statistical values) 80 * values anyarray (for representations of data values) 81 * The ID, operator, and collation fields are never NULL; they are zeroes 82 * in an unused slot. The numbers and values fields are NULL in an 83 * unused slot, and might also be NULL in a used slot if the slot kind 84 * has no need for one or the other. 85 * ---------------- 86 */ 87 88 int16 stakind1; 89 int16 stakind2; 90 int16 stakind3; 91 int16 stakind4; 92 int16 stakind5; 93 94 Oid staop1 BKI_LOOKUP_OPT(pg_operator); 95 Oid staop2 BKI_LOOKUP_OPT(pg_operator); 96 Oid staop3 BKI_LOOKUP_OPT(pg_operator); 97 Oid staop4 BKI_LOOKUP_OPT(pg_operator); 98 Oid staop5 BKI_LOOKUP_OPT(pg_operator); 99 100 Oid stacoll1 BKI_LOOKUP_OPT(pg_collation); 101 Oid stacoll2 BKI_LOOKUP_OPT(pg_collation); 102 Oid stacoll3 BKI_LOOKUP_OPT(pg_collation); 103 Oid stacoll4 BKI_LOOKUP_OPT(pg_collation); 104 Oid stacoll5 BKI_LOOKUP_OPT(pg_collation); 105 106 #ifdef CATALOG_VARLEN /* variable-length fields start here */ 107 float4 stanumbers1[1]; 108 float4 stanumbers2[1]; 109 float4 stanumbers3[1]; 110 float4 stanumbers4[1]; 111 float4 stanumbers5[1]; 112 113 /* 114 * Values in these arrays are values of the column's data type, or of some 115 * related type such as an array element type. We presently have to cheat 116 * quite a bit to allow polymorphic arrays of this kind, but perhaps 117 * someday it'll be a less bogus facility. 118 */ 119 anyarray stavalues1; 120 anyarray stavalues2; 121 anyarray stavalues3; 122 anyarray stavalues4; 123 anyarray stavalues5; 124 #endif 125 } FormData_pg_statistic; 126 127 #define STATISTIC_NUM_SLOTS 5 128 129 130 /* ---------------- 131 * Form_pg_statistic corresponds to a pointer to a tuple with 132 * the format of pg_statistic relation. 133 * ---------------- 134 */ 135 typedef FormData_pg_statistic *Form_pg_statistic; 136 137 DECLARE_TOAST(pg_statistic, 2840, 2841); 138 139 DECLARE_UNIQUE_INDEX_PKEY(pg_statistic_relid_att_inh_index, 2696, on pg_statistic using btree(starelid oid_ops, staattnum int2_ops, stainherit bool_ops)); 140 #define StatisticRelidAttnumInhIndexId 2696 141 142 DECLARE_FOREIGN_KEY((starelid, staattnum), pg_attribute, (attrelid, attnum)); 143 144 #ifdef EXPOSE_TO_CLIENT_CODE 145 146 /* 147 * Several statistical slot "kinds" are defined by core PostgreSQL, as 148 * documented below. Also, custom data types can define their own "kind" 149 * codes by mutual agreement between a custom typanalyze routine and the 150 * selectivity estimation functions of the type's operators. 151 * 152 * Code reading the pg_statistic relation should not assume that a particular 153 * data "kind" will appear in any particular slot. Instead, search the 154 * stakind fields to see if the desired data is available. (The standard 155 * function get_attstatsslot() may be used for this.) 156 */ 157 158 /* 159 * The present allocation of "kind" codes is: 160 * 161 * 1-99: reserved for assignment by the core PostgreSQL project 162 * (values in this range will be documented in this file) 163 * 100-199: reserved for assignment by the PostGIS project 164 * (values to be documented in PostGIS documentation) 165 * 200-299: reserved for assignment by the ESRI ST_Geometry project 166 * (values to be documented in ESRI ST_Geometry documentation) 167 * 300-9999: reserved for future public assignments 168 * 169 * For private use you may choose a "kind" code at random in the range 170 * 10000-30000. However, for code that is to be widely disseminated it is 171 * better to obtain a publicly defined "kind" code by request from the 172 * PostgreSQL Global Development Group. 173 */ 174 175 /* 176 * In a "most common values" slot, staop is the OID of the "=" operator 177 * used to decide whether values are the same or not, and stacoll is the 178 * collation used (same as column's collation). stavalues contains 179 * the K most common non-null values appearing in the column, and stanumbers 180 * contains their frequencies (fractions of total row count). The values 181 * shall be ordered in decreasing frequency. Note that since the arrays are 182 * variable-size, K may be chosen by the statistics collector. Values should 183 * not appear in MCV unless they have been observed to occur more than once; 184 * a unique column will have no MCV slot. 185 */ 186 #define STATISTIC_KIND_MCV 1 187 188 /* 189 * A "histogram" slot describes the distribution of scalar data. staop is 190 * the OID of the "<" operator that describes the sort ordering, and stacoll 191 * is the relevant collation. (In theory more than one histogram could appear, 192 * if a datatype has more than one useful sort operator or we care about more 193 * than one collation. Currently the collation will always be that of the 194 * underlying column.) stavalues contains M (>=2) non-null values that 195 * divide the non-null column data values into M-1 bins of approximately equal 196 * population. The first stavalues item is the MIN and the last is the MAX. 197 * stanumbers is not used and should be NULL. IMPORTANT POINT: if an MCV 198 * slot is also provided, then the histogram describes the data distribution 199 * *after removing the values listed in MCV* (thus, it's a "compressed 200 * histogram" in the technical parlance). This allows a more accurate 201 * representation of the distribution of a column with some very-common 202 * values. In a column with only a few distinct values, it's possible that 203 * the MCV list describes the entire data population; in this case the 204 * histogram reduces to empty and should be omitted. 205 */ 206 #define STATISTIC_KIND_HISTOGRAM 2 207 208 /* 209 * A "correlation" slot describes the correlation between the physical order 210 * of table tuples and the ordering of data values of this column, as seen 211 * by the "<" operator identified by staop with the collation identified by 212 * stacoll. (As with the histogram, more than one entry could theoretically 213 * appear.) stavalues is not used and should be NULL. stanumbers contains 214 * a single entry, the correlation coefficient between the sequence of data 215 * values and the sequence of their actual tuple positions. The coefficient 216 * ranges from +1 to -1. 217 */ 218 #define STATISTIC_KIND_CORRELATION 3 219 220 /* 221 * A "most common elements" slot is similar to a "most common values" slot, 222 * except that it stores the most common non-null *elements* of the column 223 * values. This is useful when the column datatype is an array or some other 224 * type with identifiable elements (for instance, tsvector). staop contains 225 * the equality operator appropriate to the element type, and stacoll 226 * contains the collation to use with it. stavalues contains 227 * the most common element values, and stanumbers their frequencies. Unlike 228 * MCV slots, frequencies are measured as the fraction of non-null rows the 229 * element value appears in, not the frequency of all rows. Also unlike 230 * MCV slots, the values are sorted into the element type's default order 231 * (to support binary search for a particular value). Since this puts the 232 * minimum and maximum frequencies at unpredictable spots in stanumbers, 233 * there are two extra members of stanumbers, holding copies of the minimum 234 * and maximum frequencies. Optionally, there can be a third extra member, 235 * which holds the frequency of null elements (expressed in the same terms: 236 * the fraction of non-null rows that contain at least one null element). If 237 * this member is omitted, the column is presumed to contain no null elements. 238 * 239 * Note: in current usage for tsvector columns, the stavalues elements are of 240 * type text, even though their representation within tsvector is not 241 * exactly text. 242 */ 243 #define STATISTIC_KIND_MCELEM 4 244 245 /* 246 * A "distinct elements count histogram" slot describes the distribution of 247 * the number of distinct element values present in each row of an array-type 248 * column. Only non-null rows are considered, and only non-null elements. 249 * staop contains the equality operator appropriate to the element type, 250 * and stacoll contains the collation to use with it. 251 * stavalues is not used and should be NULL. The last member of stanumbers is 252 * the average count of distinct element values over all non-null rows. The 253 * preceding M (>=2) members form a histogram that divides the population of 254 * distinct-elements counts into M-1 bins of approximately equal population. 255 * The first of these is the minimum observed count, and the last the maximum. 256 */ 257 #define STATISTIC_KIND_DECHIST 5 258 259 /* 260 * A "length histogram" slot describes the distribution of range lengths in 261 * rows of a range-type column. stanumbers contains a single entry, the 262 * fraction of empty ranges. stavalues is a histogram of non-empty lengths, in 263 * a format similar to STATISTIC_KIND_HISTOGRAM: it contains M (>=2) range 264 * values that divide the column data values into M-1 bins of approximately 265 * equal population. The lengths are stored as float8s, as measured by the 266 * range type's subdiff function. Only non-null rows are considered. 267 */ 268 #define STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM 6 269 270 /* 271 * A "bounds histogram" slot is similar to STATISTIC_KIND_HISTOGRAM, but for 272 * a range-type column. stavalues contains M (>=2) range values that divide 273 * the column data values into M-1 bins of approximately equal population. 274 * Unlike a regular scalar histogram, this is actually two histograms combined 275 * into a single array, with the lower bounds of each value forming a 276 * histogram of lower bounds, and the upper bounds a histogram of upper 277 * bounds. Only non-NULL, non-empty ranges are included. 278 */ 279 #define STATISTIC_KIND_BOUNDS_HISTOGRAM 7 280 281 #endif /* EXPOSE_TO_CLIENT_CODE */ 282 283 #endif /* PG_STATISTIC_H */ 284