1 /*------------------------------------------------------------------------- 2 * 3 * pg_statistic.h 4 * definition of the "statistics" system catalog (pg_statistic) 5 * 6 * 7 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group 8 * Portions Copyright (c) 1994, Regents of the University of California 9 * 10 * src/include/catalog/pg_statistic.h 11 * 12 * NOTES 13 * The Catalog.pm module reads this file and derives schema 14 * information. 15 * 16 *------------------------------------------------------------------------- 17 */ 18 #ifndef PG_STATISTIC_H 19 #define PG_STATISTIC_H 20 21 #include "catalog/genbki.h" 22 #include "catalog/pg_statistic_d.h" 23 24 /* ---------------- 25 * pg_statistic definition. cpp turns this into 26 * typedef struct FormData_pg_statistic 27 * ---------------- 28 */ 29 CATALOG(pg_statistic,2619,StatisticRelationId) BKI_WITHOUT_OIDS 30 { 31 /* These fields form the unique key for the entry: */ 32 Oid starelid; /* relation containing attribute */ 33 int16 staattnum; /* attribute (column) stats are for */ 34 bool stainherit; /* true if inheritance children are included */ 35 36 /* the fraction of the column's entries that are NULL: */ 37 float4 stanullfrac; 38 39 /* 40 * stawidth is the average width in bytes of non-null entries. For 41 * fixed-width datatypes this is of course the same as the typlen, but for 42 * var-width types it is more useful. Note that this is the average width 43 * of the data as actually stored, post-TOASTing (eg, for a 44 * moved-out-of-line value, only the size of the pointer object is 45 * counted). This is the appropriate definition for the primary use of 46 * the statistic, which is to estimate sizes of in-memory hash tables of 47 * tuples. 48 */ 49 int32 stawidth; 50 51 /* ---------------- 52 * stadistinct indicates the (approximate) number of distinct non-null 53 * data values in the column. The interpretation is: 54 * 0 unknown or not computed 55 * > 0 actual number of distinct values 56 * < 0 negative of multiplier for number of rows 57 * The special negative case allows us to cope with columns that are 58 * unique (stadistinct = -1) or nearly so (for example, a column in which 59 * non-null values appear about twice on the average could be represented 60 * by stadistinct = -0.5 if there are no nulls, or -0.4 if 20% of the 61 * column is nulls). Because the number-of-rows statistic in pg_class may 62 * be updated more frequently than pg_statistic is, it's important to be 63 * able to describe such situations as a multiple of the number of rows, 64 * rather than a fixed number of distinct values. But in other cases a 65 * fixed number is correct (eg, a boolean column). 66 * ---------------- 67 */ 68 float4 stadistinct; 69 70 /* ---------------- 71 * To allow keeping statistics on different kinds of datatypes, 72 * we do not hard-wire any particular meaning for the remaining 73 * statistical fields. Instead, we provide several "slots" in which 74 * statistical data can be placed. Each slot includes: 75 * kind integer code identifying kind of data (see below) 76 * op OID of associated operator, if needed 77 * numbers float4 array (for statistical values) 78 * values anyarray (for representations of data values) 79 * The ID and operator fields are never NULL; they are zeroes in an 80 * unused slot. The numbers and values fields are NULL in an unused 81 * slot, and might also be NULL in a used slot if the slot kind has 82 * no need for one or the other. 83 * ---------------- 84 */ 85 86 int16 stakind1; 87 int16 stakind2; 88 int16 stakind3; 89 int16 stakind4; 90 int16 stakind5; 91 92 Oid staop1; 93 Oid staop2; 94 Oid staop3; 95 Oid staop4; 96 Oid staop5; 97 98 #ifdef CATALOG_VARLEN /* variable-length fields start here */ 99 float4 stanumbers1[1]; 100 float4 stanumbers2[1]; 101 float4 stanumbers3[1]; 102 float4 stanumbers4[1]; 103 float4 stanumbers5[1]; 104 105 /* 106 * Values in these arrays are values of the column's data type, or of some 107 * related type such as an array element type. We presently have to cheat 108 * quite a bit to allow polymorphic arrays of this kind, but perhaps 109 * someday it'll be a less bogus facility. 110 */ 111 anyarray stavalues1; 112 anyarray stavalues2; 113 anyarray stavalues3; 114 anyarray stavalues4; 115 anyarray stavalues5; 116 #endif 117 } FormData_pg_statistic; 118 119 #define STATISTIC_NUM_SLOTS 5 120 121 122 /* ---------------- 123 * Form_pg_statistic corresponds to a pointer to a tuple with 124 * the format of pg_statistic relation. 125 * ---------------- 126 */ 127 typedef FormData_pg_statistic *Form_pg_statistic; 128 129 #ifdef EXPOSE_TO_CLIENT_CODE 130 131 /* 132 * Several statistical slot "kinds" are defined by core PostgreSQL, as 133 * documented below. Also, custom data types can define their own "kind" 134 * codes by mutual agreement between a custom typanalyze routine and the 135 * selectivity estimation functions of the type's operators. 136 * 137 * Code reading the pg_statistic relation should not assume that a particular 138 * data "kind" will appear in any particular slot. Instead, search the 139 * stakind fields to see if the desired data is available. (The standard 140 * function get_attstatsslot() may be used for this.) 141 */ 142 143 /* 144 * The present allocation of "kind" codes is: 145 * 146 * 1-99: reserved for assignment by the core PostgreSQL project 147 * (values in this range will be documented in this file) 148 * 100-199: reserved for assignment by the PostGIS project 149 * (values to be documented in PostGIS documentation) 150 * 200-299: reserved for assignment by the ESRI ST_Geometry project 151 * (values to be documented in ESRI ST_Geometry documentation) 152 * 300-9999: reserved for future public assignments 153 * 154 * For private use you may choose a "kind" code at random in the range 155 * 10000-30000. However, for code that is to be widely disseminated it is 156 * better to obtain a publicly defined "kind" code by request from the 157 * PostgreSQL Global Development Group. 158 */ 159 160 /* 161 * In a "most common values" slot, staop is the OID of the "=" operator 162 * used to decide whether values are the same or not. stavalues contains 163 * the K most common non-null values appearing in the column, and stanumbers 164 * contains their frequencies (fractions of total row count). The values 165 * shall be ordered in decreasing frequency. Note that since the arrays are 166 * variable-size, K may be chosen by the statistics collector. Values should 167 * not appear in MCV unless they have been observed to occur more than once; 168 * a unique column will have no MCV slot. 169 */ 170 #define STATISTIC_KIND_MCV 1 171 172 /* 173 * A "histogram" slot describes the distribution of scalar data. staop is 174 * the OID of the "<" operator that describes the sort ordering. (In theory, 175 * more than one histogram could appear, if a datatype has more than one 176 * useful sort operator.) stavalues contains M (>=2) non-null values that 177 * divide the non-null column data values into M-1 bins of approximately equal 178 * population. The first stavalues item is the MIN and the last is the MAX. 179 * stanumbers is not used and should be NULL. IMPORTANT POINT: if an MCV 180 * slot is also provided, then the histogram describes the data distribution 181 * *after removing the values listed in MCV* (thus, it's a "compressed 182 * histogram" in the technical parlance). This allows a more accurate 183 * representation of the distribution of a column with some very-common 184 * values. In a column with only a few distinct values, it's possible that 185 * the MCV list describes the entire data population; in this case the 186 * histogram reduces to empty and should be omitted. 187 */ 188 #define STATISTIC_KIND_HISTOGRAM 2 189 190 /* 191 * A "correlation" slot describes the correlation between the physical order 192 * of table tuples and the ordering of data values of this column, as seen 193 * by the "<" operator identified by staop. (As with the histogram, more 194 * than one entry could theoretically appear.) stavalues is not used and 195 * should be NULL. stanumbers contains a single entry, the correlation 196 * coefficient between the sequence of data values and the sequence of 197 * their actual tuple positions. The coefficient ranges from +1 to -1. 198 */ 199 #define STATISTIC_KIND_CORRELATION 3 200 201 /* 202 * A "most common elements" slot is similar to a "most common values" slot, 203 * except that it stores the most common non-null *elements* of the column 204 * values. This is useful when the column datatype is an array or some other 205 * type with identifiable elements (for instance, tsvector). staop contains 206 * the equality operator appropriate to the element type. stavalues contains 207 * the most common element values, and stanumbers their frequencies. Unlike 208 * MCV slots, frequencies are measured as the fraction of non-null rows the 209 * element value appears in, not the frequency of all rows. Also unlike 210 * MCV slots, the values are sorted into the element type's default order 211 * (to support binary search for a particular value). Since this puts the 212 * minimum and maximum frequencies at unpredictable spots in stanumbers, 213 * there are two extra members of stanumbers, holding copies of the minimum 214 * and maximum frequencies. Optionally, there can be a third extra member, 215 * which holds the frequency of null elements (expressed in the same terms: 216 * the fraction of non-null rows that contain at least one null element). If 217 * this member is omitted, the column is presumed to contain no null elements. 218 * 219 * Note: in current usage for tsvector columns, the stavalues elements are of 220 * type text, even though their representation within tsvector is not 221 * exactly text. 222 */ 223 #define STATISTIC_KIND_MCELEM 4 224 225 /* 226 * A "distinct elements count histogram" slot describes the distribution of 227 * the number of distinct element values present in each row of an array-type 228 * column. Only non-null rows are considered, and only non-null elements. 229 * staop contains the equality operator appropriate to the element type. 230 * stavalues is not used and should be NULL. The last member of stanumbers is 231 * the average count of distinct element values over all non-null rows. The 232 * preceding M (>=2) members form a histogram that divides the population of 233 * distinct-elements counts into M-1 bins of approximately equal population. 234 * The first of these is the minimum observed count, and the last the maximum. 235 */ 236 #define STATISTIC_KIND_DECHIST 5 237 238 /* 239 * A "length histogram" slot describes the distribution of range lengths in 240 * rows of a range-type column. stanumbers contains a single entry, the 241 * fraction of empty ranges. stavalues is a histogram of non-empty lengths, in 242 * a format similar to STATISTIC_KIND_HISTOGRAM: it contains M (>=2) range 243 * values that divide the column data values into M-1 bins of approximately 244 * equal population. The lengths are stored as float8s, as measured by the 245 * range type's subdiff function. Only non-null rows are considered. 246 */ 247 #define STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM 6 248 249 /* 250 * A "bounds histogram" slot is similar to STATISTIC_KIND_HISTOGRAM, but for 251 * a range-type column. stavalues contains M (>=2) range values that divide 252 * the column data values into M-1 bins of approximately equal population. 253 * Unlike a regular scalar histogram, this is actually two histograms combined 254 * into a single array, with the lower bounds of each value forming a 255 * histogram of lower bounds, and the upper bounds a histogram of upper 256 * bounds. Only non-NULL, non-empty ranges are included. 257 */ 258 #define STATISTIC_KIND_BOUNDS_HISTOGRAM 7 259 260 #endif /* EXPOSE_TO_CLIENT_CODE */ 261 262 #endif /* PG_STATISTIC_H */ 263