1 /* Copyright (C) 2014 InfiniDB, Inc.
2
3 This program is free software; you can redistribute it and/or
4 modify it under the terms of the GNU General Public License
5 as published by the Free Software Foundation; version 2 of
6 the License.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
16 MA 02110-1301, USA. */
17
18
19 /******************************************************************************
20 * $Id: rowestimator.cpp 5642 2009-08-10 21:04:59Z wweeks $
21 *
22 ******************************************************************************/
23 #include <iostream>
24 #include "primitivemsg.h"
25 #include "blocksize.h"
26 #include "rowestimator.h"
27 #include "calpontsystemcatalog.h"
28 #include "brm.h"
29 #include "brmtypes.h"
30 #include "dataconvert.h"
31 #include "configcpp.h"
32
33 #define ROW_EST_DEBUG 0
34 #if ROW_EST_DEBUG
35 #include "stopwatch.h"
36 #endif
37
38 using namespace config;
39 using namespace std;
40 using namespace execplan;
41 using namespace BRM;
42 using namespace logging;
43
44
45 namespace joblist
46 {
47
48 // Returns the sum of the days through a particular month where 1 is Jan, 2 is Feb, ...
49 // This is used for converting a Calpont date to an integer representing the day number since the year 0 for use in
50 // calculating the number of distinct days in a range. It doesn't account for leap years as these are rough estimates
51 // and only need to be accurate within an order of magnitude.
daysThroughMonth(uint32_t mth)52 uint32_t RowEstimator::daysThroughMonth(uint32_t mth)
53 {
54 switch (mth)
55 {
56 case 0:
57 return 0;
58
59 case 1:
60 return 31;
61
62 case 2:
63 return 59;
64
65 case 3:
66 return 90;
67
68 case 4:
69 return 120;
70
71 case 5:
72 return 151;
73
74 case 6:
75 return 181;
76
77 case 7:
78 return 212;
79
80 case 8:
81 return 243;
82
83 case 9:
84 return 273;
85
86 case 10:
87 return 304;
88
89 case 11:
90 return 334;
91
92 default:
93 return 365;
94 }
95 }
96
97 // This function takes a column value and if necessary adjusts it based on rules defined in the requirements.
98 // The values are adjusted so that one can be subtracted from another to find a range, compare, etc.
adjustValue(const execplan::CalpontSystemCatalog::ColType & ct,const uint64_t & value)99 uint64_t RowEstimator::adjustValue(const execplan::CalpontSystemCatalog::ColType& ct,
100 const uint64_t& value)
101 {
102 switch (ct.colDataType)
103 {
104 // Use day precision for dates. We'll use day relative to the year 0 without worrying about leap
105 // years. This is for row count estimation and we are close enough for hand grenades.
106 case CalpontSystemCatalog::DATE:
107 {
108 dataconvert::Date dt(value);
109 return dt.year * 365 + daysThroughMonth(dt.month - 1) + dt.day;
110 }
111
112 // Use second precision for datetime estimates. We'll use number of seconds since the year 0
113 // without worrying about leap years.
114 case CalpontSystemCatalog::DATETIME:
115 {
116 dataconvert::DateTime dtm(value);
117 // 86,400 seconds in a day.
118 return (dtm.year * 365 + daysThroughMonth(dtm.month - 1) + dtm.day - 1) * 86400 +
119 dtm.hour * 3600 + dtm.minute * 60 + dtm.second;
120 }
121
122 case CalpontSystemCatalog::TIMESTAMP:
123 {
124 dataconvert::TimeStamp ts(value);
125 return ts.second;
126 }
127
128 // Use the first character only for estimating chars and varchar ranges.
129 // TODO: Use dictionary column HWM for dictionary columns.
130 case CalpontSystemCatalog::CHAR:
131 case CalpontSystemCatalog::VARCHAR:
132 case CalpontSystemCatalog::TEXT:
133 // Last byte is the first character in the string.
134 return (0xFF & value);
135
136 default:
137 return value;
138 }
139 }
140
141 // Estimates the number of distinct values given a min/max range. When the range has not been set,
142 // rules from the requirements are used based on the column type.
estimateDistinctValues(const execplan::CalpontSystemCatalog::ColType & ct,const uint64_t & min,const uint64_t & max,const char cpStatus)143 uint32_t RowEstimator::estimateDistinctValues(const execplan::CalpontSystemCatalog::ColType& ct,
144 const uint64_t& min,
145 const uint64_t& max,
146 const char cpStatus)
147 {
148 uint64_t ret = 10;
149
150 // If no casual partitioning info available for extent. These rules were defined in the requirements.
151 if (cpStatus != BRM::CP_VALID)
152 {
153 switch (ct.colDataType)
154 {
155
156 case CalpontSystemCatalog::BIT:
157 return 2;
158
159 // Return limit/2 for integers where limit is number of possible values.
160 case CalpontSystemCatalog::TINYINT:
161 return (2 ^ 8) / 2;
162
163 case CalpontSystemCatalog::UTINYINT:
164 return (2 ^ 8);
165
166 case CalpontSystemCatalog::SMALLINT:
167 return (2 ^ 16) / 2;
168
169 case CalpontSystemCatalog::USMALLINT:
170 return (2 ^ 16);
171
172 // Next group all have range greater than 8M (# of rows in an extent), use 8M/2 as the estimate.
173 case CalpontSystemCatalog::MEDINT:
174 case CalpontSystemCatalog::UMEDINT:
175 case CalpontSystemCatalog::INT:
176 case CalpontSystemCatalog::UINT:
177 case CalpontSystemCatalog::BIGINT:
178 case CalpontSystemCatalog::UBIGINT:
179 case CalpontSystemCatalog::FLOAT:
180 case CalpontSystemCatalog::UFLOAT:
181 case CalpontSystemCatalog::DOUBLE:
182 case CalpontSystemCatalog::UDOUBLE:
183 return fRowsPerExtent / 2;
184
185 // Use 1000 for dates.
186 case CalpontSystemCatalog::DATE:
187 case CalpontSystemCatalog::DATETIME:
188 case CalpontSystemCatalog::TIMESTAMP:
189 return 1000;
190
191 // Use 10 for CHARs and VARCHARs. We'll use 10 for whatever else.
192 // Todo: Requirement say use dictionary HWM if dictionary column, not sure that will be doable.
193 default:
194 return 10;
195 }
196 }
197 else
198 {
199 ret = max - min + 1;
200 }
201
202 if (ret > fRowsPerExtent)
203 {
204 ret = fRowsPerExtent;
205 }
206
207 return ret;
208 }
209
210 // Returns a floating point number between 0 and 1 representing the percentage of matching rows for the given predicate against
211 // the given range. This function is used for estimating an individual operation such as col1 = 2.
212 template<class T>
estimateOpFactor(const T & min,const T & max,const T & value,char op,uint8_t lcf,uint32_t distinctValues,char cpStatus)213 float RowEstimator::estimateOpFactor(const T& min, const T& max, const T& value, char op, uint8_t lcf, uint32_t distinctValues, char cpStatus)
214 {
215 float factor = 1.0;
216
217 switch (op)
218 {
219 case COMPARE_LT:
220 case COMPARE_NGE:
221 if (cpStatus == BRM::CP_VALID)
222 {
223 factor = (1.0 * value - min) / (max - min + 1);
224 }
225
226 break;
227
228 case COMPARE_LE:
229 case COMPARE_NGT:
230 if (cpStatus == BRM::CP_VALID)
231 {
232 factor = (1.0 * value - min + 1) / (max - min + 1);
233 }
234
235 break;
236
237 case COMPARE_GT:
238 case COMPARE_NLE:
239 if (cpStatus == BRM::CP_VALID)
240 {
241 factor = (1.0 * max - value) / (1.0 * max - min + 1);
242 }
243
244 break;
245
246 case COMPARE_GE:
247 case COMPARE_NLT:
248 if (cpStatus == BRM::CP_VALID)
249 {
250 // TODO: Best way to convert to floating point arithmetic?
251 factor = (1.0 * max - value + 1) / (max - min + 1);
252 }
253
254 break;
255
256 case COMPARE_EQ:
257 factor = 1.0 / distinctValues;
258 break;
259
260 case COMPARE_NE:
261 factor = 1.0 - (1.0 / distinctValues);
262 break;
263 }
264
265 if (factor < 0.0)
266 {
267 factor = 0.0;
268 }
269 else if (factor > 1.0)
270 {
271 factor = 1.0;
272 }
273
274 return factor;
275 }
276
277 // Estimate the percentage of rows that will be returned for a particular extent.
278 // This function provides the estimate for entire filter such as "col 1 < 100 or col1 > 10000".
estimateRowReturnFactor(const BRM::EMEntry & emEntry,const messageqcpp::ByteStream * bs,const uint16_t NOPS,const execplan::CalpontSystemCatalog::ColType & ct,const uint8_t BOP,const uint32_t & rowsInExtent)279 float RowEstimator::estimateRowReturnFactor(const BRM::EMEntry& emEntry,
280 const messageqcpp::ByteStream* bs,
281 const uint16_t NOPS,
282 const execplan::CalpontSystemCatalog::ColType& ct,
283 const uint8_t BOP,
284 const uint32_t& rowsInExtent)
285 {
286 bool bIsUnsigned = execplan::isUnsigned(ct.colDataType);
287 float factor = 1.0;
288 float tempFactor = 1.0;
289
290 // Adjust values based on column type and estimate the
291 uint64_t adjustedMin = adjustValue(ct, emEntry.partition.cprange.lo_val);
292 uint64_t adjustedMax = adjustValue(ct, emEntry.partition.cprange.hi_val);
293 uint32_t distinctValuesEstimate = estimateDistinctValues(
294 ct, adjustedMin, adjustedMax, emEntry.partition.cprange.isValid);
295
296 // Loop through the operations and estimate the percentage of rows that will qualify.
297 // For example, there are two operations for "col1 > 5 and col1 < 10":
298 // 1) col1 > 5
299 // 2) col2 < 10
300 int length = bs->length(), pos = 0;
301 const char* msgDataPtr = (const char*) bs->buf();
302 int64_t value = 0;
303 bool firstQualifyingOrCondition = true;
304 uint16_t comparisonLimit = (NOPS <= fMaxComparisons) ? NOPS : fMaxComparisons;
305
306 for (int i = 0; i < comparisonLimit; i++)
307 {
308 pos += ct.colWidth + 2; // predicate + op + lcf
309
310 // TODO: Stole this condition from lbidlist.
311 // Investigate whether this can happen / should throw an error.
312 if (pos > length)
313 {
314 return factor;
315 }
316
317 // Get the comparison value for the condition.
318 char op = *msgDataPtr++;
319 uint8_t lcf = *(uint8_t*)msgDataPtr++;
320
321 if (bIsUnsigned)
322 {
323 switch (ct.colWidth)
324 {
325 case 1:
326 {
327 uint8_t val = *(uint8_t*)msgDataPtr;
328 value = val;
329 break;
330 }
331
332 case 2:
333 {
334 uint16_t val = *(uint16_t*)msgDataPtr;
335 value = val;
336 break;
337 }
338
339 case 4:
340 {
341 uint32_t val = *(uint32_t*)msgDataPtr;
342 value = val;
343 break;
344 }
345
346 case 8:
347 default:
348 {
349 uint64_t val = *(uint64_t*)msgDataPtr;
350 value = static_cast<int64_t>(val);
351 break;
352 }
353 }
354 }
355 else
356 {
357 switch (ct.colWidth)
358 {
359 case 1:
360 {
361 int8_t val = *(int8_t*)msgDataPtr;
362 value = val;
363 break;
364 }
365
366 case 2:
367 {
368 int16_t val = *(int16_t*)msgDataPtr;
369 value = val;
370 break;
371 }
372
373 case 4:
374 {
375 int32_t val = *(int32_t*)msgDataPtr;
376 value = val;
377 break;
378 }
379
380 case 8:
381 default:
382 {
383 int64_t val = *(int64_t*)msgDataPtr;
384 value = val;
385 break;
386 }
387 }
388 }
389
390 // TODO: Investigate whether condition below should throw an error.
391 msgDataPtr += ct.colWidth;
392
393 if (pos > length)
394 {
395 return factor;
396 }
397
398 #if ROW_EST_DEBUG
399 cout << " Min-" << emEntry.partition.cprange.lo_val <<
400 ", Max-" << emEntry.partition.cprange.hi_val <<
401 ", Val-" << value;
402 #endif
403
404 // Get the factor for the individual operation.
405 if (bIsUnsigned)
406 {
407 tempFactor = estimateOpFactor<uint64_t>(
408 adjustedMin, adjustedMax, adjustValue(ct, value), op, lcf,
409 distinctValuesEstimate, emEntry.partition.cprange.isValid);
410 }
411 else
412 {
413 tempFactor = estimateOpFactor<int64_t>(
414 adjustedMin, adjustedMax, adjustValue(ct, value), op, lcf,
415 distinctValuesEstimate, emEntry.partition.cprange.isValid);
416 }
417
418 #if ROW_EST_DEBUG
419 cout << ", OperatorFactor-" << tempFactor << ", DistinctValsEst-" << distinctValuesEstimate << endl;
420 #endif
421
422 // Use it in the overall factor.
423 if (BOP == BOP_AND)
424 {
425 // TODO: Handle betweens correctly (same as a >= 5 and a <= 10)
426 factor *= tempFactor;
427 }
428 else if (BOP == BOP_OR)
429 {
430 if (firstQualifyingOrCondition)
431 {
432 factor = tempFactor;
433 firstQualifyingOrCondition = false;
434 }
435 else
436 {
437 factor += tempFactor;
438 }
439 }
440 else
441 {
442 factor = tempFactor;
443 }
444
445 } // for()
446
447 if (factor > 1.0)
448 {
449 factor = 1.0;
450 }
451
452 #if ROW_EST_DEBUG
453
454 if (NOPS > 1)
455 cout << " FilterFactor-" << factor << endl;
456
457 #endif
458
459 return factor;
460 }
461
462 // This function returns the estimated row count for the entire TupleBPS. It samples the last 20 (configurable) extents to
463 // calculate the estimate.
estimateRows(const vector<ColumnCommandJL * > & cpColVec,const std::vector<bool> & scanFlags,BRM::DBRM & dbrm,const execplan::CalpontSystemCatalog::OID oid)464 uint64_t RowEstimator::estimateRows(const vector<ColumnCommandJL*>& cpColVec,
465 const std::vector<bool>& scanFlags,
466 BRM::DBRM& dbrm,
467 const execplan::CalpontSystemCatalog::OID oid)
468
469 {
470 #if ROW_EST_DEBUG
471 StopWatch stopwatch;
472 stopwatch.start("estimateRows");
473 #endif
474
475 uint32_t rowsInLastExtent = fRowsPerExtent;
476 uint32_t extentRows = 0;
477 HWM_t hwm = 0;
478 float factor = 1.0;
479 float tempFactor = 1.0;
480
481 ColumnCommandJL* colCmd = 0;
482 uint32_t extentsSampled = 0;
483 uint64_t totalRowsToBeScanned = 0;
484 uint32_t estimatedExtentRowCount = 0;
485 uint64_t estimatedRowCount = 0;
486 //vector<EMEntry> *extents = NULL;
487
488 // Nothing to do if no scanFlags.
489 if (scanFlags.size() == 0 || cpColVec.size() == 0)
490 {
491 // TODO: Probably should throw an error here.
492 return 0;
493 }
494
495 // Use the HWM for the estimated row count in the last extent.
496 colCmd = cpColVec[0];
497 const vector<EMEntry>& extents = colCmd->getExtents();
498 hwm = extents.back().HWM; // extents is sorted by "global" fbo
499 rowsInLastExtent = ((hwm + 1) * fBlockSize / colCmd->getColType().colWidth) % fRowsPerExtent;
500
501 // Sum up the total number of scanned rows.
502 int32_t idx = scanFlags.size() - 1;
503
504 while (idx >= 0)
505 {
506 if (scanFlags[idx])
507 {
508 extentRows = (idx == (int) scanFlags.size() - 1 ? rowsInLastExtent : fRowsPerExtent);
509
510 // Get the predicate factor.
511 #if ROW_EST_DEBUG
512 cout << endl;
513 cout << "Ext-" << idx << ", rowsToScan-" << extentRows << endl;
514 #endif
515 factor = 1.0;
516
517 for (uint32_t j = 0; j < cpColVec.size(); j++)
518 {
519 colCmd = cpColVec[j];
520 //RowEstimator rowEstimator;
521 #if ROW_EST_DEBUG
522 stopwatch.start("estimateRowReturnFactor");
523 #endif
524 //tempFactor = rowEstimator.estimateRowReturnFactor(
525 tempFactor = estimateRowReturnFactor(
526 colCmd->getExtents()[idx],
527 &(colCmd->getFilterString()),
528 colCmd->getFilterCount(),
529 colCmd->getColType(),
530 colCmd->getBOP(),
531 extentRows);
532 #if ROW_EST_DEBUG
533 stopwatch.stop("estimateRowReturnFactor");
534 #endif
535
536 factor *= tempFactor;
537 }
538
539 extentsSampled++;
540 totalRowsToBeScanned += extentRows;
541 estimatedExtentRowCount = uint64_t(ceil(factor * extentRows));
542
543 if (estimatedExtentRowCount <= 0) estimatedExtentRowCount = 1;
544
545 estimatedRowCount += estimatedExtentRowCount;
546 #if ROW_EST_DEBUG
547 cout << "ExtentFactor-" << factor << ", EstimatedRows-" << estimatedExtentRowCount << endl;
548 #endif
549 }
550
551 //if (extentsSampled == fExtentsToSample || idx == 0)
552 //{
553 //done = true;
554 //}
555 //else
556 //{
557 idx--;
558 //}
559 }
560
561 // If there are more extents than we sampled, add the row counts for the qualifying extents
562 // that we didn't sample to the count of rows that will be scanned.
563 // XXXPAT: Modified this fcn to sample all extents. Leaving this here due to level of arcana
564 // involved. :)
565 if (false && (extentsSampled >= fExtentsToSample) && (idx > 0))
566 {
567 factor = (1.0 * estimatedRowCount) / (1.0 * totalRowsToBeScanned);
568 #if ROW_EST_DEBUG
569 cout << "overall factor-" << factor << endl;
570 #endif
571
572 for (int32_t i = 0; i < idx; i++)
573 {
574 if (scanFlags[i])
575 {
576 // Don't take the expense of checking to see if the last extent was one that wasn't
577 // sampled. It will more than likely have been the first extent sampled since we
578 // are doing them in reverse order. If not, the amount of rows not populated isn't
579 // that significant since there are many qualifying extents.
580 totalRowsToBeScanned += fRowsPerExtent;
581 }
582 }
583
584 estimatedRowCount = uint64_t(ceil(factor * totalRowsToBeScanned));
585 }
586
587 #if ROW_EST_DEBUG
588 cout << "Oid-" << oid << ", TotalEstimatedRows-" << estimatedRowCount << endl;
589 stopwatch.stop("estimateRows");
590 stopwatch.finish();
591 #endif
592 return estimatedRowCount;
593 }
594
595 // @Bug 3503. Fix to use the number of extents to estimate the number of rows in queries that do
596 // joins on dictionaries or other column types that do not use casual partitioning.
597 // We use an estimate of 100% of the rows regardless of any dictionary filters.
estimateRowsForNonCPColumn(ColumnCommandJL & colCmd)598 uint64_t RowEstimator::estimateRowsForNonCPColumn(ColumnCommandJL& colCmd)
599 {
600 uint64_t estimatedRows = 0;
601 int numExtents = colCmd.getExtents().size();
602
603 if (numExtents > 0)
604 {
605 HWM_t hwm = colCmd.getExtents()[numExtents - 1].HWM;
606 uint32_t rowsInLastExtent =
607 ((hwm + 1) * fBlockSize / colCmd.getColType().colWidth) % fRowsPerExtent;
608 estimatedRows = fRowsPerExtent * (numExtents - 1) + rowsInLastExtent;
609 }
610
611 return estimatedRows;
612 }
613
614 } //namespace joblist
615