1 /* Copyright (C) 2014 InfiniDB, Inc.
2 
3    This program is free software; you can redistribute it and/or
4    modify it under the terms of the GNU General Public License
5    as published by the Free Software Foundation; version 2 of
6    the License.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
16    MA 02110-1301, USA. */
17 
18 
19 /******************************************************************************
20 * $Id: rowestimator.cpp 5642 2009-08-10 21:04:59Z wweeks $
21 *
22 ******************************************************************************/
23 #include <iostream>
24 #include "primitivemsg.h"
25 #include "blocksize.h"
26 #include "rowestimator.h"
27 #include "calpontsystemcatalog.h"
28 #include "brm.h"
29 #include "brmtypes.h"
30 #include "dataconvert.h"
31 #include "configcpp.h"
32 
33 #define ROW_EST_DEBUG 0
34 #if ROW_EST_DEBUG
35 #include "stopwatch.h"
36 #endif
37 
38 using namespace config;
39 using namespace std;
40 using namespace execplan;
41 using namespace BRM;
42 using namespace logging;
43 
44 
45 namespace joblist
46 {
47 
48 // Returns the sum of the days through a particular month where 1 is Jan, 2 is Feb, ...
49 // This is used for converting a Calpont date to an integer representing the day number since the year 0 for use in
50 // calculating the number of distinct days in a range.  It doesn't account for leap years as these are rough estimates
51 // and only need to be accurate within an order of magnitude.
daysThroughMonth(uint32_t mth)52 uint32_t RowEstimator::daysThroughMonth(uint32_t mth)
53 {
54     switch (mth)
55     {
56         case 0:
57             return 0;
58 
59         case 1:
60             return 31;
61 
62         case 2:
63             return 59;
64 
65         case 3:
66             return 90;
67 
68         case 4:
69             return 120;
70 
71         case 5:
72             return 151;
73 
74         case 6:
75             return 181;
76 
77         case 7:
78             return 212;
79 
80         case 8:
81             return 243;
82 
83         case 9:
84             return 273;
85 
86         case 10:
87             return 304;
88 
89         case 11:
90             return 334;
91 
92         default:
93             return 365;
94     }
95 }
96 
97 // This function takes a column value and if necessary adjusts it based on rules defined in the requirements.
98 // The values are adjusted so that one can be subtracted from another to find a range, compare, etc.
adjustValue(const execplan::CalpontSystemCatalog::ColType & ct,const uint64_t & value)99 uint64_t RowEstimator::adjustValue(const execplan::CalpontSystemCatalog::ColType& ct,
100                                    const uint64_t& value)
101 {
102     switch (ct.colDataType)
103     {
104         // Use day precision for dates.  We'll use day relative to the year 0 without worrying about leap
105         // years.  This is for row count estimation and we are close enough for hand grenades.
106         case CalpontSystemCatalog::DATE:
107         {
108             dataconvert::Date dt(value);
109             return dt.year * 365 + daysThroughMonth(dt.month - 1) + dt.day;
110         }
111 
112         // Use second precision for datetime estimates.  We'll use number of seconds since the year 0
113         // without worrying about leap years.
114         case CalpontSystemCatalog::DATETIME:
115         {
116             dataconvert::DateTime dtm(value);
117             // 86,400 seconds in a day.
118             return (dtm.year * 365 + daysThroughMonth(dtm.month - 1) + dtm.day - 1) * 86400 +
119                    dtm.hour * 3600 + dtm.minute * 60 + dtm.second;
120         }
121 
122         case CalpontSystemCatalog::TIMESTAMP:
123         {
124             dataconvert::TimeStamp ts(value);
125             return ts.second;
126         }
127 
128         // Use the first character only for estimating chars and varchar ranges.
129         // TODO:  Use dictionary column HWM for dictionary columns.
130         case CalpontSystemCatalog::CHAR:
131         case CalpontSystemCatalog::VARCHAR:
132         case CalpontSystemCatalog::TEXT:
133             // Last byte is the first character in the string.
134             return (0xFF & value);
135 
136         default:
137             return value;
138     }
139 }
140 
141 // Estimates the number of distinct values given a min/max range.  When the range has not been set,
142 // rules from the requirements are used based on the column type.
estimateDistinctValues(const execplan::CalpontSystemCatalog::ColType & ct,const uint64_t & min,const uint64_t & max,const char cpStatus)143 uint32_t RowEstimator::estimateDistinctValues(const execplan::CalpontSystemCatalog::ColType& ct,
144         const uint64_t& min,
145         const uint64_t& max,
146         const char cpStatus)
147 {
148     uint64_t ret = 10;
149 
150     // If no casual partitioning info available for extent.  These rules were defined in the requirements.
151     if (cpStatus != BRM::CP_VALID)
152     {
153         switch (ct.colDataType)
154         {
155 
156             case CalpontSystemCatalog::BIT:
157                 return 2;
158 
159             // Return limit/2 for integers where limit is number of possible values.
160             case CalpontSystemCatalog::TINYINT:
161                 return (2 ^ 8) / 2;
162 
163             case CalpontSystemCatalog::UTINYINT:
164                 return (2 ^ 8);
165 
166             case CalpontSystemCatalog::SMALLINT:
167                 return (2 ^ 16) / 2;
168 
169             case CalpontSystemCatalog::USMALLINT:
170                 return (2 ^ 16);
171 
172             // Next group all have range greater than 8M (# of rows in an extent), use 8M/2 as the estimate.
173             case CalpontSystemCatalog::MEDINT:
174             case CalpontSystemCatalog::UMEDINT:
175             case CalpontSystemCatalog::INT:
176             case CalpontSystemCatalog::UINT:
177             case CalpontSystemCatalog::BIGINT:
178             case CalpontSystemCatalog::UBIGINT:
179             case CalpontSystemCatalog::FLOAT:
180             case CalpontSystemCatalog::UFLOAT:
181             case CalpontSystemCatalog::DOUBLE:
182             case CalpontSystemCatalog::UDOUBLE:
183                 return fRowsPerExtent / 2;
184 
185             // Use 1000 for dates.
186             case CalpontSystemCatalog::DATE:
187             case CalpontSystemCatalog::DATETIME:
188             case CalpontSystemCatalog::TIMESTAMP:
189                 return 1000;
190 
191             // Use 10 for CHARs and VARCHARs.  We'll use 10 for whatever else.
192             // Todo:  Requirement say use dictionary HWM if dictionary column, not sure that will be doable.
193             default:
194                 return 10;
195         }
196     }
197     else
198     {
199         ret = max - min + 1;
200     }
201 
202     if (ret > fRowsPerExtent)
203     {
204         ret = fRowsPerExtent;
205     }
206 
207     return ret;
208 }
209 
210 // Returns a floating point number between 0 and 1 representing the percentage of matching rows for the given predicate against
211 // the given range.  This function is used for estimating an individual operation such as col1 = 2.
212 template<class T>
estimateOpFactor(const T & min,const T & max,const T & value,char op,uint8_t lcf,uint32_t distinctValues,char cpStatus)213 float RowEstimator::estimateOpFactor(const T& min, const T& max, const T& value, char op, uint8_t lcf, uint32_t distinctValues, char cpStatus)
214 {
215     float factor = 1.0;
216 
217     switch (op)
218     {
219         case COMPARE_LT:
220         case COMPARE_NGE:
221             if (cpStatus == BRM::CP_VALID)
222             {
223                 factor = (1.0 * value - min) / (max - min + 1);
224             }
225 
226             break;
227 
228         case COMPARE_LE:
229         case COMPARE_NGT:
230             if (cpStatus == BRM::CP_VALID)
231             {
232                 factor = (1.0 * value - min + 1) / (max - min + 1);
233             }
234 
235             break;
236 
237         case COMPARE_GT:
238         case COMPARE_NLE:
239             if (cpStatus == BRM::CP_VALID)
240             {
241                 factor = (1.0 * max - value) / (1.0 * max - min + 1);
242             }
243 
244             break;
245 
246         case COMPARE_GE:
247         case COMPARE_NLT:
248             if (cpStatus == BRM::CP_VALID)
249             {
250                 // TODO:  Best way to convert to floating point arithmetic?
251                 factor = (1.0 * max - value + 1) / (max - min + 1);
252             }
253 
254             break;
255 
256         case COMPARE_EQ:
257             factor = 1.0 / distinctValues;
258             break;
259 
260         case COMPARE_NE:
261             factor = 1.0 - (1.0 / distinctValues);
262             break;
263     }
264 
265     if (factor < 0.0)
266     {
267         factor = 0.0;
268     }
269     else if (factor > 1.0)
270     {
271         factor = 1.0;
272     }
273 
274     return factor;
275 }
276 
277 // Estimate the percentage of rows that will be returned for a particular extent.
278 // This function provides the estimate for entire filter such as "col 1 < 100 or col1 > 10000".
estimateRowReturnFactor(const BRM::EMEntry & emEntry,const messageqcpp::ByteStream * bs,const uint16_t NOPS,const execplan::CalpontSystemCatalog::ColType & ct,const uint8_t BOP,const uint32_t & rowsInExtent)279 float RowEstimator::estimateRowReturnFactor(const BRM::EMEntry& emEntry,
280         const messageqcpp::ByteStream* bs,
281         const uint16_t NOPS,
282         const execplan::CalpontSystemCatalog::ColType& ct,
283         const uint8_t BOP,
284         const uint32_t& rowsInExtent)
285 {
286     bool bIsUnsigned = execplan::isUnsigned(ct.colDataType);
287     float factor = 1.0;
288     float tempFactor = 1.0;
289 
290     // Adjust values based on column type and estimate the
291     uint64_t adjustedMin = adjustValue(ct, emEntry.partition.cprange.lo_val);
292     uint64_t adjustedMax = adjustValue(ct, emEntry.partition.cprange.hi_val);
293     uint32_t distinctValuesEstimate = estimateDistinctValues(
294                                           ct, adjustedMin, adjustedMax, emEntry.partition.cprange.isValid);
295 
296     // Loop through the operations and estimate the percentage of rows that will qualify.
297     // For example, there are two operations for "col1 > 5 and col1 < 10":
298     // 1) col1 > 5
299     // 2) col2 < 10
300     int length = bs->length(), pos = 0;
301     const char* msgDataPtr = (const char*) bs->buf();
302     int64_t value = 0;
303     bool firstQualifyingOrCondition = true;
304     uint16_t comparisonLimit = (NOPS <= fMaxComparisons) ? NOPS : fMaxComparisons;
305 
306     for (int i = 0; i < comparisonLimit; i++)
307     {
308         pos += ct.colWidth + 2;  // predicate + op + lcf
309 
310         // TODO:  Stole this condition from lbidlist.
311         // Investigate whether this can happen / should throw an error.
312         if (pos > length)
313         {
314             return factor;
315         }
316 
317         // Get the comparison value for the condition.
318         char op = *msgDataPtr++;
319         uint8_t lcf = *(uint8_t*)msgDataPtr++;
320 
321         if (bIsUnsigned)
322         {
323             switch (ct.colWidth)
324             {
325                 case 1:
326                 {
327                     uint8_t val = *(uint8_t*)msgDataPtr;
328                     value = val;
329                     break;
330                 }
331 
332                 case 2:
333                 {
334                     uint16_t val = *(uint16_t*)msgDataPtr;
335                     value = val;
336                     break;
337                 }
338 
339                 case 4:
340                 {
341                     uint32_t val = *(uint32_t*)msgDataPtr;
342                     value = val;
343                     break;
344                 }
345 
346                 case 8:
347                 default:
348                 {
349                     uint64_t val = *(uint64_t*)msgDataPtr;
350                     value = static_cast<int64_t>(val);
351                     break;
352                 }
353             }
354         }
355         else
356         {
357             switch (ct.colWidth)
358             {
359                 case 1:
360                 {
361                     int8_t val = *(int8_t*)msgDataPtr;
362                     value = val;
363                     break;
364                 }
365 
366                 case 2:
367                 {
368                     int16_t val = *(int16_t*)msgDataPtr;
369                     value = val;
370                     break;
371                 }
372 
373                 case 4:
374                 {
375                     int32_t val = *(int32_t*)msgDataPtr;
376                     value = val;
377                     break;
378                 }
379 
380                 case 8:
381                 default:
382                 {
383                     int64_t val = *(int64_t*)msgDataPtr;
384                     value = val;
385                     break;
386                 }
387             }
388         }
389 
390         // TODO:  Investigate whether condition below should throw an error.
391         msgDataPtr += ct.colWidth;
392 
393         if (pos > length)
394         {
395             return factor;
396         }
397 
398 #if ROW_EST_DEBUG
399         cout << "  Min-" << emEntry.partition.cprange.lo_val <<
400              ", Max-" << emEntry.partition.cprange.hi_val <<
401              ", Val-" << value;
402 #endif
403 
404         // Get the factor for the individual operation.
405         if (bIsUnsigned)
406         {
407             tempFactor = estimateOpFactor<uint64_t>(
408                              adjustedMin, adjustedMax, adjustValue(ct, value), op, lcf,
409                              distinctValuesEstimate, emEntry.partition.cprange.isValid);
410         }
411         else
412         {
413             tempFactor = estimateOpFactor<int64_t>(
414                              adjustedMin, adjustedMax, adjustValue(ct, value), op, lcf,
415                              distinctValuesEstimate, emEntry.partition.cprange.isValid);
416         }
417 
418 #if ROW_EST_DEBUG
419         cout << ", OperatorFactor-" << tempFactor << ", DistinctValsEst-" << distinctValuesEstimate << endl;
420 #endif
421 
422         // Use it in the overall factor.
423         if (BOP == BOP_AND)
424         {
425             // TODO:  Handle betweens correctly (same as a >= 5 and a <= 10)
426             factor *= tempFactor;
427         }
428         else if (BOP == BOP_OR)
429         {
430             if (firstQualifyingOrCondition)
431             {
432                 factor = tempFactor;
433                 firstQualifyingOrCondition = false;
434             }
435             else
436             {
437                 factor += tempFactor;
438             }
439         }
440         else
441         {
442             factor = tempFactor;
443         }
444 
445     } // for()
446 
447     if (factor > 1.0)
448     {
449         factor = 1.0;
450     }
451 
452 #if ROW_EST_DEBUG
453 
454     if (NOPS > 1)
455         cout << "  FilterFactor-" << factor << endl;
456 
457 #endif
458 
459     return factor;
460 }
461 
462 // This function returns the estimated row count for the entire TupleBPS.  It samples the last 20 (configurable) extents to
463 // calculate the estimate.
estimateRows(const vector<ColumnCommandJL * > & cpColVec,const std::vector<bool> & scanFlags,BRM::DBRM & dbrm,const execplan::CalpontSystemCatalog::OID oid)464 uint64_t RowEstimator::estimateRows(const vector<ColumnCommandJL*>& cpColVec,
465                                     const std::vector<bool>& scanFlags,
466                                     BRM::DBRM& dbrm,
467                                     const execplan::CalpontSystemCatalog::OID oid)
468 
469 {
470 #if ROW_EST_DEBUG
471     StopWatch stopwatch;
472     stopwatch.start("estimateRows");
473 #endif
474 
475     uint32_t rowsInLastExtent = fRowsPerExtent;
476     uint32_t extentRows = 0;
477     HWM_t hwm = 0;
478     float factor = 1.0;
479     float tempFactor = 1.0;
480 
481     ColumnCommandJL* colCmd = 0;
482     uint32_t extentsSampled = 0;
483     uint64_t totalRowsToBeScanned = 0;
484     uint32_t estimatedExtentRowCount = 0;
485     uint64_t estimatedRowCount = 0;
486     //vector<EMEntry> *extents = NULL;
487 
488     // Nothing to do if no scanFlags.
489     if (scanFlags.size() == 0 || cpColVec.size() == 0)
490     {
491         // TODO:  Probably should throw an error here.
492         return 0;
493     }
494 
495     // Use the HWM for the estimated row count in the last extent.
496     colCmd = cpColVec[0];
497     const vector<EMEntry>& extents = colCmd->getExtents();
498     hwm = extents.back().HWM;   // extents is sorted by "global" fbo
499     rowsInLastExtent = ((hwm + 1) * fBlockSize / colCmd->getColType().colWidth) % fRowsPerExtent;
500 
501     // Sum up the total number of scanned rows.
502     int32_t idx = scanFlags.size() - 1;
503 
504     while (idx >= 0)
505     {
506         if (scanFlags[idx])
507         {
508             extentRows = (idx == (int) scanFlags.size() - 1 ? rowsInLastExtent : fRowsPerExtent);
509 
510             // Get the predicate factor.
511 #if ROW_EST_DEBUG
512             cout << endl;
513             cout << "Ext-" << idx << ", rowsToScan-" << extentRows << endl;
514 #endif
515             factor = 1.0;
516 
517             for (uint32_t j = 0; j < cpColVec.size(); j++)
518             {
519                 colCmd = cpColVec[j];
520                 //RowEstimator rowEstimator;
521 #if ROW_EST_DEBUG
522                 stopwatch.start("estimateRowReturnFactor");
523 #endif
524                 //tempFactor =  rowEstimator.estimateRowReturnFactor(
525                 tempFactor = estimateRowReturnFactor(
526                                  colCmd->getExtents()[idx],
527                                  &(colCmd->getFilterString()),
528                                  colCmd->getFilterCount(),
529                                  colCmd->getColType(),
530                                  colCmd->getBOP(),
531                                  extentRows);
532 #if ROW_EST_DEBUG
533                 stopwatch.stop("estimateRowReturnFactor");
534 #endif
535 
536                 factor *= tempFactor;
537             }
538 
539             extentsSampled++;
540             totalRowsToBeScanned += extentRows;
541             estimatedExtentRowCount = uint64_t(ceil(factor * extentRows));
542 
543             if (estimatedExtentRowCount <= 0) estimatedExtentRowCount = 1;
544 
545             estimatedRowCount += estimatedExtentRowCount;
546 #if ROW_EST_DEBUG
547             cout << "ExtentFactor-" << factor << ", EstimatedRows-" << estimatedExtentRowCount << endl;
548 #endif
549         }
550 
551         //if (extentsSampled == fExtentsToSample || idx == 0)
552         //{
553             //done = true;
554         //}
555         //else
556         //{
557             idx--;
558         //}
559     }
560 
561     // If there are more extents than we sampled, add the row counts for the qualifying extents
562     // that we didn't sample to the count of rows that will be scanned.
563 	// XXXPAT: Modified this fcn to sample all extents.  Leaving this here due to level of arcana
564 	// involved.  :)
565     if (false && (extentsSampled >= fExtentsToSample) && (idx > 0))
566     {
567         factor = (1.0 * estimatedRowCount) / (1.0 * totalRowsToBeScanned);
568 #if ROW_EST_DEBUG
569         cout << "overall factor-" << factor << endl;
570 #endif
571 
572         for (int32_t i = 0; i < idx; i++)
573         {
574             if (scanFlags[i])
575             {
576                 // Don't take the expense of checking to see if the last extent was one that wasn't
577                 // sampled.  It will more than likely have been the first extent sampled since we
578                 // are doing them in reverse order.  If not, the amount of rows not populated isn't
579                 // that significant since there are many qualifying extents.
580                 totalRowsToBeScanned += fRowsPerExtent;
581             }
582         }
583 
584         estimatedRowCount = uint64_t(ceil(factor * totalRowsToBeScanned));
585     }
586 
587 #if ROW_EST_DEBUG
588     cout << "Oid-" << oid << ", TotalEstimatedRows-" << estimatedRowCount << endl;
589     stopwatch.stop("estimateRows");
590     stopwatch.finish();
591 #endif
592     return estimatedRowCount;
593 }
594 
595 // @Bug 3503.  Fix to use the number of extents to estimate the number of rows in queries that do
596 // joins on dictionaries or other column types that do not use casual partitioning.
597 // We use an estimate of 100% of the rows regardless of any dictionary filters.
estimateRowsForNonCPColumn(ColumnCommandJL & colCmd)598 uint64_t RowEstimator::estimateRowsForNonCPColumn(ColumnCommandJL& colCmd)
599 {
600     uint64_t estimatedRows = 0;
601     int numExtents = colCmd.getExtents().size();
602 
603     if (numExtents > 0)
604     {
605         HWM_t hwm = colCmd.getExtents()[numExtents - 1].HWM;
606         uint32_t rowsInLastExtent =
607             ((hwm + 1) * fBlockSize / colCmd.getColType().colWidth) % fRowsPerExtent;
608         estimatedRows = fRowsPerExtent * (numExtents - 1) + rowsInLastExtent;
609     }
610 
611     return estimatedRows;
612 }
613 
614 } //namespace joblist
615