1 /* Copyright (c) 2014, 2021, Oracle and/or its affiliates.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License, version 2.0,
5 as published by the Free Software Foundation.
6
7 This program is also distributed with certain software (including
8 but not limited to OpenSSL) that is licensed under separate terms,
9 as designated in a particular file or component or in included license
10 documentation. The authors of MySQL hereby grant you an additional
11 permission to link the program and your derivative works with the
12 separately licensed software that they have included with MySQL.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License, version 2.0, for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
22
23 /**
24 @file
25 Checks for some semantic constraints on queries using GROUP
26 BY, or aggregate functions, or DISTINCT. Enforced if
27 sql_mode contains 'only_full_group_by'.
28 */
29
30 #include "sql_select.h"
31 #include "opt_trace.h"
32 #include "sql_base.h"
33 #include "aggregate_check.h"
34
35 /**
36 @addtogroup AGGREGATE_CHECKS
37
38 @section USED_TABLES Implementation note: used_tables_for_level() vs used_tables()
39
40 - When we are looking for items to validate, we must enter scalar/row
41 subqueries; if we find an item of our SELECT_LEX inside such subquery, for
42 example an Item_field with depended_from equal to our SELECT_LEX, we
43 must use used_tables_for_level(). Example: when validating t1.a in
44 select (select t1.a from t1 as t2 limit 1) from t1 group by t1.pk;
45 we need t1.a's map in the grouped query; used_tables() would return
46 OUTER_REF_TABLE_BIT.
47 - When we are searching for FDs in PKs, or join conditions, or the GROUP BY
48 clause, we do not enter scalar/row subqueries, so we use
49 used_tables(). Example:
50 select ... from t1 where t1.a=(subq) from t1 group by ...
51 the subq is not used to discover FDs in the grouped query.
52 Or:
53 select (select t2.a from t1 as t2 where t2.a=t1.a group by t2.b) from t1
54 when validating the subq, t1.a is an outer reference, kind of a constant, so
55 tells us that t2.a is FD on {} ; using used_tables_for_level() on t1.a would
56 be nonsense - we are validating the subquery.
57
58 @{
59 */
60
61 /**
62 We need to search for items inside subqueries, in case subqueries contain
63 outer references to tables of a query block having DISTINCT or GROUP BY.
64 We also need to sometimes skip parts of item trees, so the walk processor
65 must be called prefix (to enable skipping) and postfix (to disable
66 skipping).
67 */
68 static const Item::enum_walk walk_options=
69 Item::enum_walk(Item::WALK_PREFIX | Item::WALK_POSTFIX | Item::WALK_SUBQUERY);
70
71 /**
72 Rejects the query if it has a combination of DISTINCT and ORDER BY which
73 could lead to randomly ordered results. More precisely: if, in a query
74 block 'sl', an ORDER BY expression
75 - is not the same expression as one in the SELECT list of 'sl' (1)
76 - and contains a column which:
77 -- is of a table whose qualifying query block is 'sl' (2)
78 -- and is not in the SELECT list of 'sl' (3)
79 then 'sl' should not have DISTINCT.
80
81 @returns true if rejected (my_error() is called)
82 */
check_query(THD * thd)83 bool Distinct_check::check_query(THD *thd)
84 {
85 uint number_in_list= 1;
86 for (ORDER *order= select->order_list.first;
87 order;
88 ++number_in_list, order= order->next)
89 {
90 if (order->in_field_list) // is in SELECT list
91 continue;
92 assert((*order->item)->fixed);
93 uint counter;
94 enum_resolution_type resolution;
95 /*
96 Search if this expression is equal to one in the SELECT
97 list. setup_order()/find_order_in_list() has already done so, but not
98 perfectly, indeed at that time the expression was not fixed, which
99 prevents recognition of certain equalities. Example:
100 create view v select x*2 as b from ...;
101 select distinct sin(b) as z from v order by sin(b);
102 This query is valid because the expression in ORDER BY is the same as
103 the one in SELECT list. But in setup_order():
104 'b' in ORDER BY (not yet fixed) is still a 'generic' Item_field,
105 'b' in SELECT (already fixed) is Item_direct_view_ref referencing 'x*2'
106 (so type()==REF_ITEM).
107 So Item_field::eq() says the 'b's are different, so 'sin(b)' of
108 ORDER BY is not found equal to 'sin(b)' of SELECT.
109
110 On the other hand, the search below will match, because everything is
111 now fixed.
112
113 There is a limitation with subqueries: in this query
114 SELECT (subquery) ... ORDER BY (subquery)
115 we may not be able to recognize that both subqueries are the same (and
116 so we may reject it even though the order is deterministic). It is
117 because Item_subselect::eq() is Item::eq() which is too coarse and
118 misses equalities: it compares names of both items; depending on the
119 position of the subquery in the query, MySQL gives it a name or not; and
120 this name is the raw text of the subquery (so if subqueries' texts
121 differ due to white space....).
122 Subqueries in ORDER BY are non-standard anyway.
123 */
124 Item** const res= find_item_in_list(*order->item, select->item_list,
125 &counter, REPORT_EXCEPT_NOT_FOUND,
126 &resolution);
127 if (res == NULL) // Other error than "not found", my_error() was called
128 return true; /* purecov: inspected */
129 if (res != not_found_item) // is in SELECT list
130 continue;
131 /*
132 [numbers refer to the function's comment]
133 (1) is true. Check (2) and (3) inside walk().
134 */
135 if ((*order->item)->walk(&Item::aggregate_check_distinct,
136 walk_options, (uchar*)this))
137 {
138 if (failed_ident)
139 my_error(ER_FIELD_IN_ORDER_NOT_SELECT, MYF(0), number_in_list,
140 failed_ident->full_name(), "DISTINCT");
141 else
142 my_error(ER_AGGREGATE_IN_ORDER_NOT_SELECT, MYF(0), number_in_list,
143 "DISTINCT");
144 return true;
145 }
146 }
147 return false;
148 }
149
150
151 /**
152 Rejects the query if it does aggregation or grouping, and expressions in
153 its SELECT list, ORDER BY clause, or HAVING condition, may vary inside a
154 group (are not "group-invariant").
155 */
check_query(THD * thd)156 bool Group_check::check_query(THD *thd)
157 {
158 ORDER *order= select->order_list.first;
159
160 // Validate SELECT list
161 List_iterator<Item> select_exprs_it(select->item_list);
162 Item *expr;
163 uint number_in_list= 1;
164 const char *place= "SELECT list";
165 while ((expr= select_exprs_it++))
166 {
167 if (check_expression(thd, expr, true))
168 goto err;
169 ++number_in_list;
170 }
171
172 // aggregate without GROUP BY has no ORDER BY at this stage.
173 assert(!(select->is_implicitly_grouped() && select->is_ordered()));
174 // Validate ORDER BY list
175 if (order)
176 {
177 number_in_list= 1;
178 place= "ORDER BY clause";
179 for ( ; order ; order= order->next)
180 {
181 // If it is in SELECT list it is already checked.
182 if (!order->in_field_list &&
183 check_expression(thd, *order->item, false))
184 goto err;
185 ++number_in_list;
186 }
187 }
188
189 // Validate HAVING condition
190 if (select->having_cond())
191 {
192 number_in_list= 1;
193 place= "HAVING clause";
194 if (check_expression(thd, select->having_cond(), false))
195 goto err;
196 }
197
198 return false;
199
200 err:
201 uint code;
202 const char *text;
203 /*
204 Starting from MySQL 5.7 we want give a better messages than before,
205 to provide more information. But we can't change texts of existing error
206 codes for backward-compatibility reasons, so we introduce new texts;
207 however we want to keep sending the old error codes, for pre-5.7
208 applications used to it.
209 */
210 if (select->is_explicitly_grouped())
211 {
212 code= ER_WRONG_FIELD_WITH_GROUP; // old code
213 text= ER(ER_WRONG_FIELD_WITH_GROUP_V2); // new text
214 }
215 else
216 {
217 code= ER_MIX_OF_GROUP_FUNC_AND_FIELDS; // old code
218 text= ER(ER_MIX_OF_GROUP_FUNC_AND_FIELDS_V2); // new text
219 }
220 my_printf_error(code, text, MYF(0), number_in_list, place,
221 failed_ident->full_name());
222 return true;
223 }
224
225
226 /**
227 Validates one expression (this forms one step of check_query()).
228 @param expr expression
229 @param in_select_list whether this expression is coming from the SELECT
230 list.
231 */
check_expression(THD * thd,Item * expr,bool in_select_list)232 bool Group_check::check_expression(THD *thd, Item *expr,
233 bool in_select_list)
234 {
235 assert(!is_child());
236 if (!in_select_list)
237 {
238 uint counter;
239 enum_resolution_type resolution;
240 // Search if this expression is equal to one in the SELECT list.
241 Item** const res= find_item_in_list(expr,
242 select->item_list,
243 &counter, REPORT_EXCEPT_NOT_FOUND,
244 &resolution);
245 if (res == NULL) // Other error than "not found", my_error() was called
246 return true; /* purecov: inspected */
247 if (res != not_found_item)
248 {
249 // is in SELECT list, which has already been validated.
250 return false;
251 }
252 }
253
254 for (ORDER *grp= select->group_list.first; grp; grp= grp->next)
255 {
256 if ((*grp->item)->type() == Item::INT_ITEM &&
257 (*grp->item)->basic_const_item()) /* group by position */
258 return false;
259 if ((*grp->item)->eq(expr, false))
260 return false; // Expression is in GROUP BY so is ok
261 }
262
263 // Analyze columns/aggregates referenced by the expression
264 return
265 expr->walk(&Item::aggregate_check_group, walk_options, (uchar*)this);
266 }
267
268
269 /**
270 Tells if 'item' is functionally dependent ("FD") on source columns.
271 Source columns are:
272 - if !is_child(), the GROUP BY columns
273 - if is_child(), columns of the result of the query expression under
274 'table' which are themselves part of 'fd' of the parent Group_check.
275
276 We recognize most FDs imposed by SQL2011 (optional feature T301)
277
278 We build sets, named En, by induction.
279 A "column" is defined as base table / view / derived table's column.
280
281 E1 = {source columns} (=group columns, if this is a master Group_check;
282 empty set otherwise).
283
284 En is a set of columns of the result of the WHERE clause of 'select' which
285 are functionally dependent on E1.
286 If is_child(), En columns might rather be of the result of the GROUP BY
287 clause (if there is one), but that's an unimportant detail, ignored further
288 down.
289
290 Given En, build En+1:
291 - let En+1= En
292 - for each {pk/unique index of some table T} found in En, add T.* to En+1
293 (actually, we rather add T's map bit to the table_map whole_tables_fd).
294
295 Then build En+2, by adding y, for each x=y in AND parts of WHERE/ON where
296 x is in En+1 or is constant, and y is a column not in En+1.
297
298 When we meet columns of views or derived tables, we additionally search
299 for FDs in their query expression, which can then give FDs in our
300 query.
301 If En+2==En, this is the end of the induction. Otherwise we loop.
302
303 As we build En, we check if 'item' is in it.
304
305 @param item Item to consider; either a column local to 'select', or a set
306 function whose aggregation query is 'select'
307
308 @returns true if 'item' is functionally dependent on source columns.
309 */
is_fd_on_source(Item * item)310 bool Group_check::is_fd_on_source(Item *item)
311 {
312 if (is_in_fd(item))
313 return true;
314
315 if (!is_child())
316 {
317 /*
318 If it were a child Group_check, its list of source columns
319 would start empty, it would gradually be filled by the master
320 Group_check when it fills its own list.
321 Here it is the master Group_check, so GROUP expressions are considered
322 to be known, from which we build E1.
323 */
324 if (fd.empty())
325 {
326 /*
327 We do a first attempt: is the column part of group columns? This
328 test should be sufficient to accept any query accepted by
329 only_full_group_by in 5.6, and avoids filling the "fd" list with
330 add_to_fd() (and potentially add_to_source_of_mat_table()).
331 It's just an optimization.
332 */
333 for (ORDER *grp= select->group_list.first; grp; grp= grp->next)
334 {
335 if ((*grp->item)->eq(item, false))
336 return true;
337 }
338 // It didn't suffice. Let's start the search for FDs: build E1.
339 for (ORDER *grp= select->group_list.first; grp; grp= grp->next)
340 {
341 Item *const grp_it= *grp->item;
342 add_to_fd(grp_it, local_column(grp_it));
343 }
344 }
345 }
346
347 if (select->olap != UNSPECIFIED_OLAP_TYPE)
348 {
349 /*
350 - the syntactical transformation of ROLLUP is to make a union of
351 queries, and in each such query, some group column references are
352 replaced with a NULL literal.
353 - functional dependencies should be recognized only after that
354 transformation. But there cannot be a key-based or equality-based
355 functional dependency on a NULL literal.
356 Moreover, there are no FDs in a UNION.
357 So if the query has ROLLUP, we can stop here.
358 */
359 return false;
360 }
361
362 // no need to search for keys in those tables:
363 table_map tested_map_for_keys= whole_tables_fd;
364 while (true)
365 {
366 // build En+1
367 const table_map last_whole_tables_fd= whole_tables_fd;
368 for (uint j= 0; j < fd.size(); j++)
369 {
370 Item *const item2= fd.at(j)->real_item(); // Go down view field
371 if (item2->type() != Item::FIELD_ITEM)
372 continue;
373
374 Item_field *const item_field= down_cast<Item_field*>(item2);
375 /**
376 @todo make table_ref non-NULL for gcols, then use it for 'tl'.
377 Do the same in Item_field::used_tables_for_level().
378 */
379 TABLE_LIST *const tl= item_field->field->table->pos_in_table_list;
380
381 if (tested_map_for_keys & tl->map())
382 continue;
383 tested_map_for_keys|= tl->map();
384 for (uint keyno= 0; keyno < tl->table->s->keys; keyno++)
385 {
386 KEY *const key_info= &tl->table->key_info[keyno];
387 if ((key_info->flags & (HA_NOSAME | HA_NULL_PART_KEY)) != HA_NOSAME)
388 continue;
389 uint k;
390 for (k= 0; k < key_info->user_defined_key_parts; k++)
391 {
392 const Field * const key_field= key_info->key_part[k].field;
393 bool key_field_in_fd= false;
394 for (uint l= 0; l < fd.size(); l++)
395 {
396 Item *const item3= fd.at(l)->real_item(); // Go down view field
397 if (item3->type() != Item::FIELD_ITEM)
398 continue;
399 if (static_cast<Item_field *>(item3)->field == key_field)
400 {
401 key_field_in_fd= true;
402 break;
403 }
404 }
405 if (!key_field_in_fd)
406 break;
407 }
408 if (k == key_info->user_defined_key_parts)
409 {
410 /*
411 We just found that intersect(En,table.*) contains all columns of
412 the key, so intersect(En,table.*) -> table.* in 'table'.
413 This is key-based so is an NFFD, so it propagates to the result of
414 the WHERE clause. Thus, intersect(En,table.*) -> table.* in this
415 result, so En -> table.* in this result.
416 We knew that E1 -> En in this result.
417 So, E1 -> table.* there too. So we can add table.* to En+1:
418 */
419 add_to_fd(tl->map());
420 break;
421 }
422 }
423 }
424 if (last_whole_tables_fd != whole_tables_fd && // something new, check again
425 is_in_fd(item))
426 return true;
427
428 // Build En+2
429 uint last_fd= fd.size();
430
431 find_fd_in_joined_table(select->join_list); // [OUTER] JOIN ON
432
433 if (select->where_cond()) // WHERE
434 find_fd_in_cond(select->where_cond(), 0, false);
435
436 table_map map_of_new_fds= 0;
437 for (; last_fd < fd.size(); ++last_fd)
438 map_of_new_fds|= fd.at(last_fd)->used_tables();
439
440 if (map_of_new_fds != 0) // something new, check again
441 {
442 assert((map_of_new_fds & PSEUDO_TABLE_BITS) == 0);
443 if (is_in_fd(item))
444 return true;
445 // Recheck keys only in tables with something new:
446 tested_map_for_keys&= ~map_of_new_fds;
447 }
448 else
449 {
450 // If already searched in expressions underlying identifiers.
451 if (search_in_underlying)
452 return false;
453
454 // Otherwise, iterate once more and dig deeper.
455 search_in_underlying= true;
456
457 if (is_in_fd(item))
458 return true;
459 }
460 } // while(true)
461 }
462
463
464 /*
465 Record that an expression is uniquely determined by source columns.
466
467 @param item Expression to add.
468 @param local_column True: it is a column, should be added to 'fd'. False:
469 it cannot be added to 'fd' but we can still derive some useful knowledge -
470 see the function's body.
471 @param add_to_mat_table True: we should check if it's a column of a mat
472 table and if so we should pass it to the child Group_check.
473 */
add_to_fd(Item * item,bool local_column,bool add_to_mat_table)474 void Group_check::add_to_fd(Item *item, bool local_column,
475 bool add_to_mat_table)
476 {
477 /*
478 Because the "fd" list is limited to columns and because MySQL allows
479 non-column expressions in GROUP BY (unlike the standard), we need this
480 block _even_if_ this is not a column. For example:
481 select d.s from
482 (select b*3 as c, sum(a) as s from t1 group by b*3) as d
483 group by d.c;
484 Say that we are validating the outer query. d.c is a group column,
485 containing the value of t1.b*3; say that we are presently telling the
486 (child) Group_check of the subquery that t1.b*3 is in its source (in
487 other words, the value of t1.b*3 can be considered as determined).
488 t1.b*3 is not a column, so it cannot be put into "fd", however it's the
489 group expression, so it determines sum(a), and so d.* is determined.
490 In this corner case, a "source" can be a source _expression_, not
491 column.
492 */
493 find_group_in_fd(item);
494
495 if (!local_column)
496 return;
497
498 /*
499 A column reference can later give more FDs, record it.
500
501 You may wonder why, if this is a merged view item (Item_*view_ref), we add
502 it to "fd" instead of adding the underlying item. Here is why:
503 create view v1 as select t1.i*2 as z from t1;
504 select v1.z*5 from v1 group by v1.z;
505 When validating z*5, we need to find z in "fd". So:
506 - either we have added z to "fd" here (chosen solution), then we search
507 for "z" and match.
508 - or we have added the real_item of "z", t1.i*2, to "fd" here, then we
509 search for the real_item of "z", t1.i*2 and match. However, matching
510 (using Item::eq()) of certain items is not working well (if the item
511 contains a subquery), which would lead to some incorrect rejections of
512 queries. Moreover, it is good to stick to the definition of what a
513 functionally dependent column can be: it can be a view's column.
514 */
515
516 fd.push_back(down_cast<Item_ident *>(item));
517
518 if (!add_to_mat_table)
519 return;
520
521 item= item->real_item(); // for merged view containing mat table
522 if (item->type() == Item::FIELD_ITEM)
523 {
524 Item_field *const item_field= (Item_field*)item;
525 TABLE_LIST *const tl= item_field->field->table->pos_in_table_list;
526 if (tl->uses_materialization()) // materialized table
527 add_to_source_of_mat_table(item_field, tl);
528 }
529 }
530
531
532 /**
533 This function must be called every time we discover an item which is FD on
534 source columns, or add a bit to whole_tables_fd; it maintains group_in_fd.
535
536 @param item item which is FD; if NULL, means that we instead added a bit
537 to whole_tables_fd.
538 */
find_group_in_fd(Item * item)539 void Group_check::find_group_in_fd(Item *item)
540 {
541 if (group_in_fd == ~0ULL)
542 return; // nothing to do
543 if (select->is_grouped())
544 {
545 /*
546 See if we now have all of query expression's GROUP BY list; an
547 implicitely grouped query has an empty group list.
548 */
549 bool missing= false;
550 int j= 0;
551 for (ORDER *grp= select->group_list.first; grp; ++j, grp= grp->next)
552 {
553 if (!(group_in_fd & (1ULL << j)))
554 {
555 Item *grp_item= *grp->item;
556 if ((local_column(grp_item) &&
557 (grp_item->used_tables() & ~whole_tables_fd) == 0) ||
558 (item && grp_item->eq(item, false)))
559 group_in_fd|= (1ULL << j);
560 else
561 missing= true;
562 }
563 }
564 if (!missing)
565 {
566 /*
567 All GROUP BY exprs are FD on the source. Turn all bits on, for easy
568 testing.
569 */
570 group_in_fd= ~0ULL;
571 }
572 }
573 }
574
575
576 /**
577 @returns the idx-th expression in the SELECT list of our query block.
578 */
select_expression(uint idx)579 Item *Group_check::select_expression(uint idx)
580 {
581 List_iterator<Item> it_select_list_of_subq(*select->get_item_list());
582 Item *expr_under;
583 for (uint k= 0; k <= idx ; k++)
584 expr_under= it_select_list_of_subq++;
585 assert(expr_under);
586 return expr_under;
587 }
588
589
590 /**
591 If we just added a column of a materialized table to 'fd', we record this
592 fact in a new Group_check (mat_gc) for the query expression underlying that
593 table. This can later help us derive new functional dependencies in our
594 Group_check. For example:
595 select d.a, d.b from (select t.a*2 as a, t.a as b from t) group by d.b;
596 When we add d.b to 'fd', in this function we create mat_gc, see that d.b is
597 built from a column of t (t.b), we can say that "t.b is determined", so we
598 add t.b to mat_gc.fd. Later, when we wonder if d.a is functionally
599 dependent on d.b, we process d.a in is_in_fd_of_underlying():
600 we analyze 2*t.a in the context of mat_gc: 2*t.a is FD on t.a, we
601 conclude that d.a is indeed FD on d.b.
602
603 @param item_field column of 'tl', just added to 'fd'
604 @param tl mat table
605 */
add_to_source_of_mat_table(Item_field * item_field,TABLE_LIST * tl)606 void Group_check::add_to_source_of_mat_table(Item_field *item_field,
607 TABLE_LIST *tl)
608 {
609 SELECT_LEX_UNIT *const mat_unit= tl->derived_unit();
610 // Query expression underlying 'tl':
611 SELECT_LEX *const mat_select= mat_unit->first_select();
612 if (mat_unit->is_union() || mat_select->olap != UNSPECIFIED_OLAP_TYPE)
613 return; // If UNION or ROLLUP, no FD
614 // Grab Group_check for this subquery.
615 Group_check *mat_gc;
616 uint j;
617 for (j= 0; j < mat_tables.size(); j++)
618 {
619 mat_gc= mat_tables.at(j);
620 if (mat_gc->select == mat_select)
621 break;
622 }
623 if (j == mat_tables.size()) // not found, create it
624 {
625 mat_gc= new (m_root) Group_check(mat_select, m_root, tl);
626 mat_tables.push_back(mat_gc);
627 }
628 // Find underlying expression of item_field, in SELECT list of mat_select
629 Item *const expr_under=
630 mat_gc->select_expression(item_field->field->field_index);
631
632 // non-nullability of tl's column in tl, is equal to that of expr_under.
633 if (expr_under && !expr_under->maybe_null)
634 mat_gc->non_null_in_source= true;
635
636 mat_gc->add_to_fd(expr_under, mat_gc->local_column(expr_under));
637
638 if (mat_gc->group_in_fd == ~0ULL && // (1)
639 (!(mat_gc->table->map() & select->outer_join) || // (2)
640 mat_gc->non_null_in_source)) // (3)
641 {
642 /*
643 (1): In mat_gc, all GROUP BY expressions of mat_select are dependent on
644 source columns. Thus, all SELECT list expressions are, too (otherwise,
645 the validation of mat_select has or will fail). So, in our Group_check,
646 intersect(En, tl.*) -> tl.* .
647 This FD needs to propagate in our Group_check all the way up to the
648 result of the WHERE clause. It does, if:
649 - either there is no weak side above this table (2) (so NFFD is not
650 needed).
651 - or intersect(En, tl.*) contains a non-nullable column (3) (then
652 the FD is NFFD).
653
654 VE does not need to be deterministic: there is only one row per values
655 of group columns; if those values are known, then any VE, even rand(),
656 is uniquely determined.
657 */
658 add_to_fd(tl->map());
659 }
660 }
661
662
663 /**
664 is_in_fd() is low-level compared to is_fd_on_source(). The former only
665 searches through built FD information; the latter builds this information
666 and calls the former to search in it.
667
668 @param item Item to consider; either a column local to 'select', or a set
669 function whose aggregation query is 'select'
670
671 @returns true if the expression is FD on the source.
672 */
is_in_fd(Item * item)673 bool Group_check::is_in_fd(Item *item)
674 {
675 if (item->type() == Item::SUM_FUNC_ITEM)
676 {
677 /*
678 If all group expressions are FD on the source, this set function also is
679 (one single value per group).
680 */
681 return group_in_fd == ~0ULL;
682 }
683
684 assert(local_column(item));
685 Used_tables ut(select);
686 (void) item->walk(&Item::used_tables_for_level, Item::WALK_POSTFIX,
687 pointer_cast<uchar *>(&ut));
688 if ((ut.used_tables & ~whole_tables_fd) == 0)
689 {
690 /*
691 The item is a column from a table whose all columns are FD.
692 If the table is a view, the item wraps an expression, which
693 uses columns of underlying tables which are all FD; we don't even have
694 to walk the underlying expression.
695 An expression-based FD in a view is not necessarily an NFFD, but here it
696 is, as the bits in whole_tables_fd are on only if the determinant
697 columns are non-NULLable or there is no weak side upwards (see calls to
698 add_to_fd(table_map)).
699 */
700 return true;
701 }
702 for (uint j= 0; j < fd.size(); j++)
703 {
704 Item *const item2= fd.at(j);
705 if (item2->eq(item, 0))
706 return true;
707 /*
708 Say that we have view:
709 create view v1 as select i, 2*i as z from t1; and we do:
710 select z from v1 group by i;
711 v1 is merged.
712 v1.i is Item_*view_ref to t1.i;
713 v1.z is Item_*view_ref to Item_func_mul which has two arguments:
714 Item_int (2) and Item_field (t1.i).
715 We added the grouping column v1.i to "fd". Now we are walking v1.z: we
716 meet Item_field (t1.i). For us to find this t1.i in "fd" we have to
717 reach to real_item() of v1.i.
718 */
719 Item *const real_it2= item2->real_item();
720 if (real_it2 != item2 && real_it2->eq(item, 0))
721 return true;
722 }
723 if (!search_in_underlying)
724 return false;
725 return is_in_fd_of_underlying(down_cast<Item_ident *>(item));
726 }
727
728
729 /**
730 See if we can derive a FD from a column which has an underlying expression.
731
732 For a generated column, see if we can derive a FD from its expression.
733 For a column of a view or derived table, see if we can derive a FD from the
734 underlying query block.
735
736 @param item column
737 @returns true if this column is FD on source
738 */
is_in_fd_of_underlying(Item_ident * item)739 bool Group_check::is_in_fd_of_underlying(Item_ident *item)
740 {
741 if (item->type() == Item::REF_ITEM)
742 {
743 /*
744 It's a merged view's item.
745 Consider
746 create view v1 as select as as a, a*2 as b from t1;
747 select v1.b from v1 group by v1.a;
748 we have this->fd={v1.a}, and we search if v1.b is FD on v1.a. We'll look
749 if t1.a*2 is FD on t1.a.
750 */
751 assert(static_cast<const Item_ref *>(item)->ref_type() ==
752 Item_ref::VIEW_REF);
753 /*
754 Refuse RAND_TABLE_BIT because:
755 - FDs in a view are those of the underlying query expression.
756 - For FDs in a query expression, expressions in the SELECT list must be
757 deterministic.
758 Same is true for materialized tables further down.
759 */
760 if (item->used_tables() & RAND_TABLE_BIT)
761 return false;
762
763 Item *const real_it= item->real_item();
764 Used_tables ut(select);
765 (void) item->walk(&Item::used_tables_for_level, Item::WALK_POSTFIX,
766 pointer_cast<uchar *>(&ut));
767 /*
768 todo When we eliminate all uses of cached_table, we can probably add a
769 derived_table_ref field to Item_direct_view_ref objects and use it here.
770 */
771 TABLE_LIST *const tl= item->cached_table;
772 assert(tl->is_view_or_derived());
773 /*
774 We might find expression-based FDs in the result of the view's query
775 expression; but if this view is on the weak side of an outer join,
776 the FD won't propagate to that outer join's result.
777 */
778 const bool weak_side_upwards= tl->is_inner_table_of_outer_join();
779
780 /*
781 (3) real_it is a deterministic expression of columns which are all FD on
782 the source. This gives a FD in the view, maybe not NFFD. It propagates
783 to our query expression if:
784 (1) Either there is no weak side upwards (NFFD not needed)
785 (2) Or NULLness of columns implies NULLness of expression (so it's
786 NFFD).
787 */
788 if ((!weak_side_upwards || // (1)
789 (ut.used_tables & real_it->not_null_tables())) && // (2)
790 !real_it->walk(&Item::is_column_not_in_fd, walk_options,
791 pointer_cast<uchar*>(this))) // (3)
792 {
793 add_to_fd(item, true);
794 return true;
795 }
796 }
797
798 else if (item->type() == Item::FIELD_ITEM)
799 {
800 Item_field *const item_field= down_cast<Item_field*>(item);
801 /**
802 @todo make table_ref non-NULL for gcols, then use it for 'tl'.
803 Do the same in Item_field::used_tables_for_level().
804 */
805 TABLE_LIST *const tl= item_field->field->table->pos_in_table_list;
806 if (item_field->field->is_gcol()) // Generated column
807 {
808 assert(!tl->uses_materialization());
809 Item *const expr= item_field->field->gcol_info->expr_item;
810 assert(expr->fixed);
811 Used_tables ut(select);
812 item_field->used_tables_for_level(pointer_cast<uchar *>(&ut));
813 const bool weak_side_upwards= tl->is_inner_table_of_outer_join();
814 if ((!weak_side_upwards ||
815 (ut.used_tables & expr->not_null_tables())) &&
816 !expr->walk(&Item::is_column_not_in_fd, walk_options,
817 pointer_cast<uchar*>(this)))
818 {
819 add_to_fd(item, true);
820 return true;
821 }
822 }
823 else if (tl->uses_materialization()) // Materialized derived table
824 {
825 SELECT_LEX *const mat_select= tl->derived_unit()->first_select();
826 uint j;
827 for (j= 0; j < mat_tables.size() ; j++)
828 {
829 if (mat_tables.at(j)->select == mat_select)
830 break;
831 }
832 if (j < mat_tables.size()) // if false, we know nothing about this table
833 {
834 Group_check *const mat_gc= mat_tables.at(j);
835 /*
836 'item' belongs to a materialized table, and certain fields of the
837 subquery are in this->fd.
838 Search if the expression inside 'item' is FD on them.
839 */
840 Item *const expr_under=
841 mat_gc->select_expression(item_field->field->field_index);
842 /*
843 expr_under is the expression underlying 'item'.
844 (1) and (4) it is a deterministic expression of mat_gc source
845 columns, so is FD on them. This gives a FD in the mat table, maybe
846 not NFFD: intersect(En, tl.*) -> item .
847 This FD needs to propagate in our Group_check all the way up to the
848 result of the WHERE clause. It does, if:
849 - either there is no weak side above this table (2) (so NFFD is not
850 needed).
851 - or intersect(En, tl.*) contains a non-nullable column (3) (then
852 the FD is NFFD).
853 */
854 if (!(expr_under->used_tables() & RAND_TABLE_BIT) && // (1)
855 (!(mat_gc->table->map() & select->outer_join) || // (2)
856 mat_gc->non_null_in_source) && // (3)
857 !expr_under->walk(&Item::aggregate_check_group, // (4)
858 walk_options,
859 pointer_cast<uchar*>(mat_gc)))
860 {
861 /*
862 We pass add_to_mat_table==false otherwise add_to_fd() may add
863 expr_under (if it's a field) to mat_gc->fd, uselessly (it is
864 already in mat_gc->fd, as walk() succeeded above). This is just to
865 not make the 'fd' list longer than needed.
866 */
867 add_to_fd(item_field, true, false);
868 return true;
869 }
870 }
871 }
872 }
873
874 return false;
875 }
876
877
878 /**
879 Searches for equality-based functional dependences in an AND-ed part of a
880 condition (a conjunct).
881
882 @param cond complete condition
883 @param conjunct one AND-ed part of 'cond'
884 @param weak_tables If condition is WHERE, it's 0. Otherwise it is the map
885 of weak tables in the join nest which owns the condition.
886 @param weak_side_upwards If condition is WHERE it's false. Otherwise it is
887 true if the join nest owning this condition is embedded in the weak side
888 of some parent outer join (no matter how far up the parent is).
889 */
analyze_conjunct(Item * cond,Item * conjunct,table_map weak_tables,bool weak_side_upwards)890 void Group_check::analyze_conjunct(Item *cond, Item *conjunct,
891 table_map weak_tables,
892 bool weak_side_upwards)
893 {
894 if (conjunct->type() != Item::FUNC_ITEM)
895 return;
896 const Item_func *cnj= static_cast<const Item_func *>(conjunct);
897 if (cnj->functype() != Item_func::EQ_FUNC)
898 return;
899 Item *left_item= cnj->arguments()[0];
900 Item *right_item= cnj->arguments()[1];
901 if (left_item->type() == Item::ROW_ITEM &&
902 right_item->type() == Item::ROW_ITEM)
903 {
904 /*
905 (a,b)=(c,d) is equivalent to 'a=c and b=d', let's iterate on pairs.
906 Note that it's not recursive: we don't handle (a,(b,c))=(d,(e,f)), the
907 Standard does not seem to require it.
908 */
909 Item_row *left_row= down_cast<Item_row*>(left_item);
910 Item_row *right_row= down_cast<Item_row*>(right_item);
911 int elem= left_row->cols();
912 while (--elem >= 0)
913 analyze_scalar_eq(cond, left_row->element_index(elem),
914 right_row->element_index(elem),
915 weak_tables, weak_side_upwards);
916 }
917 else
918 analyze_scalar_eq(cond, left_item, right_item, weak_tables,
919 weak_side_upwards);
920 }
921
922
923 /**
924 Helper function @see analyze_conjunct().
925 */
analyze_scalar_eq(Item * cond,Item * left_item,Item * right_item,table_map weak_tables,bool weak_side_upwards)926 void Group_check::analyze_scalar_eq(Item *cond,
927 Item *left_item, Item *right_item,
928 table_map weak_tables,
929 bool weak_side_upwards)
930 {
931 table_map left_tables= left_item->used_tables();
932 table_map right_tables= right_item->used_tables();
933 bool left_is_column= local_column(left_item);
934 bool right_is_column= local_column(right_item);
935
936 /*
937 We look for something=col_not_FD.
938 If there are weak tables, this column must be weak (something=strong gives
939 us nothing, in an outer join condition).
940 */
941 if (right_is_column &&
942 (!weak_tables || (weak_tables & right_tables)) &&
943 !is_in_fd(right_item))
944 {}
945 else if (left_is_column &&
946 (!weak_tables || (weak_tables & left_tables)) &&
947 !is_in_fd(left_item))
948 {
949 // col_not_FD=something: change to something=col_not_FD
950 std::swap(left_item, right_item);
951 std::swap(left_tables, right_tables);
952 std::swap(left_is_column, right_is_column);
953 }
954 else
955 return; // this equality brings nothing
956
957 // right_item is a column not in fd, see if we can add it.
958
959 if (left_is_column && (weak_tables & left_tables) &&
960 is_in_fd(left_item))
961 {
962 // weak=weak: left->right, and this is NFFD
963 add_to_fd(right_item, true);
964 return;
965 }
966
967 const table_map strong_tables= (~weak_tables) & ~PSEUDO_TABLE_BITS;
968
969 if ((left_is_column &&
970 (strong_tables & left_tables) &&
971 is_in_fd(left_item)) ||
972 left_item->const_item() ||
973 (OUTER_REF_TABLE_BIT & left_tables))
974 {
975 // strong_or_literal_or_outer_ref= right_item
976
977 if (!weak_tables)
978 {
979 /*
980 It cannot be an inner join, due to transformations done in
981 simplify_joins(). So it is WHERE, so right_item is strong.
982 This may be constant=right_item and thus not be an NFFD, but WHERE is
983 exterior to join nests so propagation is not needed.
984 */
985 assert(!weak_side_upwards); // cannot be inner join
986 add_to_fd(right_item, true);
987 }
988 else
989 {
990 // Outer join. So condition must be deterministic.
991 if (cond->used_tables() & RAND_TABLE_BIT)
992 return;
993
994 /*
995 FD will have DJS as source columns, where DJS is the set of strong
996 columns referenced by "cond". FD has to propagate. It does if:
997 - either there is no weak side upwards
998 - or NULLness of DJS columns implies that "cond" is not true.
999 */
1000 if (weak_side_upwards && !(strong_tables & cond->not_null_tables()))
1001 return;
1002
1003 std::pair<Group_check *, table_map> p(this, strong_tables);
1004 if (!cond->walk(&Item::is_strong_side_column_not_in_fd,
1005 walk_options, (uchar*)&p))
1006 {
1007 /*
1008 "cond" is deterministic.
1009 right_item is weak.
1010 strong_or_literal_or_outer= weak.
1011 So DJS->right_item holds in the result of the join, and it
1012 propagates.
1013 As DJS is FD on E1 (the walk() succeeded), E1->right_item in the
1014 result of WHERE.
1015 */
1016 add_to_fd(right_item, true);
1017 }
1018 }
1019 }
1020 }
1021
1022
1023 /**
1024 Searches for equality-based functional dependencies in a condition.
1025
1026 @param cond condition: a WHERE condition or JOIN condition.
1027 @param weak_tables If condition is WHERE, it's 0. Otherwise it is the map
1028 of weak tables in the join nest which owns the condition.
1029 @param weak_side_upwards If condition is WHERE it's false. Otherwise it is
1030 true if the join nest owning this condition is embedded in the right side
1031 of some parent left join.
1032 */
find_fd_in_cond(Item * cond,table_map weak_tables,bool weak_side_upwards)1033 void Group_check::find_fd_in_cond(Item *cond,
1034 table_map weak_tables,
1035 bool weak_side_upwards)
1036 {
1037 if (cond->type() == Item::COND_ITEM)
1038 {
1039 Item_cond *cnd= static_cast<Item_cond *>(cond);
1040 /*
1041 All ANDs already flattened, see:
1042 "(X1 AND X2) AND (Y1 AND Y2) ==> AND (X1, X2, Y1, Y2)"
1043 in sql_yacc, and also Item_cond::fix_fields().
1044 */
1045 if (cnd->functype() != Item_func::COND_AND_FUNC)
1046 return;
1047 List_iterator<Item> li(*(cnd->argument_list()));
1048 Item *item;
1049 while ((item= li++))
1050 analyze_conjunct(cond, item, weak_tables, weak_side_upwards);
1051 }
1052 else // only one conjunct
1053 analyze_conjunct(cond, cond, weak_tables, weak_side_upwards);
1054 }
1055
1056
1057 /**
1058 Searches for equality-based functional dependencies in the condition of a
1059 join nest, and recursively in all contained join nests.
1060
1061 @param join_list members of the join nest
1062 */
find_fd_in_joined_table(List<TABLE_LIST> * join_list)1063 void Group_check::find_fd_in_joined_table(List<TABLE_LIST> *join_list)
1064 {
1065 List_iterator<TABLE_LIST> li(*join_list);
1066 TABLE_LIST *table;
1067 while ((table= li++))
1068 {
1069 if (table->sj_cond())
1070 {
1071 /*
1072 We can ignore this nest as:
1073 - the subquery's WHERE was copied to the semi-join condition
1074 - so where the IN equalities
1075 - sj_cond() is also present in the parent nest's join condition or in
1076 the query block's WHERE condition, which we check somewhere else.
1077 Note that columns from sj-inner tables can help only as "intermediate
1078 link" in the graph of functional dependencies, as they are neither in
1079 GROUP BY (the source) nor in the SELECT list / HAVING / ORDER BY (the
1080 target).
1081 */
1082 continue;
1083 }
1084 table_map used_tables;
1085 NESTED_JOIN *nested_join= table->nested_join;
1086 if (nested_join)
1087 {
1088 find_fd_in_joined_table(&nested_join->join_list);
1089 used_tables= nested_join->used_tables;
1090 }
1091 else
1092 used_tables= table->map();
1093 if (table->join_cond())
1094 {
1095 /*
1096 simplify_joins() moves the ON condition of an inner join to the closest
1097 outer join nest or to WHERE. So this assertion should hold.
1098 Thus, used_tables are weak tables.
1099 */
1100 assert(table->outer_join);
1101 /*
1102 We might find equality-based FDs in the result of this outer join; but
1103 if this outer join is itself on the weak side of a parent outer join,
1104 the FD won't propagate to that parent outer join's result.
1105 */
1106 const bool weak_side_upwards=
1107 table->embedding &&
1108 table->embedding->is_inner_table_of_outer_join();
1109 find_fd_in_cond(table->join_cond(), used_tables, weak_side_upwards);
1110 }
1111 }
1112 }
1113
1114
1115 /// Writes "check information" to the optimizer trace
to_opt_trace(THD * thd)1116 void Group_check::to_opt_trace(THD *thd)
1117 {
1118 #ifdef OPTIMIZER_TRACE
1119 if (fd.empty() && !whole_tables_fd)
1120 return;
1121 Opt_trace_context *ctx= &thd->opt_trace;
1122 if (!ctx->is_started())
1123 return;
1124 Opt_trace_object trace_wrapper(ctx);
1125 Opt_trace_object trace_fds(ctx, "functional_dependencies_of_GROUP_columns");
1126 to_opt_trace2(ctx, &trace_fds);
1127 #endif
1128 }
1129
1130 /**
1131 Utility function for to_opt_trace(), as we need recursion in children
1132 Group_checks. to_opt_trace() only writes a one-time header.
1133 */
to_opt_trace2(Opt_trace_context * ctx,Opt_trace_object * parent)1134 void Group_check::to_opt_trace2(Opt_trace_context *ctx,
1135 Opt_trace_object *parent)
1136 {
1137 #ifdef OPTIMIZER_TRACE
1138 if (table)
1139 parent->add_utf8_table(table);
1140 if (whole_tables_fd)
1141 {
1142 Opt_trace_array array(ctx, "all_columns_of_table_map_bits");
1143 for (uint j= 0; j < MAX_TABLES ; j++)
1144 if (whole_tables_fd & (1ULL << j))
1145 array.add(j);
1146 }
1147 if (!fd.empty())
1148 {
1149 Opt_trace_array array(ctx, "columns");
1150 for (uint j= 0; j < fd.size() ; j++)
1151 array.add_utf8(fd.at(j)->full_name());
1152 }
1153 if (is_child())
1154 {
1155 if (group_in_fd == ~0ULL && select->is_explicitly_grouped())
1156 parent->add("all_group_expressions", true);
1157 }
1158 if (!mat_tables.empty())
1159 {
1160 Opt_trace_array array(ctx, "searched_in_materialized_tables");
1161 for (uint j= 0; j < mat_tables.size() ; j++)
1162 {
1163 Opt_trace_object trace_wrapper(ctx);
1164 mat_tables.at(j)->to_opt_trace2(ctx, &trace_wrapper);
1165 }
1166 }
1167 #endif
1168 }
1169
1170
1171 /**
1172 Does one low-level check on one Item_ident. Called by Item_ident walk
1173 processors.
1174 @param i item to check
1175 @param tm map of strong tables, if type==CHECK_STRONG_SIDE_COLUMN
1176 @param type check to do:
1177 - CHECK_GROUP for Item_ident::aggregate_check_group()
1178 - CHECK_STRONG_SIDE_COLUMN for
1179 Item_ident::is_strong_side_column_not_in_fd()
1180 - CHECK_COLUMN for Item_ident::is_column_not_in_fd()
1181 @returns false if success
1182 */
do_ident_check(Item_ident * i,table_map tm,enum enum_ident_check type)1183 bool Group_check::do_ident_check(Item_ident *i, table_map tm,
1184 enum enum_ident_check type)
1185 {
1186 if (is_stopped(i))
1187 return false;
1188
1189 const Bool3 local= i->local_column(select);
1190 if (local.is_false())
1191 goto ignore_children;
1192
1193 if (local.is_unknown())
1194 return false; // dive in child item
1195
1196 switch (type)
1197 {
1198 case CHECK_GROUP:
1199 if (!is_fd_on_source(i))
1200 {
1201 // It is not FD on source columns:
1202 if (!is_child())
1203 failed_ident= i;
1204 return true;
1205 }
1206 goto ignore_children;
1207 case CHECK_STRONG_SIDE_COLUMN:
1208 {
1209 Used_tables ut(select);
1210 (void) i->walk(&Item::used_tables_for_level, Item::WALK_POSTFIX,
1211 pointer_cast<uchar *>(&ut));
1212 if ((ut.used_tables & tm) && !is_in_fd(i))
1213 return true; // It is a strong-side column and not FD
1214 goto ignore_children;
1215 }
1216 case CHECK_COLUMN:
1217 if (!is_in_fd(i))
1218 return true; // It is a column and not FD
1219 goto ignore_children;
1220 }
1221
1222 ignore_children:
1223 stop_at(i);
1224 return false;
1225 }
1226
1227 /// @} (end of group AGGREGATE_CHECKS ONLY_FULL_GROUP_BY)
1228