1 /* Copyright (c) 2014, 2021, Oracle and/or its affiliates.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License, version 2.0,
5    as published by the Free Software Foundation.
6 
7    This program is also distributed with certain software (including
8    but not limited to OpenSSL) that is licensed under separate terms,
9    as designated in a particular file or component or in included license
10    documentation.  The authors of MySQL hereby grant you an additional
11    permission to link the program and your derivative works with the
12    separately licensed software that they have included with MySQL.
13 
14    This program is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU General Public License, version 2.0, for more details.
18 
19    You should have received a copy of the GNU General Public License
20    along with this program; if not, write to the Free Software
21    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA */
22 
23 /**
24    @file
25    Checks for some semantic constraints on queries using GROUP
26    BY, or aggregate functions, or DISTINCT. Enforced if
27    sql_mode contains 'only_full_group_by'.
28 */
29 
30 #include "sql_select.h"
31 #include "opt_trace.h"
32 #include "sql_base.h"
33 #include "aggregate_check.h"
34 
35 /**
36   @addtogroup AGGREGATE_CHECKS
37 
38   @section USED_TABLES Implementation note: used_tables_for_level() vs used_tables()
39 
40   - When we are looking for items to validate, we must enter scalar/row
41   subqueries; if we find an item of our SELECT_LEX inside such subquery, for
42   example an Item_field with depended_from equal to our SELECT_LEX, we
43   must use used_tables_for_level(). Example: when validating t1.a in
44   select (select t1.a from t1 as t2 limit 1) from t1 group by t1.pk;
45   we need t1.a's map in the grouped query; used_tables() would return
46   OUTER_REF_TABLE_BIT.
47   - When we are searching for FDs in PKs, or join conditions, or the GROUP BY
48   clause, we do not enter scalar/row subqueries, so we use
49   used_tables(). Example:
50   select ... from t1 where t1.a=(subq) from t1 group by ...
51   the subq is not used to discover FDs in the grouped query.
52   Or:
53   select (select t2.a from t1 as t2 where t2.a=t1.a group by t2.b) from t1
54   when validating the subq, t1.a is an outer reference, kind of a constant, so
55   tells us that t2.a is FD on {} ; using used_tables_for_level() on t1.a would
56   be nonsense - we are validating the subquery.
57 
58   @{
59 */
60 
61 /**
62   We need to search for items inside subqueries, in case subqueries contain
63   outer references to tables of a query block having DISTINCT or GROUP BY.
64   We also need to sometimes skip parts of item trees, so the walk processor
65   must be called prefix (to enable skipping) and postfix (to disable
66   skipping).
67 */
68 static const Item::enum_walk walk_options=
69   Item::enum_walk(Item::WALK_PREFIX | Item::WALK_POSTFIX | Item::WALK_SUBQUERY);
70 
71 /**
72    Rejects the query if it has a combination of DISTINCT and ORDER BY which
73    could lead to randomly ordered results. More precisely: if, in a query
74    block 'sl', an ORDER BY expression
75    - is not the same expression as one in the SELECT list of 'sl' (1)
76    - and contains a column which:
77    -- is of a table whose qualifying query block is 'sl'          (2)
78    -- and is not in the SELECT list of 'sl'                       (3)
79    then 'sl' should not have DISTINCT.
80 
81    @returns true if rejected (my_error() is called)
82 */
check_query(THD * thd)83 bool Distinct_check::check_query(THD *thd)
84 {
85   uint number_in_list= 1;
86   for (ORDER *order= select->order_list.first;
87        order;
88        ++number_in_list, order= order->next)
89   {
90     if (order->in_field_list) // is in SELECT list
91       continue;
92     assert((*order->item)->fixed);
93     uint counter;
94     enum_resolution_type resolution;
95     /*
96       Search if this expression is equal to one in the SELECT
97       list. setup_order()/find_order_in_list() has already done so, but not
98       perfectly, indeed at that time the expression was not fixed, which
99       prevents recognition of certain equalities. Example:
100         create view v select x*2 as b from ...;
101         select distinct sin(b) as z from v order by sin(b);
102       This query is valid because the expression in ORDER BY is the same as
103       the one in SELECT list. But in setup_order():
104       'b' in ORDER BY (not yet fixed) is still a 'generic' Item_field,
105       'b' in SELECT (already fixed) is Item_direct_view_ref referencing 'x*2'
106       (so type()==REF_ITEM).
107       So Item_field::eq() says the 'b's are different, so 'sin(b)' of
108       ORDER BY is not found equal to 'sin(b)' of SELECT.
109 
110       On the other hand, the search below will match, because everything is
111       now fixed.
112 
113       There is a limitation with subqueries: in this query
114       SELECT (subquery) ... ORDER BY (subquery)
115       we may not be able to recognize that both subqueries are the same (and
116       so we may reject it even though the order is deterministic). It is
117       because Item_subselect::eq() is Item::eq() which is too coarse and
118       misses equalities: it compares names of both items; depending on the
119       position of the subquery in the query, MySQL gives it a name or not; and
120       this name is the raw text of the subquery (so if subqueries' texts
121       differ due to white space....).
122       Subqueries in ORDER BY are non-standard anyway.
123     */
124     Item** const res= find_item_in_list(*order->item, select->item_list,
125                                         &counter, REPORT_EXCEPT_NOT_FOUND,
126                                         &resolution);
127     if (res == NULL)    // Other error than "not found", my_error() was called
128       return true;                              /* purecov: inspected */
129     if (res != not_found_item) // is in SELECT list
130       continue;
131     /*
132       [numbers refer to the function's comment]
133       (1) is true. Check (2) and (3) inside walk().
134     */
135     if ((*order->item)->walk(&Item::aggregate_check_distinct,
136                              walk_options, (uchar*)this))
137     {
138       if (failed_ident)
139         my_error(ER_FIELD_IN_ORDER_NOT_SELECT, MYF(0), number_in_list,
140                  failed_ident->full_name(), "DISTINCT");
141       else
142         my_error(ER_AGGREGATE_IN_ORDER_NOT_SELECT, MYF(0), number_in_list,
143                  "DISTINCT");
144       return true;
145     }
146   }
147   return false;
148 }
149 
150 
151 /**
152    Rejects the query if it does aggregation or grouping, and expressions in
153    its SELECT list, ORDER BY clause, or HAVING condition, may vary inside a
154    group (are not "group-invariant").
155 */
check_query(THD * thd)156 bool Group_check::check_query(THD *thd)
157 {
158   ORDER *order= select->order_list.first;
159 
160   // Validate SELECT list
161   List_iterator<Item> select_exprs_it(select->item_list);
162   Item *expr;
163   uint number_in_list= 1;
164   const char *place= "SELECT list";
165   while ((expr= select_exprs_it++))
166   {
167     if (check_expression(thd, expr, true))
168       goto err;
169     ++number_in_list;
170   }
171 
172   // aggregate without GROUP BY has no ORDER BY at this stage.
173   assert(!(select->is_implicitly_grouped() && select->is_ordered()));
174   // Validate ORDER BY list
175   if (order)
176   {
177     number_in_list= 1;
178     place= "ORDER BY clause";
179     for ( ; order ; order= order->next)
180     {
181       // If it is in SELECT list it is already checked.
182       if (!order->in_field_list &&
183           check_expression(thd, *order->item, false))
184         goto err;
185       ++number_in_list;
186     }
187   }
188 
189   // Validate HAVING condition
190   if (select->having_cond())
191   {
192     number_in_list= 1;
193     place= "HAVING clause";
194     if (check_expression(thd, select->having_cond(), false))
195       goto err;
196   }
197 
198   return false;
199 
200 err:
201   uint code;
202   const char *text;
203   /*
204     Starting from MySQL 5.7 we want give a better messages than before,
205     to provide more information. But we can't change texts of existing error
206     codes for backward-compatibility reasons, so we introduce new texts;
207     however we want to keep sending the old error codes, for pre-5.7
208     applications used to it.
209   */
210   if (select->is_explicitly_grouped())
211   {
212     code= ER_WRONG_FIELD_WITH_GROUP;        // old code
213     text= ER(ER_WRONG_FIELD_WITH_GROUP_V2); // new text
214   }
215   else
216   {
217     code= ER_MIX_OF_GROUP_FUNC_AND_FIELDS;        // old code
218     text= ER(ER_MIX_OF_GROUP_FUNC_AND_FIELDS_V2); // new text
219   }
220   my_printf_error(code, text, MYF(0), number_in_list, place,
221                   failed_ident->full_name());
222   return true;
223 }
224 
225 
226 /**
227    Validates one expression (this forms one step of check_query()).
228    @param  expr  expression
229    @param  in_select_list  whether this expression is coming from the SELECT
230    list.
231 */
check_expression(THD * thd,Item * expr,bool in_select_list)232 bool Group_check::check_expression(THD *thd, Item *expr,
233                                    bool in_select_list)
234 {
235   assert(!is_child());
236   if (!in_select_list)
237   {
238     uint counter;
239     enum_resolution_type resolution;
240     // Search if this expression is equal to one in the SELECT list.
241     Item** const res= find_item_in_list(expr,
242                                         select->item_list,
243                                         &counter, REPORT_EXCEPT_NOT_FOUND,
244                                         &resolution);
245     if (res == NULL) // Other error than "not found", my_error() was called
246       return true;   /* purecov: inspected */
247     if (res != not_found_item)
248     {
249       // is in SELECT list, which has already been validated.
250       return false;
251     }
252   }
253 
254   for (ORDER *grp= select->group_list.first; grp; grp= grp->next)
255   {
256     if ((*grp->item)->type() == Item::INT_ITEM &&
257         (*grp->item)->basic_const_item()) /* group by position */
258       return false;
259     if ((*grp->item)->eq(expr, false))
260       return false;          // Expression is in GROUP BY so is ok
261   }
262 
263   // Analyze columns/aggregates referenced by the expression
264   return
265     expr->walk(&Item::aggregate_check_group, walk_options, (uchar*)this);
266 }
267 
268 
269 /**
270    Tells if 'item' is functionally dependent ("FD") on source columns.
271    Source columns are:
272    - if !is_child(), the GROUP BY columns
273    - if is_child(), columns of the result of the query expression under
274    'table' which are themselves part of 'fd' of the parent Group_check.
275 
276    We recognize most FDs imposed by SQL2011 (optional feature T301)
277 
278    We build sets, named En, by induction.
279    A "column" is defined as base table / view / derived table's column.
280 
281    E1 = {source columns} (=group columns, if this is a master Group_check;
282    empty set otherwise).
283 
284    En is a set of columns of the result of the WHERE clause of 'select' which
285    are functionally dependent on E1.
286    If is_child(), En columns might rather be of the result of the GROUP BY
287    clause (if there is one), but that's an unimportant detail, ignored further
288    down.
289 
290    Given En, build En+1:
291     - let En+1= En
292     - for each {pk/unique index of some table T} found in En, add T.* to En+1
293     (actually, we rather add T's map bit to the table_map whole_tables_fd).
294 
295    Then build En+2, by adding y, for each x=y in AND parts of WHERE/ON where
296    x is in En+1 or is constant, and y is a column not in En+1.
297 
298    When we meet columns of views or derived tables, we additionally search
299    for FDs in their query expression, which can then give FDs in our
300    query.
301    If En+2==En, this is the end of the induction. Otherwise we loop.
302 
303    As we build En, we check if 'item' is in it.
304 
305    @param  item  Item to consider; either a column local to 'select', or a set
306    function whose aggregation query is 'select'
307 
308    @returns true if 'item' is functionally dependent on source columns.
309 */
is_fd_on_source(Item * item)310 bool Group_check::is_fd_on_source(Item *item)
311 {
312   if (is_in_fd(item))
313     return true;
314 
315   if (!is_child())
316   {
317     /*
318       If it were a child Group_check, its list of source columns
319       would start empty, it would gradually be filled by the master
320       Group_check when it fills its own list.
321       Here it is the master Group_check, so GROUP expressions are considered
322       to be known, from which we build E1.
323     */
324     if (fd.empty())
325     {
326       /*
327         We do a first attempt: is the column part of group columns? This
328         test should be sufficient to accept any query accepted by
329         only_full_group_by in 5.6, and avoids filling the "fd" list with
330         add_to_fd() (and potentially add_to_source_of_mat_table()).
331         It's just an optimization.
332       */
333       for (ORDER *grp= select->group_list.first; grp; grp= grp->next)
334       {
335         if ((*grp->item)->eq(item, false))
336           return true;
337       }
338       // It didn't suffice. Let's start the search for FDs: build E1.
339       for (ORDER *grp= select->group_list.first; grp; grp= grp->next)
340       {
341         Item *const grp_it= *grp->item;
342         add_to_fd(grp_it, local_column(grp_it));
343       }
344     }
345   }
346 
347   if (select->olap != UNSPECIFIED_OLAP_TYPE)
348   {
349     /*
350       - the syntactical transformation of ROLLUP is to make a union of
351       queries, and in each such query, some group column references are
352       replaced with a NULL literal.
353       - functional dependencies should be recognized only after that
354       transformation. But there cannot be a key-based or equality-based
355       functional dependency on a NULL literal.
356       Moreover, there are no FDs in a UNION.
357       So if the query has ROLLUP, we can stop here.
358     */
359     return false;
360   }
361 
362   // no need to search for keys in those tables:
363   table_map tested_map_for_keys= whole_tables_fd;
364   while (true)
365   {
366     // build En+1
367     const table_map last_whole_tables_fd= whole_tables_fd;
368     for (uint j= 0; j < fd.size(); j++)
369     {
370       Item *const item2= fd.at(j)->real_item(); // Go down view field
371       if (item2->type() != Item::FIELD_ITEM)
372         continue;
373 
374       Item_field *const item_field= down_cast<Item_field*>(item2);
375       /**
376         @todo make table_ref non-NULL for gcols, then use it for 'tl'.
377         Do the same in Item_field::used_tables_for_level().
378       */
379       TABLE_LIST *const tl= item_field->field->table->pos_in_table_list;
380 
381       if (tested_map_for_keys & tl->map())
382         continue;
383       tested_map_for_keys|= tl->map();
384       for (uint keyno= 0; keyno < tl->table->s->keys; keyno++)
385       {
386         KEY *const key_info= &tl->table->key_info[keyno];
387         if ((key_info->flags & (HA_NOSAME | HA_NULL_PART_KEY)) != HA_NOSAME)
388           continue;
389         uint k;
390         for (k= 0; k < key_info->user_defined_key_parts; k++)
391         {
392           const Field * const key_field= key_info->key_part[k].field;
393           bool key_field_in_fd= false;
394           for (uint l= 0; l < fd.size(); l++)
395           {
396             Item *const item3= fd.at(l)->real_item();    // Go down view field
397             if (item3->type() != Item::FIELD_ITEM)
398               continue;
399             if (static_cast<Item_field *>(item3)->field == key_field)
400             {
401               key_field_in_fd= true;
402               break;
403             }
404           }
405           if (!key_field_in_fd)
406             break;
407         }
408         if (k == key_info->user_defined_key_parts)
409         {
410           /*
411             We just found that intersect(En,table.*) contains all columns of
412             the key, so intersect(En,table.*) -> table.* in 'table'.
413             This is key-based so is an NFFD, so it propagates to the result of
414             the WHERE clause. Thus, intersect(En,table.*) -> table.* in this
415             result, so En -> table.* in this result.
416             We knew that E1 -> En in this result.
417             So, E1 -> table.* there too. So we can add table.* to En+1:
418           */
419           add_to_fd(tl->map());
420           break;
421         }
422       }
423     }
424     if (last_whole_tables_fd != whole_tables_fd && // something new, check again
425         is_in_fd(item))
426       return true;
427 
428     // Build En+2
429     uint last_fd= fd.size();
430 
431     find_fd_in_joined_table(select->join_list); // [OUTER] JOIN ON
432 
433     if (select->where_cond())                   // WHERE
434       find_fd_in_cond(select->where_cond(), 0, false);
435 
436     table_map map_of_new_fds= 0;
437     for (; last_fd < fd.size(); ++last_fd)
438       map_of_new_fds|= fd.at(last_fd)->used_tables();
439 
440     if (map_of_new_fds != 0)     // something new, check again
441     {
442       assert((map_of_new_fds & PSEUDO_TABLE_BITS) == 0);
443       if (is_in_fd(item))
444         return true;
445        // Recheck keys only in tables with something new:
446       tested_map_for_keys&= ~map_of_new_fds;
447     }
448     else
449     {
450       // If already searched in expressions underlying identifiers.
451       if (search_in_underlying)
452         return false;
453 
454       // Otherwise, iterate once more and dig deeper.
455       search_in_underlying= true;
456 
457       if (is_in_fd(item))
458         return true;
459     }
460   } // while(true)
461 }
462 
463 
464 /*
465   Record that an expression is uniquely determined by source columns.
466 
467   @param  item  Expression to add.
468   @param  local_column  True: it is a column, should be added to 'fd'. False:
469   it cannot be added to 'fd' but we can still derive some useful knowledge -
470   see the function's body.
471   @param  add_to_mat_table True: we should check if it's a column of a mat
472   table and if so we should pass it to the child Group_check.
473 */
add_to_fd(Item * item,bool local_column,bool add_to_mat_table)474 void Group_check::add_to_fd(Item *item, bool local_column,
475                             bool add_to_mat_table)
476 {
477   /*
478     Because the "fd" list is limited to columns and because MySQL allows
479     non-column expressions in GROUP BY (unlike the standard), we need this
480     block _even_if_ this is not a column. For example:
481       select d.s from
482         (select b*3 as c, sum(a) as s from t1 group by b*3) as d
483       group by d.c;
484     Say that we are validating the outer query. d.c is a group column,
485     containing the value of t1.b*3; say that we are presently telling the
486     (child) Group_check of the subquery that t1.b*3 is in its source (in
487     other words, the value of t1.b*3 can be considered as determined).
488     t1.b*3 is not a column, so it cannot be put into "fd", however it's the
489     group expression, so it determines sum(a), and so d.* is determined.
490     In this corner case, a "source" can be a source _expression_, not
491     column.
492   */
493   find_group_in_fd(item);
494 
495   if (!local_column)
496     return;
497 
498   /*
499     A column reference can later give more FDs, record it.
500 
501     You may wonder why, if this is a merged view item (Item_*view_ref), we add
502     it to "fd" instead of adding the underlying item. Here is why:
503     create view v1 as select t1.i*2 as z from t1;
504     select v1.z*5 from v1 group by v1.z;
505     When validating z*5, we need to find z in "fd". So:
506     - either we have added z to "fd" here (chosen solution), then we search
507     for "z" and match.
508     - or we have added the real_item of "z", t1.i*2, to "fd" here, then we
509     search for the real_item of "z", t1.i*2 and match. However, matching
510     (using Item::eq()) of certain items is not working well (if the item
511     contains a subquery), which would lead to some incorrect rejections of
512     queries. Moreover, it is good to stick to the definition of what a
513     functionally dependent column can be: it can be a view's column.
514   */
515 
516   fd.push_back(down_cast<Item_ident *>(item));
517 
518   if (!add_to_mat_table)
519     return;
520 
521   item= item->real_item(); // for merged view containing mat table
522   if (item->type() == Item::FIELD_ITEM)
523   {
524     Item_field *const item_field= (Item_field*)item;
525     TABLE_LIST *const tl= item_field->field->table->pos_in_table_list;
526     if (tl->uses_materialization()) // materialized table
527       add_to_source_of_mat_table(item_field, tl);
528   }
529 }
530 
531 
532 /**
533    This function must be called every time we discover an item which is FD on
534    source columns, or add a bit to whole_tables_fd; it maintains group_in_fd.
535 
536    @param item  item which is FD; if NULL, means that we instead added a bit
537    to whole_tables_fd.
538 */
find_group_in_fd(Item * item)539 void Group_check::find_group_in_fd(Item *item)
540 {
541   if (group_in_fd == ~0ULL)
542     return;                                     // nothing to do
543   if (select->is_grouped())
544   {
545     /*
546       See if we now have all of query expression's GROUP BY list; an
547       implicitely grouped query has an empty group list.
548     */
549     bool missing= false;
550     int j= 0;
551     for (ORDER *grp= select->group_list.first; grp; ++j, grp= grp->next)
552     {
553       if (!(group_in_fd & (1ULL << j)))
554       {
555         Item *grp_item= *grp->item;
556         if ((local_column(grp_item) &&
557              (grp_item->used_tables() & ~whole_tables_fd) == 0) ||
558             (item && grp_item->eq(item, false)))
559           group_in_fd|= (1ULL << j);
560         else
561           missing= true;
562       }
563     }
564     if (!missing)
565     {
566       /*
567         All GROUP BY exprs are FD on the source. Turn all bits on, for easy
568         testing.
569       */
570       group_in_fd= ~0ULL;
571     }
572   }
573 }
574 
575 
576 /**
577    @returns the idx-th expression in the SELECT list of our query block.
578 */
select_expression(uint idx)579 Item *Group_check::select_expression(uint idx)
580 {
581   List_iterator<Item> it_select_list_of_subq(*select->get_item_list());
582   Item *expr_under;
583   for (uint k= 0; k <= idx ; k++)
584     expr_under= it_select_list_of_subq++;
585   assert(expr_under);
586   return expr_under;
587 }
588 
589 
590 /**
591    If we just added a column of a materialized table to 'fd', we record this
592    fact in a new Group_check (mat_gc) for the query expression underlying that
593    table. This can later help us derive new functional dependencies in our
594    Group_check. For example:
595      select d.a, d.b from (select t.a*2 as a, t.a as b from t) group by d.b;
596    When we add d.b to 'fd', in this function we create mat_gc, see that d.b is
597    built from a column of t (t.b), we can say that "t.b is determined", so we
598    add t.b to mat_gc.fd. Later, when we wonder if d.a is functionally
599    dependent on d.b, we process d.a in is_in_fd_of_underlying():
600    we analyze 2*t.a in the context of mat_gc: 2*t.a is FD on t.a, we
601    conclude that d.a is indeed FD on d.b.
602 
603    @param  item_field  column of 'tl', just added to 'fd'
604    @param  tl          mat table
605 */
add_to_source_of_mat_table(Item_field * item_field,TABLE_LIST * tl)606 void Group_check::add_to_source_of_mat_table(Item_field *item_field,
607                                              TABLE_LIST *tl)
608 {
609   SELECT_LEX_UNIT *const mat_unit= tl->derived_unit();
610   // Query expression underlying 'tl':
611   SELECT_LEX *const mat_select= mat_unit->first_select();
612   if (mat_unit->is_union() || mat_select->olap != UNSPECIFIED_OLAP_TYPE)
613     return;                        // If UNION or ROLLUP, no FD
614   // Grab Group_check for this subquery.
615   Group_check *mat_gc;
616   uint j;
617   for (j= 0; j < mat_tables.size(); j++)
618   {
619     mat_gc= mat_tables.at(j);
620     if (mat_gc->select == mat_select)
621       break;
622   }
623   if (j == mat_tables.size())        // not found, create it
624   {
625     mat_gc= new (m_root) Group_check(mat_select, m_root, tl);
626     mat_tables.push_back(mat_gc);
627   }
628   // Find underlying expression of item_field, in SELECT list of mat_select
629   Item *const expr_under=
630     mat_gc->select_expression(item_field->field->field_index);
631 
632   // non-nullability of tl's column in tl, is equal to that of expr_under.
633   if (expr_under && !expr_under->maybe_null)
634       mat_gc->non_null_in_source= true;
635 
636   mat_gc->add_to_fd(expr_under, mat_gc->local_column(expr_under));
637 
638   if (mat_gc->group_in_fd == ~0ULL &&                       // (1)
639       (!(mat_gc->table->map() & select->outer_join) ||      // (2)
640        mat_gc->non_null_in_source))                         // (3)
641   {
642     /*
643       (1): In mat_gc, all GROUP BY expressions of mat_select are dependent on
644       source columns. Thus, all SELECT list expressions are, too (otherwise,
645       the validation of mat_select has or will fail). So, in our Group_check,
646       intersect(En, tl.*) -> tl.* .
647       This FD needs to propagate in our Group_check all the way up to the
648       result of the WHERE clause. It does, if:
649       - either there is no weak side above this table (2) (so NFFD is not
650       needed).
651       - or intersect(En, tl.*) contains a non-nullable column (3) (then
652       the FD is NFFD).
653 
654       VE does not need to be deterministic: there is only one row per values
655       of group columns; if those values are known, then any VE, even rand(),
656       is uniquely determined.
657     */
658     add_to_fd(tl->map());
659   }
660 }
661 
662 
663 /**
664    is_in_fd() is low-level compared to is_fd_on_source(). The former only
665    searches through built FD information; the latter builds this information
666    and calls the former to search in it.
667 
668    @param  item  Item to consider; either a column local to 'select', or a set
669    function whose aggregation query is 'select'
670 
671    @returns true if the expression is FD on the source.
672  */
is_in_fd(Item * item)673 bool Group_check::is_in_fd(Item *item)
674 {
675   if (item->type() == Item::SUM_FUNC_ITEM)
676   {
677     /*
678       If all group expressions are FD on the source, this set function also is
679       (one single value per group).
680     */
681     return group_in_fd == ~0ULL;
682   }
683 
684   assert(local_column(item));
685   Used_tables ut(select);
686   (void) item->walk(&Item::used_tables_for_level, Item::WALK_POSTFIX,
687                     pointer_cast<uchar *>(&ut));
688   if ((ut.used_tables & ~whole_tables_fd) == 0)
689   {
690     /*
691       The item is a column from a table whose all columns are FD.
692       If the table is a view, the item wraps an expression, which
693       uses columns of underlying tables which are all FD; we don't even have
694       to walk the underlying expression.
695       An expression-based FD in a view is not necessarily an NFFD, but here it
696       is, as the bits in whole_tables_fd are on only if the determinant
697       columns are non-NULLable or there is no weak side upwards (see calls to
698       add_to_fd(table_map)).
699     */
700     return true;
701   }
702   for (uint j= 0; j < fd.size(); j++)
703   {
704     Item *const item2= fd.at(j);
705     if (item2->eq(item, 0))
706       return true;
707     /*
708       Say that we have view:
709       create view v1 as select i, 2*i as z from t1; and we do:
710       select z from v1 group by i;
711       v1 is merged.
712       v1.i is Item_*view_ref to t1.i;
713       v1.z is Item_*view_ref to Item_func_mul which has two arguments:
714       Item_int (2) and Item_field (t1.i).
715       We added the grouping column v1.i to "fd". Now we are walking v1.z: we
716       meet Item_field (t1.i). For us to find this t1.i in "fd" we have to
717       reach to real_item() of v1.i.
718     */
719     Item *const real_it2= item2->real_item();
720     if (real_it2 != item2 && real_it2->eq(item, 0))
721       return true;
722   }
723   if (!search_in_underlying)
724     return false;
725   return is_in_fd_of_underlying(down_cast<Item_ident *>(item));
726 }
727 
728 
729 /**
730    See if we can derive a FD from a column which has an underlying expression.
731 
732    For a generated column, see if we can derive a FD from its expression.
733    For a column of a view or derived table, see if we can derive a FD from the
734    underlying query block.
735 
736    @param  item  column
737    @returns true  if this column is FD on source
738 */
is_in_fd_of_underlying(Item_ident * item)739 bool Group_check::is_in_fd_of_underlying(Item_ident *item)
740 {
741   if (item->type() == Item::REF_ITEM)
742   {
743     /*
744       It's a merged view's item.
745       Consider
746         create view v1 as select as as a, a*2 as b from t1;
747         select v1.b from v1 group by v1.a;
748       we have this->fd={v1.a}, and we search if v1.b is FD on v1.a. We'll look
749       if t1.a*2 is FD on t1.a.
750     */
751     assert(static_cast<const Item_ref *>(item)->ref_type() ==
752            Item_ref::VIEW_REF);
753     /*
754       Refuse RAND_TABLE_BIT because:
755       - FDs in a view are those of the underlying query expression.
756       - For FDs in a query expression, expressions in the SELECT list must be
757       deterministic.
758       Same is true for materialized tables further down.
759     */
760     if (item->used_tables() & RAND_TABLE_BIT)
761       return false;
762 
763     Item *const real_it= item->real_item();
764     Used_tables ut(select);
765     (void) item->walk(&Item::used_tables_for_level, Item::WALK_POSTFIX,
766                       pointer_cast<uchar *>(&ut));
767     /*
768       todo When we eliminate all uses of cached_table, we can probably add a
769       derived_table_ref field to Item_direct_view_ref objects and use it here.
770     */
771     TABLE_LIST *const tl= item->cached_table;
772     assert(tl->is_view_or_derived());
773     /*
774       We might find expression-based FDs in the result of the view's query
775       expression; but if this view is on the weak side of an outer join,
776       the FD won't propagate to that outer join's result.
777     */
778     const bool weak_side_upwards= tl->is_inner_table_of_outer_join();
779 
780     /*
781       (3) real_it is a deterministic expression of columns which are all FD on
782       the source. This gives a FD in the view, maybe not NFFD. It propagates
783       to our query expression if:
784       (1) Either there is no weak side upwards (NFFD not needed)
785       (2) Or NULLness of columns implies NULLness of expression (so it's
786       NFFD).
787     */
788     if ((!weak_side_upwards ||                  // (1)
789          (ut.used_tables & real_it->not_null_tables())) && // (2)
790         !real_it->walk(&Item::is_column_not_in_fd, walk_options,
791                        pointer_cast<uchar*>(this))) // (3)
792     {
793       add_to_fd(item, true);
794       return true;
795     }
796   }
797 
798   else if (item->type() == Item::FIELD_ITEM)
799   {
800     Item_field *const item_field= down_cast<Item_field*>(item);
801     /**
802       @todo make table_ref non-NULL for gcols, then use it for 'tl'.
803       Do the same in Item_field::used_tables_for_level().
804     */
805     TABLE_LIST *const tl= item_field->field->table->pos_in_table_list;
806     if (item_field->field->is_gcol())         // Generated column
807     {
808       assert(!tl->uses_materialization());
809       Item *const expr= item_field->field->gcol_info->expr_item;
810       assert(expr->fixed);
811       Used_tables ut(select);
812       item_field->used_tables_for_level(pointer_cast<uchar *>(&ut));
813       const bool weak_side_upwards= tl->is_inner_table_of_outer_join();
814       if ((!weak_side_upwards ||
815            (ut.used_tables & expr->not_null_tables())) &&
816           !expr->walk(&Item::is_column_not_in_fd, walk_options,
817                          pointer_cast<uchar*>(this)))
818       {
819         add_to_fd(item, true);
820         return true;
821       }
822     }
823     else if (tl->uses_materialization()) // Materialized derived table
824     {
825       SELECT_LEX *const mat_select= tl->derived_unit()->first_select();
826       uint j;
827       for (j= 0; j < mat_tables.size() ; j++)
828       {
829         if (mat_tables.at(j)->select == mat_select)
830           break;
831       }
832       if (j < mat_tables.size()) // if false, we know nothing about this table
833       {
834         Group_check *const mat_gc= mat_tables.at(j);
835         /*
836           'item' belongs to a materialized table, and certain fields of the
837           subquery are in this->fd.
838           Search if the expression inside 'item' is FD on them.
839         */
840         Item *const expr_under=
841           mat_gc->select_expression(item_field->field->field_index);
842         /*
843           expr_under is the expression underlying 'item'.
844           (1) and (4) it is a deterministic expression of mat_gc source
845           columns, so is FD on them. This gives a FD in the mat table, maybe
846           not NFFD: intersect(En, tl.*) -> item .
847           This FD needs to propagate in our Group_check all the way up to the
848           result of the WHERE clause. It does, if:
849           - either there is no weak side above this table (2) (so NFFD is not
850           needed).
851           - or intersect(En, tl.*) contains a non-nullable column (3) (then
852           the FD is NFFD).
853         */
854         if (!(expr_under->used_tables() & RAND_TABLE_BIT) &&      // (1)
855             (!(mat_gc->table->map() & select->outer_join) ||      // (2)
856              mat_gc->non_null_in_source) &&                       // (3)
857             !expr_under->walk(&Item::aggregate_check_group,       // (4)
858                               walk_options,
859                               pointer_cast<uchar*>(mat_gc)))
860         {
861           /*
862             We pass add_to_mat_table==false otherwise add_to_fd() may add
863             expr_under (if it's a field) to mat_gc->fd, uselessly (it is
864             already in mat_gc->fd, as walk() succeeded above). This is just to
865             not make the 'fd' list longer than needed.
866           */
867           add_to_fd(item_field, true, false);
868           return true;
869         }
870       }
871     }
872   }
873 
874   return false;
875 }
876 
877 
878 /**
879    Searches for equality-based functional dependences in an AND-ed part of a
880    condition (a conjunct).
881 
882    @param  cond        complete condition
883    @param  conjunct    one AND-ed part of 'cond'
884    @param  weak_tables  If condition is WHERE, it's 0. Otherwise it is the map
885    of weak tables in the join nest which owns the condition.
886    @param  weak_side_upwards If condition is WHERE it's false. Otherwise it is
887    true if the join nest owning this condition is embedded in the weak side
888    of some parent outer join (no matter how far up the parent is).
889 */
analyze_conjunct(Item * cond,Item * conjunct,table_map weak_tables,bool weak_side_upwards)890 void Group_check::analyze_conjunct(Item *cond, Item *conjunct,
891                                    table_map weak_tables,
892                                    bool weak_side_upwards)
893 {
894   if (conjunct->type() != Item::FUNC_ITEM)
895     return;
896   const Item_func *cnj= static_cast<const Item_func *>(conjunct);
897   if (cnj->functype() != Item_func::EQ_FUNC)
898     return;
899   Item *left_item= cnj->arguments()[0];
900   Item *right_item= cnj->arguments()[1];
901   if (left_item->type() == Item::ROW_ITEM &&
902       right_item->type() == Item::ROW_ITEM)
903   {
904     /*
905       (a,b)=(c,d) is equivalent to 'a=c and b=d', let's iterate on pairs.
906       Note that it's not recursive: we don't handle (a,(b,c))=(d,(e,f)), the
907       Standard does not seem to require it.
908     */
909     Item_row *left_row= down_cast<Item_row*>(left_item);
910     Item_row *right_row= down_cast<Item_row*>(right_item);
911     int elem= left_row->cols();
912     while (--elem >= 0)
913       analyze_scalar_eq(cond, left_row->element_index(elem),
914                         right_row->element_index(elem),
915                         weak_tables, weak_side_upwards);
916   }
917   else
918     analyze_scalar_eq(cond, left_item, right_item, weak_tables,
919                       weak_side_upwards);
920 }
921 
922 
923 /**
924    Helper function @see analyze_conjunct().
925 */
analyze_scalar_eq(Item * cond,Item * left_item,Item * right_item,table_map weak_tables,bool weak_side_upwards)926 void Group_check::analyze_scalar_eq(Item *cond,
927                                     Item *left_item, Item *right_item,
928                                     table_map weak_tables,
929                                     bool weak_side_upwards)
930 {
931   table_map left_tables= left_item->used_tables();
932   table_map right_tables= right_item->used_tables();
933   bool left_is_column= local_column(left_item);
934   bool right_is_column= local_column(right_item);
935 
936   /*
937     We look for something=col_not_FD.
938     If there are weak tables, this column must be weak (something=strong gives
939     us nothing, in an outer join condition).
940   */
941   if (right_is_column &&
942       (!weak_tables || (weak_tables & right_tables)) &&
943       !is_in_fd(right_item))
944   {}
945   else if (left_is_column &&
946            (!weak_tables || (weak_tables & left_tables)) &&
947            !is_in_fd(left_item))
948   {
949     // col_not_FD=something: change to something=col_not_FD
950     std::swap(left_item, right_item);
951     std::swap(left_tables, right_tables);
952     std::swap(left_is_column, right_is_column);
953   }
954   else
955     return;                                    // this equality brings nothing
956 
957   // right_item is a column not in fd, see if we can add it.
958 
959   if (left_is_column && (weak_tables & left_tables) &&
960       is_in_fd(left_item))
961   {
962     // weak=weak: left->right, and this is NFFD
963     add_to_fd(right_item, true);
964     return;
965   }
966 
967   const table_map strong_tables= (~weak_tables) & ~PSEUDO_TABLE_BITS;
968 
969   if ((left_is_column &&
970        (strong_tables & left_tables) &&
971        is_in_fd(left_item)) ||
972       left_item->const_item() ||
973       (OUTER_REF_TABLE_BIT & left_tables))
974   {
975     // strong_or_literal_or_outer_ref= right_item
976 
977     if (!weak_tables)
978     {
979       /*
980         It cannot be an inner join, due to transformations done in
981         simplify_joins(). So it is WHERE, so right_item is strong.
982         This may be constant=right_item and thus not be an NFFD, but WHERE is
983         exterior to join nests so propagation is not needed.
984       */
985       assert(!weak_side_upwards);          // cannot be inner join
986       add_to_fd(right_item, true);
987     }
988     else
989     {
990       // Outer join. So condition must be deterministic.
991       if (cond->used_tables() & RAND_TABLE_BIT)
992         return;
993 
994       /*
995         FD will have DJS as source columns, where DJS is the set of strong
996         columns referenced by "cond". FD has to propagate. It does if:
997         - either there is no weak side upwards
998         - or NULLness of DJS columns implies that "cond" is not true.
999       */
1000       if (weak_side_upwards && !(strong_tables & cond->not_null_tables()))
1001         return;
1002 
1003       std::pair<Group_check *, table_map> p(this, strong_tables);
1004       if (!cond->walk(&Item::is_strong_side_column_not_in_fd,
1005                       walk_options, (uchar*)&p))
1006       {
1007         /*
1008           "cond" is deterministic.
1009           right_item is weak.
1010           strong_or_literal_or_outer= weak.
1011           So DJS->right_item holds in the result of the join, and it
1012           propagates.
1013           As DJS is FD on E1 (the walk() succeeded), E1->right_item in the
1014           result of WHERE.
1015         */
1016         add_to_fd(right_item, true);
1017       }
1018     }
1019   }
1020 }
1021 
1022 
1023 /**
1024    Searches for equality-based functional dependencies in a condition.
1025 
1026    @param  cond  condition: a WHERE condition or JOIN condition.
1027    @param  weak_tables  If condition is WHERE, it's 0. Otherwise it is the map
1028    of weak tables in the join nest which owns the condition.
1029    @param  weak_side_upwards If condition is WHERE it's false. Otherwise it is
1030    true if the join nest owning this condition is embedded in the right side
1031    of some parent left join.
1032 */
find_fd_in_cond(Item * cond,table_map weak_tables,bool weak_side_upwards)1033 void Group_check::find_fd_in_cond(Item *cond,
1034                                   table_map weak_tables,
1035                                   bool weak_side_upwards)
1036 {
1037   if (cond->type() == Item::COND_ITEM)
1038   {
1039     Item_cond *cnd= static_cast<Item_cond *>(cond);
1040     /*
1041       All ANDs already flattened, see:
1042       "(X1 AND X2) AND (Y1 AND Y2) ==> AND (X1, X2, Y1, Y2)"
1043       in sql_yacc, and also Item_cond::fix_fields().
1044     */
1045     if (cnd->functype() != Item_func::COND_AND_FUNC)
1046       return;
1047     List_iterator<Item> li(*(cnd->argument_list()));
1048     Item *item;
1049     while ((item= li++))
1050       analyze_conjunct(cond, item, weak_tables, weak_side_upwards);
1051   }
1052   else // only one conjunct
1053     analyze_conjunct(cond, cond, weak_tables, weak_side_upwards);
1054 }
1055 
1056 
1057 /**
1058    Searches for equality-based functional dependencies in the condition of a
1059    join nest, and recursively in all contained join nests.
1060 
1061    @param  join_list  members of the join nest
1062 */
find_fd_in_joined_table(List<TABLE_LIST> * join_list)1063 void Group_check::find_fd_in_joined_table(List<TABLE_LIST> *join_list)
1064 {
1065   List_iterator<TABLE_LIST> li(*join_list);
1066   TABLE_LIST *table;
1067   while ((table= li++))
1068   {
1069     if (table->sj_cond())
1070     {
1071       /*
1072         We can ignore this nest as:
1073         - the subquery's WHERE was copied to the semi-join condition
1074         - so where the IN equalities
1075         - sj_cond() is also present in the parent nest's join condition or in
1076         the query block's WHERE condition, which we check somewhere else.
1077         Note that columns from sj-inner tables can help only as "intermediate
1078         link" in the graph of functional dependencies, as they are neither in
1079         GROUP BY (the source) nor in the SELECT list / HAVING / ORDER BY (the
1080         target).
1081       */
1082       continue;
1083     }
1084     table_map used_tables;
1085     NESTED_JOIN *nested_join= table->nested_join;
1086     if (nested_join)
1087     {
1088       find_fd_in_joined_table(&nested_join->join_list);
1089       used_tables= nested_join->used_tables;
1090     }
1091     else
1092       used_tables= table->map();
1093     if (table->join_cond())
1094     {
1095       /*
1096         simplify_joins() moves the ON condition of an inner join to the closest
1097         outer join nest or to WHERE. So this assertion should hold.
1098         Thus, used_tables are weak tables.
1099       */
1100       assert(table->outer_join);
1101       /*
1102         We might find equality-based FDs in the result of this outer join; but
1103         if this outer join is itself on the weak side of a parent outer join,
1104         the FD won't propagate to that parent outer join's result.
1105       */
1106       const bool weak_side_upwards=
1107         table->embedding &&
1108         table->embedding->is_inner_table_of_outer_join();
1109       find_fd_in_cond(table->join_cond(), used_tables, weak_side_upwards);
1110     }
1111   }
1112 }
1113 
1114 
1115 /// Writes "check information" to the optimizer trace
to_opt_trace(THD * thd)1116 void Group_check::to_opt_trace(THD *thd)
1117 {
1118 #ifdef OPTIMIZER_TRACE
1119   if (fd.empty() && !whole_tables_fd)
1120     return;
1121   Opt_trace_context *ctx= &thd->opt_trace;
1122   if (!ctx->is_started())
1123       return;
1124   Opt_trace_object trace_wrapper(ctx);
1125   Opt_trace_object trace_fds(ctx, "functional_dependencies_of_GROUP_columns");
1126   to_opt_trace2(ctx, &trace_fds);
1127 #endif
1128 }
1129 
1130 /**
1131    Utility function for to_opt_trace(), as we need recursion in children
1132    Group_checks. to_opt_trace() only writes a one-time header.
1133 */
to_opt_trace2(Opt_trace_context * ctx,Opt_trace_object * parent)1134 void Group_check::to_opt_trace2(Opt_trace_context *ctx,
1135                                 Opt_trace_object *parent)
1136 {
1137 #ifdef OPTIMIZER_TRACE
1138   if (table)
1139     parent->add_utf8_table(table);
1140   if (whole_tables_fd)
1141   {
1142     Opt_trace_array array(ctx, "all_columns_of_table_map_bits");
1143     for (uint j= 0; j < MAX_TABLES ; j++)
1144       if (whole_tables_fd & (1ULL << j))
1145         array.add(j);
1146   }
1147   if (!fd.empty())
1148   {
1149     Opt_trace_array array(ctx, "columns");
1150     for (uint j= 0; j < fd.size() ; j++)
1151       array.add_utf8(fd.at(j)->full_name());
1152   }
1153   if (is_child())
1154   {
1155     if (group_in_fd == ~0ULL && select->is_explicitly_grouped())
1156       parent->add("all_group_expressions", true);
1157   }
1158   if (!mat_tables.empty())
1159   {
1160     Opt_trace_array array(ctx, "searched_in_materialized_tables");
1161     for (uint j= 0; j < mat_tables.size() ; j++)
1162     {
1163       Opt_trace_object trace_wrapper(ctx);
1164       mat_tables.at(j)->to_opt_trace2(ctx, &trace_wrapper);
1165     }
1166   }
1167 #endif
1168 }
1169 
1170 
1171 /**
1172    Does one low-level check on one Item_ident. Called by Item_ident walk
1173    processors.
1174    @param i     item to check
1175    @param tm    map of strong tables, if type==CHECK_STRONG_SIDE_COLUMN
1176    @param type  check to do:
1177       - CHECK_GROUP for Item_ident::aggregate_check_group()
1178       - CHECK_STRONG_SIDE_COLUMN for
1179       Item_ident::is_strong_side_column_not_in_fd()
1180       - CHECK_COLUMN for Item_ident::is_column_not_in_fd()
1181    @returns false if success
1182 */
do_ident_check(Item_ident * i,table_map tm,enum enum_ident_check type)1183 bool Group_check::do_ident_check(Item_ident *i, table_map tm,
1184                                  enum enum_ident_check type)
1185 {
1186   if (is_stopped(i))
1187     return false;
1188 
1189   const Bool3 local= i->local_column(select);
1190   if (local.is_false())
1191     goto ignore_children;
1192 
1193   if (local.is_unknown())
1194     return false; // dive in child item
1195 
1196   switch (type)
1197   {
1198   case CHECK_GROUP:
1199     if (!is_fd_on_source(i))
1200     {
1201       // It is not FD on source columns:
1202       if (!is_child())
1203         failed_ident= i;
1204       return true;
1205     }
1206     goto ignore_children;
1207   case CHECK_STRONG_SIDE_COLUMN:
1208    {
1209     Used_tables ut(select);
1210     (void) i->walk(&Item::used_tables_for_level, Item::WALK_POSTFIX,
1211                    pointer_cast<uchar *>(&ut));
1212     if ((ut.used_tables & tm) && !is_in_fd(i))
1213       return true;                    // It is a strong-side column and not FD
1214     goto ignore_children;
1215    }
1216   case CHECK_COLUMN:
1217     if (!is_in_fd(i))
1218       return true;                              // It is a column and not FD
1219     goto ignore_children;
1220   }
1221 
1222 ignore_children:
1223     stop_at(i);
1224     return false;
1225 }
1226 
1227 /// @} (end of group AGGREGATE_CHECKS ONLY_FULL_GROUP_BY)
1228