1# This program is copyright 2007-2011 Baron Schwartz, 2011 Percona Ireland Ltd.
2# Feedback and improvements are welcome.
3#
4# THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
5# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
6# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
7#
8# This program is free software; you can redistribute it and/or modify it under
9# the terms of the GNU General Public License as published by the Free Software
10# Foundation, version 2; OR the Perl Artistic License.  On UNIX and similar
11# systems, you can issue `man perlgpl' or `man perlartistic' to read these
12# licenses.
13#
14# You should have received a copy of the GNU General Public License along with
15# this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16# Place, Suite 330, Boston, MA  02111-1307  USA.
17# ###########################################################################
18# QueryRewriter package
19# ###########################################################################
20{
21# Package: QueryRewriter
22# QueryRewriter rewrites and transforms queries.
23package QueryRewriter;
24
25use strict;
26use warnings FATAL => 'all';
27use English qw(-no_match_vars);
28use constant PTDEBUG => $ENV{PTDEBUG} || 0;
29
30# A list of verbs that can appear in queries.  I know this is incomplete -- it
31# does not have CREATE, DROP, ALTER, TRUNCATE for example.  But I don't need
32# those for my client yet.  Other verbs: KILL, LOCK, UNLOCK
33our $verbs   = qr{^SHOW|^FLUSH|^COMMIT|^ROLLBACK|^BEGIN|SELECT|INSERT
34                  |UPDATE|DELETE|REPLACE|^SET|UNION|^START|^LOCK}xi;
35my $quote_re = qr/"(?:(?!(?<!\\)").)*"|'(?:(?!(?<!\\)').)*'/; # Costly!
36my $bal;
37$bal         = qr/
38                  \(
39                  (?:
40                     (?> [^()]+ )    # Non-parens without backtracking
41                     |
42                     (??{ $bal })    # Group with matching parens
43                  )*
44                  \)
45                 /x;
46
47# The one-line comment pattern is quite crude.  This is intentional for
48# performance.  The multi-line pattern does not match version-comments.
49my $olc_re = qr/(?:--|#)[^'"\r\n]*(?=[\r\n]|\Z)/;  # One-line comments
50my $mlc_re = qr#/\*[^!].*?\*/#sm;                  # But not /*!version */
51my $vlc_re = qr#/\*.*?[0-9]+.*?\*/#sm;             # For SHOW + /*!version */
52my $vlc_rf = qr#^(?:SHOW).*?/\*![0-9]+(.*?)\*/#sm;     # Variation for SHOW
53
54
55sub new {
56   my ( $class, %args ) = @_;
57   my $self = { %args };
58   return bless $self, $class;
59}
60
61# Strips comments out of queries.
62sub strip_comments {
63   my ( $self, $query ) = @_;
64   return unless $query;
65   $query =~ s/$mlc_re//go;
66   $query =~ s/$olc_re//go;
67   if ( $query =~ m/$vlc_rf/i ) { # contains show + version
68      my $qualifier = $1 || '';
69      $query =~ s/$vlc_re/$qualifier/go;
70   }
71   return $query;
72}
73
74# Shortens long queries by normalizing stuff out of them.  $length is used only
75# for IN() lists.  If $length is given, the query is shortened if it's longer
76# than that.
77sub shorten {
78   my ( $self, $query, $length ) = @_;
79   # Shorten multi-value insert/replace, all the way up to on duplicate key
80   # update if it exists.
81   $query =~ s{
82      \A(
83         (?:INSERT|REPLACE)
84         (?:\s+LOW_PRIORITY|DELAYED|HIGH_PRIORITY|IGNORE)?
85         (?:\s\w+)*\s+\S+\s+VALUES\s*\(.*?\)
86      )
87      \s*,\s*\(.*?(ON\s+DUPLICATE|\Z)}
88      {$1 /*... omitted ...*/$2}xsi;
89
90   # Shortcut!  Find out if there's an IN() list with values.
91   return $query unless $query =~ m/IN\s*\(\s*(?!select)/i;
92
93   # Shorten long IN() lists of literals.  But only if the string is longer than
94   # the $length limit.  Assumption: values don't contain commas or closing
95   # parens inside them.
96   my $last_length  = 0;
97   my $query_length = length($query);
98   while (
99      $length          > 0
100      && $query_length > $length
101      && $query_length < ( $last_length || $query_length + 1 )
102   ) {
103      $last_length = $query_length;
104      $query =~ s{
105         (\bIN\s*\()    # The opening of an IN list
106         ([^\)]+)       # Contents of the list, assuming no item contains paren
107         (?=\))           # Close of the list
108      }
109      {
110         $1 . __shorten($2)
111      }gexsi;
112   }
113
114   return $query;
115}
116
117# Used by shorten().  The argument is the stuff inside an IN() list.  The
118# argument might look like this:
119#  1,2,3,4,5,6
120# Or, if this is a second or greater iteration, it could even look like this:
121#  /*... omitted 5 items ...*/ 6,7,8,9
122# In the second case, we need to trim out 6,7,8 and increment "5 items" to "8
123# items".  We assume that the values in the list don't contain commas; if they
124# do, the results could be a little bit wrong, but who cares.  We keep the first
125# 20 items because we don't want to nuke all the samples from the query, we just
126# want to shorten it.
127sub __shorten {
128   my ( $snippet ) = @_;
129   my @vals = split(/,/, $snippet);
130   return $snippet unless @vals > 20;
131   my @keep = splice(@vals, 0, 20);  # Remove and save the first 20 items
132   return
133      join(',', @keep)
134      . "/*... omitted "
135      . scalar(@vals)
136      . " items ...*/";
137}
138
139# Normalizes variable queries to a "query fingerprint" by abstracting away
140# parameters, canonicalizing whitespace, etc.  See
141# http://dev.mysql.com/doc/refman/5.0/en/literals.html for literal syntax.
142# Note: Any changes to this function must be profiled for speed!  Speed of this
143# function is critical for pt-query-digest.  There are known bugs in this,
144# but the balance between maybe-you-get-a-bug and speed favors speed.
145# See past Maatkit revisions of this subroutine for more correct, but slower,
146# regexes.
147sub fingerprint {
148   my ( $self, $query ) = @_;
149
150   # First, we start with a bunch of special cases that we can optimize because
151   # they are special behavior or because they are really big and we want to
152   # throw them away as early as possible.
153   $query =~ m#\ASELECT /\*!40001 SQL_NO_CACHE \*/ \* FROM `# # mysqldump query
154      && return 'mysqldump';
155   # Matches queries like REPLACE /*foo.bar:3/3*/ INTO checksum.checksum
156   $query =~ m#/\*\w+\.\w+:[0-9]/[0-9]\*/#     # pt-table-checksum, etc query
157      && return 'percona-toolkit';
158   # Administrator commands appear to be a comment, so return them as-is
159   $query =~ m/\Aadministrator command: /
160      && return $query;
161   # Special-case for stored procedures.
162   $query =~ m/\A\s*(call\s+\S+)\(/i
163      && return lc($1); # Warning! $1 used, be careful.
164   # mysqldump's INSERT statements will have long values() lists, don't waste
165   # time on them... they also tend to segfault Perl on some machines when you
166   # get to the "# Collapse IN() and VALUES() lists" regex below!
167   if ( my ($beginning) = $query =~ m/\A((?:INSERT|REPLACE)(?: IGNORE)?\s+INTO.+?VALUES\s*\(.*?\))\s*,\s*\(/is ) {
168      $query = $beginning; # Shorten multi-value INSERT statements ASAP
169   }
170
171   $query =~ s/$mlc_re//go;
172   $query =~ s/$olc_re//go;
173   $query =~ s/\Ause \S+\Z/use ?/i       # Abstract the DB in USE
174      && return $query;
175
176   $query =~ s/\\["']//g;                # quoted strings
177   $query =~ s/".*?"/?/sg;               # quoted strings
178   $query =~ s/'.*?'/?/sg;               # quoted strings
179
180   $query =~ s/\bfalse\b|\btrue\b/?/isg; # boolean values
181
182   # MD5 checksums which are always 32 hex chars
183   if ( $self->{match_md5_checksums} ) {
184      $query =~ s/([._-])[a-f0-9]{32}/$1?/g;
185   }
186
187   # Things resembling numbers/hex.
188   if ( !$self->{match_embedded_numbers} ) {
189      # For speed, this regex is extremely broad in its definition
190      # of what looks like a number.
191      $query =~ s/[0-9+-][0-9a-f.xb+-]*/?/g;
192   }
193   else {
194      $query =~ s/\b[0-9+-][0-9a-f.xb+-]*/?/g;
195   }
196
197   # Clean up leftovers
198   if ( $self->{match_md5_checksums} ) {
199      $query =~ s/[xb+-]\?/?/g;
200   }
201   else {
202      $query =~ s/[xb.+-]\?/?/g;
203   }
204
205   $query =~ s/\A\s+//;                  # Chop off leading whitespace
206   chomp $query;                         # Kill trailing whitespace
207   $query =~ tr[ \n\t\r\f][ ]s;          # Collapse whitespace
208   $query = lc $query;
209   $query =~ s/\bnull\b/?/g;             # Get rid of NULLs
210   $query =~ s{                          # Collapse IN and VALUES lists
211               \b(in|values?)(?:[\s,]*\([\s?,]*\))+
212              }
213              {$1(?+)}gx;
214   $query =~ s{                          # Collapse UNION
215               \b(select\s.*?)(?:(\sunion(?:\sall)?)\s\1)+
216              }
217              {$1 /*repeat$2*/}xg;
218   $query =~ s/\blimit \?(?:, ?\?| offset \?)?/limit ?/; # LIMIT
219   # The following are disabled because of speed issues.  Should we try to
220   # normalize whitespace between and around operators?  My gut feeling is no.
221   # $query =~ s/ , | ,|, /,/g;    # Normalize commas
222   # $query =~ s/ = | =|= /=/g;       # Normalize equals
223   # $query =~ s# [,=+*/-] ?|[,=+*/-] #+#g;    # Normalize operators
224
225   # Remove ASC keywords from ORDER BY clause so these queries fingerprint
226   # the same:
227   #   SELECT * FROM `products`  ORDER BY name ASC, shape ASC;
228   #   SELECT * FROM `products`  ORDER BY name, shape;
229   # ASC is default so "ORDER BY col ASC" is really the same as just
230   # "ORDER BY col".
231   # http://code.google.com/p/maatkit/issues/detail?id=1030
232   if ( $query =~ m/\bORDER BY /gi ) {  # Find, anchor on ORDER BY clause
233      # Replace any occurrence of "ASC" after anchor until end of query.
234      # I have verified this with regex debug: it's a single forward pass
235      # without backtracking.  Probably as fast as it gets.
236      1 while $query =~ s/\G(.+?)\s+ASC/$1/gi && pos $query;
237   }
238
239   return $query;
240}
241
242# Gets the verbs from an SQL query, such as SELECT, UPDATE, etc.
243sub distill_verbs {
244   my ( $self, $query ) = @_;
245
246   # Simple verbs that normally don't have comments, extra clauses, etc.
247   $query =~ m/\A\s*call\s+(\S+)\(/i && return "CALL $1";
248   $query =~ m/\A\s*use\s+/          && return "USE";
249   $query =~ m/\A\s*UNLOCK TABLES/i  && return "UNLOCK";
250   $query =~ m/\A\s*xa\s+(\S+)/i     && return "XA_$1";
251
252   if ( $query =~ m/\A\s*LOAD/i ) {
253      my ($tbl) = $query =~ m/INTO TABLE\s+(\S+)/i;
254      $tbl ||= '';
255      $tbl =~ s/`//g;
256      return "LOAD DATA $tbl";
257   }
258
259   if ( $query =~ m/\Aadministrator command:/ ) {
260      $query =~ s/administrator command:/ADMIN/;
261      $query = uc $query;
262      return $query;
263   }
264
265   # All other, more complex verbs.
266   $query = $self->strip_comments($query);
267
268   # SHOW statements are either 2 or 3 words: SHOW A (B), where A and B
269   # are words; B is optional.  E.g. "SHOW TABLES" or "SHOW SLAVE STATUS".
270   # There's a few common keywords that may show up in place of A, so we
271   # remove them first.  Then there's some keywords that signify extra clauses
272   # that may show up in place of B and since these clauses are at the
273   # end of the statement, we remove everything from the clause onward.
274   if ( $query =~ m/\A\s*SHOW\s+/i ) {
275      PTDEBUG && _d($query);
276
277      # Remove common keywords.
278      $query = uc $query;
279      $query =~ s/\s+(?:SESSION|FULL|STORAGE|ENGINE)\b/ /g;
280      # This should be in the regex above but Perl doesn't seem to match
281      # COUNT\(.+\) properly when it's grouped.
282      $query =~ s/\s+COUNT[^)]+\)//g;
283
284      # Remove clause keywords and everything after.
285      $query =~ s/\s+(?:FOR|FROM|LIKE|WHERE|LIMIT|IN)\b.+//ms;
286
287      # The query should now be like SHOW A B C ... delete everything after B,
288      # collapse whitespace, and we're done.
289      $query =~ s/\A(SHOW(?:\s+\S+){1,2}).*\Z/$1/s;
290      $query =~ s/\s+/ /g;
291      PTDEBUG && _d($query);
292      return $query;
293   }
294
295   # Data defintion statements verbs like CREATE and ALTER.
296   # The two evals are a hack to keep Perl from warning that
297   # "QueryParser::data_def_stmts" used only once: possible typo at...".
298   # Some day we'll group all our common regex together in a packet and
299   # export/import them properly.
300   eval $QueryParser::data_def_stmts;
301   eval $QueryParser::tbl_ident;
302   my ( $dds ) = $query =~ /^\s*($QueryParser::data_def_stmts)\b/i;
303   if ( $dds) {
304      # https://bugs.launchpad.net/percona-toolkit/+bug/821690
305      $query =~ s/\s+IF(?:\s+NOT)?\s+EXISTS/ /i;
306      my ( $obj ) = $query =~ m/$dds.+(DATABASE|TABLE)\b/i;
307      $obj = uc $obj if $obj;
308      PTDEBUG && _d('Data def statment:', $dds, 'obj:', $obj);
309      my ($db_or_tbl)
310         = $query =~ m/(?:TABLE|DATABASE)\s+($QueryParser::tbl_ident)(\s+.*)?/i;
311      PTDEBUG && _d('Matches db or table:', $db_or_tbl);
312      return uc($dds . ($obj ? " $obj" : '')), $db_or_tbl;
313   }
314
315   # All other verbs, like SELECT, INSERT, UPDATE, etc.  First, get
316   # the query type -- just extract all the verbs and collapse them
317   # together.
318   my @verbs = $query =~ m/\b($verbs)\b/gio;
319   @verbs    = do {
320      my $last = '';
321      grep { my $pass = $_ ne $last; $last = $_; $pass } map { uc } @verbs;
322   };
323
324   # http://code.google.com/p/maatkit/issues/detail?id=1176
325   # A SELECT query can't have any verbs other than UNION.
326   # Subqueries (SELECT SELECT) are reduced to 1 SELECT in the
327   # do loop above.  And there's no valid SQL syntax like
328   # SELECT ... DELETE (there are valid multi-verb syntaxes, like
329   # INSERT ... SELECT).  So if it's a SELECT with multiple verbs,
330   # we need to check it else SELECT c FROM t WHERE col='delete'
331   # will incorrectly distill as SELECT DELETE t.
332   if ( ($verbs[0] || '') eq 'SELECT' && @verbs > 1 ) {
333      PTDEBUG && _d("False-positive verbs after SELECT:", @verbs[1..$#verbs]);
334      my $union = grep { $_ eq 'UNION' } @verbs;
335      @verbs    = $union ? qw(SELECT UNION) : qw(SELECT);
336   }
337
338   # This used to be "my $verbs" but older verisons of Perl complain that
339   # ""my" variable $verbs masks earlier declaration in same scope" where
340   # the earlier declaration is our $verbs.
341   # http://code.google.com/p/maatkit/issues/detail?id=957
342   my $verb_str = join(q{ }, @verbs);
343   return $verb_str;
344}
345
346sub __distill_tables {
347   my ( $self, $query, $table, %args ) = @_;
348   my $qp = $args{QueryParser} || $self->{QueryParser};
349   die "I need a QueryParser argument" unless $qp;
350
351   # "Fingerprint" the tables.
352   my @tables = map {
353      $_ =~ s/`//g;
354      $_ =~ s/(_?)[0-9]+/$1?/g;
355      $_;
356   } grep { defined $_ } $qp->get_tables($query);
357
358   push @tables, $table if $table;
359
360   # Collapse the table list
361   @tables = do {
362      my $last = '';
363      grep { my $pass = $_ ne $last; $last = $_; $pass } @tables;
364   };
365
366   return @tables;
367}
368
369# This is kind of like fingerprinting, but it super-fingerprints to something
370# that shows the query type and the tables/objects it accesses.
371sub distill {
372   my ( $self, $query, %args ) = @_;
373
374   if ( $args{generic} ) {
375      # Do a generic distillation which returns the first two words
376      # of a simple "cmd arg" query, like memcached and HTTP stuff.
377      my ($cmd, $arg) = $query =~ m/^(\S+)\s+(\S+)/;
378      return '' unless $cmd;
379      $query = (uc $cmd) . ($arg ? " $arg" : '');
380   }
381   else {
382      # distill_verbs() may return a table if it's a special statement
383      # like TRUNCATE TABLE foo.  __distill_tables() handles some but not
384      # all special statements so we pass the special table from distill_verbs()
385      # to __distill_tables() in case it's a statement that the latter
386      # can't handle.  If it can handle it, it will eliminate any duplicate
387      # tables.
388      my ($verbs, $table)  = $self->distill_verbs($query, %args);
389
390      if ( $verbs && $verbs =~ m/^SHOW/ ) {
391         # Ignore tables for SHOW statements and normalize some
392         # aliases like SCHMEA==DATABASE, KEYS==INDEX.
393         my %alias_for = qw(
394            SCHEMA   DATABASE
395            KEYS     INDEX
396            INDEXES  INDEX
397         );
398         map { $verbs =~ s/$_/$alias_for{$_}/ } keys %alias_for;
399         $query = $verbs;
400      }
401      elsif ( $verbs && $verbs =~ m/^LOAD DATA/ ) {
402         return $verbs;
403      }
404      else {
405         # For everything else, distill the tables.
406         my @tables = $self->__distill_tables($query, $table, %args);
407         $query     = join(q{ }, $verbs, @tables);
408      }
409   }
410
411   if ( $args{trf} ) {
412      $query = $args{trf}->($query, %args);
413   }
414
415   return $query;
416}
417
418sub convert_to_select {
419   my ( $self, $query ) = @_;
420   return unless $query;
421
422   # Trying to convert statments that have subqueries as values to column
423   # assignments doesn't work.  E.g. SET col=(SELECT ...).  But subqueries
424   # do work in other cases like JOIN (SELECT ...).
425   # http://code.google.com/p/maatkit/issues/detail?id=347
426   return if $query =~ m/=\s*\(\s*SELECT /i;
427
428   $query =~ s{
429                 \A.*?
430                 update(?:\s+(?:low_priority|ignore))?\s+(.*?)
431                 \s+set\b(.*?)
432                 (?:\s*where\b(.*?))?
433                 (limit\s*[0-9]+(?:\s*,\s*[0-9]+)?)?
434                 \Z
435              }
436              {__update_to_select($1, $2, $3, $4)}exsi
437      # INSERT|REPLACE tbl (cols) VALUES (vals)
438      || $query =~ s{
439                    \A.*?
440                    (?:insert(?:\s+ignore)?|replace)\s+
441                    .*?\binto\b(.*?)\(([^\)]+)\)\s*
442                    values?\s*(\(.*?\))\s*
443                    (?:\blimit\b|on\s+duplicate\s+key.*)?\s*
444                    \Z
445                 }
446                 {__insert_to_select($1, $2, $3)}exsi
447      # INSERT|REPLACE tbl SET vals
448      || $query =~ s{
449                    \A.*?
450                    (?:insert(?:\s+ignore)?|replace)\s+
451                    (?:.*?\binto)\b(.*?)\s*
452                    set\s+(.*?)\s*
453                    (?:\blimit\b|on\s+duplicate\s+key.*)?\s*
454                    \Z
455                 }
456                 {__insert_to_select_with_set($1, $2)}exsi
457      || $query =~ s{
458                    \A.*?
459                    delete\s+(.*?)
460                    \bfrom\b(.*)
461                    \Z
462                 }
463                 {__delete_to_select($1, $2)}exsi;
464   $query =~ s/\s*on\s+duplicate\s+key\s+update.*\Z//si;
465   $query =~ s/\A.*?(?=\bSELECT\s*\b)//ism;
466   return $query;
467}
468
469sub convert_select_list {
470   my ( $self, $query ) = @_;
471   $query =~ s{
472               \A\s*select(.*?)\bfrom\b
473              }
474              {$1 =~ m/\*/ ? "select 1 from" : "select isnull(coalesce($1)) from"}exi;
475   return $query;
476}
477
478sub __delete_to_select {
479   my ( $delete, $join ) = @_;
480   if ( $join =~ m/\bjoin\b/ ) {
481      return "select 1 from $join";
482   }
483   return "select * from $join";
484}
485
486sub __insert_to_select {
487   my ( $tbl, $cols, $vals ) = @_;
488   PTDEBUG && _d('Args:', @_);
489   my @cols = split(/,/, $cols);
490   PTDEBUG && _d('Cols:', @cols);
491   $vals =~ s/^\(|\)$//g; # Strip leading/trailing parens
492   my @vals = $vals =~ m/($quote_re|[^,]*${bal}[^,]*|[^,]+)/g;
493   PTDEBUG && _d('Vals:', @vals);
494   if ( @cols == @vals ) {
495      return "select * from $tbl where "
496         . join(' and ', map { "$cols[$_]=$vals[$_]" } (0..$#cols));
497   }
498   else {
499      return "select * from $tbl limit 1";
500   }
501}
502
503sub __insert_to_select_with_set {
504   my ( $from, $set ) = @_;
505   $set =~ s/,/ and /g;
506   return "select * from $from where $set ";
507}
508
509sub __update_to_select {
510   my ( $from, $set, $where, $limit ) = @_;
511   return "select $set from $from "
512      . ( $where ? "where $where" : '' )
513      . ( $limit ? " $limit "      : '' );
514}
515
516sub wrap_in_derived {
517   my ( $self, $query ) = @_;
518   return unless $query;
519   return $query =~ m/\A\s*select/i
520      ? "select 1 from ($query) as x limit 1"
521      : $query;
522}
523
524sub _d {
525   my ($package, undef, $line) = caller 0;
526   @_ = map { (my $temp = $_) =~ s/\n/\n# /g; $temp; }
527        map { defined $_ ? $_ : 'undef' }
528        @_;
529   print STDERR "# $package:$line $PID ", join(' ', @_), "\n";
530}
531
5321;
533}
534# ###########################################################################
535# End QueryRewriter package
536# ###########################################################################
537