1# This program is copyright 2007-2011 Baron Schwartz, 2011 Percona Ireland Ltd. 2# Feedback and improvements are welcome. 3# 4# THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED 5# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF 6# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. 7# 8# This program is free software; you can redistribute it and/or modify it under 9# the terms of the GNU General Public License as published by the Free Software 10# Foundation, version 2; OR the Perl Artistic License. On UNIX and similar 11# systems, you can issue `man perlgpl' or `man perlartistic' to read these 12# licenses. 13# 14# You should have received a copy of the GNU General Public License along with 15# this program; if not, write to the Free Software Foundation, Inc., 59 Temple 16# Place, Suite 330, Boston, MA 02111-1307 USA. 17# ########################################################################### 18# QueryRewriter package 19# ########################################################################### 20{ 21# Package: QueryRewriter 22# QueryRewriter rewrites and transforms queries. 23package QueryRewriter; 24 25use strict; 26use warnings FATAL => 'all'; 27use English qw(-no_match_vars); 28use constant PTDEBUG => $ENV{PTDEBUG} || 0; 29 30# A list of verbs that can appear in queries. I know this is incomplete -- it 31# does not have CREATE, DROP, ALTER, TRUNCATE for example. But I don't need 32# those for my client yet. Other verbs: KILL, LOCK, UNLOCK 33our $verbs = qr{^SHOW|^FLUSH|^COMMIT|^ROLLBACK|^BEGIN|SELECT|INSERT 34 |UPDATE|DELETE|REPLACE|^SET|UNION|^START|^LOCK}xi; 35my $quote_re = qr/"(?:(?!(?<!\\)").)*"|'(?:(?!(?<!\\)').)*'/; # Costly! 36my $bal; 37$bal = qr/ 38 \( 39 (?: 40 (?> [^()]+ ) # Non-parens without backtracking 41 | 42 (??{ $bal }) # Group with matching parens 43 )* 44 \) 45 /x; 46 47# The one-line comment pattern is quite crude. This is intentional for 48# performance. The multi-line pattern does not match version-comments. 49my $olc_re = qr/(?:--|#)[^'"\r\n]*(?=[\r\n]|\Z)/; # One-line comments 50my $mlc_re = qr#/\*[^!].*?\*/#sm; # But not /*!version */ 51my $vlc_re = qr#/\*.*?[0-9]+.*?\*/#sm; # For SHOW + /*!version */ 52my $vlc_rf = qr#^(?:SHOW).*?/\*![0-9]+(.*?)\*/#sm; # Variation for SHOW 53 54 55sub new { 56 my ( $class, %args ) = @_; 57 my $self = { %args }; 58 return bless $self, $class; 59} 60 61# Strips comments out of queries. 62sub strip_comments { 63 my ( $self, $query ) = @_; 64 return unless $query; 65 $query =~ s/$mlc_re//go; 66 $query =~ s/$olc_re//go; 67 if ( $query =~ m/$vlc_rf/i ) { # contains show + version 68 my $qualifier = $1 || ''; 69 $query =~ s/$vlc_re/$qualifier/go; 70 } 71 return $query; 72} 73 74# Shortens long queries by normalizing stuff out of them. $length is used only 75# for IN() lists. If $length is given, the query is shortened if it's longer 76# than that. 77sub shorten { 78 my ( $self, $query, $length ) = @_; 79 # Shorten multi-value insert/replace, all the way up to on duplicate key 80 # update if it exists. 81 $query =~ s{ 82 \A( 83 (?:INSERT|REPLACE) 84 (?:\s+LOW_PRIORITY|DELAYED|HIGH_PRIORITY|IGNORE)? 85 (?:\s\w+)*\s+\S+\s+VALUES\s*\(.*?\) 86 ) 87 \s*,\s*\(.*?(ON\s+DUPLICATE|\Z)} 88 {$1 /*... omitted ...*/$2}xsi; 89 90 # Shortcut! Find out if there's an IN() list with values. 91 return $query unless $query =~ m/IN\s*\(\s*(?!select)/i; 92 93 # Shorten long IN() lists of literals. But only if the string is longer than 94 # the $length limit. Assumption: values don't contain commas or closing 95 # parens inside them. 96 my $last_length = 0; 97 my $query_length = length($query); 98 while ( 99 $length > 0 100 && $query_length > $length 101 && $query_length < ( $last_length || $query_length + 1 ) 102 ) { 103 $last_length = $query_length; 104 $query =~ s{ 105 (\bIN\s*\() # The opening of an IN list 106 ([^\)]+) # Contents of the list, assuming no item contains paren 107 (?=\)) # Close of the list 108 } 109 { 110 $1 . __shorten($2) 111 }gexsi; 112 } 113 114 return $query; 115} 116 117# Used by shorten(). The argument is the stuff inside an IN() list. The 118# argument might look like this: 119# 1,2,3,4,5,6 120# Or, if this is a second or greater iteration, it could even look like this: 121# /*... omitted 5 items ...*/ 6,7,8,9 122# In the second case, we need to trim out 6,7,8 and increment "5 items" to "8 123# items". We assume that the values in the list don't contain commas; if they 124# do, the results could be a little bit wrong, but who cares. We keep the first 125# 20 items because we don't want to nuke all the samples from the query, we just 126# want to shorten it. 127sub __shorten { 128 my ( $snippet ) = @_; 129 my @vals = split(/,/, $snippet); 130 return $snippet unless @vals > 20; 131 my @keep = splice(@vals, 0, 20); # Remove and save the first 20 items 132 return 133 join(',', @keep) 134 . "/*... omitted " 135 . scalar(@vals) 136 . " items ...*/"; 137} 138 139# Normalizes variable queries to a "query fingerprint" by abstracting away 140# parameters, canonicalizing whitespace, etc. See 141# http://dev.mysql.com/doc/refman/5.0/en/literals.html for literal syntax. 142# Note: Any changes to this function must be profiled for speed! Speed of this 143# function is critical for pt-query-digest. There are known bugs in this, 144# but the balance between maybe-you-get-a-bug and speed favors speed. 145# See past Maatkit revisions of this subroutine for more correct, but slower, 146# regexes. 147sub fingerprint { 148 my ( $self, $query ) = @_; 149 150 # First, we start with a bunch of special cases that we can optimize because 151 # they are special behavior or because they are really big and we want to 152 # throw them away as early as possible. 153 $query =~ m#\ASELECT /\*!40001 SQL_NO_CACHE \*/ \* FROM `# # mysqldump query 154 && return 'mysqldump'; 155 # Matches queries like REPLACE /*foo.bar:3/3*/ INTO checksum.checksum 156 $query =~ m#/\*\w+\.\w+:[0-9]/[0-9]\*/# # pt-table-checksum, etc query 157 && return 'percona-toolkit'; 158 # Administrator commands appear to be a comment, so return them as-is 159 $query =~ m/\Aadministrator command: / 160 && return $query; 161 # Special-case for stored procedures. 162 $query =~ m/\A\s*(call\s+\S+)\(/i 163 && return lc($1); # Warning! $1 used, be careful. 164 # mysqldump's INSERT statements will have long values() lists, don't waste 165 # time on them... they also tend to segfault Perl on some machines when you 166 # get to the "# Collapse IN() and VALUES() lists" regex below! 167 if ( my ($beginning) = $query =~ m/\A((?:INSERT|REPLACE)(?: IGNORE)?\s+INTO.+?VALUES\s*\(.*?\))\s*,\s*\(/is ) { 168 $query = $beginning; # Shorten multi-value INSERT statements ASAP 169 } 170 171 $query =~ s/$mlc_re//go; 172 $query =~ s/$olc_re//go; 173 $query =~ s/\Ause \S+\Z/use ?/i # Abstract the DB in USE 174 && return $query; 175 176 $query =~ s/\\["']//g; # quoted strings 177 $query =~ s/".*?"/?/sg; # quoted strings 178 $query =~ s/'.*?'/?/sg; # quoted strings 179 180 $query =~ s/\bfalse\b|\btrue\b/?/isg; # boolean values 181 182 # MD5 checksums which are always 32 hex chars 183 if ( $self->{match_md5_checksums} ) { 184 $query =~ s/([._-])[a-f0-9]{32}/$1?/g; 185 } 186 187 # Things resembling numbers/hex. 188 if ( !$self->{match_embedded_numbers} ) { 189 # For speed, this regex is extremely broad in its definition 190 # of what looks like a number. 191 $query =~ s/[0-9+-][0-9a-f.xb+-]*/?/g; 192 } 193 else { 194 $query =~ s/\b[0-9+-][0-9a-f.xb+-]*/?/g; 195 } 196 197 # Clean up leftovers 198 if ( $self->{match_md5_checksums} ) { 199 $query =~ s/[xb+-]\?/?/g; 200 } 201 else { 202 $query =~ s/[xb.+-]\?/?/g; 203 } 204 205 $query =~ s/\A\s+//; # Chop off leading whitespace 206 chomp $query; # Kill trailing whitespace 207 $query =~ tr[ \n\t\r\f][ ]s; # Collapse whitespace 208 $query = lc $query; 209 $query =~ s/\bnull\b/?/g; # Get rid of NULLs 210 $query =~ s{ # Collapse IN and VALUES lists 211 \b(in|values?)(?:[\s,]*\([\s?,]*\))+ 212 } 213 {$1(?+)}gx; 214 $query =~ s{ # Collapse UNION 215 \b(select\s.*?)(?:(\sunion(?:\sall)?)\s\1)+ 216 } 217 {$1 /*repeat$2*/}xg; 218 $query =~ s/\blimit \?(?:, ?\?| offset \?)?/limit ?/; # LIMIT 219 # The following are disabled because of speed issues. Should we try to 220 # normalize whitespace between and around operators? My gut feeling is no. 221 # $query =~ s/ , | ,|, /,/g; # Normalize commas 222 # $query =~ s/ = | =|= /=/g; # Normalize equals 223 # $query =~ s# [,=+*/-] ?|[,=+*/-] #+#g; # Normalize operators 224 225 # Remove ASC keywords from ORDER BY clause so these queries fingerprint 226 # the same: 227 # SELECT * FROM `products` ORDER BY name ASC, shape ASC; 228 # SELECT * FROM `products` ORDER BY name, shape; 229 # ASC is default so "ORDER BY col ASC" is really the same as just 230 # "ORDER BY col". 231 # http://code.google.com/p/maatkit/issues/detail?id=1030 232 if ( $query =~ m/\bORDER BY /gi ) { # Find, anchor on ORDER BY clause 233 # Replace any occurrence of "ASC" after anchor until end of query. 234 # I have verified this with regex debug: it's a single forward pass 235 # without backtracking. Probably as fast as it gets. 236 1 while $query =~ s/\G(.+?)\s+ASC/$1/gi && pos $query; 237 } 238 239 return $query; 240} 241 242# Gets the verbs from an SQL query, such as SELECT, UPDATE, etc. 243sub distill_verbs { 244 my ( $self, $query ) = @_; 245 246 # Simple verbs that normally don't have comments, extra clauses, etc. 247 $query =~ m/\A\s*call\s+(\S+)\(/i && return "CALL $1"; 248 $query =~ m/\A\s*use\s+/ && return "USE"; 249 $query =~ m/\A\s*UNLOCK TABLES/i && return "UNLOCK"; 250 $query =~ m/\A\s*xa\s+(\S+)/i && return "XA_$1"; 251 252 if ( $query =~ m/\A\s*LOAD/i ) { 253 my ($tbl) = $query =~ m/INTO TABLE\s+(\S+)/i; 254 $tbl ||= ''; 255 $tbl =~ s/`//g; 256 return "LOAD DATA $tbl"; 257 } 258 259 if ( $query =~ m/\Aadministrator command:/ ) { 260 $query =~ s/administrator command:/ADMIN/; 261 $query = uc $query; 262 return $query; 263 } 264 265 # All other, more complex verbs. 266 $query = $self->strip_comments($query); 267 268 # SHOW statements are either 2 or 3 words: SHOW A (B), where A and B 269 # are words; B is optional. E.g. "SHOW TABLES" or "SHOW SLAVE STATUS". 270 # There's a few common keywords that may show up in place of A, so we 271 # remove them first. Then there's some keywords that signify extra clauses 272 # that may show up in place of B and since these clauses are at the 273 # end of the statement, we remove everything from the clause onward. 274 if ( $query =~ m/\A\s*SHOW\s+/i ) { 275 PTDEBUG && _d($query); 276 277 # Remove common keywords. 278 $query = uc $query; 279 $query =~ s/\s+(?:SESSION|FULL|STORAGE|ENGINE)\b/ /g; 280 # This should be in the regex above but Perl doesn't seem to match 281 # COUNT\(.+\) properly when it's grouped. 282 $query =~ s/\s+COUNT[^)]+\)//g; 283 284 # Remove clause keywords and everything after. 285 $query =~ s/\s+(?:FOR|FROM|LIKE|WHERE|LIMIT|IN)\b.+//ms; 286 287 # The query should now be like SHOW A B C ... delete everything after B, 288 # collapse whitespace, and we're done. 289 $query =~ s/\A(SHOW(?:\s+\S+){1,2}).*\Z/$1/s; 290 $query =~ s/\s+/ /g; 291 PTDEBUG && _d($query); 292 return $query; 293 } 294 295 # Data defintion statements verbs like CREATE and ALTER. 296 # The two evals are a hack to keep Perl from warning that 297 # "QueryParser::data_def_stmts" used only once: possible typo at...". 298 # Some day we'll group all our common regex together in a packet and 299 # export/import them properly. 300 eval $QueryParser::data_def_stmts; 301 eval $QueryParser::tbl_ident; 302 my ( $dds ) = $query =~ /^\s*($QueryParser::data_def_stmts)\b/i; 303 if ( $dds) { 304 # https://bugs.launchpad.net/percona-toolkit/+bug/821690 305 $query =~ s/\s+IF(?:\s+NOT)?\s+EXISTS/ /i; 306 my ( $obj ) = $query =~ m/$dds.+(DATABASE|TABLE)\b/i; 307 $obj = uc $obj if $obj; 308 PTDEBUG && _d('Data def statment:', $dds, 'obj:', $obj); 309 my ($db_or_tbl) 310 = $query =~ m/(?:TABLE|DATABASE)\s+($QueryParser::tbl_ident)(\s+.*)?/i; 311 PTDEBUG && _d('Matches db or table:', $db_or_tbl); 312 return uc($dds . ($obj ? " $obj" : '')), $db_or_tbl; 313 } 314 315 # All other verbs, like SELECT, INSERT, UPDATE, etc. First, get 316 # the query type -- just extract all the verbs and collapse them 317 # together. 318 my @verbs = $query =~ m/\b($verbs)\b/gio; 319 @verbs = do { 320 my $last = ''; 321 grep { my $pass = $_ ne $last; $last = $_; $pass } map { uc } @verbs; 322 }; 323 324 # http://code.google.com/p/maatkit/issues/detail?id=1176 325 # A SELECT query can't have any verbs other than UNION. 326 # Subqueries (SELECT SELECT) are reduced to 1 SELECT in the 327 # do loop above. And there's no valid SQL syntax like 328 # SELECT ... DELETE (there are valid multi-verb syntaxes, like 329 # INSERT ... SELECT). So if it's a SELECT with multiple verbs, 330 # we need to check it else SELECT c FROM t WHERE col='delete' 331 # will incorrectly distill as SELECT DELETE t. 332 if ( ($verbs[0] || '') eq 'SELECT' && @verbs > 1 ) { 333 PTDEBUG && _d("False-positive verbs after SELECT:", @verbs[1..$#verbs]); 334 my $union = grep { $_ eq 'UNION' } @verbs; 335 @verbs = $union ? qw(SELECT UNION) : qw(SELECT); 336 } 337 338 # This used to be "my $verbs" but older verisons of Perl complain that 339 # ""my" variable $verbs masks earlier declaration in same scope" where 340 # the earlier declaration is our $verbs. 341 # http://code.google.com/p/maatkit/issues/detail?id=957 342 my $verb_str = join(q{ }, @verbs); 343 return $verb_str; 344} 345 346sub __distill_tables { 347 my ( $self, $query, $table, %args ) = @_; 348 my $qp = $args{QueryParser} || $self->{QueryParser}; 349 die "I need a QueryParser argument" unless $qp; 350 351 # "Fingerprint" the tables. 352 my @tables = map { 353 $_ =~ s/`//g; 354 $_ =~ s/(_?)[0-9]+/$1?/g; 355 $_; 356 } grep { defined $_ } $qp->get_tables($query); 357 358 push @tables, $table if $table; 359 360 # Collapse the table list 361 @tables = do { 362 my $last = ''; 363 grep { my $pass = $_ ne $last; $last = $_; $pass } @tables; 364 }; 365 366 return @tables; 367} 368 369# This is kind of like fingerprinting, but it super-fingerprints to something 370# that shows the query type and the tables/objects it accesses. 371sub distill { 372 my ( $self, $query, %args ) = @_; 373 374 if ( $args{generic} ) { 375 # Do a generic distillation which returns the first two words 376 # of a simple "cmd arg" query, like memcached and HTTP stuff. 377 my ($cmd, $arg) = $query =~ m/^(\S+)\s+(\S+)/; 378 return '' unless $cmd; 379 $query = (uc $cmd) . ($arg ? " $arg" : ''); 380 } 381 else { 382 # distill_verbs() may return a table if it's a special statement 383 # like TRUNCATE TABLE foo. __distill_tables() handles some but not 384 # all special statements so we pass the special table from distill_verbs() 385 # to __distill_tables() in case it's a statement that the latter 386 # can't handle. If it can handle it, it will eliminate any duplicate 387 # tables. 388 my ($verbs, $table) = $self->distill_verbs($query, %args); 389 390 if ( $verbs && $verbs =~ m/^SHOW/ ) { 391 # Ignore tables for SHOW statements and normalize some 392 # aliases like SCHMEA==DATABASE, KEYS==INDEX. 393 my %alias_for = qw( 394 SCHEMA DATABASE 395 KEYS INDEX 396 INDEXES INDEX 397 ); 398 map { $verbs =~ s/$_/$alias_for{$_}/ } keys %alias_for; 399 $query = $verbs; 400 } 401 elsif ( $verbs && $verbs =~ m/^LOAD DATA/ ) { 402 return $verbs; 403 } 404 else { 405 # For everything else, distill the tables. 406 my @tables = $self->__distill_tables($query, $table, %args); 407 $query = join(q{ }, $verbs, @tables); 408 } 409 } 410 411 if ( $args{trf} ) { 412 $query = $args{trf}->($query, %args); 413 } 414 415 return $query; 416} 417 418sub convert_to_select { 419 my ( $self, $query ) = @_; 420 return unless $query; 421 422 # Trying to convert statments that have subqueries as values to column 423 # assignments doesn't work. E.g. SET col=(SELECT ...). But subqueries 424 # do work in other cases like JOIN (SELECT ...). 425 # http://code.google.com/p/maatkit/issues/detail?id=347 426 return if $query =~ m/=\s*\(\s*SELECT /i; 427 428 $query =~ s{ 429 \A.*? 430 update(?:\s+(?:low_priority|ignore))?\s+(.*?) 431 \s+set\b(.*?) 432 (?:\s*where\b(.*?))? 433 (limit\s*[0-9]+(?:\s*,\s*[0-9]+)?)? 434 \Z 435 } 436 {__update_to_select($1, $2, $3, $4)}exsi 437 # INSERT|REPLACE tbl (cols) VALUES (vals) 438 || $query =~ s{ 439 \A.*? 440 (?:insert(?:\s+ignore)?|replace)\s+ 441 .*?\binto\b(.*?)\(([^\)]+)\)\s* 442 values?\s*(\(.*?\))\s* 443 (?:\blimit\b|on\s+duplicate\s+key.*)?\s* 444 \Z 445 } 446 {__insert_to_select($1, $2, $3)}exsi 447 # INSERT|REPLACE tbl SET vals 448 || $query =~ s{ 449 \A.*? 450 (?:insert(?:\s+ignore)?|replace)\s+ 451 (?:.*?\binto)\b(.*?)\s* 452 set\s+(.*?)\s* 453 (?:\blimit\b|on\s+duplicate\s+key.*)?\s* 454 \Z 455 } 456 {__insert_to_select_with_set($1, $2)}exsi 457 || $query =~ s{ 458 \A.*? 459 delete\s+(.*?) 460 \bfrom\b(.*) 461 \Z 462 } 463 {__delete_to_select($1, $2)}exsi; 464 $query =~ s/\s*on\s+duplicate\s+key\s+update.*\Z//si; 465 $query =~ s/\A.*?(?=\bSELECT\s*\b)//ism; 466 return $query; 467} 468 469sub convert_select_list { 470 my ( $self, $query ) = @_; 471 $query =~ s{ 472 \A\s*select(.*?)\bfrom\b 473 } 474 {$1 =~ m/\*/ ? "select 1 from" : "select isnull(coalesce($1)) from"}exi; 475 return $query; 476} 477 478sub __delete_to_select { 479 my ( $delete, $join ) = @_; 480 if ( $join =~ m/\bjoin\b/ ) { 481 return "select 1 from $join"; 482 } 483 return "select * from $join"; 484} 485 486sub __insert_to_select { 487 my ( $tbl, $cols, $vals ) = @_; 488 PTDEBUG && _d('Args:', @_); 489 my @cols = split(/,/, $cols); 490 PTDEBUG && _d('Cols:', @cols); 491 $vals =~ s/^\(|\)$//g; # Strip leading/trailing parens 492 my @vals = $vals =~ m/($quote_re|[^,]*${bal}[^,]*|[^,]+)/g; 493 PTDEBUG && _d('Vals:', @vals); 494 if ( @cols == @vals ) { 495 return "select * from $tbl where " 496 . join(' and ', map { "$cols[$_]=$vals[$_]" } (0..$#cols)); 497 } 498 else { 499 return "select * from $tbl limit 1"; 500 } 501} 502 503sub __insert_to_select_with_set { 504 my ( $from, $set ) = @_; 505 $set =~ s/,/ and /g; 506 return "select * from $from where $set "; 507} 508 509sub __update_to_select { 510 my ( $from, $set, $where, $limit ) = @_; 511 return "select $set from $from " 512 . ( $where ? "where $where" : '' ) 513 . ( $limit ? " $limit " : '' ); 514} 515 516sub wrap_in_derived { 517 my ( $self, $query ) = @_; 518 return unless $query; 519 return $query =~ m/\A\s*select/i 520 ? "select 1 from ($query) as x limit 1" 521 : $query; 522} 523 524sub _d { 525 my ($package, undef, $line) = caller 0; 526 @_ = map { (my $temp = $_) =~ s/\n/\n# /g; $temp; } 527 map { defined $_ ? $_ : 'undef' } 528 @_; 529 print STDERR "# $package:$line $PID ", join(' ', @_), "\n"; 530} 531 5321; 533} 534# ########################################################################### 535# End QueryRewriter package 536# ########################################################################### 537