1# This program is copyright 2007-2011 Baron Schwartz, 2011 Percona Ireland Ltd. 2# Feedback and improvements are welcome. 3# 4# THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED 5# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF 6# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. 7# 8# This program is free software; you can redistribute it and/or modify it under 9# the terms of the GNU General Public License as published by the Free Software 10# Foundation, version 2; OR the Perl Artistic License. On UNIX and similar 11# systems, you can issue `man perlgpl' or `man perlartistic' to read these 12# licenses. 13# 14# You should have received a copy of the GNU General Public License along with 15# this program; if not, write to the Free Software Foundation, Inc., 59 Temple 16# Place, Suite 330, Boston, MA 02111-1307 USA. 17# ########################################################################### 18# SlowLogParser package 19# ########################################################################### 20{ 21# Package: SlowLogParser 22# SlowLogParser parses MySQL slow logs. 23package SlowLogParser; 24 25use strict; 26use warnings FATAL => 'all'; 27use English qw(-no_match_vars); 28use constant PTDEBUG => $ENV{PTDEBUG} || 0; 29 30use Data::Dumper; 31$Data::Dumper::Indent = 1; 32$Data::Dumper::Sortkeys = 1; 33$Data::Dumper::Quotekeys = 0; 34 35sub new { 36 my ( $class ) = @_; 37 my $self = { 38 pending => [], 39 last_event_offset => undef, 40 }; 41 return bless $self, $class; 42} 43 44my $slow_log_ts_line = qr/^# Time: ((?:[0-9: ]{15})|(?:[-0-9: T]{19}))/; 45my $slow_log_uh_line = qr/# User\@Host: ([^\[]+|\[[^[]+\]).*?@ (\S*) \[(.*)\]\s*(?:Id:\s*(\d+))?/; 46# These can appear in the log file when it's opened -- for example, when someone 47# runs FLUSH LOGS or the server starts. 48# /usr/sbin/mysqld, Version: 5.0.67-0ubuntu6-log ((Ubuntu)). started with: 49# Tcp port: 3306 Unix socket: /var/run/mysqld/mysqld.sock 50# Time Id Command Argument 51# These lines vary depending on OS and whether it's embedded. 52my $slow_log_hd_line = qr{ 53 ^(?: 54 T[cC][pP]\s[pP]ort:\s+\d+ # case differs on windows/unix 55 | 56 [/A-Z].*mysqld,\sVersion.*(?:started\swith:|embedded\slibrary) 57 | 58 Time\s+Id\s+Command 59 ).*\n 60 }xm; 61 62# This method accepts an open slow log filehandle and callback functions. 63# It reads events from the filehandle and calls the callbacks with each event. 64# It may find more than one event per call. $misc is some placeholder for the 65# future and for compatibility with other query sources. 66# 67# Each event is a hashref of attribute => value pairs like: 68# my $event = { 69# ts => '', # Timestamp 70# id => '', # Connection ID 71# arg => '', # Argument to the command 72# other attributes... 73# }; 74# 75# Returns the number of events it finds. 76# 77# NOTE: If you change anything inside this subroutine, you need to profile 78# the result. Sometimes a line of code has been changed from an alternate 79# form for performance reasons -- sometimes as much as 20x better performance. 80sub parse_event { 81 my ( $self, %args ) = @_; 82 my @required_args = qw(next_event tell); 83 foreach my $arg ( @required_args ) { 84 die "I need a $arg argument" unless $args{$arg}; 85 } 86 my ($next_event, $tell) = @args{@required_args}; 87 88 # Read a whole stmt at a time. But, to make things even more fun, sometimes 89 # part of the log entry might continue past the separator. In these cases we 90 # peek ahead (see code below.) We do it this way because in the general 91 # case, reading line-by-line is too slow, and the special-case code is 92 # acceptable. And additionally, the line terminator doesn't work for all 93 # cases; the header lines might follow a statement, causing the paragraph 94 # slurp to grab more than one statement at a time. 95 my $pending = $self->{pending}; 96 local $INPUT_RECORD_SEPARATOR = ";\n#"; 97 my $trimlen = length($INPUT_RECORD_SEPARATOR); 98 my $pos_in_log = $tell->(); 99 my $stmt; 100 101 EVENT: 102 while ( 103 defined($stmt = shift @$pending) 104 or defined($stmt = $next_event->()) 105 ) { 106 my @properties = ('cmd', 'Query', 'pos_in_log', $pos_in_log); 107 $self->{last_event_offset} = $pos_in_log; 108 $pos_in_log = $tell->(); 109 110 # If there were such lines in the file, we may have slurped > 1 event. 111 # Delete the lines and re-split if there were deletes. This causes the 112 # pos_in_log to be inaccurate, but that's really okay. 113 if ( $stmt =~ s/$slow_log_hd_line//go ){ # Throw away header lines in log 114 my @chunks = split(/$INPUT_RECORD_SEPARATOR/o, $stmt); 115 if ( @chunks > 1 ) { 116 PTDEBUG && _d("Found multiple chunks"); 117 $stmt = shift @chunks; 118 unshift @$pending, @chunks; 119 } 120 } 121 122 # There might not be a leading '#' because $INPUT_RECORD_SEPARATOR will 123 # have gobbled that up. And the end may have all/part of the separator. 124 $stmt = '#' . $stmt unless $stmt =~ m/\A#/; 125 $stmt =~ s/;\n#?\Z//; 126 127 # The beginning of a slow-query-log event should be something like 128 # # Time: 071015 21:43:52 129 # Or, it might look like this, sometimes at the end of the Time: line: 130 # # User@Host: root[root] @ localhost [] 131 132 # The following line contains variables intended to be sure we do 133 # particular things once and only once, for those regexes that will 134 # match only one line per event, so we don't keep trying to re-match 135 # regexes. 136 my ($got_ts, $got_uh, $got_ac, $got_db, $got_set, $got_embed); 137 my $pos = 0; 138 my $len = length($stmt); 139 my $found_arg = 0; 140 LINE: 141 while ( $stmt =~ m/^(.*)$/mg ) { # /g is important, requires scalar match. 142 $pos = pos($stmt); # Be careful not to mess this up! 143 my $line = $1; # Necessary for /g and pos() to work. 144 PTDEBUG && _d($line); 145 146 # Handle meta-data lines. These are case-sensitive. If they appear in 147 # the log with a different case, they are from a user query, not from 148 # something printed out by sql/log.cc. 149 if ($line =~ m/^(?:#|use |SET (?:last_insert_id|insert_id|timestamp))/o) { 150 151 # Maybe it's the beginning of the slow query log event. XXX 152 # something to know: Perl profiling reports this line as the hot 153 # spot for any of the conditions in the whole if/elsif/elsif 154 # construct. So if this line looks "hot" then profile each 155 # condition separately. 156 if ( !$got_ts && (my ( $time ) = $line =~ m/$slow_log_ts_line/o)) { 157 PTDEBUG && _d("Got ts", $time); 158 push @properties, 'ts', $time; 159 ++$got_ts; 160 # The User@Host might be concatenated onto the end of the Time. 161 if ( !$got_uh 162 && ( my ( $user, $host, $ip, $thread_id ) = $line =~ m/$slow_log_uh_line/o ) 163 ) { 164 PTDEBUG && _d("Got user, host, ip", $user, $host, $ip); 165 $host ||= $ip; # sometimes host is missing when using skip-name-resolve (LP #issue 1262456) 166 push @properties, 'user', $user, 'host', $host, 'ip', $ip; 167 # 5.6 has the thread id on the User@Host line 168 if ( $thread_id ) { 169 push @properties, 'Thread_id', $thread_id; 170 } 171 ++$got_uh; 172 } 173 } 174 175 # Maybe it's the user/host line of a slow query log 176 # # User@Host: root[root] @ localhost [] 177 elsif ( !$got_uh 178 && ( my ( $user, $host, $ip, $thread_id ) = $line =~ m/$slow_log_uh_line/o ) 179 ) { 180 PTDEBUG && _d("Got user, host, ip", $user, $host, $ip); 181 $host ||= $ip; # sometimes host is missing when using skip-name-resolve (LP #issue 1262456) 182 push @properties, 'user', $user, 'host', $host, 'ip', $ip; 183 # 5.6 has the thread id on the User@Host line 184 if ( $thread_id ) { 185 push @properties, 'Thread_id', $thread_id; 186 } 187 ++$got_uh; 188 } 189 190 # A line that looks like meta-data but is not: 191 # # administrator command: Quit; 192 elsif (!$got_ac && $line =~ m/^# (?:administrator command:.*)$/) { 193 PTDEBUG && _d("Got admin command"); 194 $line =~ s/^#\s+//; # string leading "# ". 195 push @properties, 'cmd', 'Admin', 'arg', $line; 196 push @properties, 'bytes', length($properties[-1]); 197 ++$found_arg; 198 ++$got_ac; 199 } 200 201 # Maybe it's the timing line of a slow query log, or another line 202 # such as that... they typically look like this: 203 # # Query_time: 2 Lock_time: 0 Rows_sent: 1 Rows_examined: 0 204 elsif ( $line =~ m/^# +[A-Z][A-Za-z_]+: \S+/ ) { # Make the test cheap! 205 PTDEBUG && _d("Got some line with properties"); 206 207 # http://code.google.com/p/maatkit/issues/detail?id=1104 208 if ( $line =~ m/Schema:\s+\w+: / ) { 209 PTDEBUG && _d('Removing empty Schema attrib'); 210 $line =~ s/Schema:\s+//; 211 PTDEBUG && _d($line); 212 } 213 214 # I tried using split, but coping with the above bug makes it 215 # slower than a complex regex match. 216 my @temp = $line =~ m/(\w+):\s+(\S+|\Z)/g; 217 push @properties, @temp; 218 } 219 220 # Include the current default database given by 'use <db>;' Again 221 # as per the code in sql/log.cc this is case-sensitive. 222 elsif ( !$got_db && (my ( $db ) = $line =~ m/^use ([^;]+)/ ) ) { 223 PTDEBUG && _d("Got a default database:", $db); 224 push @properties, 'db', $db; 225 ++$got_db; 226 } 227 228 # Some things you might see in the log output, as printed by 229 # sql/log.cc (this time the SET is uppercaes, and again it is 230 # case-sensitive). 231 # SET timestamp=foo; 232 # SET timestamp=foo,insert_id=123; 233 # SET insert_id=123; 234 elsif (!$got_set && (my ($setting) = $line =~ m/^SET\s+([^;]*)/)) { 235 # Note: this assumes settings won't be complex things like 236 # SQL_MODE, which as of 5.0.51 appears to be true (see sql/log.cc, 237 # function MYSQL_LOG::write(THD, char*, uint, time_t)). 238 PTDEBUG && _d("Got some setting:", $setting); 239 push @properties, split(/,|\s*=\s*/, $setting); 240 ++$got_set; 241 } 242 243 # Handle pathological special cases. The "# administrator command" 244 # is one example: it can come AFTER lines that are not commented, 245 # so it looks like it belongs to the next event, and it won't be 246 # in $stmt. Profiling shows this is an expensive if() so we do 247 # this only if we've seen the user/host line. 248 if ( !$found_arg && $pos == $len ) { 249 PTDEBUG && _d("Did not find arg, looking for special cases"); 250 local $INPUT_RECORD_SEPARATOR = ";\n"; # get next line 251 if ( defined(my $l = $next_event->()) ) { 252 if ( $l =~ /^\s*[A-Z][a-z_]+: / ) { 253 PTDEBUG && _d("Found NULL query before", $l); 254 # https://bugs.launchpad.net/percona-toolkit/+bug/1082599 255 # This is really pathological but it happens: 256 # header_for_query_1 257 # SET timestamp=123; 258 # use db; 259 # header_for_query_2 260 # In this case, "get next line" ^ will actually fetch 261 # header_for_query_2 and the first line of any arg data, 262 # so to get the rest of the arg data, we switch back to 263 # the default input rec. sep. 264 local $INPUT_RECORD_SEPARATOR = ";\n#"; 265 my $rest_of_event = $next_event->(); 266 push @{$self->{pending}}, $l . $rest_of_event; 267 push @properties, 'cmd', 'Query', 'arg', '/* No query */'; 268 push @properties, 'bytes', 0; 269 $found_arg++; 270 } 271 else { 272 chomp $l; 273 $l =~ s/^\s+//; 274 PTDEBUG && _d("Found admin statement", $l); 275 push @properties, 'cmd', 'Admin', 'arg', $l; 276 push @properties, 'bytes', length($properties[-1]); 277 $found_arg++; 278 } 279 } 280 else { 281 # Unrecoverable -- who knows what happened. This is possible, 282 # for example, if someone does something like "head -c 10000 283 # /path/to/slow.log | mk-log-parser". Or if there was a 284 # server crash and the file has no newline. 285 PTDEBUG && _d("I can't figure out what to do with this line"); 286 next EVENT; 287 } 288 } 289 } 290 else { 291 # This isn't a meta-data line. It's the first line of the 292 # whole query. Grab from here to the end of the string and 293 # put that into the 'arg' for the event. Then we are done. 294 # Note that if this line really IS the query but we skip in 295 # the 'if' above because it looks like meta-data, later 296 # we'll remedy that. 297 PTDEBUG && _d("Got the query/arg line"); 298 my $arg = substr($stmt, $pos - length($line)); 299 push @properties, 'arg', $arg, 'bytes', length($arg); 300 # Handle embedded attributes. 301 if ( $args{misc} && $args{misc}->{embed} 302 && ( my ($e) = $arg =~ m/($args{misc}->{embed})/) 303 ) { 304 push @properties, $e =~ m/$args{misc}->{capture}/g; 305 } 306 last LINE; 307 } 308 } 309 310 # Don't dump $event; want to see full dump of all properties, and after 311 # it's been cast into a hash, duplicated keys will be gone. 312 PTDEBUG && _d('Properties of event:', Dumper(\@properties)); 313 my $event = { @properties }; 314 if ( !$event->{arg} ) { 315 PTDEBUG && _d('Partial event, no arg'); 316 } 317 else { 318 $self->{last_event_offset} = undef; 319 if ( $args{stats} ) { 320 $args{stats}->{events_read}++; 321 $args{stats}->{events_parsed}++; 322 } 323 } 324 return $event; 325 } # EVENT 326 327 @$pending = (); 328 $args{oktorun}->(0) if $args{oktorun}; 329 return; 330} 331 332sub _d { 333 my ($package, undef, $line) = caller 0; 334 @_ = map { (my $temp = $_) =~ s/\n/\n# /g; $temp; } 335 map { defined $_ ? $_ : 'undef' } 336 @_; 337 print STDERR "# $package:$line $PID ", join(' ', @_), "\n"; 338} 339 3401; 341} 342# ########################################################################### 343# End SlowLogParser package 344# ########################################################################### 345