1# This program is copyright 2007-2011 Baron Schwartz, 2011 Percona Ireland Ltd.
2# Feedback and improvements are welcome.
3#
4# THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
5# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
6# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
7#
8# This program is free software; you can redistribute it and/or modify it under
9# the terms of the GNU General Public License as published by the Free Software
10# Foundation, version 2; OR the Perl Artistic License.  On UNIX and similar
11# systems, you can issue `man perlgpl' or `man perlartistic' to read these
12# licenses.
13#
14# You should have received a copy of the GNU General Public License along with
15# this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16# Place, Suite 330, Boston, MA  02111-1307  USA.
17# ###########################################################################
18# SlowLogParser package
19# ###########################################################################
20{
21# Package: SlowLogParser
22# SlowLogParser parses MySQL slow logs.
23package SlowLogParser;
24
25use strict;
26use warnings FATAL => 'all';
27use English qw(-no_match_vars);
28use constant PTDEBUG => $ENV{PTDEBUG} || 0;
29
30use Data::Dumper;
31$Data::Dumper::Indent    = 1;
32$Data::Dumper::Sortkeys  = 1;
33$Data::Dumper::Quotekeys = 0;
34
35sub new {
36   my ( $class ) = @_;
37   my $self = {
38      pending => [],
39      last_event_offset => undef,
40   };
41   return bless $self, $class;
42}
43
44my $slow_log_ts_line = qr/^# Time: ((?:[0-9: ]{15})|(?:[-0-9: T]{19}))/;
45my $slow_log_uh_line = qr/# User\@Host: ([^\[]+|\[[^[]+\]).*?@ (\S*) \[(.*)\]\s*(?:Id:\s*(\d+))?/;
46# These can appear in the log file when it's opened -- for example, when someone
47# runs FLUSH LOGS or the server starts.
48# /usr/sbin/mysqld, Version: 5.0.67-0ubuntu6-log ((Ubuntu)). started with:
49# Tcp port: 3306  Unix socket: /var/run/mysqld/mysqld.sock
50# Time                 Id Command    Argument
51# These lines vary depending on OS and whether it's embedded.
52my $slow_log_hd_line = qr{
53      ^(?:
54      T[cC][pP]\s[pP]ort:\s+\d+ # case differs on windows/unix
55      |
56      [/A-Z].*mysqld,\sVersion.*(?:started\swith:|embedded\slibrary)
57      |
58      Time\s+Id\s+Command
59      ).*\n
60   }xm;
61
62# This method accepts an open slow log filehandle and callback functions.
63# It reads events from the filehandle and calls the callbacks with each event.
64# It may find more than one event per call.  $misc is some placeholder for the
65# future and for compatibility with other query sources.
66#
67# Each event is a hashref of attribute => value pairs like:
68#  my $event = {
69#     ts  => '',    # Timestamp
70#     id  => '',    # Connection ID
71#     arg => '',    # Argument to the command
72#     other attributes...
73#  };
74#
75# Returns the number of events it finds.
76#
77# NOTE: If you change anything inside this subroutine, you need to profile
78# the result.  Sometimes a line of code has been changed from an alternate
79# form for performance reasons -- sometimes as much as 20x better performance.
80sub parse_event {
81   my ( $self, %args ) = @_;
82   my @required_args = qw(next_event tell);
83   foreach my $arg ( @required_args ) {
84      die "I need a $arg argument" unless $args{$arg};
85   }
86   my ($next_event, $tell) = @args{@required_args};
87
88   # Read a whole stmt at a time.  But, to make things even more fun, sometimes
89   # part of the log entry might continue past the separator.  In these cases we
90   # peek ahead (see code below.)  We do it this way because in the general
91   # case, reading line-by-line is too slow, and the special-case code is
92   # acceptable.  And additionally, the line terminator doesn't work for all
93   # cases; the header lines might follow a statement, causing the paragraph
94   # slurp to grab more than one statement at a time.
95   my $pending = $self->{pending};
96   local $INPUT_RECORD_SEPARATOR = ";\n#";
97   my $trimlen    = length($INPUT_RECORD_SEPARATOR);
98   my $pos_in_log = $tell->();
99   my $stmt;
100
101   EVENT:
102   while (
103         defined($stmt = shift @$pending)
104      or defined($stmt = $next_event->())
105   ) {
106      my @properties = ('cmd', 'Query', 'pos_in_log', $pos_in_log);
107      $self->{last_event_offset} = $pos_in_log;
108      $pos_in_log = $tell->();
109
110      # If there were such lines in the file, we may have slurped > 1 event.
111      # Delete the lines and re-split if there were deletes.  This causes the
112      # pos_in_log to be inaccurate, but that's really okay.
113      if ( $stmt =~ s/$slow_log_hd_line//go ){ # Throw away header lines in log
114         my @chunks = split(/$INPUT_RECORD_SEPARATOR/o, $stmt);
115         if ( @chunks > 1 ) {
116            PTDEBUG && _d("Found multiple chunks");
117            $stmt = shift @chunks;
118            unshift @$pending, @chunks;
119         }
120      }
121
122      # There might not be a leading '#' because $INPUT_RECORD_SEPARATOR will
123      # have gobbled that up.  And the end may have all/part of the separator.
124      $stmt = '#' . $stmt unless $stmt =~ m/\A#/;
125      $stmt =~ s/;\n#?\Z//;
126
127      # The beginning of a slow-query-log event should be something like
128      # # Time: 071015 21:43:52
129      # Or, it might look like this, sometimes at the end of the Time: line:
130      # # User@Host: root[root] @ localhost []
131
132      # The following line contains variables intended to be sure we do
133      # particular things once and only once, for those regexes that will
134      # match only one line per event, so we don't keep trying to re-match
135      # regexes.
136      my ($got_ts, $got_uh, $got_ac, $got_db, $got_set, $got_embed);
137      my $pos = 0;
138      my $len = length($stmt);
139      my $found_arg = 0;
140      LINE:
141      while ( $stmt =~ m/^(.*)$/mg ) { # /g is important, requires scalar match.
142         $pos     = pos($stmt);  # Be careful not to mess this up!
143         my $line = $1;          # Necessary for /g and pos() to work.
144         PTDEBUG && _d($line);
145
146         # Handle meta-data lines.  These are case-sensitive.  If they appear in
147         # the log with a different case, they are from a user query, not from
148         # something printed out by sql/log.cc.
149         if ($line =~ m/^(?:#|use |SET (?:last_insert_id|insert_id|timestamp))/o) {
150
151            # Maybe it's the beginning of the slow query log event.  XXX
152            # something to know: Perl profiling reports this line as the hot
153            # spot for any of the conditions in the whole if/elsif/elsif
154            # construct.  So if this line looks "hot" then profile each
155            # condition separately.
156            if ( !$got_ts && (my ( $time ) = $line =~ m/$slow_log_ts_line/o)) {
157               PTDEBUG && _d("Got ts", $time);
158               push @properties, 'ts', $time;
159               ++$got_ts;
160               # The User@Host might be concatenated onto the end of the Time.
161               if ( !$got_uh
162                  && ( my ( $user, $host, $ip, $thread_id ) = $line =~ m/$slow_log_uh_line/o )
163               ) {
164                  PTDEBUG && _d("Got user, host, ip", $user, $host, $ip);
165                  $host ||= $ip;  # sometimes host is missing when using skip-name-resolve (LP #issue 1262456)
166                  push @properties, 'user', $user, 'host', $host, 'ip', $ip;
167                  # 5.6 has the thread id on the User@Host line
168                  if ( $thread_id ) {
169                     push @properties, 'Thread_id', $thread_id;
170                 }
171                 ++$got_uh;
172               }
173            }
174
175            # Maybe it's the user/host line of a slow query log
176            # # User@Host: root[root] @ localhost []
177            elsif ( !$got_uh
178                  && ( my ( $user, $host, $ip, $thread_id ) = $line =~ m/$slow_log_uh_line/o )
179            ) {
180                  PTDEBUG && _d("Got user, host, ip", $user, $host, $ip);
181                  $host ||= $ip;  # sometimes host is missing when using skip-name-resolve (LP #issue 1262456)
182                  push @properties, 'user', $user, 'host', $host, 'ip', $ip;
183                  # 5.6 has the thread id on the User@Host line
184                  if ( $thread_id ) {
185                     push @properties, 'Thread_id', $thread_id;
186                 }
187               ++$got_uh;
188            }
189
190            # A line that looks like meta-data but is not:
191            # # administrator command: Quit;
192            elsif (!$got_ac && $line =~ m/^# (?:administrator command:.*)$/) {
193               PTDEBUG && _d("Got admin command");
194               $line =~ s/^#\s+//;  # string leading "# ".
195               push @properties, 'cmd', 'Admin', 'arg', $line;
196               push @properties, 'bytes', length($properties[-1]);
197               ++$found_arg;
198               ++$got_ac;
199            }
200
201            # Maybe it's the timing line of a slow query log, or another line
202            # such as that... they typically look like this:
203            # # Query_time: 2  Lock_time: 0  Rows_sent: 1  Rows_examined: 0
204            elsif ( $line =~ m/^# +[A-Z][A-Za-z_]+: \S+/ ) { # Make the test cheap!
205               PTDEBUG && _d("Got some line with properties");
206
207               # http://code.google.com/p/maatkit/issues/detail?id=1104
208               if ( $line =~ m/Schema:\s+\w+: / ) {
209                  PTDEBUG && _d('Removing empty Schema attrib');
210                  $line =~ s/Schema:\s+//;
211                  PTDEBUG && _d($line);
212               }
213
214               # I tried using split, but coping with the above bug makes it
215               # slower than a complex regex match.
216               my @temp = $line =~ m/(\w+):\s+(\S+|\Z)/g;
217               push @properties, @temp;
218            }
219
220            # Include the current default database given by 'use <db>;'  Again
221            # as per the code in sql/log.cc this is case-sensitive.
222            elsif ( !$got_db && (my ( $db ) = $line =~ m/^use ([^;]+)/ ) ) {
223               PTDEBUG && _d("Got a default database:", $db);
224               push @properties, 'db', $db;
225               ++$got_db;
226            }
227
228            # Some things you might see in the log output, as printed by
229            # sql/log.cc (this time the SET is uppercaes, and again it is
230            # case-sensitive).
231            # SET timestamp=foo;
232            # SET timestamp=foo,insert_id=123;
233            # SET insert_id=123;
234            elsif (!$got_set && (my ($setting) = $line =~ m/^SET\s+([^;]*)/)) {
235               # Note: this assumes settings won't be complex things like
236               # SQL_MODE, which as of 5.0.51 appears to be true (see sql/log.cc,
237               # function MYSQL_LOG::write(THD, char*, uint, time_t)).
238               PTDEBUG && _d("Got some setting:", $setting);
239               push @properties, split(/,|\s*=\s*/, $setting);
240               ++$got_set;
241            }
242
243            # Handle pathological special cases. The "# administrator command"
244            # is one example: it can come AFTER lines that are not commented,
245            # so it looks like it belongs to the next event, and it won't be
246            # in $stmt. Profiling shows this is an expensive if() so we do
247            # this only if we've seen the user/host line.
248            if ( !$found_arg && $pos == $len ) {
249               PTDEBUG && _d("Did not find arg, looking for special cases");
250               local $INPUT_RECORD_SEPARATOR = ";\n";  # get next line
251               if ( defined(my $l = $next_event->()) ) {
252                  if ( $l =~ /^\s*[A-Z][a-z_]+: / ) {
253                     PTDEBUG && _d("Found NULL query before", $l);
254                     # https://bugs.launchpad.net/percona-toolkit/+bug/1082599
255                     # This is really pathological but it happens:
256                     #   header_for_query_1
257                     #   SET timestamp=123;
258                     #   use db;
259                     #   header_for_query_2
260                     # In this case, "get next line" ^ will actually fetch
261                     # header_for_query_2 and the first line of any arg data,
262                     # so to get the rest of the arg data, we switch back to
263                     # the default input rec. sep.
264                     local $INPUT_RECORD_SEPARATOR = ";\n#";
265                     my $rest_of_event = $next_event->();
266                     push @{$self->{pending}}, $l . $rest_of_event;
267                     push @properties, 'cmd', 'Query', 'arg', '/* No query */';
268                     push @properties, 'bytes', 0;
269                     $found_arg++;
270                  }
271                  else {
272                     chomp $l;
273                     $l =~ s/^\s+//;
274                     PTDEBUG && _d("Found admin statement", $l);
275                     push @properties, 'cmd', 'Admin', 'arg', $l;
276                     push @properties, 'bytes', length($properties[-1]);
277                     $found_arg++;
278                  }
279               }
280               else {
281                  # Unrecoverable -- who knows what happened.  This is possible,
282                  # for example, if someone does something like "head -c 10000
283                  # /path/to/slow.log | mk-log-parser".  Or if there was a
284                  # server crash and the file has no newline.
285                  PTDEBUG && _d("I can't figure out what to do with this line");
286                  next EVENT;
287               }
288            }
289         }
290         else {
291            # This isn't a meta-data line.  It's the first line of the
292            # whole query. Grab from here to the end of the string and
293            # put that into the 'arg' for the event.  Then we are done.
294            # Note that if this line really IS the query but we skip in
295            # the 'if' above because it looks like meta-data, later
296            # we'll remedy that.
297            PTDEBUG && _d("Got the query/arg line");
298            my $arg = substr($stmt, $pos - length($line));
299            push @properties, 'arg', $arg, 'bytes', length($arg);
300            # Handle embedded attributes.
301            if ( $args{misc} && $args{misc}->{embed}
302               && ( my ($e) = $arg =~ m/($args{misc}->{embed})/)
303            ) {
304               push @properties, $e =~ m/$args{misc}->{capture}/g;
305            }
306            last LINE;
307         }
308      }
309
310      # Don't dump $event; want to see full dump of all properties, and after
311      # it's been cast into a hash, duplicated keys will be gone.
312      PTDEBUG && _d('Properties of event:', Dumper(\@properties));
313      my $event = { @properties };
314      if ( !$event->{arg} ) {
315         PTDEBUG && _d('Partial event, no arg');
316      }
317      else {
318         $self->{last_event_offset} = undef;
319         if ( $args{stats} ) {
320            $args{stats}->{events_read}++;
321            $args{stats}->{events_parsed}++;
322         }
323      }
324      return $event;
325   } # EVENT
326
327   @$pending = ();
328   $args{oktorun}->(0) if $args{oktorun};
329   return;
330}
331
332sub _d {
333   my ($package, undef, $line) = caller 0;
334   @_ = map { (my $temp = $_) =~ s/\n/\n# /g; $temp; }
335        map { defined $_ ? $_ : 'undef' }
336        @_;
337   print STDERR "# $package:$line $PID ", join(' ', @_), "\n";
338}
339
3401;
341}
342# ###########################################################################
343# End SlowLogParser package
344# ###########################################################################
345