1 /* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
2 
3   This program is free software; you can redistribute it and/or modify
4   it under the terms of the GNU General Public License as published by
5   the Free Software Foundation; version 2 of the License.
6 
7   This program is distributed in the hope that it will be useful,
8   but WITHOUT ANY WARRANTY; without even the implied warranty of
9   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10   GNU General Public License for more details.
11 
12   You should have received a copy of the GNU General Public License
13   along with this program; if not, write to the Free Software Foundation,
14   51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
15 
16 /**
17   @file storage/perfschema/pfs.cc
18   The performance schema implementation of all instruments.
19 */
20 
21 #include "my_global.h"
22 #include "pfs.h"
23 #include "pfs_instr_class.h"
24 #include "pfs_instr.h"
25 #include "pfs_global.h"
26 #include "pfs_column_values.h"
27 #include "pfs_timer.h"
28 #include "pfs_events_waits.h"
29 
30 /* Pending WL#4895 PERFORMANCE_SCHEMA Instrumenting Table IO */
31 #undef HAVE_TABLE_WAIT
32 
33 /**
34   @page PAGE_PERFORMANCE_SCHEMA The Performance Schema main page
35   MySQL PERFORMANCE_SCHEMA implementation.
36 
37   @section INTRO Introduction
38   The PERFORMANCE_SCHEMA is a way to introspect the internal execution of
39   the server at runtime.
40   The performance schema focuses primarily on performance data,
41   as opposed to the INFORMATION_SCHEMA whose purpose is to inspect metadata.
42 
43   From a user point of view, the performance schema consists of:
44   - a dedicated database schema, named PERFORMANCE_SCHEMA,
45   - SQL tables, used to query the server internal state or change
46   configuration settings.
47 
48   From an implementation point of view, the performance schema is a dedicated
49   Storage Engine which exposes data collected by 'Instrumentation Points'
50   placed in the server code.
51 
52   @section INTERFACES Multiple interfaces
53 
54   The performance schema exposes many different interfaces,
55   for different components, and for different purposes.
56 
57   @subsection INT_INSTRUMENTING Instrumenting interface
58 
59   All the data representing the server internal state exposed
60   in the performance schema must be first collected:
61   this is the role of the instrumenting interface.
62   The instrumenting interface is a coding interface provided
63   by implementors (of the performance schema) to implementors
64   (of the server or server components).
65 
66   This interface is available to:
67   - C implementations
68   - C++ implementations
69   - the core SQL layer (/sql)
70   - the mysys library (/mysys)
71   - MySQL plugins, including storage engines,
72   - third party plugins, including third party storage engines.
73 
74   For details, see the @ref PAGE_INSTRUMENTATION_INTERFACE
75   "instrumentation interface page".
76 
77   @subsection INT_COMPILING Compiling interface
78 
79   The implementation of the performance schema can be enabled or disabled at
80   build time, when building MySQL from the source code.
81 
82   When building with the performance schema code, some compilation flags
83   are available to change the default values used in the code, if required.
84 
85   For more details, see:
86   @verbatim ./configure --help @endverbatim
87 
88   To compile with the performance schema:
89   @verbatim ./configure --with-perfschema @endverbatim
90 
91   The implementation of all the compiling options is located in
92   @verbatim ./storage/perfschema/plug.in @endverbatim
93 
94   @subsection INT_STARTUP Server startup interface
95 
96   The server startup interface consists of the "./mysqld ..."
97   command line used to start the server.
98   When the performance schema is compiled in the server binary,
99   extra command line options are available.
100 
101   These extra start options allow the DBA to:
102   - enable or disable the performance schema
103   - specify some sizing parameters.
104 
105   To see help for the performance schema startup options, see:
106   @verbatim ./sql/mysqld --verbose --help  @endverbatim
107 
108   The implementation of all the startup options is located in
109   @verbatim ./sql/mysqld.cc, my_long_options[] @endverbatim
110 
111   @subsection INT_BOOTSTRAP Server bootstrap interface
112 
113   The bootstrap interface is a private interface exposed by
114   the performance schema, and used by the SQL layer.
115   Its role is to advertise all the SQL tables natively
116   supported by the performance schema to the SQL server.
117   The code consists of creating MySQL tables for the
118   performance schema itself, and is used in './mysql --bootstrap'
119   mode when a server is installed.
120 
121   The implementation of the database creation script is located in
122   @verbatim ./scripts/mysql_system_tables.sql @endverbatim
123 
124   @subsection INT_CONFIG Runtime configuration interface
125 
126   When the performance schema is used at runtime, various configuration
127   parameters can be used to specify what kind of data is collected,
128   what kind of aggregations are computed, what kind of timers are used,
129   what events are timed, etc.
130 
131   For all these capabilities, not a single statement or special syntax
132   was introduced in the parser.
133   Instead of new SQL statements, the interface consists of DML
134   (SELECT, INSERT, UPDATE, DELETE) against special "SETUP" tables.
135 
136   For example:
137   @verbatim mysql> update performance_schema.SETUP_INSTRUMENTS
138     set ENABLED='YES', TIMED='YES';
139   Query OK, 234 rows affected (0.00 sec)
140   Rows matched: 234  Changed: 234  Warnings: 0 @endverbatim
141 
142   @subsection INT_STATUS Internal audit interface
143 
144   The internal audit interface is provided to the DBA to inspect if the
145   performance schema code itself is functioning properly.
146   This interface is necessary because a failure caused while
147   instrumenting code in the server should not cause failures in the
148   MySQL server itself, so that the performance schema implementation
149   never raises errors during runtime execution.
150 
151   This auditing interface consists of:
152   @verbatim SHOW ENGINE PERFORMANCE_SCHEMA STATUS; @endverbatim
153   It displays data related to the memory usage of the performance schema,
154   as well as statistics about lost events, if any.
155 
156   The SHOW STATUS command is implemented in
157   @verbatim ./storage/perfschema/pfs_engine_table.cc @endverbatim
158 
159   @subsection INT_QUERY Query interface
160 
161   The query interface is used to query the internal state of a running server.
162   It is provided as SQL tables.
163 
164   For example:
165   @verbatim mysql> select * from performance_schema.EVENTS_WAITS_CURRENT;
166   @endverbatim
167 
168   @section DESIGN_PRINCIPLES Design principles
169 
170   @subsection PRINCIPLE_BEHAVIOR No behavior changes
171 
172   The primary goal of the performance schema is to measure (instrument) the
173   execution of the server. A good measure should not cause any change
174   in behavior.
175 
176   To achieve this, the overall design of the performance schema complies
177   with the following very severe design constraints:
178 
179   The parser is unchanged. There are no new keywords, no new statements.
180   This guarantees that existing applications will run the same way with or
181   without the performance schema.
182 
183   All the instrumentation points return "void", there are no error codes.
184   Even if the performance schema internally fails, execution of the server
185   code will proceed.
186 
187   None of the instrumentation points allocate memory.
188   All the memory used by the performance schema is pre-allocated at startup,
189   and is considered "static" during the server life time.
190 
191   None of the instrumentation points use any pthread_mutex, pthread_rwlock,
192   or pthread_cond (or platform equivalents).
193   Executing the instrumentation point should not cause thread scheduling to
194   change in the server.
195 
196   In other words, the implementation of the instrumentation points,
197   including all the code called by the instrumentation points, is:
198   - malloc free
199   - mutex free
200   - rwlock free
201 
202   TODO: All the code located in storage/perfschema is malloc free,
203   but unfortunately the usage of LF_HASH introduces some memory allocation.
204   This should be revised if possible, to use a lock-free,
205   malloc-free hash code table.
206 
207   @subsection PRINCIPLE_PERFORMANCE No performance hit
208 
209   The instrumentation of the server should be as fast as possible.
210   In cases when there are choices between:
211   - doing some processing when recording the performance data
212   in the instrumentation,
213   - doing some processing when retrieving the performance data,
214 
215   priority is given in the design to make the instrumentation faster,
216   pushing some complexity to data retrieval.
217 
218   As a result, some parts of the design, related to:
219   - the setup code path,
220   - the query code path,
221 
222   might appear to be sub-optimal.
223 
224   The criterion used here is to optimize primarily the critical path (data
225   collection), possibly at the expense of non-critical code paths.
226 
227   @subsection PRINCIPLE_NOT_INTRUSIVE Unintrusive instrumentation
228 
229   For the performance schema in general to be successful, the barrier
230   of entry for a developer should be low, so it's easy to instrument code.
231 
232   In particular, the instrumentation interface:
233   - is available for C and C++ code (so it's a C interface),
234   - does not require parameters that the calling code can't easily provide,
235   - supports partial instrumentation (for example, instrumenting mutexes does
236   not require that every mutex is instrumented)
237 
238   @subsection PRINCIPLE_EXTENDABLE Extendable instrumentation
239 
240   As the content of the performance schema improves,
241   with more tables exposed and more data collected,
242   the instrumentation interface will also be augmented
243   to support instrumenting new concepts.
244   Existing instrumentations should not be affected when additional
245   instrumentation is made available, and making a new instrumentation
246   available should not require existing instrumented code to support it.
247 
248   @subsection PRINCIPLE_VERSIONED Versioned instrumentation
249 
250   Given that the instrumentation offered by the performance schema will
251   be augmented with time, when more features are implemented,
252   the interface itself should be versioned, to keep compatibility
253   with previous instrumented code.
254 
255   For example, after both plugin-A and plugin-B have been instrumented for
256   mutexes, read write locks and conditions, using the instrumentation
257   interface, we can anticipate that the instrumentation interface
258   is expanded to support file based operations.
259 
260   Plugin-A, a file based storage engine, will most likely use the expanded
261   interface and instrument its file usage, using the version 2
262   interface, while Plugin-B, a network based storage engine, will not change
263   its code and not release a new binary.
264 
265   When later the instrumentation interface is expanded to support network
266   based operations (which will define interface version 3), the Plugin-B code
267   can then be changed to make use of it.
268 
269   Note, this is just an example to illustrate the design concept here.
270   Both mutexes and file instrumentation are already available
271   since version 1 of the instrumentation interface.
272 
273   @subsection PRINCIPLE_DEPLOYMENT Easy deployment
274 
275   Internally, we might want every plugin implementation to upgrade the
276   instrumented code to the latest available, but this will cause additional
277   work and this is not practical if the code change is monolithic.
278 
279   Externally, for third party plugin implementors, asking implementors to
280   always stay aligned to the latest instrumentation and make new releases,
281   even when the change does not provide new functionality for them,
282   is a bad idea.
283 
284   For example, requiring a network based engine to re-release because the
285   instrumentation interface changed for file based operations, will create
286   too many deployment issues.
287 
288   So, the performance schema implementation must support concurrently,
289   in the same deployment, multiple versions of the instrumentation
290   interface, and ensure binary compatibility with each version.
291 
292   In addition to this, the performance schema can be included or excluded
293   from the server binary, using build time configuration options.
294 
295   Regardless, the following types of deployment are valid:
296   - a server supporting the performance schema + a storage engine
297   that is not instrumented
298   - a server not supporting the performance schema + a storage engine
299   that is instrumented
300 */
301 
302 /**
303   @page PAGE_INSTRUMENTATION_INTERFACE
304   Performance schema: instrumentation interface page.
305   MySQL performance schema instrumentation interface.
306 
307   @section INTRO Introduction
308 
309   The instrumentation interface consist of two layers:
310   - a raw ABI (Application Binary Interface) layer, that exposes the primitive
311   instrumentation functions exported by the performance schema instrumentation
312   - an API (Application Programing Interface) layer,
313   that provides many helpers for a developer instrumenting some code,
314   to make the instrumentation as easy as possible.
315 
316   The ABI layer consists of:
317 @code
318 #include "mysql/psi/psi.h"
319 @endcode
320 
321   The API layer consists of:
322 @code
323 #include "mysql/psi/mutex_mutex.h"
324 #include "mysql/psi/mutex_file.h"
325 @endcode
326 
327   The first helper is for mutexes, rwlocks and conditions,
328   the second for file io.
329 
330   The API layer exposes C macros and typedefs which will expand:
331   - either to non-instrumented code, when compiled without the performance
332   schema instrumentation
333   - or to instrumented code, that will issue the raw calls to the ABI layer
334   so that the implementation can collect data.
335 
336   Note that all the names introduced (for example, @c mysql_mutex_lock) do not
337   collide with any other namespace.
338   In particular, the macro @c mysql_mutex_lock is on purpose not named
339   @c pthread_mutex_lock.
340   This is to:
341   - avoid overloading @c pthread_mutex_lock with yet another macro,
342   which is dangerous as it can affect user code and pollute
343   the end-user namespace.
344   - allow the developer instrumenting code to selectively instrument
345   some code but not all.
346 
347   @section PRINCIPLES Design principles
348 
349   The ABI part is designed as a facade, that exposes basic primitives.
350   The expectation is that each primitive will be very stable over time,
351   but the list will constantly grow when more instruments are supported.
352   To support binary compatibility with plugins compiled with a different
353   version of the instrumentation, the ABI itself is versioned
354   (see @c PSI_v1, @c PSI_v2).
355 
356   For a given instrumentation point in the API, the basic coding pattern
357   used is:
358   - (a) If the performance schema is not initialized, do nothing
359   - (b) If the object acted upon is not instrumented, do nothing
360   - (c) otherwise, notify the performance schema of the operation
361   about to be performed.
362 
363   The implementation of the instrumentation interface can:
364   - decide that it is not interested by the event, and return NULL.
365   In this context, 'interested' means whether the instrumentation for
366   this object + event is turned on in the performance schema configuration
367   (the SETUP_ tables).
368   - decide that this event is to be instrumented.
369   In this case, the instrumentation returns an opaque pointer,
370   that acts as a listener.
371 
372   If a listener is returned, the instrumentation point then:
373   - (d) invokes the "start" event method
374   - (e) executes the instrumented code.
375   - (f) invokes the "end" event method.
376 
377   If no listener is returned, only the instrumented code (e) is invoked.
378 
379   The following code fragment is annotated to show how in detail this pattern
380   in implemented, when the instrumentation is compiled in:
381 
382 @verbatim
383 static inline int mysql_mutex_lock(
384   mysql_mutex_t *that, myf flags, const char *src_file, uint src_line)
385 {
386   int result;
387   struct PSI_mutex_locker *locker= NULL;
388 
389   ...... (a) .......... (b)
390   if (PSI_server && that->m_psi)
391 
392   .......................... (c)
393     if ((locker= PSI_server->get_thread_mutex_locker(that->m_psi,
394                                                      PSI_MUTEX_LOCK)))
395 
396   ............... (d)
397       PSI_server->start_mutex_wait(locker, src_file, src_line);
398 
399   ........ (e)
400   result= pthread_mutex_lock(&that->m_mutex);
401 
402   if (locker)
403 
404   ............. (f)
405     PSI_server->end_mutex_wait(locker, result);
406 
407   return result;
408 }
409 @endverbatim
410 
411   When the performance schema instrumentation is not compiled in,
412   the code becomes simply a wrapper, expanded in line by the compiler:
413 
414 @verbatim
415 static inline int mysql_mutex_lock(...)
416 {
417   int result;
418 
419   ........ (e)
420   result= pthread_mutex_lock(&that->m_mutex);
421 
422   return result;
423 }
424 @endverbatim
425 */
426 
427 /**
428   @page PAGE_AGGREGATES Performance schema: the aggregates page.
429   Performance schema aggregates.
430 
431   @section INTRO Introduction
432 
433   Aggregates tables are tables that can be formally defined as
434   SELECT ... from EVENTS_WAITS_HISTORY_INFINITE ... group by 'group clause'.
435 
436   Each group clause defines a different kind of aggregate, and corresponds to
437   a different table exposed by the performance schema.
438 
439   Aggregates can be either:
440   - computed on the fly,
441   - computed on demand, based on other available data.
442 
443   'EVENTS_WAITS_HISTORY_INFINITE' is a table that does not exist,
444   the best approximation is EVENTS_WAITS_HISTORY_LONG.
445   Aggregates computed on the fly in fact are based on EVENTS_WAITS_CURRENT,
446   while aggregates computed on demand are based on other
447   EVENTS_WAITS_SUMMARY_BY_xxx tables.
448 
449   To better understand the implementation itself, a bit of math is
450   required first, to understand the model behind the code:
451   the code is deceptively simple, the real complexity resides
452   in the flyweight of pointers between various performance schema buffers.
453 
454   @section DIMENSION Concept of dimension
455 
456   An event measured by the instrumentation has many attributes.
457   An event is represented as a data point P(x1, x2, ..., xN),
458   where each x_i coordinate represents a given attribute value.
459 
460   Examples of attributes are:
461   - the time waited
462   - the object waited on
463   - the instrument waited on
464   - the thread that waited
465   - the operation performed
466   - per object or per operation additional attributes, such as spins,
467   number of bytes, etc.
468 
469   Computing an aggregate per thread is fundamentally different from
470   computing an aggregate by instrument, so the "_BY_THREAD" and
471   "_BY_EVENT_NAME" aggregates are different dimensions,
472   operating on different x_i and x_j coordinates.
473   These aggregates are "orthogonal".
474 
475   @section PROJECTION Concept of projection
476 
477   A given x_i attribute value can convey either just one basic information,
478   such as a number of bytes, or can convey implied information,
479   such as an object fully qualified name.
480 
481   For example, from the value "test.t1", the name of the object schema
482   "test" can be separated from the object name "t1", so that now aggregates
483   by object schema can be implemented.
484 
485   In math terms, that corresponds to defining a function:
486   F_i (x): x --> y
487   Applying this function to our point P gives another point P':
488 
489   F_i (P):
490   P(x1, x2, ..., x{i-1}, x_i, x{i+1}, ..., x_N
491   --> P' (x1, x2, ..., x{i-1}, f_i(x_i), x{i+1}, ..., x_N)
492 
493   That function defines in fact an aggregate !
494   In SQL terms, this aggregate would look like the following table:
495 
496 @verbatim
497   CREATE VIEW EVENTS_WAITS_SUMMARY_BY_Func_i AS
498   SELECT col_1, col_2, ..., col_{i-1},
499          Func_i(col_i),
500          COUNT(col_i),
501          MIN(col_i), AVG(col_i), MAX(col_i), -- if col_i is a numeric value
502          col_{i+1}, ..., col_N
503          FROM EVENTS_WAITS_HISTORY_INFINITE
504          group by col_1, col_2, ..., col_{i-1}, col{i+1}, ..., col_N.
505 @endverbatim
506 
507   Note that not all columns have to be included,
508   in particular some columns that are dependent on the x_i column should
509   be removed, so that in practice, MySQL's aggregation method tends to
510   remove many attributes at each aggregation steps.
511 
512   For example, when aggregating wait events by object instances,
513   - the wait_time and number_of_bytes can be summed,
514   and sum(wait_time) now becomes an object instance attribute.
515   - the source, timer_start, timer_end columns are not in the
516   _BY_INSTANCE table, because these attributes are only
517   meaningful for a wait.
518 
519   @section COMPOSITION Concept of composition
520 
521   Now, the "test.t1" --> "test" example was purely theory,
522   just to explain the concept, and does not lead very far.
523   Let's look at a more interesting example of data that can be derived
524   from the row event.
525 
526   An event creates a transient object, PFS_wait_locker, per operation.
527   This object's life cycle is extremely short: it's created just
528   before the start_wait() instrumentation call, and is destroyed in
529   the end_wait() call.
530 
531   The wait locker itself contains a pointer to the object instance
532   waited on.
533   That allows to implement a wait_locker --> object instance projection,
534   with m_target.
535   The object instance life cycle depends on _init and _destroy calls
536   from the code, such as mysql_mutex_init()
537   and mysql_mutex_destroy() for a mutex.
538 
539   The object instance waited on contains a pointer to the object class,
540   which is represented by the instrument name.
541   That allows to implement an object instance --> object class projection.
542   The object class life cycle is permanent, as instruments are loaded in
543   the server and never removed.
544 
545   The object class is named in such a way
546   (for example, "wait/sync/mutex/sql/LOCK_open",
547   "wait/io/file/maria/data_file) that the component ("sql", "maria")
548   that it belongs to can be inferred.
549   That allows to implement an object class --> server component projection.
550 
551   Back to math again, we have, for example for mutexes:
552 
553   F1 (l) : PFS_wait_locker l --> PFS_mutex m = l->m_target.m_mutex
554 
555   F1_to_2 (m) : PFS_mutex m --> PFS_mutex_class i = m->m_class
556 
557   F2_to_3 (i) : PFS_mutex_class i --> const char *component =
558                                         substring(i->m_name, ...)
559 
560   Per components aggregates are not implemented, this is just an illustration.
561 
562   F1 alone defines this aggregate:
563 
564   EVENTS_WAITS_HISTORY_INFINITE --> EVENTS_WAITS_SUMMARY_BY_INSTANCE
565   (or MUTEX_INSTANCE)
566 
567   F1_to_2 alone could define this aggregate:
568 
569   EVENTS_WAITS_SUMMARY_BY_INSTANCE --> EVENTS_WAITS_SUMMARY_BY_EVENT_NAME
570 
571   Alternatively, using function composition, with
572   F2 = F1_to_2 o F1, F2 defines:
573 
574   EVENTS_WAITS_HISTORY_INFINITE --> EVENTS_WAITS_SUMMARY_BY_EVENT_NAME
575 
576   Likewise, F_2_to_3 defines:
577 
578   EVENTS_WAITS_SUMMARY_BY_EVENT_NAME --> EVENTS_WAITS_SUMMARY_BY_COMPONENT
579 
580   and F3 = F_2_to_3 o F_1_to_2 o F1 defines:
581 
582   EVENTS_WAITS_HISTORY_INFINITE --> EVENTS_WAITS_SUMMARY_BY_COMPONENT
583 
584   What has all this to do with the code ?
585 
586   Function composition such as F_2_to_3 o F_1_to_2 o F1 is implemented
587   as PFS_single_stat_chain, where each link in the chain represents
588   an individual F_{i}_to_{i+1} aggregation step.
589 
590   A single call to aggregate_single_stat_chain() updates all the tables
591   described in the statistics chain.
592 
593   @section STAT_CHAIN Statistics chains
594 
595   Statistics chains are only used for on the fly aggregates,
596   and are therefore all based initially on the '_CURRENT' base table that
597   contains the data recorded.
598   The following table aggregates are implemented with a statistics chain:
599 
600   EVENTS_WAITS_CURRENT --> EVENTS_WAITS_SUMMARY_BY_INSTANCE
601   --> EVENTS_WAITS_SUMMARY_BY_EVENT_NAME
602 
603   This relationship is between classes.
604 
605   In terms of object instances, or records, this chain is implemented
606   as a flyweight.
607 
608   For example, assuming the following scenario:
609   - A mutex class "M" is instrumented, the instrument name
610   is "wait/sync/mutex/sql/M"
611   - This mutex instrument has been instantiated twice,
612   mutex instances are noted M-1 and M-2
613   - Threads T-A and T-B are locking mutex instance M-1
614   - Threads T-C and T-D are locking mutex instance M-2
615 
616   The performance schema will record the following data:
617   - EVENTS_WAITS_CURRENT has 4 rows, one for each mutex locker
618   - EVENTS_WAITS_SUMMARY_BY_INSTANCE shows 2 rows, for M-1 and M-2
619   - EVENTS_WAITS_SUMMARY_BY_EVENT_NAME shows 1 row, for M
620 
621   The graph of structures will look like:
622 
623 @verbatim
624   PFS_wait_locker (T-A, M-1) ----------
625                                       |
626                                       v
627                                  PFS_mutex (M-1)
628                                  - m_wait_stat    ------------
629                                       ^                      |
630                                       |                      |
631   PFS_wait_locker (T-B, M-1) ----------                      |
632                                                              v
633                                                         PFS_mutex_class (M)
634                                                         - m_wait_stat
635   PFS_wait_locker (T-C, M-2) ----------                      ^
636                                       |                      |
637                                       v                      |
638                                  PFS_mutex (M-2)             |
639                                  - m_wait_stat    ------------
640                                       ^
641                                       |
642   PFS_wait_locker (T-D, M-2) ----------
643 
644             ||                        ||                     ||
645             ||                        ||                     ||
646             vv                        vv                     vv
647 
648   EVENTS_WAITS_CURRENT ..._SUMMARY_BY_INSTANCE ..._SUMMARY_BY_EVENT_NAME
649 @endverbatim
650 
651   @section ON_THE_FLY On the fly aggregates
652 
653   'On the fly' aggregates are computed during the code execution.
654   This is necessary because the data the aggregate is based on is volatile,
655   and can not be kept indefinitely.
656 
657   @section HIGHER_LEVEL Higher level aggregates
658 
659   Note: no higher level aggregate is implemented yet,
660   this section is a place holder.
661 */
662 
663 /**
664   @defgroup Performance_schema Performance Schema
665   The performance schema component.
666   For details, see the
667   @ref PAGE_PERFORMANCE_SCHEMA "performance schema main page".
668 
669   @defgroup Performance_schema_implementation Performance Schema Implementation
670   @ingroup Performance_schema
671 
672   @defgroup Performance_schema_tables Performance Schema Tables
673   @ingroup Performance_schema_implementation
674 */
675 
676 pthread_key(PFS_thread*, THR_PFS);
677 bool THR_PFS_initialized= false;
678 
679 static enum_operation_type mutex_operation_map[]=
680 {
681   OPERATION_TYPE_LOCK,
682   OPERATION_TYPE_TRYLOCK
683 };
684 
685 static enum_operation_type rwlock_operation_map[]=
686 {
687   OPERATION_TYPE_READLOCK,
688   OPERATION_TYPE_WRITELOCK,
689   OPERATION_TYPE_TRYREADLOCK,
690   OPERATION_TYPE_TRYWRITELOCK
691 };
692 
693 static enum_operation_type cond_operation_map[]=
694 {
695   OPERATION_TYPE_WAIT,
696   OPERATION_TYPE_TIMEDWAIT
697 };
698 
699 /**
700   Conversion map from PSI_file_operation to enum_operation_type.
701   Indexed by enum PSI_file_operation.
702 */
703 static enum_operation_type file_operation_map[]=
704 {
705   OPERATION_TYPE_FILECREATE,
706   OPERATION_TYPE_FILECREATETMP,
707   OPERATION_TYPE_FILEOPEN,
708   OPERATION_TYPE_FILESTREAMOPEN,
709   OPERATION_TYPE_FILECLOSE,
710   OPERATION_TYPE_FILESTREAMCLOSE,
711   OPERATION_TYPE_FILEREAD,
712   OPERATION_TYPE_FILEWRITE,
713   OPERATION_TYPE_FILESEEK,
714   OPERATION_TYPE_FILETELL,
715   OPERATION_TYPE_FILEFLUSH,
716   OPERATION_TYPE_FILESTAT,
717   OPERATION_TYPE_FILEFSTAT,
718   OPERATION_TYPE_FILECHSIZE,
719   OPERATION_TYPE_FILEDELETE,
720   OPERATION_TYPE_FILERENAME,
721   OPERATION_TYPE_FILESYNC
722 };
723 
724 /**
725   Build the prefix name of a class of instruments in a category.
726   For example, this function builds the string 'wait/sync/mutex/sql/' from
727   a prefix 'wait/sync/mutex' and a category 'sql'.
728   This prefix is used later to build each instrument name, such as
729   'wait/sync/mutex/sql/LOCK_open'.
730   @param prefix               Prefix for this class of instruments
731   @param category             Category name
732   @param [out] output         Buffer of length PFS_MAX_INFO_NAME_LENGTH.
733   @param [out] output_length  Length of the resulting output string.
734   @return 0 for success, non zero for errors
735 */
build_prefix(const LEX_STRING * prefix,const char * category,char * output,int * output_length)736 static int build_prefix(const LEX_STRING *prefix, const char *category,
737                         char *output, int *output_length)
738 {
739   int len= strlen(category);
740   char *out_ptr= output;
741   int prefix_length= prefix->length;
742 
743   if (unlikely((prefix_length + len + 1) >=
744                PFS_MAX_FULL_PREFIX_NAME_LENGTH))
745   {
746     pfs_print_error("build_prefix: prefix+category is too long <%s> <%s>\n",
747                     prefix->str, category);
748     return 1;
749   }
750 
751   if (unlikely(strchr(category, '/') != NULL))
752   {
753     pfs_print_error("build_prefix: invalid category <%s>\n",
754                     category);
755     return 1;
756   }
757 
758   /* output = prefix + category + '/' */
759   memcpy(out_ptr, prefix->str, prefix_length);
760   out_ptr+= prefix_length;
761   memcpy(out_ptr, category, len);
762   out_ptr+= len;
763   *out_ptr= '/';
764   out_ptr++;
765   *output_length= out_ptr - output;
766 
767   return 0;
768 }
769 
770 #define REGISTER_BODY_V1(KEY_T, PREFIX, REGISTER_FUNC)                \
771   KEY_T key;                                                          \
772   char formatted_name[PFS_MAX_INFO_NAME_LENGTH];                      \
773   int prefix_length;                                                  \
774   int len;                                                            \
775   int full_length;                                                    \
776                                                                       \
777   DBUG_ASSERT(category != NULL);                                      \
778   DBUG_ASSERT(info != NULL);                                          \
779   if (unlikely(build_prefix(&PREFIX, category,                        \
780                    formatted_name, &prefix_length)))                  \
781   {                                                                   \
782     for (; count>0; count--, info++)                                  \
783       *(info->m_key)= 0;                                              \
784     return ;                                                          \
785   }                                                                   \
786                                                                       \
787   for (; count>0; count--, info++)                                    \
788   {                                                                   \
789     DBUG_ASSERT(info->m_key != NULL);                                 \
790     DBUG_ASSERT(info->m_name != NULL);                                \
791     len= strlen(info->m_name);                                        \
792     full_length= prefix_length + len;                                 \
793     if (likely(full_length <= PFS_MAX_INFO_NAME_LENGTH))              \
794     {                                                                 \
795       memcpy(formatted_name + prefix_length, info->m_name, len);      \
796       key= REGISTER_FUNC(formatted_name, full_length, info->m_flags); \
797     }                                                                 \
798     else                                                              \
799     {                                                                 \
800       pfs_print_error("REGISTER_BODY_V1: name too long <%s> <%s>\n",  \
801                       category, info->m_name);                        \
802       key= 0;                                                         \
803     }                                                                 \
804                                                                       \
805     *(info->m_key)= key;                                              \
806   }                                                                   \
807   return;
808 
809 /* Use C linkage for the interface functions. */
810 
811 C_MODE_START
812 
register_mutex_v1(const char * category,PSI_mutex_info_v1 * info,int count)813 static void register_mutex_v1(const char *category,
814                               PSI_mutex_info_v1 *info,
815                               int count)
816 {
817   REGISTER_BODY_V1(PSI_mutex_key,
818                    mutex_instrument_prefix,
819                    register_mutex_class)
820 }
821 
register_rwlock_v1(const char * category,PSI_rwlock_info_v1 * info,int count)822 static void register_rwlock_v1(const char *category,
823                                PSI_rwlock_info_v1 *info,
824                                int count)
825 {
826   REGISTER_BODY_V1(PSI_rwlock_key,
827                    rwlock_instrument_prefix,
828                    register_rwlock_class)
829 }
830 
register_cond_v1(const char * category,PSI_cond_info_v1 * info,int count)831 static void register_cond_v1(const char *category,
832                              PSI_cond_info_v1 *info,
833                              int count)
834 {
835   REGISTER_BODY_V1(PSI_cond_key,
836                    cond_instrument_prefix,
837                    register_cond_class)
838 }
839 
register_thread_v1(const char * category,PSI_thread_info_v1 * info,int count)840 static void register_thread_v1(const char *category,
841                                PSI_thread_info_v1 *info,
842                                int count)
843 {
844   REGISTER_BODY_V1(PSI_thread_key,
845                    thread_instrument_prefix,
846                    register_thread_class)
847 }
848 
register_file_v1(const char * category,PSI_file_info_v1 * info,int count)849 static void register_file_v1(const char *category,
850                              PSI_file_info_v1 *info,
851                              int count)
852 {
853   REGISTER_BODY_V1(PSI_file_key,
854                    file_instrument_prefix,
855                    register_file_class)
856 }
857 
858 #define INIT_BODY_V1(T, KEY, ID)                                            \
859   PFS_##T##_class *klass;                                                   \
860   PFS_##T *pfs;                                                             \
861   PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS); \
862   if (unlikely(pfs_thread == NULL))                                         \
863     return NULL;                                                            \
864   if (! pfs_thread->m_enabled)                                              \
865     return NULL;                                                            \
866   klass= find_##T##_class(KEY);                                             \
867   if (unlikely(klass == NULL))                                              \
868     return NULL;                                                            \
869   if (! klass->m_enabled)                                                   \
870     return NULL;                                                            \
871   pfs= create_##T(klass, ID);                                               \
872   return reinterpret_cast<PSI_##T *> (pfs)
873 
874 static PSI_mutex*
init_mutex_v1(PSI_mutex_key key,const void * identity)875 init_mutex_v1(PSI_mutex_key key, const void *identity)
876 {
877   INIT_BODY_V1(mutex, key, identity);
878 }
879 
destroy_mutex_v1(PSI_mutex * mutex)880 static void destroy_mutex_v1(PSI_mutex* mutex)
881 {
882   PFS_mutex *pfs= reinterpret_cast<PFS_mutex*> (mutex);
883   destroy_mutex(pfs);
884 }
885 
886 static PSI_rwlock*
init_rwlock_v1(PSI_rwlock_key key,const void * identity)887 init_rwlock_v1(PSI_rwlock_key key, const void *identity)
888 {
889   INIT_BODY_V1(rwlock, key, identity);
890 }
891 
destroy_rwlock_v1(PSI_rwlock * rwlock)892 static void destroy_rwlock_v1(PSI_rwlock* rwlock)
893 {
894   PFS_rwlock *pfs= reinterpret_cast<PFS_rwlock*> (rwlock);
895   destroy_rwlock(pfs);
896 }
897 
898 static PSI_cond*
init_cond_v1(PSI_cond_key key,const void * identity)899 init_cond_v1(PSI_cond_key key, const void *identity)
900 {
901   INIT_BODY_V1(cond, key, identity);
902 }
903 
destroy_cond_v1(PSI_cond * cond)904 static void destroy_cond_v1(PSI_cond* cond)
905 {
906   PFS_cond *pfs= reinterpret_cast<PFS_cond*> (cond);
907   destroy_cond(pfs);
908 }
909 
910 static PSI_table_share*
get_table_share_v1(const char * schema_name,int schema_name_length,const char * table_name,int table_name_length,const void * identity)911 get_table_share_v1(const char *schema_name, int schema_name_length,
912                    const char *table_name, int table_name_length,
913                    const void *identity)
914 {
915 #ifdef HAVE_TABLE_WAIT
916   PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
917   if (unlikely(pfs_thread == NULL))
918     return NULL;
919   PFS_table_share* share;
920   share= find_or_create_table_share(pfs_thread,
921                                     schema_name, schema_name_length,
922                                     table_name, table_name_length);
923   return reinterpret_cast<PSI_table_share*> (share);
924 #else
925   return NULL;
926 #endif
927 }
928 
release_table_share_v1(PSI_table_share * share)929 static void release_table_share_v1(PSI_table_share* share)
930 {
931   /*
932     To be implemented by WL#4895 PERFORMANCE_SCHEMA Instrumenting Table IO.
933   */
934 }
935 
936 static PSI_table*
open_table_v1(PSI_table_share * share,const void * identity)937 open_table_v1(PSI_table_share *share, const void *identity)
938 {
939   PFS_table_share *pfs_table_share=
940     reinterpret_cast<PFS_table_share*> (share);
941   PFS_table *pfs_table;
942   DBUG_ASSERT(pfs_table_share);
943   pfs_table= create_table(pfs_table_share, identity);
944   return reinterpret_cast<PSI_table *> (pfs_table);
945 }
946 
close_table_v1(PSI_table * table)947 static void close_table_v1(PSI_table *table)
948 {
949   PFS_table *pfs= reinterpret_cast<PFS_table*> (table);
950   DBUG_ASSERT(pfs);
951   destroy_table(pfs);
952 }
953 
create_file_v1(PSI_file_key key,const char * name,File file)954 static void create_file_v1(PSI_file_key key, const char *name, File file)
955 {
956   int index= (int) file;
957   if (unlikely(index < 0))
958     return;
959   PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
960   if (unlikely(pfs_thread == NULL))
961     return;
962   if (! pfs_thread->m_enabled)
963     return;
964   PFS_file_class *klass= find_file_class(key);
965   if (unlikely(klass == NULL))
966     return;
967   if (! klass->m_enabled)
968     return;
969   if (likely(index < file_handle_max))
970   {
971     uint len= strlen(name);
972     PFS_file *pfs= find_or_create_file(pfs_thread, klass, name, len);
973     file_handle_array[index]= pfs;
974   }
975   else
976     file_handle_lost++;
977 }
978 
979 struct PFS_spawn_thread_arg
980 {
981   PFS_thread *m_parent_thread;
982   PSI_thread_key m_child_key;
983   const void *m_child_identity;
984   void *(*m_user_start_routine)(void*);
985   void *m_user_arg;
986 };
987 
pfs_spawn_thread(void * arg)988 void* pfs_spawn_thread(void *arg)
989 {
990   PFS_spawn_thread_arg *typed_arg= (PFS_spawn_thread_arg*) arg;
991   void *user_arg;
992   void *(*user_start_routine)(void*);
993 
994   PFS_thread *pfs;
995 
996   /* First, attach instrumentation to this newly created pthread. */
997   PFS_thread_class *klass= find_thread_class(typed_arg->m_child_key);
998   if (likely(klass != NULL))
999     pfs= create_thread(klass, typed_arg->m_child_identity, 0);
1000   else
1001     pfs= NULL;
1002   my_pthread_setspecific_ptr(THR_PFS, pfs);
1003 
1004   /*
1005     Secondly, free the memory allocated in spawn_thread_v1().
1006     It is preferable to do this before invoking the user
1007     routine, to avoid memory leaks at shutdown, in case
1008     the server exits without waiting for this thread.
1009   */
1010   user_start_routine= typed_arg->m_user_start_routine;
1011   user_arg= typed_arg->m_user_arg;
1012   my_free(typed_arg);
1013 
1014   /* Then, execute the user code for this thread. */
1015   (*user_start_routine)(user_arg);
1016 
1017   return NULL;
1018 }
1019 
spawn_thread_v1(PSI_thread_key key,pthread_t * thread,const pthread_attr_t * attr,void * (* start_routine)(void *),void * arg)1020 static int spawn_thread_v1(PSI_thread_key key,
1021                            pthread_t *thread, const pthread_attr_t *attr,
1022                            void *(*start_routine)(void*), void *arg)
1023 {
1024   PFS_spawn_thread_arg *psi_arg;
1025 
1026   /* psi_arg can not be global, and can not be a local variable. */
1027   psi_arg= (PFS_spawn_thread_arg*) my_malloc(sizeof(PFS_spawn_thread_arg),
1028                                              MYF(MY_WME));
1029   if (unlikely(psi_arg == NULL))
1030     return EAGAIN;
1031 
1032   psi_arg->m_parent_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
1033   psi_arg->m_child_key= key;
1034   psi_arg->m_child_identity= (arg ? arg : thread);
1035   psi_arg->m_user_start_routine= start_routine;
1036   psi_arg->m_user_arg= arg;
1037 
1038   int result= pthread_create(thread, attr, pfs_spawn_thread, psi_arg);
1039   if (unlikely(result != 0))
1040     my_free(psi_arg);
1041   return result;
1042 }
1043 
1044 static PSI_thread*
new_thread_v1(PSI_thread_key key,const void * identity,ulong thread_id)1045 new_thread_v1(PSI_thread_key key, const void *identity, ulong thread_id)
1046 {
1047   PFS_thread *pfs;
1048 
1049   PFS_thread_class *klass= find_thread_class(key);
1050   if (likely(klass != NULL))
1051     pfs= create_thread(klass, identity, thread_id);
1052   else
1053     pfs= NULL;
1054 
1055   return reinterpret_cast<PSI_thread*> (pfs);
1056 }
1057 
set_thread_id_v1(PSI_thread * thread,unsigned long id)1058 static void set_thread_id_v1(PSI_thread *thread, unsigned long id)
1059 {
1060   DBUG_ASSERT(thread);
1061   PFS_thread *pfs= reinterpret_cast<PFS_thread*> (thread);
1062   pfs->m_thread_id= id;
1063 }
1064 
1065 static PSI_thread*
get_thread_v1(void)1066 get_thread_v1(void)
1067 {
1068   PFS_thread *pfs= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
1069   return reinterpret_cast<PSI_thread*> (pfs);
1070 }
1071 
set_thread_v1(PSI_thread * thread)1072 static void set_thread_v1(PSI_thread* thread)
1073 {
1074   PFS_thread *pfs= reinterpret_cast<PFS_thread*> (thread);
1075   my_pthread_setspecific_ptr(THR_PFS, pfs);
1076 }
1077 
delete_current_thread_v1(void)1078 static void delete_current_thread_v1(void)
1079 {
1080   PFS_thread *thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
1081   if (thread != NULL)
1082   {
1083     my_pthread_setspecific_ptr(THR_PFS, NULL);
1084     destroy_thread(thread);
1085   }
1086 }
1087 
delete_thread_v1(PSI_thread * thread)1088 static void delete_thread_v1(PSI_thread *thread)
1089 {
1090   PFS_thread *pfs= reinterpret_cast<PFS_thread*> (thread);
1091   if (pfs != NULL)
1092     destroy_thread(pfs);
1093 }
1094 
1095 static PSI_mutex_locker*
get_thread_mutex_locker_v1(PSI_mutex_locker_state * state,PSI_mutex * mutex,PSI_mutex_operation op)1096 get_thread_mutex_locker_v1(PSI_mutex_locker_state *state,
1097                            PSI_mutex *mutex, PSI_mutex_operation op)
1098 {
1099   PFS_mutex *pfs_mutex= reinterpret_cast<PFS_mutex*> (mutex);
1100   DBUG_ASSERT((int) op >= 0);
1101   DBUG_ASSERT((uint) op < array_elements(mutex_operation_map));
1102   DBUG_ASSERT(pfs_mutex != NULL);
1103   DBUG_ASSERT(pfs_mutex->m_class != NULL);
1104   if (! flag_events_waits_current)
1105     return NULL;
1106   if (! pfs_mutex->m_class->m_enabled)
1107     return NULL;
1108   PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
1109   if (unlikely(pfs_thread == NULL))
1110     return NULL;
1111   if (! pfs_thread->m_enabled)
1112     return NULL;
1113   if (unlikely(pfs_thread->m_wait_locker_count >= LOCKER_STACK_SIZE))
1114   {
1115     locker_lost++;
1116     return NULL;
1117   }
1118   PFS_wait_locker *pfs_locker= &pfs_thread->m_wait_locker_stack
1119     [pfs_thread->m_wait_locker_count];
1120 
1121   pfs_locker->m_target.m_mutex= pfs_mutex;
1122   pfs_locker->m_waits_current.m_thread= pfs_thread;
1123   pfs_locker->m_waits_current.m_class= pfs_mutex->m_class;
1124   if (pfs_mutex->m_class->m_timed)
1125   {
1126     pfs_locker->m_timer_name= wait_timer;
1127     pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_STARTING;
1128   }
1129   else
1130     pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_UNTIMED;
1131   pfs_locker->m_waits_current.m_object_instance_addr= pfs_mutex->m_identity;
1132   pfs_locker->m_waits_current.m_event_id= pfs_thread->m_event_id++;
1133   pfs_locker->m_waits_current.m_operation= mutex_operation_map[(int) op];
1134   pfs_locker->m_waits_current.m_wait_class= WAIT_CLASS_MUTEX;
1135 
1136   pfs_thread->m_wait_locker_count++;
1137   return reinterpret_cast<PSI_mutex_locker*> (pfs_locker);
1138 }
1139 
1140 static PSI_rwlock_locker*
get_thread_rwlock_locker_v1(PSI_rwlock_locker_state * state,PSI_rwlock * rwlock,PSI_rwlock_operation op)1141 get_thread_rwlock_locker_v1(PSI_rwlock_locker_state *state,
1142                             PSI_rwlock *rwlock, PSI_rwlock_operation op)
1143 {
1144   PFS_rwlock *pfs_rwlock= reinterpret_cast<PFS_rwlock*> (rwlock);
1145   DBUG_ASSERT(static_cast<int> (op) >= 0);
1146   DBUG_ASSERT(static_cast<uint> (op) < array_elements(rwlock_operation_map));
1147   DBUG_ASSERT(pfs_rwlock != NULL);
1148   DBUG_ASSERT(pfs_rwlock->m_class != NULL);
1149   if (! flag_events_waits_current)
1150     return NULL;
1151   if (! pfs_rwlock->m_class->m_enabled)
1152     return NULL;
1153   PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
1154   if (unlikely(pfs_thread == NULL))
1155     return NULL;
1156   if (! pfs_thread->m_enabled)
1157     return NULL;
1158   if (unlikely(pfs_thread->m_wait_locker_count >= LOCKER_STACK_SIZE))
1159   {
1160     locker_lost++;
1161     return NULL;
1162   }
1163   PFS_wait_locker *pfs_locker= &pfs_thread->m_wait_locker_stack
1164     [pfs_thread->m_wait_locker_count];
1165 
1166   pfs_locker->m_target.m_rwlock= pfs_rwlock;
1167   pfs_locker->m_waits_current.m_thread= pfs_thread;
1168   pfs_locker->m_waits_current.m_class= pfs_rwlock->m_class;
1169   if (pfs_rwlock->m_class->m_timed)
1170   {
1171     pfs_locker->m_timer_name= wait_timer;
1172     pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_STARTING;
1173   }
1174   else
1175     pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_UNTIMED;
1176   pfs_locker->m_waits_current.m_object_instance_addr= pfs_rwlock->m_identity;
1177   pfs_locker->m_waits_current.m_event_id= pfs_thread->m_event_id++;
1178   pfs_locker->m_waits_current.m_operation=
1179     rwlock_operation_map[static_cast<int> (op)];
1180   pfs_locker->m_waits_current.m_wait_class= WAIT_CLASS_RWLOCK;
1181 
1182   pfs_thread->m_wait_locker_count++;
1183   return reinterpret_cast<PSI_rwlock_locker*> (pfs_locker);
1184 }
1185 
1186 static PSI_cond_locker*
get_thread_cond_locker_v1(PSI_cond_locker_state * state,PSI_cond * cond,PSI_mutex *,PSI_cond_operation op)1187 get_thread_cond_locker_v1(PSI_cond_locker_state *state,
1188                           PSI_cond *cond, PSI_mutex * /* unused: mutex */,
1189                           PSI_cond_operation op)
1190 {
1191   /*
1192     Note about the unused PSI_mutex *mutex parameter:
1193     In the pthread library, a call to pthread_cond_wait()
1194     causes an unlock() + lock() on the mutex associated with the condition.
1195     This mutex operation is not instrumented, so the mutex will still
1196     appear as locked when a thread is waiting on a condition.
1197     This has no impact now, as unlock_mutex() is not recording events.
1198     When unlock_mutex() is implemented by later work logs,
1199     this parameter here will be used to adjust the mutex state,
1200     in start_cond_wait_v1() and end_cond_wait_v1().
1201   */
1202   PFS_cond *pfs_cond= reinterpret_cast<PFS_cond*> (cond);
1203   DBUG_ASSERT(static_cast<int> (op) >= 0);
1204   DBUG_ASSERT(static_cast<uint> (op) < array_elements(cond_operation_map));
1205   DBUG_ASSERT(pfs_cond != NULL);
1206   DBUG_ASSERT(pfs_cond->m_class != NULL);
1207   if (! flag_events_waits_current)
1208     return NULL;
1209   if (! pfs_cond->m_class->m_enabled)
1210     return NULL;
1211   PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
1212   if (unlikely(pfs_thread == NULL))
1213     return NULL;
1214   if (! pfs_thread->m_enabled)
1215     return NULL;
1216   if (unlikely(pfs_thread->m_wait_locker_count >= LOCKER_STACK_SIZE))
1217   {
1218     locker_lost++;
1219     return NULL;
1220   }
1221   PFS_wait_locker *pfs_locker= &pfs_thread->m_wait_locker_stack
1222     [pfs_thread->m_wait_locker_count];
1223 
1224   pfs_locker->m_target.m_cond= pfs_cond;
1225   pfs_locker->m_waits_current.m_thread= pfs_thread;
1226   pfs_locker->m_waits_current.m_class= pfs_cond->m_class;
1227   if (pfs_cond->m_class->m_timed)
1228   {
1229     pfs_locker->m_timer_name= wait_timer;
1230     pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_STARTING;
1231   }
1232   else
1233     pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_UNTIMED;
1234   pfs_locker->m_waits_current.m_object_instance_addr= pfs_cond->m_identity;
1235   pfs_locker->m_waits_current.m_event_id= pfs_thread->m_event_id++;
1236   pfs_locker->m_waits_current.m_operation=
1237     cond_operation_map[static_cast<int> (op)];
1238   pfs_locker->m_waits_current.m_wait_class= WAIT_CLASS_COND;
1239 
1240   pfs_thread->m_wait_locker_count++;
1241   return reinterpret_cast<PSI_cond_locker*> (pfs_locker);
1242 }
1243 
1244 static PSI_table_locker*
get_thread_table_locker_v1(PSI_table_locker_state * state,PSI_table * table)1245 get_thread_table_locker_v1(PSI_table_locker_state *state,
1246                            PSI_table *table)
1247 {
1248   PFS_table *pfs_table= reinterpret_cast<PFS_table*> (table);
1249   DBUG_ASSERT(pfs_table != NULL);
1250   DBUG_ASSERT(pfs_table->m_share != NULL);
1251   if (! flag_events_waits_current)
1252     return NULL;
1253   if (! pfs_table->m_share->m_enabled)
1254     return NULL;
1255   PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
1256   if (unlikely(pfs_thread == NULL))
1257     return NULL;
1258   if (! pfs_thread->m_enabled)
1259     return NULL;
1260   if (unlikely(pfs_thread->m_wait_locker_count >= LOCKER_STACK_SIZE))
1261   {
1262     locker_lost++;
1263     return NULL;
1264   }
1265   PFS_wait_locker *pfs_locker= &pfs_thread->m_wait_locker_stack
1266     [pfs_thread->m_wait_locker_count];
1267 
1268   pfs_locker->m_target.m_table= pfs_table;
1269   pfs_locker->m_waits_current.m_thread= pfs_thread;
1270   pfs_locker->m_waits_current.m_class= &global_table_class;
1271   if (pfs_table->m_share->m_timed)
1272   {
1273     pfs_locker->m_timer_name= wait_timer;
1274     pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_STARTING;
1275   }
1276   else
1277     pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_UNTIMED;
1278   pfs_locker->m_waits_current.m_object_instance_addr= pfs_table->m_identity;
1279   pfs_locker->m_waits_current.m_event_id= pfs_thread->m_event_id++;
1280   pfs_locker->m_waits_current.m_wait_class= WAIT_CLASS_TABLE;
1281 
1282   pfs_thread->m_wait_locker_count++;
1283   return reinterpret_cast<PSI_table_locker*> (pfs_locker);
1284 }
1285 
1286 static PSI_file_locker*
get_thread_file_name_locker_v1(PSI_file_locker_state * state,PSI_file_key key,PSI_file_operation op,const char * name,const void * identity)1287 get_thread_file_name_locker_v1(PSI_file_locker_state *state,
1288                                PSI_file_key key,
1289                                PSI_file_operation op,
1290                                const char *name, const void *identity)
1291 {
1292   DBUG_ASSERT(static_cast<int> (op) >= 0);
1293   DBUG_ASSERT(static_cast<uint> (op) < array_elements(file_operation_map));
1294 
1295   if (! flag_events_waits_current)
1296     return NULL;
1297   PFS_file_class *klass= find_file_class(key);
1298   if (unlikely(klass == NULL))
1299     return NULL;
1300   if (! klass->m_enabled)
1301     return NULL;
1302   PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
1303   if (unlikely(pfs_thread == NULL))
1304     return NULL;
1305   if (! pfs_thread->m_enabled)
1306     return NULL;
1307   if (unlikely(pfs_thread->m_wait_locker_count >= LOCKER_STACK_SIZE))
1308   {
1309     locker_lost++;
1310     return NULL;
1311   }
1312   uint len= strlen(name);
1313   PFS_file *pfs_file= find_or_create_file(pfs_thread, klass, name, len);
1314   if (unlikely(pfs_file == NULL))
1315     return NULL;
1316 
1317   PFS_wait_locker *pfs_locker= &pfs_thread->m_wait_locker_stack
1318     [pfs_thread->m_wait_locker_count];
1319 
1320   pfs_locker->m_target.m_file= pfs_file;
1321   pfs_locker->m_waits_current.m_thread= pfs_thread;
1322   pfs_locker->m_waits_current.m_class= pfs_file->m_class;
1323   if (pfs_file->m_class->m_timed)
1324   {
1325     pfs_locker->m_timer_name= wait_timer;
1326     pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_STARTING;
1327   }
1328   else
1329     pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_UNTIMED;
1330   pfs_locker->m_waits_current.m_object_instance_addr= pfs_file;
1331   pfs_locker->m_waits_current.m_object_name= pfs_file->m_filename;
1332   pfs_locker->m_waits_current.m_object_name_length=
1333     pfs_file->m_filename_length;
1334   pfs_locker->m_waits_current.m_event_id= pfs_thread->m_event_id++;
1335   pfs_locker->m_waits_current.m_operation=
1336     file_operation_map[static_cast<int> (op)];
1337   pfs_locker->m_waits_current.m_wait_class= WAIT_CLASS_FILE;
1338 
1339   pfs_thread->m_wait_locker_count++;
1340   return reinterpret_cast<PSI_file_locker*> (pfs_locker);
1341 }
1342 
1343 static PSI_file_locker*
get_thread_file_stream_locker_v1(PSI_file_locker_state * state,PSI_file * file,PSI_file_operation op)1344 get_thread_file_stream_locker_v1(PSI_file_locker_state *state,
1345                                  PSI_file *file, PSI_file_operation op)
1346 {
1347   PFS_file *pfs_file= reinterpret_cast<PFS_file*> (file);
1348 
1349   DBUG_ASSERT(static_cast<int> (op) >= 0);
1350   DBUG_ASSERT(static_cast<uint> (op) < array_elements(file_operation_map));
1351   DBUG_ASSERT(pfs_file != NULL);
1352   DBUG_ASSERT(pfs_file->m_class != NULL);
1353 
1354   if (! flag_events_waits_current)
1355     return NULL;
1356   if (! pfs_file->m_class->m_enabled)
1357     return NULL;
1358   PFS_thread *pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
1359   if (unlikely(pfs_thread == NULL))
1360     return NULL;
1361   if (! pfs_thread->m_enabled)
1362     return NULL;
1363   if (unlikely(pfs_thread->m_wait_locker_count >= LOCKER_STACK_SIZE))
1364   {
1365     locker_lost++;
1366     return NULL;
1367   }
1368   PFS_wait_locker *pfs_locker= &pfs_thread->m_wait_locker_stack
1369     [pfs_thread->m_wait_locker_count];
1370 
1371   pfs_locker->m_target.m_file= pfs_file;
1372   pfs_locker->m_waits_current.m_thread= pfs_thread;
1373   pfs_locker->m_waits_current.m_class= pfs_file->m_class;
1374   if (pfs_file->m_class->m_timed)
1375   {
1376     pfs_locker->m_timer_name= wait_timer;
1377     pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_STARTING;
1378   }
1379   else
1380     pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_UNTIMED;
1381   pfs_locker->m_waits_current.m_object_instance_addr= pfs_file;
1382   pfs_locker->m_waits_current.m_object_name= pfs_file->m_filename;
1383   pfs_locker->m_waits_current.m_object_name_length=
1384     pfs_file->m_filename_length;
1385   pfs_locker->m_waits_current.m_event_id= pfs_thread->m_event_id++;
1386   pfs_locker->m_waits_current.m_operation=
1387     file_operation_map[static_cast<int> (op)];
1388   pfs_locker->m_waits_current.m_wait_class= WAIT_CLASS_FILE;
1389 
1390   pfs_thread->m_wait_locker_count++;
1391   return reinterpret_cast<PSI_file_locker*> (pfs_locker);
1392 }
1393 
1394 static PSI_file_locker*
get_thread_file_descriptor_locker_v1(PSI_file_locker_state * state,File file,PSI_file_operation op)1395 get_thread_file_descriptor_locker_v1(PSI_file_locker_state *state,
1396                                      File file, PSI_file_operation op)
1397 {
1398   int index= static_cast<int> (file);
1399 
1400   DBUG_ASSERT(static_cast<int> (op) >= 0);
1401   DBUG_ASSERT(static_cast<uint> (op) < array_elements(file_operation_map));
1402 
1403   if (! flag_events_waits_current)
1404     return NULL;
1405   if (likely((index >= 0) && (index < file_handle_max)))
1406   {
1407     PFS_file *pfs_file= file_handle_array[index];
1408     if (likely(pfs_file != NULL))
1409     {
1410       PFS_thread *pfs_thread;
1411 
1412       /*
1413         We are about to close a file by descriptor number,
1414         and the calling code still holds the descriptor.
1415         Cleanup the file descriptor <--> file instrument association.
1416         Remove the instrumentation *before* the close to avoid race
1417         conditions with another thread opening a file
1418         (that could be given the same descriptor).
1419       */
1420       if (op == PSI_FILE_CLOSE)
1421         file_handle_array[index]= NULL;
1422 
1423       DBUG_ASSERT(pfs_file->m_class != NULL);
1424       if (! pfs_file->m_class->m_enabled)
1425         return NULL;
1426       pfs_thread= my_pthread_getspecific_ptr(PFS_thread*, THR_PFS);
1427       if (unlikely(pfs_thread == NULL))
1428         return NULL;
1429       if (! pfs_thread->m_enabled)
1430         return NULL;
1431       if (unlikely(pfs_thread->m_wait_locker_count >= LOCKER_STACK_SIZE))
1432       {
1433         locker_lost++;
1434         return NULL;
1435       }
1436       PFS_wait_locker *pfs_locker= &pfs_thread->m_wait_locker_stack
1437         [pfs_thread->m_wait_locker_count];
1438 
1439       pfs_locker->m_target.m_file= pfs_file;
1440       pfs_locker->m_waits_current.m_thread= pfs_thread;
1441       pfs_locker->m_waits_current.m_class= pfs_file->m_class;
1442       if (pfs_file->m_class->m_timed)
1443       {
1444         pfs_locker->m_timer_name= wait_timer;
1445         pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_STARTING;
1446       }
1447       else
1448         pfs_locker->m_waits_current.m_timer_state= TIMER_STATE_UNTIMED;
1449       pfs_locker->m_waits_current.m_object_instance_addr= pfs_file;
1450       pfs_locker->m_waits_current.m_object_name= pfs_file->m_filename;
1451       pfs_locker->m_waits_current.m_object_name_length=
1452         pfs_file->m_filename_length;
1453       pfs_locker->m_waits_current.m_event_id= pfs_thread->m_event_id++;
1454       pfs_locker->m_waits_current.m_operation=
1455         file_operation_map[static_cast<int> (op)];
1456       pfs_locker->m_waits_current.m_wait_class= WAIT_CLASS_FILE;
1457 
1458       pfs_thread->m_wait_locker_count++;
1459       return reinterpret_cast<PSI_file_locker*> (pfs_locker);
1460     }
1461   }
1462   return NULL;
1463 }
1464 
unlock_mutex_v1(PSI_mutex * mutex)1465 static void unlock_mutex_v1(PSI_mutex *mutex)
1466 {
1467   PFS_mutex *pfs_mutex= reinterpret_cast<PFS_mutex*> (mutex);
1468   DBUG_ASSERT(pfs_mutex != NULL);
1469 
1470   /*
1471     Note that this code is still protected by the instrumented mutex,
1472     and therefore is thread safe. See inline_mysql_mutex_unlock().
1473   */
1474 
1475   /* Always update the instrumented state */
1476   pfs_mutex->m_owner= NULL;
1477   pfs_mutex->m_last_locked= 0;
1478 
1479 #ifdef LATER_WL2333
1480   /*
1481     See WL#2333: SHOW ENGINE ... LOCK STATUS.
1482     PFS_mutex::m_lock_stat is not exposed in user visible tables
1483     currently, so there is no point spending time computing it.
1484   */
1485   PFS_thread *pfs_thread= reinterpret_cast<PFS_thread*> (thread);
1486   DBUG_ASSERT(pfs_thread != NULL);
1487 
1488   if (unlikely(! flag_events_waits_current))
1489     return;
1490   if (! pfs_mutex->m_class->m_enabled)
1491     return;
1492   if (! pfs_thread->m_enabled)
1493     return;
1494 
1495   if (pfs_mutex->m_class->m_timed)
1496   {
1497     ulonglong locked_time;
1498     locked_time= get_timer_value(wait_timer) - pfs_mutex->m_last_locked;
1499     aggregate_single_stat_chain(&pfs_mutex->m_lock_stat, locked_time);
1500   }
1501 #endif
1502 }
1503 
unlock_rwlock_v1(PSI_rwlock * rwlock)1504 static void unlock_rwlock_v1(PSI_rwlock *rwlock)
1505 {
1506   PFS_rwlock *pfs_rwlock= reinterpret_cast<PFS_rwlock*> (rwlock);
1507   DBUG_ASSERT(pfs_rwlock != NULL);
1508   bool last_writer= false;
1509   bool last_reader= false;
1510 
1511   /*
1512     Note that this code is still protected by the instrumented rwlock,
1513     and therefore is:
1514     - thread safe for write locks
1515     - almost thread safe for read locks (pfs_rwlock->m_readers is unsafe).
1516     See inline_mysql_rwlock_unlock()
1517   */
1518 
1519   /* Always update the instrumented state */
1520   if (pfs_rwlock->m_writer)
1521   {
1522     /* Nominal case, a writer is unlocking. */
1523     last_writer= true;
1524     pfs_rwlock->m_writer= NULL;
1525     /* Reset the readers stats, they could be off */
1526     pfs_rwlock->m_readers= 0;
1527   }
1528   else if (likely(pfs_rwlock->m_readers > 0))
1529   {
1530     /* Nominal case, a reader is unlocking. */
1531     if (--(pfs_rwlock->m_readers) == 0)
1532       last_reader= true;
1533   }
1534   else
1535   {
1536     /*
1537       Edge case, we have no writer and no readers,
1538       on an unlock event.
1539       This is possible for:
1540       - partial instrumentation
1541       - instrumentation disabled at runtime,
1542         see when get_thread_rwlock_locker_v1() returns NULL
1543       No further action is taken here, the next
1544       write lock will put the statistics is a valid state.
1545     */
1546   }
1547 
1548 #ifdef LATER_WL2333
1549   /* See WL#2333: SHOW ENGINE ... LOCK STATUS. */
1550   PFS_thread *pfs_thread= reinterpret_cast<PFS_thread*> (thread);
1551   DBUG_ASSERT(pfs_thread != NULL);
1552 
1553   if (unlikely(! flag_events_waits_current))
1554     return;
1555   if (! pfs_rwlock->m_class->m_enabled)
1556     return;
1557   if (! pfs_thread->m_enabled)
1558     return;
1559 
1560   ulonglong locked_time;
1561   if (last_writer)
1562   {
1563     if (pfs_rwlock->m_class->m_timed)
1564     {
1565       locked_time= get_timer_value(wait_timer) - pfs_rwlock->m_last_written;
1566       aggregate_single_stat_chain(&pfs_rwlock->m_write_lock_stat, locked_time);
1567     }
1568   }
1569   else if (last_reader)
1570   {
1571     if (pfs_rwlock->m_class->m_timed)
1572     {
1573       locked_time= get_timer_value(wait_timer) - pfs_rwlock->m_last_read;
1574       aggregate_single_stat_chain(&pfs_rwlock->m_read_lock_stat, locked_time);
1575     }
1576   }
1577 #else
1578   (void) last_reader;
1579   (void) last_writer;
1580 #endif
1581 }
1582 
signal_cond_v1(PSI_cond * cond)1583 static void signal_cond_v1(PSI_cond* cond)
1584 {
1585   PFS_cond *pfs_cond= reinterpret_cast<PFS_cond*> (cond);
1586   DBUG_ASSERT(pfs_cond != NULL);
1587 
1588   pfs_cond->m_cond_stat.m_signal_count++;
1589 }
1590 
broadcast_cond_v1(PSI_cond * cond)1591 static void broadcast_cond_v1(PSI_cond* cond)
1592 {
1593   PFS_cond *pfs_cond= reinterpret_cast<PFS_cond*> (cond);
1594   DBUG_ASSERT(pfs_cond != NULL);
1595 
1596   pfs_cond->m_cond_stat.m_broadcast_count++;
1597 }
1598 
start_mutex_wait_v1(PSI_mutex_locker * locker,const char * src_file,uint src_line)1599 static void start_mutex_wait_v1(PSI_mutex_locker* locker,
1600                                 const char *src_file, uint src_line)
1601 {
1602   PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker);
1603   DBUG_ASSERT(pfs_locker != NULL);
1604 
1605   PFS_events_waits *wait= &pfs_locker->m_waits_current;
1606   if (wait->m_timer_state == TIMER_STATE_STARTING)
1607   {
1608     wait->m_timer_start= get_timer_value(pfs_locker->m_timer_name);
1609     wait->m_timer_state= TIMER_STATE_STARTED;
1610   }
1611   wait->m_source_file= src_file;
1612   wait->m_source_line= src_line;
1613 }
1614 
end_mutex_wait_v1(PSI_mutex_locker * locker,int rc)1615 static void end_mutex_wait_v1(PSI_mutex_locker* locker, int rc)
1616 {
1617   PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker);
1618   DBUG_ASSERT(pfs_locker != NULL);
1619   PFS_events_waits *wait= &pfs_locker->m_waits_current;
1620 
1621   if (wait->m_timer_state == TIMER_STATE_STARTED)
1622   {
1623     wait->m_timer_end= get_timer_value(pfs_locker->m_timer_name);
1624     wait->m_timer_state= TIMER_STATE_TIMED;
1625   }
1626   if (flag_events_waits_history)
1627     insert_events_waits_history(wait->m_thread, wait);
1628   if (flag_events_waits_history_long)
1629     insert_events_waits_history_long(wait);
1630 
1631   if (rc == 0)
1632   {
1633     /* Thread safe: we are protected by the instrumented mutex */
1634     PFS_mutex *mutex= pfs_locker->m_target.m_mutex;
1635     PFS_single_stat_chain *stat= find_per_thread_mutex_class_wait_stat(wait->m_thread, mutex->m_class);
1636     mutex->m_owner= wait->m_thread;
1637     mutex->m_last_locked= wait->m_timer_end;
1638 
1639     /* If timed then aggregate stats, else increment the value counts only */
1640     if (wait->m_timer_state == TIMER_STATE_TIMED)
1641     {
1642       ulonglong wait_time= wait->m_timer_end - wait->m_timer_start;
1643       aggregate_single_stat_chain(&mutex->m_wait_stat, wait_time);
1644       aggregate_single_stat_chain(stat, wait_time);
1645     }
1646     else
1647     {
1648       increment_single_stat_chain(&mutex->m_wait_stat);
1649       increment_single_stat_chain(stat);
1650     }
1651   }
1652   wait->m_thread->m_wait_locker_count--;
1653 }
1654 
start_rwlock_rdwait_v1(PSI_rwlock_locker * locker,const char * src_file,uint src_line)1655 static void start_rwlock_rdwait_v1(PSI_rwlock_locker* locker,
1656                                    const char *src_file, uint src_line)
1657 {
1658   PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker);
1659   DBUG_ASSERT(pfs_locker != NULL);
1660 
1661   PFS_events_waits *wait= &pfs_locker->m_waits_current;
1662   if (wait->m_timer_state == TIMER_STATE_STARTING)
1663   {
1664     wait->m_timer_start= get_timer_value(pfs_locker->m_timer_name);
1665     wait->m_timer_state= TIMER_STATE_STARTED;
1666   }
1667   wait->m_source_file= src_file;
1668   wait->m_source_line= src_line;
1669 }
1670 
end_rwlock_rdwait_v1(PSI_rwlock_locker * locker,int rc)1671 static void end_rwlock_rdwait_v1(PSI_rwlock_locker* locker, int rc)
1672 {
1673   PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker);
1674   DBUG_ASSERT(pfs_locker != NULL);
1675   PFS_events_waits *wait= &pfs_locker->m_waits_current;
1676 
1677   if (wait->m_timer_state == TIMER_STATE_STARTED)
1678   {
1679     wait->m_timer_end= get_timer_value(pfs_locker->m_timer_name);
1680     wait->m_timer_state= TIMER_STATE_TIMED;
1681   }
1682   if (flag_events_waits_history)
1683     insert_events_waits_history(wait->m_thread, wait);
1684   if (flag_events_waits_history_long)
1685     insert_events_waits_history_long(wait);
1686 
1687   if (rc == 0)
1688   {
1689     /*
1690       Warning:
1691       Multiple threads can execute this section concurrently
1692       (since multiple readers can execute in parallel).
1693       The statistics generated are not safe, which is why they are
1694       just statistics, not facts.
1695     */
1696     PFS_rwlock *rwlock= pfs_locker->m_target.m_rwlock;
1697     PFS_single_stat_chain *stat= find_per_thread_rwlock_class_wait_stat(wait->m_thread, rwlock->m_class);
1698 
1699     if (rwlock->m_readers == 0)
1700       rwlock->m_last_read= wait->m_timer_end;
1701     rwlock->m_writer= NULL;
1702     rwlock->m_readers++;
1703 
1704     /* If timed then aggregate stats, else increment the value counts only */
1705     if (wait->m_timer_state == TIMER_STATE_TIMED)
1706     {
1707       ulonglong wait_time= wait->m_timer_end - wait->m_timer_start;
1708       aggregate_single_stat_chain(&rwlock->m_wait_stat, wait_time);
1709       aggregate_single_stat_chain(stat, wait_time);
1710     }
1711     else
1712     {
1713       increment_single_stat_chain(&rwlock->m_wait_stat);
1714       increment_single_stat_chain(stat);
1715     }
1716   }
1717   wait->m_thread->m_wait_locker_count--;
1718 }
1719 
start_rwlock_wrwait_v1(PSI_rwlock_locker * locker,const char * src_file,uint src_line)1720 static void start_rwlock_wrwait_v1(PSI_rwlock_locker* locker,
1721                                    const char *src_file, uint src_line)
1722 {
1723   PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker);
1724   DBUG_ASSERT(pfs_locker != NULL);
1725 
1726   PFS_events_waits *wait= &pfs_locker->m_waits_current;
1727   if (wait->m_timer_state == TIMER_STATE_STARTING)
1728   {
1729     wait->m_timer_start= get_timer_value(pfs_locker->m_timer_name);
1730     wait->m_timer_state= TIMER_STATE_STARTED;
1731   }
1732   wait->m_source_file= src_file;
1733   wait->m_source_line= src_line;
1734 }
1735 
end_rwlock_wrwait_v1(PSI_rwlock_locker * locker,int rc)1736 static void end_rwlock_wrwait_v1(PSI_rwlock_locker* locker, int rc)
1737 {
1738   PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker);
1739   DBUG_ASSERT(pfs_locker != NULL);
1740   PFS_events_waits *wait= &pfs_locker->m_waits_current;
1741 
1742   if (wait->m_timer_state == TIMER_STATE_STARTED)
1743   {
1744     wait->m_timer_end= get_timer_value(pfs_locker->m_timer_name);
1745     wait->m_timer_state= TIMER_STATE_TIMED;
1746   }
1747   if (flag_events_waits_history)
1748     insert_events_waits_history(wait->m_thread, wait);
1749   if (flag_events_waits_history_long)
1750     insert_events_waits_history_long(wait);
1751 
1752   if (rc == 0)
1753   {
1754     /* Thread safe : we are protected by the instrumented rwlock */
1755     PFS_rwlock *rwlock= pfs_locker->m_target.m_rwlock;
1756     PFS_single_stat_chain *stat= find_per_thread_rwlock_class_wait_stat(wait->m_thread, rwlock->m_class);
1757     rwlock->m_writer= wait->m_thread;
1758     rwlock->m_last_written= wait->m_timer_end;
1759     /* Reset the readers stats, they could be off */
1760     rwlock->m_readers= 0;
1761     rwlock->m_last_read= 0;
1762 
1763     /* If timed then aggregate stats, else increment the value counts only */
1764     if (wait->m_timer_state == TIMER_STATE_TIMED)
1765     {
1766       ulonglong wait_time= wait->m_timer_end - wait->m_timer_start;
1767       aggregate_single_stat_chain(&rwlock->m_wait_stat, wait_time);
1768       aggregate_single_stat_chain(stat, wait_time);
1769     }
1770     else
1771     {
1772       increment_single_stat_chain(&rwlock->m_wait_stat);
1773       increment_single_stat_chain(stat);
1774     }
1775   }
1776   wait->m_thread->m_wait_locker_count--;
1777 }
1778 
start_cond_wait_v1(PSI_cond_locker * locker,const char * src_file,uint src_line)1779 static void start_cond_wait_v1(PSI_cond_locker* locker,
1780                                const char *src_file, uint src_line)
1781 {
1782   PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker);
1783   DBUG_ASSERT(pfs_locker != NULL);
1784 
1785   PFS_events_waits *wait= &pfs_locker->m_waits_current;
1786   if (wait->m_timer_state == TIMER_STATE_STARTING)
1787   {
1788     wait->m_timer_start= get_timer_value(pfs_locker->m_timer_name);
1789     wait->m_timer_state= TIMER_STATE_STARTED;
1790   }
1791   wait->m_source_file= src_file;
1792   wait->m_source_line= src_line;
1793 }
1794 
end_cond_wait_v1(PSI_cond_locker * locker,int rc)1795 static void end_cond_wait_v1(PSI_cond_locker* locker, int rc)
1796 {
1797   PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker);
1798   DBUG_ASSERT(pfs_locker != NULL);
1799   PFS_events_waits *wait= &pfs_locker->m_waits_current;
1800 
1801   if (wait->m_timer_state == TIMER_STATE_STARTED)
1802   {
1803     wait->m_timer_end= get_timer_value(pfs_locker->m_timer_name);
1804     wait->m_timer_state= TIMER_STATE_TIMED;
1805   }
1806   if (flag_events_waits_history)
1807     insert_events_waits_history(wait->m_thread, wait);
1808   if (flag_events_waits_history_long)
1809     insert_events_waits_history_long(wait);
1810 
1811   if (rc == 0)
1812   {
1813     /*
1814       Not thread safe, race conditions will occur.
1815       A first race condition is:
1816       - thread 1 waits on cond A
1817       - thread 2 waits on cond B
1818       threads 1 and 2 compete when updating the same cond A
1819       statistics, possibly missing a min / max / sum / count.
1820       A second race condition is:
1821       - thread 1 waits on cond A
1822       - thread 2 destroys cond A
1823       - thread 2 or 3 creates cond B in the same condition slot
1824       thread 1 will then aggregate statistics about defunct A
1825       in condition B.
1826       This is accepted, the data will be slightly inaccurate.
1827     */
1828     PFS_cond *cond= pfs_locker->m_target.m_cond;
1829     PFS_single_stat_chain *stat= find_per_thread_cond_class_wait_stat(wait->m_thread, cond->m_class);
1830 
1831     /* If timed then aggregate stats, else increment the value counts only */
1832     if (wait->m_timer_state == TIMER_STATE_TIMED)
1833     {
1834       ulonglong wait_time= wait->m_timer_end - wait->m_timer_start;
1835       aggregate_single_stat_chain(&cond->m_wait_stat, wait_time);
1836       aggregate_single_stat_chain(stat, wait_time);
1837     }
1838     else
1839     {
1840       increment_single_stat_chain(&cond->m_wait_stat);
1841       increment_single_stat_chain(stat);
1842     }
1843   }
1844   wait->m_thread->m_wait_locker_count--;
1845 }
1846 
start_table_wait_v1(PSI_table_locker * locker,const char * src_file,uint src_line)1847 static void start_table_wait_v1(PSI_table_locker* locker,
1848                                 const char *src_file, uint src_line)
1849 {
1850   PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker);
1851   DBUG_ASSERT(pfs_locker != NULL);
1852 
1853   PFS_events_waits *wait= &pfs_locker->m_waits_current;
1854   if (wait->m_timer_state == TIMER_STATE_STARTING)
1855   {
1856     wait->m_timer_start= get_timer_value(pfs_locker->m_timer_name);
1857     wait->m_timer_state= TIMER_STATE_STARTED;
1858   }
1859   wait->m_source_file= src_file;
1860   wait->m_source_line= src_line;
1861   wait->m_operation= OPERATION_TYPE_LOCK;
1862   PFS_table_share *share= pfs_locker->m_target.m_table->m_share;
1863   wait->m_schema_name= share->m_schema_name;
1864   wait->m_schema_name_length= share->m_schema_name_length;
1865   wait->m_object_name= share->m_table_name;
1866   wait->m_object_name_length= share->m_table_name_length;
1867 }
1868 
end_table_wait_v1(PSI_table_locker * locker)1869 static void end_table_wait_v1(PSI_table_locker* locker)
1870 {
1871   PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker);
1872   DBUG_ASSERT(pfs_locker != NULL);
1873   PFS_events_waits *wait= &pfs_locker->m_waits_current;
1874 
1875   if (wait->m_timer_state == TIMER_STATE_STARTED)
1876   {
1877     wait->m_timer_end= get_timer_value(pfs_locker->m_timer_name);
1878     wait->m_timer_state= TIMER_STATE_TIMED;
1879   }
1880   if (flag_events_waits_history)
1881     insert_events_waits_history(wait->m_thread, wait);
1882   if (flag_events_waits_history_long)
1883     insert_events_waits_history_long(wait);
1884 
1885   PFS_table *table= pfs_locker->m_target.m_table;
1886 
1887   /* If timed then aggregate stats, else increment the value counts only */
1888   if (wait->m_timer_state == TIMER_STATE_TIMED)
1889   {
1890     ulonglong wait_time= wait->m_timer_end - wait->m_timer_start;
1891     aggregate_single_stat_chain(&table->m_wait_stat, wait_time);
1892   }
1893   else
1894   {
1895     increment_single_stat_chain(&table->m_wait_stat);
1896   }
1897 
1898   /*
1899     There is currently no per table and per thread aggregation.
1900     The number of tables in the application is arbitrary, and may be high.
1901     The number of slots per thread to hold aggregates is fixed,
1902     and is constrained by memory.
1903     Implementing a per thread and per table aggregate has not been
1904     decided yet.
1905     If it's implemented, it's likely that the user will have to specify,
1906     per table name, if the aggregate per thread is to be computed or not.
1907     This will mean a SETUP_ table.
1908   */
1909   wait->m_thread->m_wait_locker_count--;
1910 }
1911 
1912 static void start_file_wait_v1(PSI_file_locker *locker,
1913                                size_t count,
1914                                const char *src_file,
1915                                uint src_line);
1916 
1917 static void end_file_wait_v1(PSI_file_locker *locker,
1918                              size_t count);
1919 
start_file_open_wait_v1(PSI_file_locker * locker,const char * src_file,uint src_line)1920 static PSI_file* start_file_open_wait_v1(PSI_file_locker *locker,
1921                                          const char *src_file,
1922                                          uint src_line)
1923 {
1924   PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker);
1925   DBUG_ASSERT(pfs_locker != NULL);
1926 
1927   start_file_wait_v1(locker, 0, src_file, src_line);
1928 
1929   PFS_file *pfs_file= pfs_locker->m_target.m_file;
1930   return reinterpret_cast<PSI_file*> (pfs_file);
1931 }
1932 
end_file_open_wait_v1(PSI_file_locker * locker)1933 static void end_file_open_wait_v1(PSI_file_locker *locker)
1934 {
1935   end_file_wait_v1(locker, 0);
1936 }
1937 
end_file_open_wait_and_bind_to_descriptor_v1(PSI_file_locker * locker,File file)1938 static void end_file_open_wait_and_bind_to_descriptor_v1
1939   (PSI_file_locker *locker, File file)
1940 {
1941   int index= (int) file;
1942   PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker);
1943   DBUG_ASSERT(pfs_locker != NULL);
1944 
1945   end_file_wait_v1(locker, 0);
1946 
1947   PFS_file *pfs_file= pfs_locker->m_target.m_file;
1948   DBUG_ASSERT(pfs_file != NULL);
1949 
1950   if (likely(index >= 0))
1951   {
1952     if (likely(index < file_handle_max))
1953       file_handle_array[index]= pfs_file;
1954     else
1955       file_handle_lost++;
1956   }
1957   else
1958     release_file(pfs_file);
1959 }
1960 
start_file_wait_v1(PSI_file_locker * locker,size_t count,const char * src_file,uint src_line)1961 static void start_file_wait_v1(PSI_file_locker *locker,
1962                                size_t count,
1963                                const char *src_file,
1964                                uint src_line)
1965 {
1966   PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker);
1967   DBUG_ASSERT(pfs_locker != NULL);
1968 
1969   PFS_events_waits *wait= &pfs_locker->m_waits_current;
1970   if (wait->m_timer_state == TIMER_STATE_STARTING)
1971   {
1972     wait->m_timer_start= get_timer_value(pfs_locker->m_timer_name);
1973     wait->m_timer_state= TIMER_STATE_STARTED;
1974   }
1975   wait->m_source_file= src_file;
1976   wait->m_source_line= src_line;
1977   wait->m_number_of_bytes= count;
1978 }
1979 
end_file_wait_v1(PSI_file_locker * locker,size_t count)1980 static void end_file_wait_v1(PSI_file_locker *locker,
1981                              size_t count)
1982 {
1983   PFS_wait_locker *pfs_locker= reinterpret_cast<PFS_wait_locker*> (locker);
1984   DBUG_ASSERT(pfs_locker != NULL);
1985   PFS_events_waits *wait= &pfs_locker->m_waits_current;
1986 
1987   wait->m_number_of_bytes= count;
1988   if (wait->m_timer_state == TIMER_STATE_STARTED)
1989   {
1990     wait->m_timer_end= get_timer_value(pfs_locker->m_timer_name);
1991     wait->m_timer_state= TIMER_STATE_TIMED;
1992   }
1993   if (flag_events_waits_history)
1994     insert_events_waits_history(wait->m_thread, wait);
1995   if (flag_events_waits_history_long)
1996     insert_events_waits_history_long(wait);
1997 
1998   PFS_file *file= pfs_locker->m_target.m_file;
1999   PFS_single_stat_chain *stat= find_per_thread_file_class_wait_stat(wait->m_thread, file->m_class);
2000 
2001   /* If timed then aggregate stats, else increment the value counts only */
2002   if (wait->m_timer_state == TIMER_STATE_TIMED)
2003   {
2004     ulonglong wait_time= wait->m_timer_end - wait->m_timer_start;
2005     aggregate_single_stat_chain(&file->m_wait_stat, wait_time);
2006     aggregate_single_stat_chain(stat, wait_time);
2007   }
2008   else
2009   {
2010     increment_single_stat_chain(&file->m_wait_stat);
2011     increment_single_stat_chain(stat);
2012   }
2013 
2014   PFS_file_class *klass= file->m_class;
2015 
2016   switch(wait->m_operation)
2017   {
2018   case OPERATION_TYPE_FILEREAD:
2019     file->m_file_stat.m_count_read++;
2020     file->m_file_stat.m_read_bytes+= count;
2021     klass->m_file_stat.m_count_read++;
2022     klass->m_file_stat.m_read_bytes+= count;
2023     break;
2024   case OPERATION_TYPE_FILEWRITE:
2025     file->m_file_stat.m_count_write++;
2026     file->m_file_stat.m_write_bytes+= count;
2027     klass->m_file_stat.m_count_write++;
2028     klass->m_file_stat.m_write_bytes+= count;
2029     break;
2030   case OPERATION_TYPE_FILECLOSE:
2031   case OPERATION_TYPE_FILESTREAMCLOSE:
2032   case OPERATION_TYPE_FILESTAT:
2033     release_file(pfs_locker->m_target.m_file);
2034     break;
2035   case OPERATION_TYPE_FILEDELETE:
2036     destroy_file(wait->m_thread, pfs_locker->m_target.m_file);
2037     break;
2038   default:
2039     break;
2040   }
2041 
2042   wait->m_thread->m_wait_locker_count--;
2043 }
2044 
2045 PSI_v1 PFS_v1=
2046 {
2047   register_mutex_v1,
2048   register_rwlock_v1,
2049   register_cond_v1,
2050   register_thread_v1,
2051   register_file_v1,
2052   init_mutex_v1,
2053   destroy_mutex_v1,
2054   init_rwlock_v1,
2055   destroy_rwlock_v1,
2056   init_cond_v1,
2057   destroy_cond_v1,
2058   get_table_share_v1,
2059   release_table_share_v1,
2060   open_table_v1,
2061   close_table_v1,
2062   create_file_v1,
2063   spawn_thread_v1,
2064   new_thread_v1,
2065   set_thread_id_v1,
2066   get_thread_v1,
2067   set_thread_v1,
2068   delete_current_thread_v1,
2069   delete_thread_v1,
2070   get_thread_mutex_locker_v1,
2071   get_thread_rwlock_locker_v1,
2072   get_thread_cond_locker_v1,
2073   get_thread_table_locker_v1,
2074   get_thread_file_name_locker_v1,
2075   get_thread_file_stream_locker_v1,
2076   get_thread_file_descriptor_locker_v1,
2077   unlock_mutex_v1,
2078   unlock_rwlock_v1,
2079   signal_cond_v1,
2080   broadcast_cond_v1,
2081   start_mutex_wait_v1,
2082   end_mutex_wait_v1,
2083   start_rwlock_rdwait_v1,
2084   end_rwlock_rdwait_v1,
2085   start_rwlock_wrwait_v1,
2086   end_rwlock_wrwait_v1,
2087   start_cond_wait_v1,
2088   end_cond_wait_v1,
2089   start_table_wait_v1,
2090   end_table_wait_v1,
2091   start_file_open_wait_v1,
2092   end_file_open_wait_v1,
2093   end_file_open_wait_and_bind_to_descriptor_v1,
2094   start_file_wait_v1,
2095   end_file_wait_v1
2096 };
2097 
get_interface(int version)2098 static void* get_interface(int version)
2099 {
2100   switch (version)
2101   {
2102   case PSI_VERSION_1:
2103     return &PFS_v1;
2104   default:
2105     return NULL;
2106   }
2107 }
2108 
2109 C_MODE_END
2110 
2111 struct PSI_bootstrap PFS_bootstrap=
2112 {
2113   get_interface
2114 };
2115