1 /*-------------------------------------------------------------------------
2 *
3 * pg_stat_statements.c
4 * Track statement planning and execution times as well as resource
5 * usage across a whole database cluster.
6 *
7 * Execution costs are totaled for each distinct source query, and kept in
8 * a shared hashtable. (We track only as many distinct queries as will fit
9 * in the designated amount of shared memory.)
10 *
11 * Starting in Postgres 9.2, this module normalized query entries. As of
12 * Postgres 14, the normalization is done by the core if compute_query_id is
13 * enabled, or optionally by third-party modules.
14 *
15 * To facilitate presenting entries to users, we create "representative" query
16 * strings in which constants are replaced with parameter symbols ($n), to
17 * make it clearer what a normalized entry can represent. To save on shared
18 * memory, and to avoid having to truncate oversized query strings, we store
19 * these strings in a temporary external query-texts file. Offsets into this
20 * file are kept in shared memory.
21 *
22 * Note about locking issues: to create or delete an entry in the shared
23 * hashtable, one must hold pgss->lock exclusively. Modifying any field
24 * in an entry except the counters requires the same. To look up an entry,
25 * one must hold the lock shared. To read or update the counters within
26 * an entry, one must hold the lock shared or exclusive (so the entry doesn't
27 * disappear!) and also take the entry's mutex spinlock.
28 * The shared state variable pgss->extent (the next free spot in the external
29 * query-text file) should be accessed only while holding either the
30 * pgss->mutex spinlock, or exclusive lock on pgss->lock. We use the mutex to
31 * allow reserving file space while holding only shared lock on pgss->lock.
32 * Rewriting the entire external query-text file, eg for garbage collection,
33 * requires holding pgss->lock exclusively; this allows individual entries
34 * in the file to be read or written while holding only shared lock.
35 *
36 *
37 * Copyright (c) 2008-2021, PostgreSQL Global Development Group
38 *
39 * IDENTIFICATION
40 * contrib/pg_stat_statements/pg_stat_statements.c
41 *
42 *-------------------------------------------------------------------------
43 */
44 #include "postgres.h"
45
46 #include <math.h>
47 #include <sys/stat.h>
48 #include <unistd.h>
49
50 #include "access/parallel.h"
51 #include "catalog/pg_authid.h"
52 #include "common/hashfn.h"
53 #include "executor/instrument.h"
54 #include "funcapi.h"
55 #include "mb/pg_wchar.h"
56 #include "miscadmin.h"
57 #include "optimizer/planner.h"
58 #include "parser/analyze.h"
59 #include "parser/parsetree.h"
60 #include "parser/scanner.h"
61 #include "parser/scansup.h"
62 #include "pgstat.h"
63 #include "storage/fd.h"
64 #include "storage/ipc.h"
65 #include "storage/lwlock.h"
66 #include "storage/shmem.h"
67 #include "storage/spin.h"
68 #include "tcop/utility.h"
69 #include "utils/acl.h"
70 #include "utils/builtins.h"
71 #include "utils/queryjumble.h"
72 #include "utils/memutils.h"
73 #include "utils/timestamp.h"
74
75 PG_MODULE_MAGIC;
76
77 /* Location of permanent stats file (valid when database is shut down) */
78 #define PGSS_DUMP_FILE PGSTAT_STAT_PERMANENT_DIRECTORY "/pg_stat_statements.stat"
79
80 /*
81 * Location of external query text file. We don't keep it in the core
82 * system's stats_temp_directory. The core system can safely use that GUC
83 * setting, because the statistics collector temp file paths are set only once
84 * as part of changing the GUC, but pg_stat_statements has no way of avoiding
85 * race conditions. Besides, we only expect modest, infrequent I/O for query
86 * strings, so placing the file on a faster filesystem is not compelling.
87 */
88 #define PGSS_TEXT_FILE PG_STAT_TMP_DIR "/pgss_query_texts.stat"
89
90 /* Magic number identifying the stats file format */
91 static const uint32 PGSS_FILE_HEADER = 0x20201227;
92
93 /* PostgreSQL major version number, changes in which invalidate all entries */
94 static const uint32 PGSS_PG_MAJOR_VERSION = PG_VERSION_NUM / 100;
95
96 /* XXX: Should USAGE_EXEC reflect execution time and/or buffer usage? */
97 #define USAGE_EXEC(duration) (1.0)
98 #define USAGE_INIT (1.0) /* including initial planning */
99 #define ASSUMED_MEDIAN_INIT (10.0) /* initial assumed median usage */
100 #define ASSUMED_LENGTH_INIT 1024 /* initial assumed mean query length */
101 #define USAGE_DECREASE_FACTOR (0.99) /* decreased every entry_dealloc */
102 #define STICKY_DECREASE_FACTOR (0.50) /* factor for sticky entries */
103 #define USAGE_DEALLOC_PERCENT 5 /* free this % of entries at once */
104 #define IS_STICKY(c) ((c.calls[PGSS_PLAN] + c.calls[PGSS_EXEC]) == 0)
105
106 /*
107 * Utility statements that pgss_ProcessUtility and pgss_post_parse_analyze
108 * ignores.
109 */
110 #define PGSS_HANDLED_UTILITY(n) (!IsA(n, ExecuteStmt) && \
111 !IsA(n, PrepareStmt) && \
112 !IsA(n, DeallocateStmt))
113
114 /*
115 * Extension version number, for supporting older extension versions' objects
116 */
117 typedef enum pgssVersion
118 {
119 PGSS_V1_0 = 0,
120 PGSS_V1_1,
121 PGSS_V1_2,
122 PGSS_V1_3,
123 PGSS_V1_8,
124 PGSS_V1_9
125 } pgssVersion;
126
127 typedef enum pgssStoreKind
128 {
129 PGSS_INVALID = -1,
130
131 /*
132 * PGSS_PLAN and PGSS_EXEC must be respectively 0 and 1 as they're used to
133 * reference the underlying values in the arrays in the Counters struct,
134 * and this order is required in pg_stat_statements_internal().
135 */
136 PGSS_PLAN = 0,
137 PGSS_EXEC,
138
139 PGSS_NUMKIND /* Must be last value of this enum */
140 } pgssStoreKind;
141
142 /*
143 * Hashtable key that defines the identity of a hashtable entry. We separate
144 * queries by user and by database even if they are otherwise identical.
145 *
146 * If you add a new key to this struct, make sure to teach pgss_store() to
147 * zero the padding bytes. Otherwise, things will break, because pgss_hash is
148 * created using HASH_BLOBS, and thus tag_hash is used to hash this.
149
150 */
151 typedef struct pgssHashKey
152 {
153 Oid userid; /* user OID */
154 Oid dbid; /* database OID */
155 uint64 queryid; /* query identifier */
156 bool toplevel; /* query executed at top level */
157 } pgssHashKey;
158
159 /*
160 * The actual stats counters kept within pgssEntry.
161 */
162 typedef struct Counters
163 {
164 int64 calls[PGSS_NUMKIND]; /* # of times planned/executed */
165 double total_time[PGSS_NUMKIND]; /* total planning/execution time,
166 * in msec */
167 double min_time[PGSS_NUMKIND]; /* minimum planning/execution time in
168 * msec */
169 double max_time[PGSS_NUMKIND]; /* maximum planning/execution time in
170 * msec */
171 double mean_time[PGSS_NUMKIND]; /* mean planning/execution time in
172 * msec */
173 double sum_var_time[PGSS_NUMKIND]; /* sum of variances in
174 * planning/execution time in msec */
175 int64 rows; /* total # of retrieved or affected rows */
176 int64 shared_blks_hit; /* # of shared buffer hits */
177 int64 shared_blks_read; /* # of shared disk blocks read */
178 int64 shared_blks_dirtied; /* # of shared disk blocks dirtied */
179 int64 shared_blks_written; /* # of shared disk blocks written */
180 int64 local_blks_hit; /* # of local buffer hits */
181 int64 local_blks_read; /* # of local disk blocks read */
182 int64 local_blks_dirtied; /* # of local disk blocks dirtied */
183 int64 local_blks_written; /* # of local disk blocks written */
184 int64 temp_blks_read; /* # of temp blocks read */
185 int64 temp_blks_written; /* # of temp blocks written */
186 double blk_read_time; /* time spent reading, in msec */
187 double blk_write_time; /* time spent writing, in msec */
188 double usage; /* usage factor */
189 int64 wal_records; /* # of WAL records generated */
190 int64 wal_fpi; /* # of WAL full page images generated */
191 uint64 wal_bytes; /* total amount of WAL generated in bytes */
192 } Counters;
193
194 /*
195 * Global statistics for pg_stat_statements
196 */
197 typedef struct pgssGlobalStats
198 {
199 int64 dealloc; /* # of times entries were deallocated */
200 TimestampTz stats_reset; /* timestamp with all stats reset */
201 } pgssGlobalStats;
202
203 /*
204 * Statistics per statement
205 *
206 * Note: in event of a failure in garbage collection of the query text file,
207 * we reset query_offset to zero and query_len to -1. This will be seen as
208 * an invalid state by qtext_fetch().
209 */
210 typedef struct pgssEntry
211 {
212 pgssHashKey key; /* hash key of entry - MUST BE FIRST */
213 Counters counters; /* the statistics for this query */
214 Size query_offset; /* query text offset in external file */
215 int query_len; /* # of valid bytes in query string, or -1 */
216 int encoding; /* query text encoding */
217 slock_t mutex; /* protects the counters only */
218 } pgssEntry;
219
220 /*
221 * Global shared state
222 */
223 typedef struct pgssSharedState
224 {
225 LWLock *lock; /* protects hashtable search/modification */
226 double cur_median_usage; /* current median usage in hashtable */
227 Size mean_query_len; /* current mean entry text length */
228 slock_t mutex; /* protects following fields only: */
229 Size extent; /* current extent of query file */
230 int n_writers; /* number of active writers to query file */
231 int gc_count; /* query file garbage collection cycle count */
232 pgssGlobalStats stats; /* global statistics for pgss */
233 } pgssSharedState;
234
235 /*---- Local variables ----*/
236
237 /* Current nesting depth of ExecutorRun+ProcessUtility calls */
238 static int exec_nested_level = 0;
239
240 /* Current nesting depth of planner calls */
241 static int plan_nested_level = 0;
242
243 /* Saved hook values in case of unload */
244 static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
245 static post_parse_analyze_hook_type prev_post_parse_analyze_hook = NULL;
246 static planner_hook_type prev_planner_hook = NULL;
247 static ExecutorStart_hook_type prev_ExecutorStart = NULL;
248 static ExecutorRun_hook_type prev_ExecutorRun = NULL;
249 static ExecutorFinish_hook_type prev_ExecutorFinish = NULL;
250 static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
251 static ProcessUtility_hook_type prev_ProcessUtility = NULL;
252
253 /* Links to shared memory state */
254 static pgssSharedState *pgss = NULL;
255 static HTAB *pgss_hash = NULL;
256
257 /*---- GUC variables ----*/
258
259 typedef enum
260 {
261 PGSS_TRACK_NONE, /* track no statements */
262 PGSS_TRACK_TOP, /* only top level statements */
263 PGSS_TRACK_ALL /* all statements, including nested ones */
264 } PGSSTrackLevel;
265
266 static const struct config_enum_entry track_options[] =
267 {
268 {"none", PGSS_TRACK_NONE, false},
269 {"top", PGSS_TRACK_TOP, false},
270 {"all", PGSS_TRACK_ALL, false},
271 {NULL, 0, false}
272 };
273
274 static int pgss_max; /* max # statements to track */
275 static int pgss_track; /* tracking level */
276 static bool pgss_track_utility; /* whether to track utility commands */
277 static bool pgss_track_planning; /* whether to track planning duration */
278 static bool pgss_save; /* whether to save stats across shutdown */
279
280
281 #define pgss_enabled(level) \
282 (!IsParallelWorker() && \
283 (pgss_track == PGSS_TRACK_ALL || \
284 (pgss_track == PGSS_TRACK_TOP && (level) == 0)))
285
286 #define record_gc_qtexts() \
287 do { \
288 volatile pgssSharedState *s = (volatile pgssSharedState *) pgss; \
289 SpinLockAcquire(&s->mutex); \
290 s->gc_count++; \
291 SpinLockRelease(&s->mutex); \
292 } while(0)
293
294 /*---- Function declarations ----*/
295
296 void _PG_init(void);
297 void _PG_fini(void);
298
299 PG_FUNCTION_INFO_V1(pg_stat_statements_reset);
300 PG_FUNCTION_INFO_V1(pg_stat_statements_reset_1_7);
301 PG_FUNCTION_INFO_V1(pg_stat_statements_1_2);
302 PG_FUNCTION_INFO_V1(pg_stat_statements_1_3);
303 PG_FUNCTION_INFO_V1(pg_stat_statements_1_8);
304 PG_FUNCTION_INFO_V1(pg_stat_statements_1_9);
305 PG_FUNCTION_INFO_V1(pg_stat_statements);
306 PG_FUNCTION_INFO_V1(pg_stat_statements_info);
307
308 static void pgss_shmem_startup(void);
309 static void pgss_shmem_shutdown(int code, Datum arg);
310 static void pgss_post_parse_analyze(ParseState *pstate, Query *query,
311 JumbleState *jstate);
312 static PlannedStmt *pgss_planner(Query *parse,
313 const char *query_string,
314 int cursorOptions,
315 ParamListInfo boundParams);
316 static void pgss_ExecutorStart(QueryDesc *queryDesc, int eflags);
317 static void pgss_ExecutorRun(QueryDesc *queryDesc,
318 ScanDirection direction,
319 uint64 count, bool execute_once);
320 static void pgss_ExecutorFinish(QueryDesc *queryDesc);
321 static void pgss_ExecutorEnd(QueryDesc *queryDesc);
322 static void pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
323 bool readOnlyTree,
324 ProcessUtilityContext context, ParamListInfo params,
325 QueryEnvironment *queryEnv,
326 DestReceiver *dest, QueryCompletion *qc);
327 static void pgss_store(const char *query, uint64 queryId,
328 int query_location, int query_len,
329 pgssStoreKind kind,
330 double total_time, uint64 rows,
331 const BufferUsage *bufusage,
332 const WalUsage *walusage,
333 JumbleState *jstate);
334 static void pg_stat_statements_internal(FunctionCallInfo fcinfo,
335 pgssVersion api_version,
336 bool showtext);
337 static Size pgss_memsize(void);
338 static pgssEntry *entry_alloc(pgssHashKey *key, Size query_offset, int query_len,
339 int encoding, bool sticky);
340 static void entry_dealloc(void);
341 static bool qtext_store(const char *query, int query_len,
342 Size *query_offset, int *gc_count);
343 static char *qtext_load_file(Size *buffer_size);
344 static char *qtext_fetch(Size query_offset, int query_len,
345 char *buffer, Size buffer_size);
346 static bool need_gc_qtexts(void);
347 static void gc_qtexts(void);
348 static void entry_reset(Oid userid, Oid dbid, uint64 queryid);
349 static char *generate_normalized_query(JumbleState *jstate, const char *query,
350 int query_loc, int *query_len_p);
351 static void fill_in_constant_lengths(JumbleState *jstate, const char *query,
352 int query_loc);
353 static int comp_location(const void *a, const void *b);
354
355
356 /*
357 * Module load callback
358 */
359 void
_PG_init(void)360 _PG_init(void)
361 {
362 /*
363 * In order to create our shared memory area, we have to be loaded via
364 * shared_preload_libraries. If not, fall out without hooking into any of
365 * the main system. (We don't throw error here because it seems useful to
366 * allow the pg_stat_statements functions to be created even when the
367 * module isn't active. The functions must protect themselves against
368 * being called then, however.)
369 */
370 if (!process_shared_preload_libraries_in_progress)
371 return;
372
373 /*
374 * Inform the postmaster that we want to enable query_id calculation if
375 * compute_query_id is set to auto.
376 */
377 EnableQueryId();
378
379 /*
380 * Define (or redefine) custom GUC variables.
381 */
382 DefineCustomIntVariable("pg_stat_statements.max",
383 "Sets the maximum number of statements tracked by pg_stat_statements.",
384 NULL,
385 &pgss_max,
386 5000,
387 100,
388 INT_MAX,
389 PGC_POSTMASTER,
390 0,
391 NULL,
392 NULL,
393 NULL);
394
395 DefineCustomEnumVariable("pg_stat_statements.track",
396 "Selects which statements are tracked by pg_stat_statements.",
397 NULL,
398 &pgss_track,
399 PGSS_TRACK_TOP,
400 track_options,
401 PGC_SUSET,
402 0,
403 NULL,
404 NULL,
405 NULL);
406
407 DefineCustomBoolVariable("pg_stat_statements.track_utility",
408 "Selects whether utility commands are tracked by pg_stat_statements.",
409 NULL,
410 &pgss_track_utility,
411 true,
412 PGC_SUSET,
413 0,
414 NULL,
415 NULL,
416 NULL);
417
418 DefineCustomBoolVariable("pg_stat_statements.track_planning",
419 "Selects whether planning duration is tracked by pg_stat_statements.",
420 NULL,
421 &pgss_track_planning,
422 false,
423 PGC_SUSET,
424 0,
425 NULL,
426 NULL,
427 NULL);
428
429 DefineCustomBoolVariable("pg_stat_statements.save",
430 "Save pg_stat_statements statistics across server shutdowns.",
431 NULL,
432 &pgss_save,
433 true,
434 PGC_SIGHUP,
435 0,
436 NULL,
437 NULL,
438 NULL);
439
440 EmitWarningsOnPlaceholders("pg_stat_statements");
441
442 /*
443 * Request additional shared resources. (These are no-ops if we're not in
444 * the postmaster process.) We'll allocate or attach to the shared
445 * resources in pgss_shmem_startup().
446 */
447 RequestAddinShmemSpace(pgss_memsize());
448 RequestNamedLWLockTranche("pg_stat_statements", 1);
449
450 /*
451 * Install hooks.
452 */
453 prev_shmem_startup_hook = shmem_startup_hook;
454 shmem_startup_hook = pgss_shmem_startup;
455 prev_post_parse_analyze_hook = post_parse_analyze_hook;
456 post_parse_analyze_hook = pgss_post_parse_analyze;
457 prev_planner_hook = planner_hook;
458 planner_hook = pgss_planner;
459 prev_ExecutorStart = ExecutorStart_hook;
460 ExecutorStart_hook = pgss_ExecutorStart;
461 prev_ExecutorRun = ExecutorRun_hook;
462 ExecutorRun_hook = pgss_ExecutorRun;
463 prev_ExecutorFinish = ExecutorFinish_hook;
464 ExecutorFinish_hook = pgss_ExecutorFinish;
465 prev_ExecutorEnd = ExecutorEnd_hook;
466 ExecutorEnd_hook = pgss_ExecutorEnd;
467 prev_ProcessUtility = ProcessUtility_hook;
468 ProcessUtility_hook = pgss_ProcessUtility;
469 }
470
471 /*
472 * Module unload callback
473 */
474 void
_PG_fini(void)475 _PG_fini(void)
476 {
477 /* Uninstall hooks. */
478 shmem_startup_hook = prev_shmem_startup_hook;
479 post_parse_analyze_hook = prev_post_parse_analyze_hook;
480 planner_hook = prev_planner_hook;
481 ExecutorStart_hook = prev_ExecutorStart;
482 ExecutorRun_hook = prev_ExecutorRun;
483 ExecutorFinish_hook = prev_ExecutorFinish;
484 ExecutorEnd_hook = prev_ExecutorEnd;
485 ProcessUtility_hook = prev_ProcessUtility;
486 }
487
488 /*
489 * shmem_startup hook: allocate or attach to shared memory,
490 * then load any pre-existing statistics from file.
491 * Also create and load the query-texts file, which is expected to exist
492 * (even if empty) while the module is enabled.
493 */
494 static void
pgss_shmem_startup(void)495 pgss_shmem_startup(void)
496 {
497 bool found;
498 HASHCTL info;
499 FILE *file = NULL;
500 FILE *qfile = NULL;
501 uint32 header;
502 int32 num;
503 int32 pgver;
504 int32 i;
505 int buffer_size;
506 char *buffer = NULL;
507
508 if (prev_shmem_startup_hook)
509 prev_shmem_startup_hook();
510
511 /* reset in case this is a restart within the postmaster */
512 pgss = NULL;
513 pgss_hash = NULL;
514
515 /*
516 * Create or attach to the shared memory state, including hash table
517 */
518 LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
519
520 pgss = ShmemInitStruct("pg_stat_statements",
521 sizeof(pgssSharedState),
522 &found);
523
524 if (!found)
525 {
526 /* First time through ... */
527 pgss->lock = &(GetNamedLWLockTranche("pg_stat_statements"))->lock;
528 pgss->cur_median_usage = ASSUMED_MEDIAN_INIT;
529 pgss->mean_query_len = ASSUMED_LENGTH_INIT;
530 SpinLockInit(&pgss->mutex);
531 pgss->extent = 0;
532 pgss->n_writers = 0;
533 pgss->gc_count = 0;
534 pgss->stats.dealloc = 0;
535 pgss->stats.stats_reset = GetCurrentTimestamp();
536 }
537
538 info.keysize = sizeof(pgssHashKey);
539 info.entrysize = sizeof(pgssEntry);
540 pgss_hash = ShmemInitHash("pg_stat_statements hash",
541 pgss_max, pgss_max,
542 &info,
543 HASH_ELEM | HASH_BLOBS);
544
545 LWLockRelease(AddinShmemInitLock);
546
547 /*
548 * If we're in the postmaster (or a standalone backend...), set up a shmem
549 * exit hook to dump the statistics to disk.
550 */
551 if (!IsUnderPostmaster)
552 on_shmem_exit(pgss_shmem_shutdown, (Datum) 0);
553
554 /*
555 * Done if some other process already completed our initialization.
556 */
557 if (found)
558 return;
559
560 /*
561 * Note: we don't bother with locks here, because there should be no other
562 * processes running when this code is reached.
563 */
564
565 /* Unlink query text file possibly left over from crash */
566 unlink(PGSS_TEXT_FILE);
567
568 /* Allocate new query text temp file */
569 qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W);
570 if (qfile == NULL)
571 goto write_error;
572
573 /*
574 * If we were told not to load old statistics, we're done. (Note we do
575 * not try to unlink any old dump file in this case. This seems a bit
576 * questionable but it's the historical behavior.)
577 */
578 if (!pgss_save)
579 {
580 FreeFile(qfile);
581 return;
582 }
583
584 /*
585 * Attempt to load old statistics from the dump file.
586 */
587 file = AllocateFile(PGSS_DUMP_FILE, PG_BINARY_R);
588 if (file == NULL)
589 {
590 if (errno != ENOENT)
591 goto read_error;
592 /* No existing persisted stats file, so we're done */
593 FreeFile(qfile);
594 return;
595 }
596
597 buffer_size = 2048;
598 buffer = (char *) palloc(buffer_size);
599
600 if (fread(&header, sizeof(uint32), 1, file) != 1 ||
601 fread(&pgver, sizeof(uint32), 1, file) != 1 ||
602 fread(&num, sizeof(int32), 1, file) != 1)
603 goto read_error;
604
605 if (header != PGSS_FILE_HEADER ||
606 pgver != PGSS_PG_MAJOR_VERSION)
607 goto data_error;
608
609 for (i = 0; i < num; i++)
610 {
611 pgssEntry temp;
612 pgssEntry *entry;
613 Size query_offset;
614
615 if (fread(&temp, sizeof(pgssEntry), 1, file) != 1)
616 goto read_error;
617
618 /* Encoding is the only field we can easily sanity-check */
619 if (!PG_VALID_BE_ENCODING(temp.encoding))
620 goto data_error;
621
622 /* Resize buffer as needed */
623 if (temp.query_len >= buffer_size)
624 {
625 buffer_size = Max(buffer_size * 2, temp.query_len + 1);
626 buffer = repalloc(buffer, buffer_size);
627 }
628
629 if (fread(buffer, 1, temp.query_len + 1, file) != temp.query_len + 1)
630 goto read_error;
631
632 /* Should have a trailing null, but let's make sure */
633 buffer[temp.query_len] = '\0';
634
635 /* Skip loading "sticky" entries */
636 if (IS_STICKY(temp.counters))
637 continue;
638
639 /* Store the query text */
640 query_offset = pgss->extent;
641 if (fwrite(buffer, 1, temp.query_len + 1, qfile) != temp.query_len + 1)
642 goto write_error;
643 pgss->extent += temp.query_len + 1;
644
645 /* make the hashtable entry (discards old entries if too many) */
646 entry = entry_alloc(&temp.key, query_offset, temp.query_len,
647 temp.encoding,
648 false);
649
650 /* copy in the actual stats */
651 entry->counters = temp.counters;
652 }
653
654 /* Read global statistics for pg_stat_statements */
655 if (fread(&pgss->stats, sizeof(pgssGlobalStats), 1, file) != 1)
656 goto read_error;
657
658 pfree(buffer);
659 FreeFile(file);
660 FreeFile(qfile);
661
662 /*
663 * Remove the persisted stats file so it's not included in
664 * backups/replication standbys, etc. A new file will be written on next
665 * shutdown.
666 *
667 * Note: it's okay if the PGSS_TEXT_FILE is included in a basebackup,
668 * because we remove that file on startup; it acts inversely to
669 * PGSS_DUMP_FILE, in that it is only supposed to be around when the
670 * server is running, whereas PGSS_DUMP_FILE is only supposed to be around
671 * when the server is not running. Leaving the file creates no danger of
672 * a newly restored database having a spurious record of execution costs,
673 * which is what we're really concerned about here.
674 */
675 unlink(PGSS_DUMP_FILE);
676
677 return;
678
679 read_error:
680 ereport(LOG,
681 (errcode_for_file_access(),
682 errmsg("could not read file \"%s\": %m",
683 PGSS_DUMP_FILE)));
684 goto fail;
685 data_error:
686 ereport(LOG,
687 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
688 errmsg("ignoring invalid data in file \"%s\"",
689 PGSS_DUMP_FILE)));
690 goto fail;
691 write_error:
692 ereport(LOG,
693 (errcode_for_file_access(),
694 errmsg("could not write file \"%s\": %m",
695 PGSS_TEXT_FILE)));
696 fail:
697 if (buffer)
698 pfree(buffer);
699 if (file)
700 FreeFile(file);
701 if (qfile)
702 FreeFile(qfile);
703 /* If possible, throw away the bogus file; ignore any error */
704 unlink(PGSS_DUMP_FILE);
705
706 /*
707 * Don't unlink PGSS_TEXT_FILE here; it should always be around while the
708 * server is running with pg_stat_statements enabled
709 */
710 }
711
712 /*
713 * shmem_shutdown hook: Dump statistics into file.
714 *
715 * Note: we don't bother with acquiring lock, because there should be no
716 * other processes running when this is called.
717 */
718 static void
pgss_shmem_shutdown(int code,Datum arg)719 pgss_shmem_shutdown(int code, Datum arg)
720 {
721 FILE *file;
722 char *qbuffer = NULL;
723 Size qbuffer_size = 0;
724 HASH_SEQ_STATUS hash_seq;
725 int32 num_entries;
726 pgssEntry *entry;
727
728 /* Don't try to dump during a crash. */
729 if (code)
730 return;
731
732 /* Safety check ... shouldn't get here unless shmem is set up. */
733 if (!pgss || !pgss_hash)
734 return;
735
736 /* Don't dump if told not to. */
737 if (!pgss_save)
738 return;
739
740 file = AllocateFile(PGSS_DUMP_FILE ".tmp", PG_BINARY_W);
741 if (file == NULL)
742 goto error;
743
744 if (fwrite(&PGSS_FILE_HEADER, sizeof(uint32), 1, file) != 1)
745 goto error;
746 if (fwrite(&PGSS_PG_MAJOR_VERSION, sizeof(uint32), 1, file) != 1)
747 goto error;
748 num_entries = hash_get_num_entries(pgss_hash);
749 if (fwrite(&num_entries, sizeof(int32), 1, file) != 1)
750 goto error;
751
752 qbuffer = qtext_load_file(&qbuffer_size);
753 if (qbuffer == NULL)
754 goto error;
755
756 /*
757 * When serializing to disk, we store query texts immediately after their
758 * entry data. Any orphaned query texts are thereby excluded.
759 */
760 hash_seq_init(&hash_seq, pgss_hash);
761 while ((entry = hash_seq_search(&hash_seq)) != NULL)
762 {
763 int len = entry->query_len;
764 char *qstr = qtext_fetch(entry->query_offset, len,
765 qbuffer, qbuffer_size);
766
767 if (qstr == NULL)
768 continue; /* Ignore any entries with bogus texts */
769
770 if (fwrite(entry, sizeof(pgssEntry), 1, file) != 1 ||
771 fwrite(qstr, 1, len + 1, file) != len + 1)
772 {
773 /* note: we assume hash_seq_term won't change errno */
774 hash_seq_term(&hash_seq);
775 goto error;
776 }
777 }
778
779 /* Dump global statistics for pg_stat_statements */
780 if (fwrite(&pgss->stats, sizeof(pgssGlobalStats), 1, file) != 1)
781 goto error;
782
783 free(qbuffer);
784 qbuffer = NULL;
785
786 if (FreeFile(file))
787 {
788 file = NULL;
789 goto error;
790 }
791
792 /*
793 * Rename file into place, so we atomically replace any old one.
794 */
795 (void) durable_rename(PGSS_DUMP_FILE ".tmp", PGSS_DUMP_FILE, LOG);
796
797 /* Unlink query-texts file; it's not needed while shutdown */
798 unlink(PGSS_TEXT_FILE);
799
800 return;
801
802 error:
803 ereport(LOG,
804 (errcode_for_file_access(),
805 errmsg("could not write file \"%s\": %m",
806 PGSS_DUMP_FILE ".tmp")));
807 if (qbuffer)
808 free(qbuffer);
809 if (file)
810 FreeFile(file);
811 unlink(PGSS_DUMP_FILE ".tmp");
812 unlink(PGSS_TEXT_FILE);
813 }
814
815 /*
816 * Post-parse-analysis hook: mark query with a queryId
817 */
818 static void
pgss_post_parse_analyze(ParseState * pstate,Query * query,JumbleState * jstate)819 pgss_post_parse_analyze(ParseState *pstate, Query *query, JumbleState *jstate)
820 {
821 if (prev_post_parse_analyze_hook)
822 prev_post_parse_analyze_hook(pstate, query, jstate);
823
824 /* Safety check... */
825 if (!pgss || !pgss_hash || !pgss_enabled(exec_nested_level))
826 return;
827
828 /*
829 * Clear queryId for prepared statements related utility, as those will
830 * inherit from the underlying statement's one (except DEALLOCATE which is
831 * entirely untracked).
832 */
833 if (query->utilityStmt)
834 {
835 if (pgss_track_utility && !PGSS_HANDLED_UTILITY(query->utilityStmt))
836 query->queryId = UINT64CONST(0);
837 return;
838 }
839
840 /*
841 * If query jumbling were able to identify any ignorable constants, we
842 * immediately create a hash table entry for the query, so that we can
843 * record the normalized form of the query string. If there were no such
844 * constants, the normalized string would be the same as the query text
845 * anyway, so there's no need for an early entry.
846 */
847 if (jstate && jstate->clocations_count > 0)
848 pgss_store(pstate->p_sourcetext,
849 query->queryId,
850 query->stmt_location,
851 query->stmt_len,
852 PGSS_INVALID,
853 0,
854 0,
855 NULL,
856 NULL,
857 jstate);
858 }
859
860 /*
861 * Planner hook: forward to regular planner, but measure planning time
862 * if needed.
863 */
864 static PlannedStmt *
pgss_planner(Query * parse,const char * query_string,int cursorOptions,ParamListInfo boundParams)865 pgss_planner(Query *parse,
866 const char *query_string,
867 int cursorOptions,
868 ParamListInfo boundParams)
869 {
870 PlannedStmt *result;
871
872 /*
873 * We can't process the query if no query_string is provided, as
874 * pgss_store needs it. We also ignore query without queryid, as it would
875 * be treated as a utility statement, which may not be the case.
876 *
877 * Note that planner_hook can be called from the planner itself, so we
878 * have a specific nesting level for the planner. However, utility
879 * commands containing optimizable statements can also call the planner,
880 * same for regular DML (for instance for underlying foreign key queries).
881 * So testing the planner nesting level only is not enough to detect real
882 * top level planner call.
883 */
884 if (pgss_enabled(plan_nested_level + exec_nested_level)
885 && pgss_track_planning && query_string
886 && parse->queryId != UINT64CONST(0))
887 {
888 instr_time start;
889 instr_time duration;
890 BufferUsage bufusage_start,
891 bufusage;
892 WalUsage walusage_start,
893 walusage;
894
895 /* We need to track buffer usage as the planner can access them. */
896 bufusage_start = pgBufferUsage;
897
898 /*
899 * Similarly the planner could write some WAL records in some cases
900 * (e.g. setting a hint bit with those being WAL-logged)
901 */
902 walusage_start = pgWalUsage;
903 INSTR_TIME_SET_CURRENT(start);
904
905 plan_nested_level++;
906 PG_TRY();
907 {
908 if (prev_planner_hook)
909 result = prev_planner_hook(parse, query_string, cursorOptions,
910 boundParams);
911 else
912 result = standard_planner(parse, query_string, cursorOptions,
913 boundParams);
914 }
915 PG_FINALLY();
916 {
917 plan_nested_level--;
918 }
919 PG_END_TRY();
920
921 INSTR_TIME_SET_CURRENT(duration);
922 INSTR_TIME_SUBTRACT(duration, start);
923
924 /* calc differences of buffer counters. */
925 memset(&bufusage, 0, sizeof(BufferUsage));
926 BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start);
927
928 /* calc differences of WAL counters. */
929 memset(&walusage, 0, sizeof(WalUsage));
930 WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start);
931
932 pgss_store(query_string,
933 parse->queryId,
934 parse->stmt_location,
935 parse->stmt_len,
936 PGSS_PLAN,
937 INSTR_TIME_GET_MILLISEC(duration),
938 0,
939 &bufusage,
940 &walusage,
941 NULL);
942 }
943 else
944 {
945 if (prev_planner_hook)
946 result = prev_planner_hook(parse, query_string, cursorOptions,
947 boundParams);
948 else
949 result = standard_planner(parse, query_string, cursorOptions,
950 boundParams);
951 }
952
953 return result;
954 }
955
956 /*
957 * ExecutorStart hook: start up tracking if needed
958 */
959 static void
pgss_ExecutorStart(QueryDesc * queryDesc,int eflags)960 pgss_ExecutorStart(QueryDesc *queryDesc, int eflags)
961 {
962 if (prev_ExecutorStart)
963 prev_ExecutorStart(queryDesc, eflags);
964 else
965 standard_ExecutorStart(queryDesc, eflags);
966
967 /*
968 * If query has queryId zero, don't track it. This prevents double
969 * counting of optimizable statements that are directly contained in
970 * utility statements.
971 */
972 if (pgss_enabled(exec_nested_level) && queryDesc->plannedstmt->queryId != UINT64CONST(0))
973 {
974 /*
975 * Set up to track total elapsed time in ExecutorRun. Make sure the
976 * space is allocated in the per-query context so it will go away at
977 * ExecutorEnd.
978 */
979 if (queryDesc->totaltime == NULL)
980 {
981 MemoryContext oldcxt;
982
983 oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
984 queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false);
985 MemoryContextSwitchTo(oldcxt);
986 }
987 }
988 }
989
990 /*
991 * ExecutorRun hook: all we need do is track nesting depth
992 */
993 static void
pgss_ExecutorRun(QueryDesc * queryDesc,ScanDirection direction,uint64 count,bool execute_once)994 pgss_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, uint64 count,
995 bool execute_once)
996 {
997 exec_nested_level++;
998 PG_TRY();
999 {
1000 if (prev_ExecutorRun)
1001 prev_ExecutorRun(queryDesc, direction, count, execute_once);
1002 else
1003 standard_ExecutorRun(queryDesc, direction, count, execute_once);
1004 }
1005 PG_FINALLY();
1006 {
1007 exec_nested_level--;
1008 }
1009 PG_END_TRY();
1010 }
1011
1012 /*
1013 * ExecutorFinish hook: all we need do is track nesting depth
1014 */
1015 static void
pgss_ExecutorFinish(QueryDesc * queryDesc)1016 pgss_ExecutorFinish(QueryDesc *queryDesc)
1017 {
1018 exec_nested_level++;
1019 PG_TRY();
1020 {
1021 if (prev_ExecutorFinish)
1022 prev_ExecutorFinish(queryDesc);
1023 else
1024 standard_ExecutorFinish(queryDesc);
1025 }
1026 PG_FINALLY();
1027 {
1028 exec_nested_level--;
1029 }
1030 PG_END_TRY();
1031 }
1032
1033 /*
1034 * ExecutorEnd hook: store results if needed
1035 */
1036 static void
pgss_ExecutorEnd(QueryDesc * queryDesc)1037 pgss_ExecutorEnd(QueryDesc *queryDesc)
1038 {
1039 uint64 queryId = queryDesc->plannedstmt->queryId;
1040
1041 if (queryId != UINT64CONST(0) && queryDesc->totaltime &&
1042 pgss_enabled(exec_nested_level))
1043 {
1044 /*
1045 * Make sure stats accumulation is done. (Note: it's okay if several
1046 * levels of hook all do this.)
1047 */
1048 InstrEndLoop(queryDesc->totaltime);
1049
1050 pgss_store(queryDesc->sourceText,
1051 queryId,
1052 queryDesc->plannedstmt->stmt_location,
1053 queryDesc->plannedstmt->stmt_len,
1054 PGSS_EXEC,
1055 queryDesc->totaltime->total * 1000.0, /* convert to msec */
1056 queryDesc->estate->es_processed,
1057 &queryDesc->totaltime->bufusage,
1058 &queryDesc->totaltime->walusage,
1059 NULL);
1060 }
1061
1062 if (prev_ExecutorEnd)
1063 prev_ExecutorEnd(queryDesc);
1064 else
1065 standard_ExecutorEnd(queryDesc);
1066 }
1067
1068 /*
1069 * ProcessUtility hook
1070 */
1071 static void
pgss_ProcessUtility(PlannedStmt * pstmt,const char * queryString,bool readOnlyTree,ProcessUtilityContext context,ParamListInfo params,QueryEnvironment * queryEnv,DestReceiver * dest,QueryCompletion * qc)1072 pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
1073 bool readOnlyTree,
1074 ProcessUtilityContext context,
1075 ParamListInfo params, QueryEnvironment *queryEnv,
1076 DestReceiver *dest, QueryCompletion *qc)
1077 {
1078 Node *parsetree = pstmt->utilityStmt;
1079 uint64 saved_queryId = pstmt->queryId;
1080
1081 /*
1082 * Force utility statements to get queryId zero. We do this even in cases
1083 * where the statement contains an optimizable statement for which a
1084 * queryId could be derived (such as EXPLAIN or DECLARE CURSOR). For such
1085 * cases, runtime control will first go through ProcessUtility and then
1086 * the executor, and we don't want the executor hooks to do anything,
1087 * since we are already measuring the statement's costs at the utility
1088 * level.
1089 *
1090 * Note that this is only done if pg_stat_statements is enabled and
1091 * configured to track utility statements, in the unlikely possibility
1092 * that user configured another extension to handle utility statements
1093 * only.
1094 */
1095 if (pgss_enabled(exec_nested_level) && pgss_track_utility)
1096 pstmt->queryId = UINT64CONST(0);
1097
1098 /*
1099 * If it's an EXECUTE statement, we don't track it and don't increment the
1100 * nesting level. This allows the cycles to be charged to the underlying
1101 * PREPARE instead (by the Executor hooks), which is much more useful.
1102 *
1103 * We also don't track execution of PREPARE. If we did, we would get one
1104 * hash table entry for the PREPARE (with hash calculated from the query
1105 * string), and then a different one with the same query string (but hash
1106 * calculated from the query tree) would be used to accumulate costs of
1107 * ensuing EXECUTEs. This would be confusing, and inconsistent with other
1108 * cases where planning time is not included at all.
1109 *
1110 * Likewise, we don't track execution of DEALLOCATE.
1111 */
1112 if (pgss_track_utility && pgss_enabled(exec_nested_level) &&
1113 PGSS_HANDLED_UTILITY(parsetree))
1114 {
1115 instr_time start;
1116 instr_time duration;
1117 uint64 rows;
1118 BufferUsage bufusage_start,
1119 bufusage;
1120 WalUsage walusage_start,
1121 walusage;
1122
1123 bufusage_start = pgBufferUsage;
1124 walusage_start = pgWalUsage;
1125 INSTR_TIME_SET_CURRENT(start);
1126
1127 exec_nested_level++;
1128 PG_TRY();
1129 {
1130 if (prev_ProcessUtility)
1131 prev_ProcessUtility(pstmt, queryString, readOnlyTree,
1132 context, params, queryEnv,
1133 dest, qc);
1134 else
1135 standard_ProcessUtility(pstmt, queryString, readOnlyTree,
1136 context, params, queryEnv,
1137 dest, qc);
1138 }
1139 PG_FINALLY();
1140 {
1141 exec_nested_level--;
1142 }
1143 PG_END_TRY();
1144
1145 INSTR_TIME_SET_CURRENT(duration);
1146 INSTR_TIME_SUBTRACT(duration, start);
1147
1148 /*
1149 * Track the total number of rows retrieved or affected by the utility
1150 * statements of COPY, FETCH, CREATE TABLE AS, CREATE MATERIALIZED
1151 * VIEW, REFRESH MATERIALIZED VIEW and SELECT INTO.
1152 */
1153 rows = (qc && (qc->commandTag == CMDTAG_COPY ||
1154 qc->commandTag == CMDTAG_FETCH ||
1155 qc->commandTag == CMDTAG_SELECT ||
1156 qc->commandTag == CMDTAG_REFRESH_MATERIALIZED_VIEW)) ?
1157 qc->nprocessed : 0;
1158
1159 /* calc differences of buffer counters. */
1160 memset(&bufusage, 0, sizeof(BufferUsage));
1161 BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start);
1162
1163 /* calc differences of WAL counters. */
1164 memset(&walusage, 0, sizeof(WalUsage));
1165 WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start);
1166
1167 pgss_store(queryString,
1168 saved_queryId,
1169 pstmt->stmt_location,
1170 pstmt->stmt_len,
1171 PGSS_EXEC,
1172 INSTR_TIME_GET_MILLISEC(duration),
1173 rows,
1174 &bufusage,
1175 &walusage,
1176 NULL);
1177 }
1178 else
1179 {
1180 if (prev_ProcessUtility)
1181 prev_ProcessUtility(pstmt, queryString, readOnlyTree,
1182 context, params, queryEnv,
1183 dest, qc);
1184 else
1185 standard_ProcessUtility(pstmt, queryString, readOnlyTree,
1186 context, params, queryEnv,
1187 dest, qc);
1188 }
1189 }
1190
1191 /*
1192 * Store some statistics for a statement.
1193 *
1194 * If queryId is 0 then this is a utility statement for which we couldn't
1195 * compute a queryId during parse analysis, and we should compute a suitable
1196 * queryId internally.
1197 *
1198 * If jstate is not NULL then we're trying to create an entry for which
1199 * we have no statistics as yet; we just want to record the normalized
1200 * query string. total_time, rows, bufusage and walusage are ignored in this
1201 * case.
1202 *
1203 * If kind is PGSS_PLAN or PGSS_EXEC, its value is used as the array position
1204 * for the arrays in the Counters field.
1205 */
1206 static void
pgss_store(const char * query,uint64 queryId,int query_location,int query_len,pgssStoreKind kind,double total_time,uint64 rows,const BufferUsage * bufusage,const WalUsage * walusage,JumbleState * jstate)1207 pgss_store(const char *query, uint64 queryId,
1208 int query_location, int query_len,
1209 pgssStoreKind kind,
1210 double total_time, uint64 rows,
1211 const BufferUsage *bufusage,
1212 const WalUsage *walusage,
1213 JumbleState *jstate)
1214 {
1215 pgssHashKey key;
1216 pgssEntry *entry;
1217 char *norm_query = NULL;
1218 int encoding = GetDatabaseEncoding();
1219
1220 Assert(query != NULL);
1221
1222 /* Safety check... */
1223 if (!pgss || !pgss_hash)
1224 return;
1225
1226 /*
1227 * Nothing to do if compute_query_id isn't enabled and no other module
1228 * computed a query identifier.
1229 */
1230 if (queryId == UINT64CONST(0))
1231 return;
1232
1233 /*
1234 * Confine our attention to the relevant part of the string, if the query
1235 * is a portion of a multi-statement source string, and update query
1236 * location and length if needed.
1237 */
1238 query = CleanQuerytext(query, &query_location, &query_len);
1239
1240 /* Set up key for hashtable search */
1241
1242 /* memset() is required when pgssHashKey is without padding only */
1243 memset(&key, 0, sizeof(pgssHashKey));
1244
1245 key.userid = GetUserId();
1246 key.dbid = MyDatabaseId;
1247 key.queryid = queryId;
1248 key.toplevel = (exec_nested_level == 0);
1249
1250 /* Lookup the hash table entry with shared lock. */
1251 LWLockAcquire(pgss->lock, LW_SHARED);
1252
1253 entry = (pgssEntry *) hash_search(pgss_hash, &key, HASH_FIND, NULL);
1254
1255 /* Create new entry, if not present */
1256 if (!entry)
1257 {
1258 Size query_offset;
1259 int gc_count;
1260 bool stored;
1261 bool do_gc;
1262
1263 /*
1264 * Create a new, normalized query string if caller asked. We don't
1265 * need to hold the lock while doing this work. (Note: in any case,
1266 * it's possible that someone else creates a duplicate hashtable entry
1267 * in the interval where we don't hold the lock below. That case is
1268 * handled by entry_alloc.)
1269 */
1270 if (jstate)
1271 {
1272 LWLockRelease(pgss->lock);
1273 norm_query = generate_normalized_query(jstate, query,
1274 query_location,
1275 &query_len);
1276 LWLockAcquire(pgss->lock, LW_SHARED);
1277 }
1278
1279 /* Append new query text to file with only shared lock held */
1280 stored = qtext_store(norm_query ? norm_query : query, query_len,
1281 &query_offset, &gc_count);
1282
1283 /*
1284 * Determine whether we need to garbage collect external query texts
1285 * while the shared lock is still held. This micro-optimization
1286 * avoids taking the time to decide this while holding exclusive lock.
1287 */
1288 do_gc = need_gc_qtexts();
1289
1290 /* Need exclusive lock to make a new hashtable entry - promote */
1291 LWLockRelease(pgss->lock);
1292 LWLockAcquire(pgss->lock, LW_EXCLUSIVE);
1293
1294 /*
1295 * A garbage collection may have occurred while we weren't holding the
1296 * lock. In the unlikely event that this happens, the query text we
1297 * stored above will have been garbage collected, so write it again.
1298 * This should be infrequent enough that doing it while holding
1299 * exclusive lock isn't a performance problem.
1300 */
1301 if (!stored || pgss->gc_count != gc_count)
1302 stored = qtext_store(norm_query ? norm_query : query, query_len,
1303 &query_offset, NULL);
1304
1305 /* If we failed to write to the text file, give up */
1306 if (!stored)
1307 goto done;
1308
1309 /* OK to create a new hashtable entry */
1310 entry = entry_alloc(&key, query_offset, query_len, encoding,
1311 jstate != NULL);
1312
1313 /* If needed, perform garbage collection while exclusive lock held */
1314 if (do_gc)
1315 gc_qtexts();
1316 }
1317
1318 /* Increment the counts, except when jstate is not NULL */
1319 if (!jstate)
1320 {
1321 /*
1322 * Grab the spinlock while updating the counters (see comment about
1323 * locking rules at the head of the file)
1324 */
1325 volatile pgssEntry *e = (volatile pgssEntry *) entry;
1326
1327 Assert(kind == PGSS_PLAN || kind == PGSS_EXEC);
1328
1329 SpinLockAcquire(&e->mutex);
1330
1331 /* "Unstick" entry if it was previously sticky */
1332 if (IS_STICKY(e->counters))
1333 e->counters.usage = USAGE_INIT;
1334
1335 e->counters.calls[kind] += 1;
1336 e->counters.total_time[kind] += total_time;
1337
1338 if (e->counters.calls[kind] == 1)
1339 {
1340 e->counters.min_time[kind] = total_time;
1341 e->counters.max_time[kind] = total_time;
1342 e->counters.mean_time[kind] = total_time;
1343 }
1344 else
1345 {
1346 /*
1347 * Welford's method for accurately computing variance. See
1348 * <http://www.johndcook.com/blog/standard_deviation/>
1349 */
1350 double old_mean = e->counters.mean_time[kind];
1351
1352 e->counters.mean_time[kind] +=
1353 (total_time - old_mean) / e->counters.calls[kind];
1354 e->counters.sum_var_time[kind] +=
1355 (total_time - old_mean) * (total_time - e->counters.mean_time[kind]);
1356
1357 /* calculate min and max time */
1358 if (e->counters.min_time[kind] > total_time)
1359 e->counters.min_time[kind] = total_time;
1360 if (e->counters.max_time[kind] < total_time)
1361 e->counters.max_time[kind] = total_time;
1362 }
1363 e->counters.rows += rows;
1364 e->counters.shared_blks_hit += bufusage->shared_blks_hit;
1365 e->counters.shared_blks_read += bufusage->shared_blks_read;
1366 e->counters.shared_blks_dirtied += bufusage->shared_blks_dirtied;
1367 e->counters.shared_blks_written += bufusage->shared_blks_written;
1368 e->counters.local_blks_hit += bufusage->local_blks_hit;
1369 e->counters.local_blks_read += bufusage->local_blks_read;
1370 e->counters.local_blks_dirtied += bufusage->local_blks_dirtied;
1371 e->counters.local_blks_written += bufusage->local_blks_written;
1372 e->counters.temp_blks_read += bufusage->temp_blks_read;
1373 e->counters.temp_blks_written += bufusage->temp_blks_written;
1374 e->counters.blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_read_time);
1375 e->counters.blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_write_time);
1376 e->counters.usage += USAGE_EXEC(total_time);
1377 e->counters.wal_records += walusage->wal_records;
1378 e->counters.wal_fpi += walusage->wal_fpi;
1379 e->counters.wal_bytes += walusage->wal_bytes;
1380
1381 SpinLockRelease(&e->mutex);
1382 }
1383
1384 done:
1385 LWLockRelease(pgss->lock);
1386
1387 /* We postpone this clean-up until we're out of the lock */
1388 if (norm_query)
1389 pfree(norm_query);
1390 }
1391
1392 /*
1393 * Reset statement statistics corresponding to userid, dbid, and queryid.
1394 */
1395 Datum
pg_stat_statements_reset_1_7(PG_FUNCTION_ARGS)1396 pg_stat_statements_reset_1_7(PG_FUNCTION_ARGS)
1397 {
1398 Oid userid;
1399 Oid dbid;
1400 uint64 queryid;
1401
1402 userid = PG_GETARG_OID(0);
1403 dbid = PG_GETARG_OID(1);
1404 queryid = (uint64) PG_GETARG_INT64(2);
1405
1406 entry_reset(userid, dbid, queryid);
1407
1408 PG_RETURN_VOID();
1409 }
1410
1411 /*
1412 * Reset statement statistics.
1413 */
1414 Datum
pg_stat_statements_reset(PG_FUNCTION_ARGS)1415 pg_stat_statements_reset(PG_FUNCTION_ARGS)
1416 {
1417 entry_reset(0, 0, 0);
1418
1419 PG_RETURN_VOID();
1420 }
1421
1422 /* Number of output arguments (columns) for various API versions */
1423 #define PG_STAT_STATEMENTS_COLS_V1_0 14
1424 #define PG_STAT_STATEMENTS_COLS_V1_1 18
1425 #define PG_STAT_STATEMENTS_COLS_V1_2 19
1426 #define PG_STAT_STATEMENTS_COLS_V1_3 23
1427 #define PG_STAT_STATEMENTS_COLS_V1_8 32
1428 #define PG_STAT_STATEMENTS_COLS_V1_9 33
1429 #define PG_STAT_STATEMENTS_COLS 33 /* maximum of above */
1430
1431 /*
1432 * Retrieve statement statistics.
1433 *
1434 * The SQL API of this function has changed multiple times, and will likely
1435 * do so again in future. To support the case where a newer version of this
1436 * loadable module is being used with an old SQL declaration of the function,
1437 * we continue to support the older API versions. For 1.2 and later, the
1438 * expected API version is identified by embedding it in the C name of the
1439 * function. Unfortunately we weren't bright enough to do that for 1.1.
1440 */
1441 Datum
pg_stat_statements_1_9(PG_FUNCTION_ARGS)1442 pg_stat_statements_1_9(PG_FUNCTION_ARGS)
1443 {
1444 bool showtext = PG_GETARG_BOOL(0);
1445
1446 pg_stat_statements_internal(fcinfo, PGSS_V1_9, showtext);
1447
1448 return (Datum) 0;
1449 }
1450
1451 Datum
pg_stat_statements_1_8(PG_FUNCTION_ARGS)1452 pg_stat_statements_1_8(PG_FUNCTION_ARGS)
1453 {
1454 bool showtext = PG_GETARG_BOOL(0);
1455
1456 pg_stat_statements_internal(fcinfo, PGSS_V1_8, showtext);
1457
1458 return (Datum) 0;
1459 }
1460
1461 Datum
pg_stat_statements_1_3(PG_FUNCTION_ARGS)1462 pg_stat_statements_1_3(PG_FUNCTION_ARGS)
1463 {
1464 bool showtext = PG_GETARG_BOOL(0);
1465
1466 pg_stat_statements_internal(fcinfo, PGSS_V1_3, showtext);
1467
1468 return (Datum) 0;
1469 }
1470
1471 Datum
pg_stat_statements_1_2(PG_FUNCTION_ARGS)1472 pg_stat_statements_1_2(PG_FUNCTION_ARGS)
1473 {
1474 bool showtext = PG_GETARG_BOOL(0);
1475
1476 pg_stat_statements_internal(fcinfo, PGSS_V1_2, showtext);
1477
1478 return (Datum) 0;
1479 }
1480
1481 /*
1482 * Legacy entry point for pg_stat_statements() API versions 1.0 and 1.1.
1483 * This can be removed someday, perhaps.
1484 */
1485 Datum
pg_stat_statements(PG_FUNCTION_ARGS)1486 pg_stat_statements(PG_FUNCTION_ARGS)
1487 {
1488 /* If it's really API 1.1, we'll figure that out below */
1489 pg_stat_statements_internal(fcinfo, PGSS_V1_0, true);
1490
1491 return (Datum) 0;
1492 }
1493
1494 /* Common code for all versions of pg_stat_statements() */
1495 static void
pg_stat_statements_internal(FunctionCallInfo fcinfo,pgssVersion api_version,bool showtext)1496 pg_stat_statements_internal(FunctionCallInfo fcinfo,
1497 pgssVersion api_version,
1498 bool showtext)
1499 {
1500 ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
1501 TupleDesc tupdesc;
1502 Tuplestorestate *tupstore;
1503 MemoryContext per_query_ctx;
1504 MemoryContext oldcontext;
1505 Oid userid = GetUserId();
1506 bool is_allowed_role = false;
1507 char *qbuffer = NULL;
1508 Size qbuffer_size = 0;
1509 Size extent = 0;
1510 int gc_count = 0;
1511 HASH_SEQ_STATUS hash_seq;
1512 pgssEntry *entry;
1513
1514 /* Superusers or members of pg_read_all_stats members are allowed */
1515 is_allowed_role = is_member_of_role(GetUserId(), ROLE_PG_READ_ALL_STATS);
1516
1517 /* hash table must exist already */
1518 if (!pgss || !pgss_hash)
1519 ereport(ERROR,
1520 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1521 errmsg("pg_stat_statements must be loaded via shared_preload_libraries")));
1522
1523 /* check to see if caller supports us returning a tuplestore */
1524 if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
1525 ereport(ERROR,
1526 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1527 errmsg("set-valued function called in context that cannot accept a set")));
1528 if (!(rsinfo->allowedModes & SFRM_Materialize))
1529 ereport(ERROR,
1530 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1531 errmsg("materialize mode required, but it is not allowed in this context")));
1532
1533 /* Switch into long-lived context to construct returned data structures */
1534 per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
1535 oldcontext = MemoryContextSwitchTo(per_query_ctx);
1536
1537 /* Build a tuple descriptor for our result type */
1538 if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
1539 elog(ERROR, "return type must be a row type");
1540
1541 /*
1542 * Check we have the expected number of output arguments. Aside from
1543 * being a good safety check, we need a kluge here to detect API version
1544 * 1.1, which was wedged into the code in an ill-considered way.
1545 */
1546 switch (tupdesc->natts)
1547 {
1548 case PG_STAT_STATEMENTS_COLS_V1_0:
1549 if (api_version != PGSS_V1_0)
1550 elog(ERROR, "incorrect number of output arguments");
1551 break;
1552 case PG_STAT_STATEMENTS_COLS_V1_1:
1553 /* pg_stat_statements() should have told us 1.0 */
1554 if (api_version != PGSS_V1_0)
1555 elog(ERROR, "incorrect number of output arguments");
1556 api_version = PGSS_V1_1;
1557 break;
1558 case PG_STAT_STATEMENTS_COLS_V1_2:
1559 if (api_version != PGSS_V1_2)
1560 elog(ERROR, "incorrect number of output arguments");
1561 break;
1562 case PG_STAT_STATEMENTS_COLS_V1_3:
1563 if (api_version != PGSS_V1_3)
1564 elog(ERROR, "incorrect number of output arguments");
1565 break;
1566 case PG_STAT_STATEMENTS_COLS_V1_8:
1567 if (api_version != PGSS_V1_8)
1568 elog(ERROR, "incorrect number of output arguments");
1569 break;
1570 case PG_STAT_STATEMENTS_COLS_V1_9:
1571 if (api_version != PGSS_V1_9)
1572 elog(ERROR, "incorrect number of output arguments");
1573 break;
1574 default:
1575 elog(ERROR, "incorrect number of output arguments");
1576 }
1577
1578 tupstore = tuplestore_begin_heap(true, false, work_mem);
1579 rsinfo->returnMode = SFRM_Materialize;
1580 rsinfo->setResult = tupstore;
1581 rsinfo->setDesc = tupdesc;
1582
1583 MemoryContextSwitchTo(oldcontext);
1584
1585 /*
1586 * We'd like to load the query text file (if needed) while not holding any
1587 * lock on pgss->lock. In the worst case we'll have to do this again
1588 * after we have the lock, but it's unlikely enough to make this a win
1589 * despite occasional duplicated work. We need to reload if anybody
1590 * writes to the file (either a retail qtext_store(), or a garbage
1591 * collection) between this point and where we've gotten shared lock. If
1592 * a qtext_store is actually in progress when we look, we might as well
1593 * skip the speculative load entirely.
1594 */
1595 if (showtext)
1596 {
1597 int n_writers;
1598
1599 /* Take the mutex so we can examine variables */
1600 {
1601 volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
1602
1603 SpinLockAcquire(&s->mutex);
1604 extent = s->extent;
1605 n_writers = s->n_writers;
1606 gc_count = s->gc_count;
1607 SpinLockRelease(&s->mutex);
1608 }
1609
1610 /* No point in loading file now if there are active writers */
1611 if (n_writers == 0)
1612 qbuffer = qtext_load_file(&qbuffer_size);
1613 }
1614
1615 /*
1616 * Get shared lock, load or reload the query text file if we must, and
1617 * iterate over the hashtable entries.
1618 *
1619 * With a large hash table, we might be holding the lock rather longer
1620 * than one could wish. However, this only blocks creation of new hash
1621 * table entries, and the larger the hash table the less likely that is to
1622 * be needed. So we can hope this is okay. Perhaps someday we'll decide
1623 * we need to partition the hash table to limit the time spent holding any
1624 * one lock.
1625 */
1626 LWLockAcquire(pgss->lock, LW_SHARED);
1627
1628 if (showtext)
1629 {
1630 /*
1631 * Here it is safe to examine extent and gc_count without taking the
1632 * mutex. Note that although other processes might change
1633 * pgss->extent just after we look at it, the strings they then write
1634 * into the file cannot yet be referenced in the hashtable, so we
1635 * don't care whether we see them or not.
1636 *
1637 * If qtext_load_file fails, we just press on; we'll return NULL for
1638 * every query text.
1639 */
1640 if (qbuffer == NULL ||
1641 pgss->extent != extent ||
1642 pgss->gc_count != gc_count)
1643 {
1644 if (qbuffer)
1645 free(qbuffer);
1646 qbuffer = qtext_load_file(&qbuffer_size);
1647 }
1648 }
1649
1650 hash_seq_init(&hash_seq, pgss_hash);
1651 while ((entry = hash_seq_search(&hash_seq)) != NULL)
1652 {
1653 Datum values[PG_STAT_STATEMENTS_COLS];
1654 bool nulls[PG_STAT_STATEMENTS_COLS];
1655 int i = 0;
1656 Counters tmp;
1657 double stddev;
1658 int64 queryid = entry->key.queryid;
1659
1660 memset(values, 0, sizeof(values));
1661 memset(nulls, 0, sizeof(nulls));
1662
1663 values[i++] = ObjectIdGetDatum(entry->key.userid);
1664 values[i++] = ObjectIdGetDatum(entry->key.dbid);
1665 if (api_version >= PGSS_V1_9)
1666 values[i++] = BoolGetDatum(entry->key.toplevel);
1667
1668 if (is_allowed_role || entry->key.userid == userid)
1669 {
1670 if (api_version >= PGSS_V1_2)
1671 values[i++] = Int64GetDatumFast(queryid);
1672
1673 if (showtext)
1674 {
1675 char *qstr = qtext_fetch(entry->query_offset,
1676 entry->query_len,
1677 qbuffer,
1678 qbuffer_size);
1679
1680 if (qstr)
1681 {
1682 char *enc;
1683
1684 enc = pg_any_to_server(qstr,
1685 entry->query_len,
1686 entry->encoding);
1687
1688 values[i++] = CStringGetTextDatum(enc);
1689
1690 if (enc != qstr)
1691 pfree(enc);
1692 }
1693 else
1694 {
1695 /* Just return a null if we fail to find the text */
1696 nulls[i++] = true;
1697 }
1698 }
1699 else
1700 {
1701 /* Query text not requested */
1702 nulls[i++] = true;
1703 }
1704 }
1705 else
1706 {
1707 /* Don't show queryid */
1708 if (api_version >= PGSS_V1_2)
1709 nulls[i++] = true;
1710
1711 /*
1712 * Don't show query text, but hint as to the reason for not doing
1713 * so if it was requested
1714 */
1715 if (showtext)
1716 values[i++] = CStringGetTextDatum("<insufficient privilege>");
1717 else
1718 nulls[i++] = true;
1719 }
1720
1721 /* copy counters to a local variable to keep locking time short */
1722 {
1723 volatile pgssEntry *e = (volatile pgssEntry *) entry;
1724
1725 SpinLockAcquire(&e->mutex);
1726 tmp = e->counters;
1727 SpinLockRelease(&e->mutex);
1728 }
1729
1730 /* Skip entry if unexecuted (ie, it's a pending "sticky" entry) */
1731 if (IS_STICKY(tmp))
1732 continue;
1733
1734 /* Note that we rely on PGSS_PLAN being 0 and PGSS_EXEC being 1. */
1735 for (int kind = 0; kind < PGSS_NUMKIND; kind++)
1736 {
1737 if (kind == PGSS_EXEC || api_version >= PGSS_V1_8)
1738 {
1739 values[i++] = Int64GetDatumFast(tmp.calls[kind]);
1740 values[i++] = Float8GetDatumFast(tmp.total_time[kind]);
1741 }
1742
1743 if ((kind == PGSS_EXEC && api_version >= PGSS_V1_3) ||
1744 api_version >= PGSS_V1_8)
1745 {
1746 values[i++] = Float8GetDatumFast(tmp.min_time[kind]);
1747 values[i++] = Float8GetDatumFast(tmp.max_time[kind]);
1748 values[i++] = Float8GetDatumFast(tmp.mean_time[kind]);
1749
1750 /*
1751 * Note we are calculating the population variance here, not
1752 * the sample variance, as we have data for the whole
1753 * population, so Bessel's correction is not used, and we
1754 * don't divide by tmp.calls - 1.
1755 */
1756 if (tmp.calls[kind] > 1)
1757 stddev = sqrt(tmp.sum_var_time[kind] / tmp.calls[kind]);
1758 else
1759 stddev = 0.0;
1760 values[i++] = Float8GetDatumFast(stddev);
1761 }
1762 }
1763 values[i++] = Int64GetDatumFast(tmp.rows);
1764 values[i++] = Int64GetDatumFast(tmp.shared_blks_hit);
1765 values[i++] = Int64GetDatumFast(tmp.shared_blks_read);
1766 if (api_version >= PGSS_V1_1)
1767 values[i++] = Int64GetDatumFast(tmp.shared_blks_dirtied);
1768 values[i++] = Int64GetDatumFast(tmp.shared_blks_written);
1769 values[i++] = Int64GetDatumFast(tmp.local_blks_hit);
1770 values[i++] = Int64GetDatumFast(tmp.local_blks_read);
1771 if (api_version >= PGSS_V1_1)
1772 values[i++] = Int64GetDatumFast(tmp.local_blks_dirtied);
1773 values[i++] = Int64GetDatumFast(tmp.local_blks_written);
1774 values[i++] = Int64GetDatumFast(tmp.temp_blks_read);
1775 values[i++] = Int64GetDatumFast(tmp.temp_blks_written);
1776 if (api_version >= PGSS_V1_1)
1777 {
1778 values[i++] = Float8GetDatumFast(tmp.blk_read_time);
1779 values[i++] = Float8GetDatumFast(tmp.blk_write_time);
1780 }
1781 if (api_version >= PGSS_V1_8)
1782 {
1783 char buf[256];
1784 Datum wal_bytes;
1785
1786 values[i++] = Int64GetDatumFast(tmp.wal_records);
1787 values[i++] = Int64GetDatumFast(tmp.wal_fpi);
1788
1789 snprintf(buf, sizeof buf, UINT64_FORMAT, tmp.wal_bytes);
1790
1791 /* Convert to numeric. */
1792 wal_bytes = DirectFunctionCall3(numeric_in,
1793 CStringGetDatum(buf),
1794 ObjectIdGetDatum(0),
1795 Int32GetDatum(-1));
1796 values[i++] = wal_bytes;
1797 }
1798
1799 Assert(i == (api_version == PGSS_V1_0 ? PG_STAT_STATEMENTS_COLS_V1_0 :
1800 api_version == PGSS_V1_1 ? PG_STAT_STATEMENTS_COLS_V1_1 :
1801 api_version == PGSS_V1_2 ? PG_STAT_STATEMENTS_COLS_V1_2 :
1802 api_version == PGSS_V1_3 ? PG_STAT_STATEMENTS_COLS_V1_3 :
1803 api_version == PGSS_V1_8 ? PG_STAT_STATEMENTS_COLS_V1_8 :
1804 api_version == PGSS_V1_9 ? PG_STAT_STATEMENTS_COLS_V1_9 :
1805 -1 /* fail if you forget to update this assert */ ));
1806
1807 tuplestore_putvalues(tupstore, tupdesc, values, nulls);
1808 }
1809
1810 /* clean up and return the tuplestore */
1811 LWLockRelease(pgss->lock);
1812
1813 if (qbuffer)
1814 free(qbuffer);
1815
1816 tuplestore_donestoring(tupstore);
1817 }
1818
1819 /* Number of output arguments (columns) for pg_stat_statements_info */
1820 #define PG_STAT_STATEMENTS_INFO_COLS 2
1821
1822 /*
1823 * Return statistics of pg_stat_statements.
1824 */
1825 Datum
pg_stat_statements_info(PG_FUNCTION_ARGS)1826 pg_stat_statements_info(PG_FUNCTION_ARGS)
1827 {
1828 pgssGlobalStats stats;
1829 TupleDesc tupdesc;
1830 Datum values[PG_STAT_STATEMENTS_INFO_COLS];
1831 bool nulls[PG_STAT_STATEMENTS_INFO_COLS];
1832
1833 if (!pgss || !pgss_hash)
1834 ereport(ERROR,
1835 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1836 errmsg("pg_stat_statements must be loaded via shared_preload_libraries")));
1837
1838 /* Build a tuple descriptor for our result type */
1839 if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
1840 elog(ERROR, "return type must be a row type");
1841
1842 MemSet(values, 0, sizeof(values));
1843 MemSet(nulls, 0, sizeof(nulls));
1844
1845 /* Read global statistics for pg_stat_statements */
1846 {
1847 volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
1848
1849 SpinLockAcquire(&s->mutex);
1850 stats = s->stats;
1851 SpinLockRelease(&s->mutex);
1852 }
1853
1854 values[0] = Int64GetDatum(stats.dealloc);
1855 values[1] = TimestampTzGetDatum(stats.stats_reset);
1856
1857 PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
1858 }
1859
1860 /*
1861 * Estimate shared memory space needed.
1862 */
1863 static Size
pgss_memsize(void)1864 pgss_memsize(void)
1865 {
1866 Size size;
1867
1868 size = MAXALIGN(sizeof(pgssSharedState));
1869 size = add_size(size, hash_estimate_size(pgss_max, sizeof(pgssEntry)));
1870
1871 return size;
1872 }
1873
1874 /*
1875 * Allocate a new hashtable entry.
1876 * caller must hold an exclusive lock on pgss->lock
1877 *
1878 * "query" need not be null-terminated; we rely on query_len instead
1879 *
1880 * If "sticky" is true, make the new entry artificially sticky so that it will
1881 * probably still be there when the query finishes execution. We do this by
1882 * giving it a median usage value rather than the normal value. (Strictly
1883 * speaking, query strings are normalized on a best effort basis, though it
1884 * would be difficult to demonstrate this even under artificial conditions.)
1885 *
1886 * Note: despite needing exclusive lock, it's not an error for the target
1887 * entry to already exist. This is because pgss_store releases and
1888 * reacquires lock after failing to find a match; so someone else could
1889 * have made the entry while we waited to get exclusive lock.
1890 */
1891 static pgssEntry *
entry_alloc(pgssHashKey * key,Size query_offset,int query_len,int encoding,bool sticky)1892 entry_alloc(pgssHashKey *key, Size query_offset, int query_len, int encoding,
1893 bool sticky)
1894 {
1895 pgssEntry *entry;
1896 bool found;
1897
1898 /* Make space if needed */
1899 while (hash_get_num_entries(pgss_hash) >= pgss_max)
1900 entry_dealloc();
1901
1902 /* Find or create an entry with desired hash code */
1903 entry = (pgssEntry *) hash_search(pgss_hash, key, HASH_ENTER, &found);
1904
1905 if (!found)
1906 {
1907 /* New entry, initialize it */
1908
1909 /* reset the statistics */
1910 memset(&entry->counters, 0, sizeof(Counters));
1911 /* set the appropriate initial usage count */
1912 entry->counters.usage = sticky ? pgss->cur_median_usage : USAGE_INIT;
1913 /* re-initialize the mutex each time ... we assume no one using it */
1914 SpinLockInit(&entry->mutex);
1915 /* ... and don't forget the query text metadata */
1916 Assert(query_len >= 0);
1917 entry->query_offset = query_offset;
1918 entry->query_len = query_len;
1919 entry->encoding = encoding;
1920 }
1921
1922 return entry;
1923 }
1924
1925 /*
1926 * qsort comparator for sorting into increasing usage order
1927 */
1928 static int
entry_cmp(const void * lhs,const void * rhs)1929 entry_cmp(const void *lhs, const void *rhs)
1930 {
1931 double l_usage = (*(pgssEntry *const *) lhs)->counters.usage;
1932 double r_usage = (*(pgssEntry *const *) rhs)->counters.usage;
1933
1934 if (l_usage < r_usage)
1935 return -1;
1936 else if (l_usage > r_usage)
1937 return +1;
1938 else
1939 return 0;
1940 }
1941
1942 /*
1943 * Deallocate least-used entries.
1944 *
1945 * Caller must hold an exclusive lock on pgss->lock.
1946 */
1947 static void
entry_dealloc(void)1948 entry_dealloc(void)
1949 {
1950 HASH_SEQ_STATUS hash_seq;
1951 pgssEntry **entries;
1952 pgssEntry *entry;
1953 int nvictims;
1954 int i;
1955 Size tottextlen;
1956 int nvalidtexts;
1957
1958 /*
1959 * Sort entries by usage and deallocate USAGE_DEALLOC_PERCENT of them.
1960 * While we're scanning the table, apply the decay factor to the usage
1961 * values, and update the mean query length.
1962 *
1963 * Note that the mean query length is almost immediately obsolete, since
1964 * we compute it before not after discarding the least-used entries.
1965 * Hopefully, that doesn't affect the mean too much; it doesn't seem worth
1966 * making two passes to get a more current result. Likewise, the new
1967 * cur_median_usage includes the entries we're about to zap.
1968 */
1969
1970 entries = palloc(hash_get_num_entries(pgss_hash) * sizeof(pgssEntry *));
1971
1972 i = 0;
1973 tottextlen = 0;
1974 nvalidtexts = 0;
1975
1976 hash_seq_init(&hash_seq, pgss_hash);
1977 while ((entry = hash_seq_search(&hash_seq)) != NULL)
1978 {
1979 entries[i++] = entry;
1980 /* "Sticky" entries get a different usage decay rate. */
1981 if (IS_STICKY(entry->counters))
1982 entry->counters.usage *= STICKY_DECREASE_FACTOR;
1983 else
1984 entry->counters.usage *= USAGE_DECREASE_FACTOR;
1985 /* In the mean length computation, ignore dropped texts. */
1986 if (entry->query_len >= 0)
1987 {
1988 tottextlen += entry->query_len + 1;
1989 nvalidtexts++;
1990 }
1991 }
1992
1993 /* Sort into increasing order by usage */
1994 qsort(entries, i, sizeof(pgssEntry *), entry_cmp);
1995
1996 /* Record the (approximate) median usage */
1997 if (i > 0)
1998 pgss->cur_median_usage = entries[i / 2]->counters.usage;
1999 /* Record the mean query length */
2000 if (nvalidtexts > 0)
2001 pgss->mean_query_len = tottextlen / nvalidtexts;
2002 else
2003 pgss->mean_query_len = ASSUMED_LENGTH_INIT;
2004
2005 /* Now zap an appropriate fraction of lowest-usage entries */
2006 nvictims = Max(10, i * USAGE_DEALLOC_PERCENT / 100);
2007 nvictims = Min(nvictims, i);
2008
2009 for (i = 0; i < nvictims; i++)
2010 {
2011 hash_search(pgss_hash, &entries[i]->key, HASH_REMOVE, NULL);
2012 }
2013
2014 pfree(entries);
2015
2016 /* Increment the number of times entries are deallocated */
2017 {
2018 volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
2019
2020 SpinLockAcquire(&s->mutex);
2021 s->stats.dealloc += 1;
2022 SpinLockRelease(&s->mutex);
2023 }
2024 }
2025
2026 /*
2027 * Given a query string (not necessarily null-terminated), allocate a new
2028 * entry in the external query text file and store the string there.
2029 *
2030 * If successful, returns true, and stores the new entry's offset in the file
2031 * into *query_offset. Also, if gc_count isn't NULL, *gc_count is set to the
2032 * number of garbage collections that have occurred so far.
2033 *
2034 * On failure, returns false.
2035 *
2036 * At least a shared lock on pgss->lock must be held by the caller, so as
2037 * to prevent a concurrent garbage collection. Share-lock-holding callers
2038 * should pass a gc_count pointer to obtain the number of garbage collections,
2039 * so that they can recheck the count after obtaining exclusive lock to
2040 * detect whether a garbage collection occurred (and removed this entry).
2041 */
2042 static bool
qtext_store(const char * query,int query_len,Size * query_offset,int * gc_count)2043 qtext_store(const char *query, int query_len,
2044 Size *query_offset, int *gc_count)
2045 {
2046 Size off;
2047 int fd;
2048
2049 /*
2050 * We use a spinlock to protect extent/n_writers/gc_count, so that
2051 * multiple processes may execute this function concurrently.
2052 */
2053 {
2054 volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
2055
2056 SpinLockAcquire(&s->mutex);
2057 off = s->extent;
2058 s->extent += query_len + 1;
2059 s->n_writers++;
2060 if (gc_count)
2061 *gc_count = s->gc_count;
2062 SpinLockRelease(&s->mutex);
2063 }
2064
2065 *query_offset = off;
2066
2067 /* Now write the data into the successfully-reserved part of the file */
2068 fd = OpenTransientFile(PGSS_TEXT_FILE, O_RDWR | O_CREAT | PG_BINARY);
2069 if (fd < 0)
2070 goto error;
2071
2072 if (pg_pwrite(fd, query, query_len, off) != query_len)
2073 goto error;
2074 if (pg_pwrite(fd, "\0", 1, off + query_len) != 1)
2075 goto error;
2076
2077 CloseTransientFile(fd);
2078
2079 /* Mark our write complete */
2080 {
2081 volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
2082
2083 SpinLockAcquire(&s->mutex);
2084 s->n_writers--;
2085 SpinLockRelease(&s->mutex);
2086 }
2087
2088 return true;
2089
2090 error:
2091 ereport(LOG,
2092 (errcode_for_file_access(),
2093 errmsg("could not write file \"%s\": %m",
2094 PGSS_TEXT_FILE)));
2095
2096 if (fd >= 0)
2097 CloseTransientFile(fd);
2098
2099 /* Mark our write complete */
2100 {
2101 volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
2102
2103 SpinLockAcquire(&s->mutex);
2104 s->n_writers--;
2105 SpinLockRelease(&s->mutex);
2106 }
2107
2108 return false;
2109 }
2110
2111 /*
2112 * Read the external query text file into a malloc'd buffer.
2113 *
2114 * Returns NULL (without throwing an error) if unable to read, eg
2115 * file not there or insufficient memory.
2116 *
2117 * On success, the buffer size is also returned into *buffer_size.
2118 *
2119 * This can be called without any lock on pgss->lock, but in that case
2120 * the caller is responsible for verifying that the result is sane.
2121 */
2122 static char *
qtext_load_file(Size * buffer_size)2123 qtext_load_file(Size *buffer_size)
2124 {
2125 char *buf;
2126 int fd;
2127 struct stat stat;
2128 Size nread;
2129
2130 fd = OpenTransientFile(PGSS_TEXT_FILE, O_RDONLY | PG_BINARY);
2131 if (fd < 0)
2132 {
2133 if (errno != ENOENT)
2134 ereport(LOG,
2135 (errcode_for_file_access(),
2136 errmsg("could not read file \"%s\": %m",
2137 PGSS_TEXT_FILE)));
2138 return NULL;
2139 }
2140
2141 /* Get file length */
2142 if (fstat(fd, &stat))
2143 {
2144 ereport(LOG,
2145 (errcode_for_file_access(),
2146 errmsg("could not stat file \"%s\": %m",
2147 PGSS_TEXT_FILE)));
2148 CloseTransientFile(fd);
2149 return NULL;
2150 }
2151
2152 /* Allocate buffer; beware that off_t might be wider than size_t */
2153 if (stat.st_size <= MaxAllocHugeSize)
2154 buf = (char *) malloc(stat.st_size);
2155 else
2156 buf = NULL;
2157 if (buf == NULL)
2158 {
2159 ereport(LOG,
2160 (errcode(ERRCODE_OUT_OF_MEMORY),
2161 errmsg("out of memory"),
2162 errdetail("Could not allocate enough memory to read file \"%s\".",
2163 PGSS_TEXT_FILE)));
2164 CloseTransientFile(fd);
2165 return NULL;
2166 }
2167
2168 /*
2169 * OK, slurp in the file. Windows fails if we try to read more than
2170 * INT_MAX bytes at once, and other platforms might not like that either,
2171 * so read a very large file in 1GB segments.
2172 */
2173 nread = 0;
2174 while (nread < stat.st_size)
2175 {
2176 int toread = Min(1024 * 1024 * 1024, stat.st_size - nread);
2177
2178 /*
2179 * If we get a short read and errno doesn't get set, the reason is
2180 * probably that garbage collection truncated the file since we did
2181 * the fstat(), so we don't log a complaint --- but we don't return
2182 * the data, either, since it's most likely corrupt due to concurrent
2183 * writes from garbage collection.
2184 */
2185 errno = 0;
2186 if (read(fd, buf + nread, toread) != toread)
2187 {
2188 if (errno)
2189 ereport(LOG,
2190 (errcode_for_file_access(),
2191 errmsg("could not read file \"%s\": %m",
2192 PGSS_TEXT_FILE)));
2193 free(buf);
2194 CloseTransientFile(fd);
2195 return NULL;
2196 }
2197 nread += toread;
2198 }
2199
2200 if (CloseTransientFile(fd) != 0)
2201 ereport(LOG,
2202 (errcode_for_file_access(),
2203 errmsg("could not close file \"%s\": %m", PGSS_TEXT_FILE)));
2204
2205 *buffer_size = nread;
2206 return buf;
2207 }
2208
2209 /*
2210 * Locate a query text in the file image previously read by qtext_load_file().
2211 *
2212 * We validate the given offset/length, and return NULL if bogus. Otherwise,
2213 * the result points to a null-terminated string within the buffer.
2214 */
2215 static char *
qtext_fetch(Size query_offset,int query_len,char * buffer,Size buffer_size)2216 qtext_fetch(Size query_offset, int query_len,
2217 char *buffer, Size buffer_size)
2218 {
2219 /* File read failed? */
2220 if (buffer == NULL)
2221 return NULL;
2222 /* Bogus offset/length? */
2223 if (query_len < 0 ||
2224 query_offset + query_len >= buffer_size)
2225 return NULL;
2226 /* As a further sanity check, make sure there's a trailing null */
2227 if (buffer[query_offset + query_len] != '\0')
2228 return NULL;
2229 /* Looks OK */
2230 return buffer + query_offset;
2231 }
2232
2233 /*
2234 * Do we need to garbage-collect the external query text file?
2235 *
2236 * Caller should hold at least a shared lock on pgss->lock.
2237 */
2238 static bool
need_gc_qtexts(void)2239 need_gc_qtexts(void)
2240 {
2241 Size extent;
2242
2243 /* Read shared extent pointer */
2244 {
2245 volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
2246
2247 SpinLockAcquire(&s->mutex);
2248 extent = s->extent;
2249 SpinLockRelease(&s->mutex);
2250 }
2251
2252 /* Don't proceed if file does not exceed 512 bytes per possible entry */
2253 if (extent < 512 * pgss_max)
2254 return false;
2255
2256 /*
2257 * Don't proceed if file is less than about 50% bloat. Nothing can or
2258 * should be done in the event of unusually large query texts accounting
2259 * for file's large size. We go to the trouble of maintaining the mean
2260 * query length in order to prevent garbage collection from thrashing
2261 * uselessly.
2262 */
2263 if (extent < pgss->mean_query_len * pgss_max * 2)
2264 return false;
2265
2266 return true;
2267 }
2268
2269 /*
2270 * Garbage-collect orphaned query texts in external file.
2271 *
2272 * This won't be called often in the typical case, since it's likely that
2273 * there won't be too much churn, and besides, a similar compaction process
2274 * occurs when serializing to disk at shutdown or as part of resetting.
2275 * Despite this, it seems prudent to plan for the edge case where the file
2276 * becomes unreasonably large, with no other method of compaction likely to
2277 * occur in the foreseeable future.
2278 *
2279 * The caller must hold an exclusive lock on pgss->lock.
2280 *
2281 * At the first sign of trouble we unlink the query text file to get a clean
2282 * slate (although existing statistics are retained), rather than risk
2283 * thrashing by allowing the same problem case to recur indefinitely.
2284 */
2285 static void
gc_qtexts(void)2286 gc_qtexts(void)
2287 {
2288 char *qbuffer;
2289 Size qbuffer_size;
2290 FILE *qfile = NULL;
2291 HASH_SEQ_STATUS hash_seq;
2292 pgssEntry *entry;
2293 Size extent;
2294 int nentries;
2295
2296 /*
2297 * When called from pgss_store, some other session might have proceeded
2298 * with garbage collection in the no-lock-held interim of lock strength
2299 * escalation. Check once more that this is actually necessary.
2300 */
2301 if (!need_gc_qtexts())
2302 return;
2303
2304 /*
2305 * Load the old texts file. If we fail (out of memory, for instance),
2306 * invalidate query texts. Hopefully this is rare. It might seem better
2307 * to leave things alone on an OOM failure, but the problem is that the
2308 * file is only going to get bigger; hoping for a future non-OOM result is
2309 * risky and can easily lead to complete denial of service.
2310 */
2311 qbuffer = qtext_load_file(&qbuffer_size);
2312 if (qbuffer == NULL)
2313 goto gc_fail;
2314
2315 /*
2316 * We overwrite the query texts file in place, so as to reduce the risk of
2317 * an out-of-disk-space failure. Since the file is guaranteed not to get
2318 * larger, this should always work on traditional filesystems; though we
2319 * could still lose on copy-on-write filesystems.
2320 */
2321 qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W);
2322 if (qfile == NULL)
2323 {
2324 ereport(LOG,
2325 (errcode_for_file_access(),
2326 errmsg("could not write file \"%s\": %m",
2327 PGSS_TEXT_FILE)));
2328 goto gc_fail;
2329 }
2330
2331 extent = 0;
2332 nentries = 0;
2333
2334 hash_seq_init(&hash_seq, pgss_hash);
2335 while ((entry = hash_seq_search(&hash_seq)) != NULL)
2336 {
2337 int query_len = entry->query_len;
2338 char *qry = qtext_fetch(entry->query_offset,
2339 query_len,
2340 qbuffer,
2341 qbuffer_size);
2342
2343 if (qry == NULL)
2344 {
2345 /* Trouble ... drop the text */
2346 entry->query_offset = 0;
2347 entry->query_len = -1;
2348 /* entry will not be counted in mean query length computation */
2349 continue;
2350 }
2351
2352 if (fwrite(qry, 1, query_len + 1, qfile) != query_len + 1)
2353 {
2354 ereport(LOG,
2355 (errcode_for_file_access(),
2356 errmsg("could not write file \"%s\": %m",
2357 PGSS_TEXT_FILE)));
2358 hash_seq_term(&hash_seq);
2359 goto gc_fail;
2360 }
2361
2362 entry->query_offset = extent;
2363 extent += query_len + 1;
2364 nentries++;
2365 }
2366
2367 /*
2368 * Truncate away any now-unused space. If this fails for some odd reason,
2369 * we log it, but there's no need to fail.
2370 */
2371 if (ftruncate(fileno(qfile), extent) != 0)
2372 ereport(LOG,
2373 (errcode_for_file_access(),
2374 errmsg("could not truncate file \"%s\": %m",
2375 PGSS_TEXT_FILE)));
2376
2377 if (FreeFile(qfile))
2378 {
2379 ereport(LOG,
2380 (errcode_for_file_access(),
2381 errmsg("could not write file \"%s\": %m",
2382 PGSS_TEXT_FILE)));
2383 qfile = NULL;
2384 goto gc_fail;
2385 }
2386
2387 elog(DEBUG1, "pgss gc of queries file shrunk size from %zu to %zu",
2388 pgss->extent, extent);
2389
2390 /* Reset the shared extent pointer */
2391 pgss->extent = extent;
2392
2393 /*
2394 * Also update the mean query length, to be sure that need_gc_qtexts()
2395 * won't still think we have a problem.
2396 */
2397 if (nentries > 0)
2398 pgss->mean_query_len = extent / nentries;
2399 else
2400 pgss->mean_query_len = ASSUMED_LENGTH_INIT;
2401
2402 free(qbuffer);
2403
2404 /*
2405 * OK, count a garbage collection cycle. (Note: even though we have
2406 * exclusive lock on pgss->lock, we must take pgss->mutex for this, since
2407 * other processes may examine gc_count while holding only the mutex.
2408 * Also, we have to advance the count *after* we've rewritten the file,
2409 * else other processes might not realize they read a stale file.)
2410 */
2411 record_gc_qtexts();
2412
2413 return;
2414
2415 gc_fail:
2416 /* clean up resources */
2417 if (qfile)
2418 FreeFile(qfile);
2419 if (qbuffer)
2420 free(qbuffer);
2421
2422 /*
2423 * Since the contents of the external file are now uncertain, mark all
2424 * hashtable entries as having invalid texts.
2425 */
2426 hash_seq_init(&hash_seq, pgss_hash);
2427 while ((entry = hash_seq_search(&hash_seq)) != NULL)
2428 {
2429 entry->query_offset = 0;
2430 entry->query_len = -1;
2431 }
2432
2433 /*
2434 * Destroy the query text file and create a new, empty one
2435 */
2436 (void) unlink(PGSS_TEXT_FILE);
2437 qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W);
2438 if (qfile == NULL)
2439 ereport(LOG,
2440 (errcode_for_file_access(),
2441 errmsg("could not recreate file \"%s\": %m",
2442 PGSS_TEXT_FILE)));
2443 else
2444 FreeFile(qfile);
2445
2446 /* Reset the shared extent pointer */
2447 pgss->extent = 0;
2448
2449 /* Reset mean_query_len to match the new state */
2450 pgss->mean_query_len = ASSUMED_LENGTH_INIT;
2451
2452 /*
2453 * Bump the GC count even though we failed.
2454 *
2455 * This is needed to make concurrent readers of file without any lock on
2456 * pgss->lock notice existence of new version of file. Once readers
2457 * subsequently observe a change in GC count with pgss->lock held, that
2458 * forces a safe reopen of file. Writers also require that we bump here,
2459 * of course. (As required by locking protocol, readers and writers don't
2460 * trust earlier file contents until gc_count is found unchanged after
2461 * pgss->lock acquired in shared or exclusive mode respectively.)
2462 */
2463 record_gc_qtexts();
2464 }
2465
2466 /*
2467 * Release entries corresponding to parameters passed.
2468 */
2469 static void
entry_reset(Oid userid,Oid dbid,uint64 queryid)2470 entry_reset(Oid userid, Oid dbid, uint64 queryid)
2471 {
2472 HASH_SEQ_STATUS hash_seq;
2473 pgssEntry *entry;
2474 FILE *qfile;
2475 long num_entries;
2476 long num_remove = 0;
2477 pgssHashKey key;
2478
2479 if (!pgss || !pgss_hash)
2480 ereport(ERROR,
2481 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
2482 errmsg("pg_stat_statements must be loaded via shared_preload_libraries")));
2483
2484 LWLockAcquire(pgss->lock, LW_EXCLUSIVE);
2485 num_entries = hash_get_num_entries(pgss_hash);
2486
2487 if (userid != 0 && dbid != 0 && queryid != UINT64CONST(0))
2488 {
2489 /* If all the parameters are available, use the fast path. */
2490 memset(&key, 0, sizeof(pgssHashKey));
2491 key.userid = userid;
2492 key.dbid = dbid;
2493 key.queryid = queryid;
2494
2495 /* Remove the key if it exists, starting with the top-level entry */
2496 key.toplevel = false;
2497 entry = (pgssEntry *) hash_search(pgss_hash, &key, HASH_REMOVE, NULL);
2498 if (entry) /* found */
2499 num_remove++;
2500
2501 /* Also remove entries for top level statements */
2502 key.toplevel = true;
2503
2504 /* Remove the key if exists */
2505 entry = (pgssEntry *) hash_search(pgss_hash, &key, HASH_REMOVE, NULL);
2506 if (entry) /* found */
2507 num_remove++;
2508 }
2509 else if (userid != 0 || dbid != 0 || queryid != UINT64CONST(0))
2510 {
2511 /* Remove entries corresponding to valid parameters. */
2512 hash_seq_init(&hash_seq, pgss_hash);
2513 while ((entry = hash_seq_search(&hash_seq)) != NULL)
2514 {
2515 if ((!userid || entry->key.userid == userid) &&
2516 (!dbid || entry->key.dbid == dbid) &&
2517 (!queryid || entry->key.queryid == queryid))
2518 {
2519 hash_search(pgss_hash, &entry->key, HASH_REMOVE, NULL);
2520 num_remove++;
2521 }
2522 }
2523 }
2524 else
2525 {
2526 /* Remove all entries. */
2527 hash_seq_init(&hash_seq, pgss_hash);
2528 while ((entry = hash_seq_search(&hash_seq)) != NULL)
2529 {
2530 hash_search(pgss_hash, &entry->key, HASH_REMOVE, NULL);
2531 num_remove++;
2532 }
2533 }
2534
2535 /* All entries are removed? */
2536 if (num_entries != num_remove)
2537 goto release_lock;
2538
2539 /*
2540 * Reset global statistics for pg_stat_statements since all entries are
2541 * removed.
2542 */
2543 {
2544 volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
2545 TimestampTz stats_reset = GetCurrentTimestamp();
2546
2547 SpinLockAcquire(&s->mutex);
2548 s->stats.dealloc = 0;
2549 s->stats.stats_reset = stats_reset;
2550 SpinLockRelease(&s->mutex);
2551 }
2552
2553 /*
2554 * Write new empty query file, perhaps even creating a new one to recover
2555 * if the file was missing.
2556 */
2557 qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W);
2558 if (qfile == NULL)
2559 {
2560 ereport(LOG,
2561 (errcode_for_file_access(),
2562 errmsg("could not create file \"%s\": %m",
2563 PGSS_TEXT_FILE)));
2564 goto done;
2565 }
2566
2567 /* If ftruncate fails, log it, but it's not a fatal problem */
2568 if (ftruncate(fileno(qfile), 0) != 0)
2569 ereport(LOG,
2570 (errcode_for_file_access(),
2571 errmsg("could not truncate file \"%s\": %m",
2572 PGSS_TEXT_FILE)));
2573
2574 FreeFile(qfile);
2575
2576 done:
2577 pgss->extent = 0;
2578 /* This counts as a query text garbage collection for our purposes */
2579 record_gc_qtexts();
2580
2581 release_lock:
2582 LWLockRelease(pgss->lock);
2583 }
2584
2585 /*
2586 * Generate a normalized version of the query string that will be used to
2587 * represent all similar queries.
2588 *
2589 * Note that the normalized representation may well vary depending on
2590 * just which "equivalent" query is used to create the hashtable entry.
2591 * We assume this is OK.
2592 *
2593 * If query_loc > 0, then "query" has been advanced by that much compared to
2594 * the original string start, so we need to translate the provided locations
2595 * to compensate. (This lets us avoid re-scanning statements before the one
2596 * of interest, so it's worth doing.)
2597 *
2598 * *query_len_p contains the input string length, and is updated with
2599 * the result string length on exit. The resulting string might be longer
2600 * or shorter depending on what happens with replacement of constants.
2601 *
2602 * Returns a palloc'd string.
2603 */
2604 static char *
generate_normalized_query(JumbleState * jstate,const char * query,int query_loc,int * query_len_p)2605 generate_normalized_query(JumbleState *jstate, const char *query,
2606 int query_loc, int *query_len_p)
2607 {
2608 char *norm_query;
2609 int query_len = *query_len_p;
2610 int i,
2611 norm_query_buflen, /* Space allowed for norm_query */
2612 len_to_wrt, /* Length (in bytes) to write */
2613 quer_loc = 0, /* Source query byte location */
2614 n_quer_loc = 0, /* Normalized query byte location */
2615 last_off = 0, /* Offset from start for previous tok */
2616 last_tok_len = 0; /* Length (in bytes) of that tok */
2617
2618 /*
2619 * Get constants' lengths (core system only gives us locations). Note
2620 * this also ensures the items are sorted by location.
2621 */
2622 fill_in_constant_lengths(jstate, query, query_loc);
2623
2624 /*
2625 * Allow for $n symbols to be longer than the constants they replace.
2626 * Constants must take at least one byte in text form, while a $n symbol
2627 * certainly isn't more than 11 bytes, even if n reaches INT_MAX. We
2628 * could refine that limit based on the max value of n for the current
2629 * query, but it hardly seems worth any extra effort to do so.
2630 */
2631 norm_query_buflen = query_len + jstate->clocations_count * 10;
2632
2633 /* Allocate result buffer */
2634 norm_query = palloc(norm_query_buflen + 1);
2635
2636 for (i = 0; i < jstate->clocations_count; i++)
2637 {
2638 int off, /* Offset from start for cur tok */
2639 tok_len; /* Length (in bytes) of that tok */
2640
2641 off = jstate->clocations[i].location;
2642 /* Adjust recorded location if we're dealing with partial string */
2643 off -= query_loc;
2644
2645 tok_len = jstate->clocations[i].length;
2646
2647 if (tok_len < 0)
2648 continue; /* ignore any duplicates */
2649
2650 /* Copy next chunk (what precedes the next constant) */
2651 len_to_wrt = off - last_off;
2652 len_to_wrt -= last_tok_len;
2653
2654 Assert(len_to_wrt >= 0);
2655 memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
2656 n_quer_loc += len_to_wrt;
2657
2658 /* And insert a param symbol in place of the constant token */
2659 n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d",
2660 i + 1 + jstate->highest_extern_param_id);
2661
2662 quer_loc = off + tok_len;
2663 last_off = off;
2664 last_tok_len = tok_len;
2665 }
2666
2667 /*
2668 * We've copied up until the last ignorable constant. Copy over the
2669 * remaining bytes of the original query string.
2670 */
2671 len_to_wrt = query_len - quer_loc;
2672
2673 Assert(len_to_wrt >= 0);
2674 memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
2675 n_quer_loc += len_to_wrt;
2676
2677 Assert(n_quer_loc <= norm_query_buflen);
2678 norm_query[n_quer_loc] = '\0';
2679
2680 *query_len_p = n_quer_loc;
2681 return norm_query;
2682 }
2683
2684 /*
2685 * Given a valid SQL string and an array of constant-location records,
2686 * fill in the textual lengths of those constants.
2687 *
2688 * The constants may use any allowed constant syntax, such as float literals,
2689 * bit-strings, single-quoted strings and dollar-quoted strings. This is
2690 * accomplished by using the public API for the core scanner.
2691 *
2692 * It is the caller's job to ensure that the string is a valid SQL statement
2693 * with constants at the indicated locations. Since in practice the string
2694 * has already been parsed, and the locations that the caller provides will
2695 * have originated from within the authoritative parser, this should not be
2696 * a problem.
2697 *
2698 * Duplicate constant pointers are possible, and will have their lengths
2699 * marked as '-1', so that they are later ignored. (Actually, we assume the
2700 * lengths were initialized as -1 to start with, and don't change them here.)
2701 *
2702 * If query_loc > 0, then "query" has been advanced by that much compared to
2703 * the original string start, so we need to translate the provided locations
2704 * to compensate. (This lets us avoid re-scanning statements before the one
2705 * of interest, so it's worth doing.)
2706 *
2707 * N.B. There is an assumption that a '-' character at a Const location begins
2708 * a negative numeric constant. This precludes there ever being another
2709 * reason for a constant to start with a '-'.
2710 */
2711 static void
fill_in_constant_lengths(JumbleState * jstate,const char * query,int query_loc)2712 fill_in_constant_lengths(JumbleState *jstate, const char *query,
2713 int query_loc)
2714 {
2715 LocationLen *locs;
2716 core_yyscan_t yyscanner;
2717 core_yy_extra_type yyextra;
2718 core_YYSTYPE yylval;
2719 YYLTYPE yylloc;
2720 int last_loc = -1;
2721 int i;
2722
2723 /*
2724 * Sort the records by location so that we can process them in order while
2725 * scanning the query text.
2726 */
2727 if (jstate->clocations_count > 1)
2728 qsort(jstate->clocations, jstate->clocations_count,
2729 sizeof(LocationLen), comp_location);
2730 locs = jstate->clocations;
2731
2732 /* initialize the flex scanner --- should match raw_parser() */
2733 yyscanner = scanner_init(query,
2734 &yyextra,
2735 &ScanKeywords,
2736 ScanKeywordTokens);
2737
2738 /* we don't want to re-emit any escape string warnings */
2739 yyextra.escape_string_warning = false;
2740
2741 /* Search for each constant, in sequence */
2742 for (i = 0; i < jstate->clocations_count; i++)
2743 {
2744 int loc = locs[i].location;
2745 int tok;
2746
2747 /* Adjust recorded location if we're dealing with partial string */
2748 loc -= query_loc;
2749
2750 Assert(loc >= 0);
2751
2752 if (loc <= last_loc)
2753 continue; /* Duplicate constant, ignore */
2754
2755 /* Lex tokens until we find the desired constant */
2756 for (;;)
2757 {
2758 tok = core_yylex(&yylval, &yylloc, yyscanner);
2759
2760 /* We should not hit end-of-string, but if we do, behave sanely */
2761 if (tok == 0)
2762 break; /* out of inner for-loop */
2763
2764 /*
2765 * We should find the token position exactly, but if we somehow
2766 * run past it, work with that.
2767 */
2768 if (yylloc >= loc)
2769 {
2770 if (query[loc] == '-')
2771 {
2772 /*
2773 * It's a negative value - this is the one and only case
2774 * where we replace more than a single token.
2775 *
2776 * Do not compensate for the core system's special-case
2777 * adjustment of location to that of the leading '-'
2778 * operator in the event of a negative constant. It is
2779 * also useful for our purposes to start from the minus
2780 * symbol. In this way, queries like "select * from foo
2781 * where bar = 1" and "select * from foo where bar = -2"
2782 * will have identical normalized query strings.
2783 */
2784 tok = core_yylex(&yylval, &yylloc, yyscanner);
2785 if (tok == 0)
2786 break; /* out of inner for-loop */
2787 }
2788
2789 /*
2790 * We now rely on the assumption that flex has placed a zero
2791 * byte after the text of the current token in scanbuf.
2792 */
2793 locs[i].length = strlen(yyextra.scanbuf + loc);
2794 break; /* out of inner for-loop */
2795 }
2796 }
2797
2798 /* If we hit end-of-string, give up, leaving remaining lengths -1 */
2799 if (tok == 0)
2800 break;
2801
2802 last_loc = loc;
2803 }
2804
2805 scanner_finish(yyscanner);
2806 }
2807
2808 /*
2809 * comp_location: comparator for qsorting LocationLen structs by location
2810 */
2811 static int
comp_location(const void * a,const void * b)2812 comp_location(const void *a, const void *b)
2813 {
2814 int l = ((const LocationLen *) a)->location;
2815 int r = ((const LocationLen *) b)->location;
2816
2817 if (l < r)
2818 return -1;
2819 else if (l > r)
2820 return +1;
2821 else
2822 return 0;
2823 }
2824