1 /*-------------------------------------------------------------------------
2 *
3 * pg_stat_statements.c
4 * Track statement execution times across a whole database cluster.
5 *
6 * Execution costs are totalled for each distinct source query, and kept in
7 * a shared hashtable. (We track only as many distinct queries as will fit
8 * in the designated amount of shared memory.)
9 *
10 * As of Postgres 9.2, this module normalizes query entries. Normalization
11 * is a process whereby similar queries, typically differing only in their
12 * constants (though the exact rules are somewhat more subtle than that) are
13 * recognized as equivalent, and are tracked as a single entry. This is
14 * particularly useful for non-prepared queries.
15 *
16 * Normalization is implemented by fingerprinting queries, selectively
17 * serializing those fields of each query tree's nodes that are judged to be
18 * essential to the query. This is referred to as a query jumble. This is
19 * distinct from a regular serialization in that various extraneous
20 * information is ignored as irrelevant or not essential to the query, such
21 * as the collations of Vars and, most notably, the values of constants.
22 *
23 * This jumble is acquired at the end of parse analysis of each query, and
24 * a 64-bit hash of it is stored into the query's Query.queryId field.
25 * The server then copies this value around, making it available in plan
26 * tree(s) generated from the query. The executor can then use this value
27 * to blame query costs on the proper queryId.
28 *
29 * To facilitate presenting entries to users, we create "representative" query
30 * strings in which constants are replaced with parameter symbols ($n), to
31 * make it clearer what a normalized entry can represent. To save on shared
32 * memory, and to avoid having to truncate oversized query strings, we store
33 * these strings in a temporary external query-texts file. Offsets into this
34 * file are kept in shared memory.
35 *
36 * Note about locking issues: to create or delete an entry in the shared
37 * hashtable, one must hold pgss->lock exclusively. Modifying any field
38 * in an entry except the counters requires the same. To look up an entry,
39 * one must hold the lock shared. To read or update the counters within
40 * an entry, one must hold the lock shared or exclusive (so the entry doesn't
41 * disappear!) and also take the entry's mutex spinlock.
42 * The shared state variable pgss->extent (the next free spot in the external
43 * query-text file) should be accessed only while holding either the
44 * pgss->mutex spinlock, or exclusive lock on pgss->lock. We use the mutex to
45 * allow reserving file space while holding only shared lock on pgss->lock.
46 * Rewriting the entire external query-text file, eg for garbage collection,
47 * requires holding pgss->lock exclusively; this allows individual entries
48 * in the file to be read or written while holding only shared lock.
49 *
50 *
51 * Copyright (c) 2008-2018, PostgreSQL Global Development Group
52 *
53 * IDENTIFICATION
54 * contrib/pg_stat_statements/pg_stat_statements.c
55 *
56 *-------------------------------------------------------------------------
57 */
58 #include "postgres.h"
59
60 #include <math.h>
61 #include <sys/stat.h>
62 #include <unistd.h>
63
64 #include "access/hash.h"
65 #include "catalog/pg_authid.h"
66 #include "executor/instrument.h"
67 #include "funcapi.h"
68 #include "mb/pg_wchar.h"
69 #include "miscadmin.h"
70 #include "parser/analyze.h"
71 #include "parser/parsetree.h"
72 #include "parser/scanner.h"
73 #include "parser/scansup.h"
74 #include "pgstat.h"
75 #include "storage/fd.h"
76 #include "storage/ipc.h"
77 #include "storage/spin.h"
78 #include "tcop/utility.h"
79 #include "utils/acl.h"
80 #include "utils/builtins.h"
81 #include "utils/memutils.h"
82
83 PG_MODULE_MAGIC;
84
85 /* Location of permanent stats file (valid when database is shut down) */
86 #define PGSS_DUMP_FILE PGSTAT_STAT_PERMANENT_DIRECTORY "/pg_stat_statements.stat"
87
88 /*
89 * Location of external query text file. We don't keep it in the core
90 * system's stats_temp_directory. The core system can safely use that GUC
91 * setting, because the statistics collector temp file paths are set only once
92 * as part of changing the GUC, but pg_stat_statements has no way of avoiding
93 * race conditions. Besides, we only expect modest, infrequent I/O for query
94 * strings, so placing the file on a faster filesystem is not compelling.
95 */
96 #define PGSS_TEXT_FILE PG_STAT_TMP_DIR "/pgss_query_texts.stat"
97
98 /* Magic number identifying the stats file format */
99 static const uint32 PGSS_FILE_HEADER = 0x20171004;
100
101 /* PostgreSQL major version number, changes in which invalidate all entries */
102 static const uint32 PGSS_PG_MAJOR_VERSION = PG_VERSION_NUM / 100;
103
104 /* XXX: Should USAGE_EXEC reflect execution time and/or buffer usage? */
105 #define USAGE_EXEC(duration) (1.0)
106 #define USAGE_INIT (1.0) /* including initial planning */
107 #define ASSUMED_MEDIAN_INIT (10.0) /* initial assumed median usage */
108 #define ASSUMED_LENGTH_INIT 1024 /* initial assumed mean query length */
109 #define USAGE_DECREASE_FACTOR (0.99) /* decreased every entry_dealloc */
110 #define STICKY_DECREASE_FACTOR (0.50) /* factor for sticky entries */
111 #define USAGE_DEALLOC_PERCENT 5 /* free this % of entries at once */
112
113 #define JUMBLE_SIZE 1024 /* query serialization buffer size */
114
115 /*
116 * Extension version number, for supporting older extension versions' objects
117 */
118 typedef enum pgssVersion
119 {
120 PGSS_V1_0 = 0,
121 PGSS_V1_1,
122 PGSS_V1_2,
123 PGSS_V1_3
124 } pgssVersion;
125
126 /*
127 * Hashtable key that defines the identity of a hashtable entry. We separate
128 * queries by user and by database even if they are otherwise identical.
129 *
130 * Right now, this structure contains no padding. If you add any, make sure
131 * to teach pgss_store() to zero the padding bytes. Otherwise, things will
132 * break, because pgss_hash is created using HASH_BLOBS, and thus tag_hash
133 * is used to hash this.
134 */
135 typedef struct pgssHashKey
136 {
137 Oid userid; /* user OID */
138 Oid dbid; /* database OID */
139 uint64 queryid; /* query identifier */
140 } pgssHashKey;
141
142 /*
143 * The actual stats counters kept within pgssEntry.
144 */
145 typedef struct Counters
146 {
147 int64 calls; /* # of times executed */
148 double total_time; /* total execution time, in msec */
149 double min_time; /* minimum execution time in msec */
150 double max_time; /* maximum execution time in msec */
151 double mean_time; /* mean execution time in msec */
152 double sum_var_time; /* sum of variances in execution time in msec */
153 int64 rows; /* total # of retrieved or affected rows */
154 int64 shared_blks_hit; /* # of shared buffer hits */
155 int64 shared_blks_read; /* # of shared disk blocks read */
156 int64 shared_blks_dirtied; /* # of shared disk blocks dirtied */
157 int64 shared_blks_written; /* # of shared disk blocks written */
158 int64 local_blks_hit; /* # of local buffer hits */
159 int64 local_blks_read; /* # of local disk blocks read */
160 int64 local_blks_dirtied; /* # of local disk blocks dirtied */
161 int64 local_blks_written; /* # of local disk blocks written */
162 int64 temp_blks_read; /* # of temp blocks read */
163 int64 temp_blks_written; /* # of temp blocks written */
164 double blk_read_time; /* time spent reading, in msec */
165 double blk_write_time; /* time spent writing, in msec */
166 double usage; /* usage factor */
167 } Counters;
168
169 /*
170 * Statistics per statement
171 *
172 * Note: in event of a failure in garbage collection of the query text file,
173 * we reset query_offset to zero and query_len to -1. This will be seen as
174 * an invalid state by qtext_fetch().
175 */
176 typedef struct pgssEntry
177 {
178 pgssHashKey key; /* hash key of entry - MUST BE FIRST */
179 Counters counters; /* the statistics for this query */
180 Size query_offset; /* query text offset in external file */
181 int query_len; /* # of valid bytes in query string, or -1 */
182 int encoding; /* query text encoding */
183 slock_t mutex; /* protects the counters only */
184 } pgssEntry;
185
186 /*
187 * Global shared state
188 */
189 typedef struct pgssSharedState
190 {
191 LWLock *lock; /* protects hashtable search/modification */
192 double cur_median_usage; /* current median usage in hashtable */
193 Size mean_query_len; /* current mean entry text length */
194 slock_t mutex; /* protects following fields only: */
195 Size extent; /* current extent of query file */
196 int n_writers; /* number of active writers to query file */
197 int gc_count; /* query file garbage collection cycle count */
198 } pgssSharedState;
199
200 /*
201 * Struct for tracking locations/lengths of constants during normalization
202 */
203 typedef struct pgssLocationLen
204 {
205 int location; /* start offset in query text */
206 int length; /* length in bytes, or -1 to ignore */
207 } pgssLocationLen;
208
209 /*
210 * Working state for computing a query jumble and producing a normalized
211 * query string
212 */
213 typedef struct pgssJumbleState
214 {
215 /* Jumble of current query tree */
216 unsigned char *jumble;
217
218 /* Number of bytes used in jumble[] */
219 Size jumble_len;
220
221 /* Array of locations of constants that should be removed */
222 pgssLocationLen *clocations;
223
224 /* Allocated length of clocations array */
225 int clocations_buf_size;
226
227 /* Current number of valid entries in clocations array */
228 int clocations_count;
229
230 /* highest Param id we've seen, in order to start normalization correctly */
231 int highest_extern_param_id;
232 } pgssJumbleState;
233
234 /*---- Local variables ----*/
235
236 /* Current nesting depth of ExecutorRun+ProcessUtility calls */
237 static int nested_level = 0;
238
239 /* Saved hook values in case of unload */
240 static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
241 static post_parse_analyze_hook_type prev_post_parse_analyze_hook = NULL;
242 static ExecutorStart_hook_type prev_ExecutorStart = NULL;
243 static ExecutorRun_hook_type prev_ExecutorRun = NULL;
244 static ExecutorFinish_hook_type prev_ExecutorFinish = NULL;
245 static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
246 static ProcessUtility_hook_type prev_ProcessUtility = NULL;
247
248 /* Links to shared memory state */
249 static pgssSharedState *pgss = NULL;
250 static HTAB *pgss_hash = NULL;
251
252 /*---- GUC variables ----*/
253
254 typedef enum
255 {
256 PGSS_TRACK_NONE, /* track no statements */
257 PGSS_TRACK_TOP, /* only top level statements */
258 PGSS_TRACK_ALL /* all statements, including nested ones */
259 } PGSSTrackLevel;
260
261 static const struct config_enum_entry track_options[] =
262 {
263 {"none", PGSS_TRACK_NONE, false},
264 {"top", PGSS_TRACK_TOP, false},
265 {"all", PGSS_TRACK_ALL, false},
266 {NULL, 0, false}
267 };
268
269 static int pgss_max; /* max # statements to track */
270 static int pgss_track; /* tracking level */
271 static bool pgss_track_utility; /* whether to track utility commands */
272 static bool pgss_save; /* whether to save stats across shutdown */
273
274
275 #define pgss_enabled() \
276 (pgss_track == PGSS_TRACK_ALL || \
277 (pgss_track == PGSS_TRACK_TOP && nested_level == 0))
278
279 #define record_gc_qtexts() \
280 do { \
281 volatile pgssSharedState *s = (volatile pgssSharedState *) pgss; \
282 SpinLockAcquire(&s->mutex); \
283 s->gc_count++; \
284 SpinLockRelease(&s->mutex); \
285 } while(0)
286
287 /*---- Function declarations ----*/
288
289 void _PG_init(void);
290 void _PG_fini(void);
291
292 PG_FUNCTION_INFO_V1(pg_stat_statements_reset);
293 PG_FUNCTION_INFO_V1(pg_stat_statements_1_2);
294 PG_FUNCTION_INFO_V1(pg_stat_statements_1_3);
295 PG_FUNCTION_INFO_V1(pg_stat_statements);
296
297 static void pgss_shmem_startup(void);
298 static void pgss_shmem_shutdown(int code, Datum arg);
299 static void pgss_post_parse_analyze(ParseState *pstate, Query *query);
300 static void pgss_ExecutorStart(QueryDesc *queryDesc, int eflags);
301 static void pgss_ExecutorRun(QueryDesc *queryDesc,
302 ScanDirection direction,
303 uint64 count, bool execute_once);
304 static void pgss_ExecutorFinish(QueryDesc *queryDesc);
305 static void pgss_ExecutorEnd(QueryDesc *queryDesc);
306 static void pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
307 ProcessUtilityContext context, ParamListInfo params,
308 QueryEnvironment *queryEnv,
309 DestReceiver *dest, char *completionTag);
310 static uint64 pgss_hash_string(const char *str, int len);
311 static void pgss_store(const char *query, uint64 queryId,
312 int query_location, int query_len,
313 double total_time, uint64 rows,
314 const BufferUsage *bufusage,
315 pgssJumbleState *jstate);
316 static void pg_stat_statements_internal(FunctionCallInfo fcinfo,
317 pgssVersion api_version,
318 bool showtext);
319 static Size pgss_memsize(void);
320 static pgssEntry *entry_alloc(pgssHashKey *key, Size query_offset, int query_len,
321 int encoding, bool sticky);
322 static void entry_dealloc(void);
323 static bool qtext_store(const char *query, int query_len,
324 Size *query_offset, int *gc_count);
325 static char *qtext_load_file(Size *buffer_size);
326 static char *qtext_fetch(Size query_offset, int query_len,
327 char *buffer, Size buffer_size);
328 static bool need_gc_qtexts(void);
329 static void gc_qtexts(void);
330 static void entry_reset(void);
331 static void AppendJumble(pgssJumbleState *jstate,
332 const unsigned char *item, Size size);
333 static void JumbleQuery(pgssJumbleState *jstate, Query *query);
334 static void JumbleRangeTable(pgssJumbleState *jstate, List *rtable);
335 static void JumbleExpr(pgssJumbleState *jstate, Node *node);
336 static void RecordConstLocation(pgssJumbleState *jstate, int location);
337 static char *generate_normalized_query(pgssJumbleState *jstate, const char *query,
338 int query_loc, int *query_len_p, int encoding);
339 static void fill_in_constant_lengths(pgssJumbleState *jstate, const char *query,
340 int query_loc);
341 static int comp_location(const void *a, const void *b);
342
343
344 /*
345 * Module load callback
346 */
347 void
_PG_init(void)348 _PG_init(void)
349 {
350 /*
351 * In order to create our shared memory area, we have to be loaded via
352 * shared_preload_libraries. If not, fall out without hooking into any of
353 * the main system. (We don't throw error here because it seems useful to
354 * allow the pg_stat_statements functions to be created even when the
355 * module isn't active. The functions must protect themselves against
356 * being called then, however.)
357 */
358 if (!process_shared_preload_libraries_in_progress)
359 return;
360
361 /*
362 * Define (or redefine) custom GUC variables.
363 */
364 DefineCustomIntVariable("pg_stat_statements.max",
365 "Sets the maximum number of statements tracked by pg_stat_statements.",
366 NULL,
367 &pgss_max,
368 5000,
369 100,
370 INT_MAX,
371 PGC_POSTMASTER,
372 0,
373 NULL,
374 NULL,
375 NULL);
376
377 DefineCustomEnumVariable("pg_stat_statements.track",
378 "Selects which statements are tracked by pg_stat_statements.",
379 NULL,
380 &pgss_track,
381 PGSS_TRACK_TOP,
382 track_options,
383 PGC_SUSET,
384 0,
385 NULL,
386 NULL,
387 NULL);
388
389 DefineCustomBoolVariable("pg_stat_statements.track_utility",
390 "Selects whether utility commands are tracked by pg_stat_statements.",
391 NULL,
392 &pgss_track_utility,
393 true,
394 PGC_SUSET,
395 0,
396 NULL,
397 NULL,
398 NULL);
399
400 DefineCustomBoolVariable("pg_stat_statements.save",
401 "Save pg_stat_statements statistics across server shutdowns.",
402 NULL,
403 &pgss_save,
404 true,
405 PGC_SIGHUP,
406 0,
407 NULL,
408 NULL,
409 NULL);
410
411 EmitWarningsOnPlaceholders("pg_stat_statements");
412
413 /*
414 * Request additional shared resources. (These are no-ops if we're not in
415 * the postmaster process.) We'll allocate or attach to the shared
416 * resources in pgss_shmem_startup().
417 */
418 RequestAddinShmemSpace(pgss_memsize());
419 RequestNamedLWLockTranche("pg_stat_statements", 1);
420
421 /*
422 * Install hooks.
423 */
424 prev_shmem_startup_hook = shmem_startup_hook;
425 shmem_startup_hook = pgss_shmem_startup;
426 prev_post_parse_analyze_hook = post_parse_analyze_hook;
427 post_parse_analyze_hook = pgss_post_parse_analyze;
428 prev_ExecutorStart = ExecutorStart_hook;
429 ExecutorStart_hook = pgss_ExecutorStart;
430 prev_ExecutorRun = ExecutorRun_hook;
431 ExecutorRun_hook = pgss_ExecutorRun;
432 prev_ExecutorFinish = ExecutorFinish_hook;
433 ExecutorFinish_hook = pgss_ExecutorFinish;
434 prev_ExecutorEnd = ExecutorEnd_hook;
435 ExecutorEnd_hook = pgss_ExecutorEnd;
436 prev_ProcessUtility = ProcessUtility_hook;
437 ProcessUtility_hook = pgss_ProcessUtility;
438 }
439
440 /*
441 * Module unload callback
442 */
443 void
_PG_fini(void)444 _PG_fini(void)
445 {
446 /* Uninstall hooks. */
447 shmem_startup_hook = prev_shmem_startup_hook;
448 post_parse_analyze_hook = prev_post_parse_analyze_hook;
449 ExecutorStart_hook = prev_ExecutorStart;
450 ExecutorRun_hook = prev_ExecutorRun;
451 ExecutorFinish_hook = prev_ExecutorFinish;
452 ExecutorEnd_hook = prev_ExecutorEnd;
453 ProcessUtility_hook = prev_ProcessUtility;
454 }
455
456 /*
457 * shmem_startup hook: allocate or attach to shared memory,
458 * then load any pre-existing statistics from file.
459 * Also create and load the query-texts file, which is expected to exist
460 * (even if empty) while the module is enabled.
461 */
462 static void
pgss_shmem_startup(void)463 pgss_shmem_startup(void)
464 {
465 bool found;
466 HASHCTL info;
467 FILE *file = NULL;
468 FILE *qfile = NULL;
469 uint32 header;
470 int32 num;
471 int32 pgver;
472 int32 i;
473 int buffer_size;
474 char *buffer = NULL;
475
476 if (prev_shmem_startup_hook)
477 prev_shmem_startup_hook();
478
479 /* reset in case this is a restart within the postmaster */
480 pgss = NULL;
481 pgss_hash = NULL;
482
483 /*
484 * Create or attach to the shared memory state, including hash table
485 */
486 LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
487
488 pgss = ShmemInitStruct("pg_stat_statements",
489 sizeof(pgssSharedState),
490 &found);
491
492 if (!found)
493 {
494 /* First time through ... */
495 pgss->lock = &(GetNamedLWLockTranche("pg_stat_statements"))->lock;
496 pgss->cur_median_usage = ASSUMED_MEDIAN_INIT;
497 pgss->mean_query_len = ASSUMED_LENGTH_INIT;
498 SpinLockInit(&pgss->mutex);
499 pgss->extent = 0;
500 pgss->n_writers = 0;
501 pgss->gc_count = 0;
502 }
503
504 memset(&info, 0, sizeof(info));
505 info.keysize = sizeof(pgssHashKey);
506 info.entrysize = sizeof(pgssEntry);
507 pgss_hash = ShmemInitHash("pg_stat_statements hash",
508 pgss_max, pgss_max,
509 &info,
510 HASH_ELEM | HASH_BLOBS);
511
512 LWLockRelease(AddinShmemInitLock);
513
514 /*
515 * If we're in the postmaster (or a standalone backend...), set up a shmem
516 * exit hook to dump the statistics to disk.
517 */
518 if (!IsUnderPostmaster)
519 on_shmem_exit(pgss_shmem_shutdown, (Datum) 0);
520
521 /*
522 * Done if some other process already completed our initialization.
523 */
524 if (found)
525 return;
526
527 /*
528 * Note: we don't bother with locks here, because there should be no other
529 * processes running when this code is reached.
530 */
531
532 /* Unlink query text file possibly left over from crash */
533 unlink(PGSS_TEXT_FILE);
534
535 /* Allocate new query text temp file */
536 qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W);
537 if (qfile == NULL)
538 goto write_error;
539
540 /*
541 * If we were told not to load old statistics, we're done. (Note we do
542 * not try to unlink any old dump file in this case. This seems a bit
543 * questionable but it's the historical behavior.)
544 */
545 if (!pgss_save)
546 {
547 FreeFile(qfile);
548 return;
549 }
550
551 /*
552 * Attempt to load old statistics from the dump file.
553 */
554 file = AllocateFile(PGSS_DUMP_FILE, PG_BINARY_R);
555 if (file == NULL)
556 {
557 if (errno != ENOENT)
558 goto read_error;
559 /* No existing persisted stats file, so we're done */
560 FreeFile(qfile);
561 return;
562 }
563
564 buffer_size = 2048;
565 buffer = (char *) palloc(buffer_size);
566
567 if (fread(&header, sizeof(uint32), 1, file) != 1 ||
568 fread(&pgver, sizeof(uint32), 1, file) != 1 ||
569 fread(&num, sizeof(int32), 1, file) != 1)
570 goto read_error;
571
572 if (header != PGSS_FILE_HEADER ||
573 pgver != PGSS_PG_MAJOR_VERSION)
574 goto data_error;
575
576 for (i = 0; i < num; i++)
577 {
578 pgssEntry temp;
579 pgssEntry *entry;
580 Size query_offset;
581
582 if (fread(&temp, sizeof(pgssEntry), 1, file) != 1)
583 goto read_error;
584
585 /* Encoding is the only field we can easily sanity-check */
586 if (!PG_VALID_BE_ENCODING(temp.encoding))
587 goto data_error;
588
589 /* Resize buffer as needed */
590 if (temp.query_len >= buffer_size)
591 {
592 buffer_size = Max(buffer_size * 2, temp.query_len + 1);
593 buffer = repalloc(buffer, buffer_size);
594 }
595
596 if (fread(buffer, 1, temp.query_len + 1, file) != temp.query_len + 1)
597 goto read_error;
598
599 /* Should have a trailing null, but let's make sure */
600 buffer[temp.query_len] = '\0';
601
602 /* Skip loading "sticky" entries */
603 if (temp.counters.calls == 0)
604 continue;
605
606 /* Store the query text */
607 query_offset = pgss->extent;
608 if (fwrite(buffer, 1, temp.query_len + 1, qfile) != temp.query_len + 1)
609 goto write_error;
610 pgss->extent += temp.query_len + 1;
611
612 /* make the hashtable entry (discards old entries if too many) */
613 entry = entry_alloc(&temp.key, query_offset, temp.query_len,
614 temp.encoding,
615 false);
616
617 /* copy in the actual stats */
618 entry->counters = temp.counters;
619 }
620
621 pfree(buffer);
622 FreeFile(file);
623 FreeFile(qfile);
624
625 /*
626 * Remove the persisted stats file so it's not included in
627 * backups/replication slaves, etc. A new file will be written on next
628 * shutdown.
629 *
630 * Note: it's okay if the PGSS_TEXT_FILE is included in a basebackup,
631 * because we remove that file on startup; it acts inversely to
632 * PGSS_DUMP_FILE, in that it is only supposed to be around when the
633 * server is running, whereas PGSS_DUMP_FILE is only supposed to be around
634 * when the server is not running. Leaving the file creates no danger of
635 * a newly restored database having a spurious record of execution costs,
636 * which is what we're really concerned about here.
637 */
638 unlink(PGSS_DUMP_FILE);
639
640 return;
641
642 read_error:
643 ereport(LOG,
644 (errcode_for_file_access(),
645 errmsg("could not read pg_stat_statement file \"%s\": %m",
646 PGSS_DUMP_FILE)));
647 goto fail;
648 data_error:
649 ereport(LOG,
650 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
651 errmsg("ignoring invalid data in pg_stat_statement file \"%s\"",
652 PGSS_DUMP_FILE)));
653 goto fail;
654 write_error:
655 ereport(LOG,
656 (errcode_for_file_access(),
657 errmsg("could not write pg_stat_statement file \"%s\": %m",
658 PGSS_TEXT_FILE)));
659 fail:
660 if (buffer)
661 pfree(buffer);
662 if (file)
663 FreeFile(file);
664 if (qfile)
665 FreeFile(qfile);
666 /* If possible, throw away the bogus file; ignore any error */
667 unlink(PGSS_DUMP_FILE);
668
669 /*
670 * Don't unlink PGSS_TEXT_FILE here; it should always be around while the
671 * server is running with pg_stat_statements enabled
672 */
673 }
674
675 /*
676 * shmem_shutdown hook: Dump statistics into file.
677 *
678 * Note: we don't bother with acquiring lock, because there should be no
679 * other processes running when this is called.
680 */
681 static void
pgss_shmem_shutdown(int code,Datum arg)682 pgss_shmem_shutdown(int code, Datum arg)
683 {
684 FILE *file;
685 char *qbuffer = NULL;
686 Size qbuffer_size = 0;
687 HASH_SEQ_STATUS hash_seq;
688 int32 num_entries;
689 pgssEntry *entry;
690
691 /* Don't try to dump during a crash. */
692 if (code)
693 return;
694
695 /* Safety check ... shouldn't get here unless shmem is set up. */
696 if (!pgss || !pgss_hash)
697 return;
698
699 /* Don't dump if told not to. */
700 if (!pgss_save)
701 return;
702
703 file = AllocateFile(PGSS_DUMP_FILE ".tmp", PG_BINARY_W);
704 if (file == NULL)
705 goto error;
706
707 if (fwrite(&PGSS_FILE_HEADER, sizeof(uint32), 1, file) != 1)
708 goto error;
709 if (fwrite(&PGSS_PG_MAJOR_VERSION, sizeof(uint32), 1, file) != 1)
710 goto error;
711 num_entries = hash_get_num_entries(pgss_hash);
712 if (fwrite(&num_entries, sizeof(int32), 1, file) != 1)
713 goto error;
714
715 qbuffer = qtext_load_file(&qbuffer_size);
716 if (qbuffer == NULL)
717 goto error;
718
719 /*
720 * When serializing to disk, we store query texts immediately after their
721 * entry data. Any orphaned query texts are thereby excluded.
722 */
723 hash_seq_init(&hash_seq, pgss_hash);
724 while ((entry = hash_seq_search(&hash_seq)) != NULL)
725 {
726 int len = entry->query_len;
727 char *qstr = qtext_fetch(entry->query_offset, len,
728 qbuffer, qbuffer_size);
729
730 if (qstr == NULL)
731 continue; /* Ignore any entries with bogus texts */
732
733 if (fwrite(entry, sizeof(pgssEntry), 1, file) != 1 ||
734 fwrite(qstr, 1, len + 1, file) != len + 1)
735 {
736 /* note: we assume hash_seq_term won't change errno */
737 hash_seq_term(&hash_seq);
738 goto error;
739 }
740 }
741
742 free(qbuffer);
743 qbuffer = NULL;
744
745 if (FreeFile(file))
746 {
747 file = NULL;
748 goto error;
749 }
750
751 /*
752 * Rename file into place, so we atomically replace any old one.
753 */
754 (void) durable_rename(PGSS_DUMP_FILE ".tmp", PGSS_DUMP_FILE, LOG);
755
756 /* Unlink query-texts file; it's not needed while shutdown */
757 unlink(PGSS_TEXT_FILE);
758
759 return;
760
761 error:
762 ereport(LOG,
763 (errcode_for_file_access(),
764 errmsg("could not write pg_stat_statement file \"%s\": %m",
765 PGSS_DUMP_FILE ".tmp")));
766 if (qbuffer)
767 free(qbuffer);
768 if (file)
769 FreeFile(file);
770 unlink(PGSS_DUMP_FILE ".tmp");
771 unlink(PGSS_TEXT_FILE);
772 }
773
774 /*
775 * Post-parse-analysis hook: mark query with a queryId
776 */
777 static void
pgss_post_parse_analyze(ParseState * pstate,Query * query)778 pgss_post_parse_analyze(ParseState *pstate, Query *query)
779 {
780 pgssJumbleState jstate;
781
782 if (prev_post_parse_analyze_hook)
783 prev_post_parse_analyze_hook(pstate, query);
784
785 /* Assert we didn't do this already */
786 Assert(query->queryId == UINT64CONST(0));
787
788 /* Safety check... */
789 if (!pgss || !pgss_hash)
790 return;
791
792 /*
793 * Utility statements get queryId zero. We do this even in cases where
794 * the statement contains an optimizable statement for which a queryId
795 * could be derived (such as EXPLAIN or DECLARE CURSOR). For such cases,
796 * runtime control will first go through ProcessUtility and then the
797 * executor, and we don't want the executor hooks to do anything, since we
798 * are already measuring the statement's costs at the utility level.
799 */
800 if (query->utilityStmt)
801 {
802 query->queryId = UINT64CONST(0);
803 return;
804 }
805
806 /* Set up workspace for query jumbling */
807 jstate.jumble = (unsigned char *) palloc(JUMBLE_SIZE);
808 jstate.jumble_len = 0;
809 jstate.clocations_buf_size = 32;
810 jstate.clocations = (pgssLocationLen *)
811 palloc(jstate.clocations_buf_size * sizeof(pgssLocationLen));
812 jstate.clocations_count = 0;
813 jstate.highest_extern_param_id = 0;
814
815 /* Compute query ID and mark the Query node with it */
816 JumbleQuery(&jstate, query);
817 query->queryId =
818 DatumGetUInt64(hash_any_extended(jstate.jumble, jstate.jumble_len, 0));
819
820 /*
821 * If we are unlucky enough to get a hash of zero, use 1 instead, to
822 * prevent confusion with the utility-statement case.
823 */
824 if (query->queryId == UINT64CONST(0))
825 query->queryId = UINT64CONST(1);
826
827 /*
828 * If we were able to identify any ignorable constants, we immediately
829 * create a hash table entry for the query, so that we can record the
830 * normalized form of the query string. If there were no such constants,
831 * the normalized string would be the same as the query text anyway, so
832 * there's no need for an early entry.
833 */
834 if (jstate.clocations_count > 0)
835 pgss_store(pstate->p_sourcetext,
836 query->queryId,
837 query->stmt_location,
838 query->stmt_len,
839 0,
840 0,
841 NULL,
842 &jstate);
843 }
844
845 /*
846 * ExecutorStart hook: start up tracking if needed
847 */
848 static void
pgss_ExecutorStart(QueryDesc * queryDesc,int eflags)849 pgss_ExecutorStart(QueryDesc *queryDesc, int eflags)
850 {
851 if (prev_ExecutorStart)
852 prev_ExecutorStart(queryDesc, eflags);
853 else
854 standard_ExecutorStart(queryDesc, eflags);
855
856 /*
857 * If query has queryId zero, don't track it. This prevents double
858 * counting of optimizable statements that are directly contained in
859 * utility statements.
860 */
861 if (pgss_enabled() && queryDesc->plannedstmt->queryId != UINT64CONST(0))
862 {
863 /*
864 * Set up to track total elapsed time in ExecutorRun. Make sure the
865 * space is allocated in the per-query context so it will go away at
866 * ExecutorEnd.
867 */
868 if (queryDesc->totaltime == NULL)
869 {
870 MemoryContext oldcxt;
871
872 oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
873 queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL);
874 MemoryContextSwitchTo(oldcxt);
875 }
876 }
877 }
878
879 /*
880 * ExecutorRun hook: all we need do is track nesting depth
881 */
882 static void
pgss_ExecutorRun(QueryDesc * queryDesc,ScanDirection direction,uint64 count,bool execute_once)883 pgss_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, uint64 count,
884 bool execute_once)
885 {
886 nested_level++;
887 PG_TRY();
888 {
889 if (prev_ExecutorRun)
890 prev_ExecutorRun(queryDesc, direction, count, execute_once);
891 else
892 standard_ExecutorRun(queryDesc, direction, count, execute_once);
893 nested_level--;
894 }
895 PG_CATCH();
896 {
897 nested_level--;
898 PG_RE_THROW();
899 }
900 PG_END_TRY();
901 }
902
903 /*
904 * ExecutorFinish hook: all we need do is track nesting depth
905 */
906 static void
pgss_ExecutorFinish(QueryDesc * queryDesc)907 pgss_ExecutorFinish(QueryDesc *queryDesc)
908 {
909 nested_level++;
910 PG_TRY();
911 {
912 if (prev_ExecutorFinish)
913 prev_ExecutorFinish(queryDesc);
914 else
915 standard_ExecutorFinish(queryDesc);
916 nested_level--;
917 }
918 PG_CATCH();
919 {
920 nested_level--;
921 PG_RE_THROW();
922 }
923 PG_END_TRY();
924 }
925
926 /*
927 * ExecutorEnd hook: store results if needed
928 */
929 static void
pgss_ExecutorEnd(QueryDesc * queryDesc)930 pgss_ExecutorEnd(QueryDesc *queryDesc)
931 {
932 uint64 queryId = queryDesc->plannedstmt->queryId;
933
934 if (queryId != UINT64CONST(0) && queryDesc->totaltime && pgss_enabled())
935 {
936 /*
937 * Make sure stats accumulation is done. (Note: it's okay if several
938 * levels of hook all do this.)
939 */
940 InstrEndLoop(queryDesc->totaltime);
941
942 pgss_store(queryDesc->sourceText,
943 queryId,
944 queryDesc->plannedstmt->stmt_location,
945 queryDesc->plannedstmt->stmt_len,
946 queryDesc->totaltime->total * 1000.0, /* convert to msec */
947 queryDesc->estate->es_processed,
948 &queryDesc->totaltime->bufusage,
949 NULL);
950 }
951
952 if (prev_ExecutorEnd)
953 prev_ExecutorEnd(queryDesc);
954 else
955 standard_ExecutorEnd(queryDesc);
956 }
957
958 /*
959 * ProcessUtility hook
960 */
961 static void
pgss_ProcessUtility(PlannedStmt * pstmt,const char * queryString,ProcessUtilityContext context,ParamListInfo params,QueryEnvironment * queryEnv,DestReceiver * dest,char * completionTag)962 pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
963 ProcessUtilityContext context,
964 ParamListInfo params, QueryEnvironment *queryEnv,
965 DestReceiver *dest, char *completionTag)
966 {
967 Node *parsetree = pstmt->utilityStmt;
968
969 /*
970 * If it's an EXECUTE statement, we don't track it and don't increment the
971 * nesting level. This allows the cycles to be charged to the underlying
972 * PREPARE instead (by the Executor hooks), which is much more useful.
973 *
974 * We also don't track execution of PREPARE. If we did, we would get one
975 * hash table entry for the PREPARE (with hash calculated from the query
976 * string), and then a different one with the same query string (but hash
977 * calculated from the query tree) would be used to accumulate costs of
978 * ensuing EXECUTEs. This would be confusing, and inconsistent with other
979 * cases where planning time is not included at all.
980 *
981 * Likewise, we don't track execution of DEALLOCATE.
982 */
983 if (pgss_track_utility && pgss_enabled() &&
984 !IsA(parsetree, ExecuteStmt) &&
985 !IsA(parsetree, PrepareStmt) &&
986 !IsA(parsetree, DeallocateStmt))
987 {
988 instr_time start;
989 instr_time duration;
990 uint64 rows;
991 BufferUsage bufusage_start,
992 bufusage;
993
994 bufusage_start = pgBufferUsage;
995 INSTR_TIME_SET_CURRENT(start);
996
997 nested_level++;
998 PG_TRY();
999 {
1000 if (prev_ProcessUtility)
1001 prev_ProcessUtility(pstmt, queryString,
1002 context, params, queryEnv,
1003 dest, completionTag);
1004 else
1005 standard_ProcessUtility(pstmt, queryString,
1006 context, params, queryEnv,
1007 dest, completionTag);
1008 nested_level--;
1009 }
1010 PG_CATCH();
1011 {
1012 nested_level--;
1013 PG_RE_THROW();
1014 }
1015 PG_END_TRY();
1016
1017 INSTR_TIME_SET_CURRENT(duration);
1018 INSTR_TIME_SUBTRACT(duration, start);
1019
1020 /* parse command tag to retrieve the number of affected rows. */
1021 if (completionTag &&
1022 strncmp(completionTag, "COPY ", 5) == 0)
1023 rows = pg_strtouint64(completionTag + 5, NULL, 10);
1024 else
1025 rows = 0;
1026
1027 /* calc differences of buffer counters. */
1028 bufusage.shared_blks_hit =
1029 pgBufferUsage.shared_blks_hit - bufusage_start.shared_blks_hit;
1030 bufusage.shared_blks_read =
1031 pgBufferUsage.shared_blks_read - bufusage_start.shared_blks_read;
1032 bufusage.shared_blks_dirtied =
1033 pgBufferUsage.shared_blks_dirtied - bufusage_start.shared_blks_dirtied;
1034 bufusage.shared_blks_written =
1035 pgBufferUsage.shared_blks_written - bufusage_start.shared_blks_written;
1036 bufusage.local_blks_hit =
1037 pgBufferUsage.local_blks_hit - bufusage_start.local_blks_hit;
1038 bufusage.local_blks_read =
1039 pgBufferUsage.local_blks_read - bufusage_start.local_blks_read;
1040 bufusage.local_blks_dirtied =
1041 pgBufferUsage.local_blks_dirtied - bufusage_start.local_blks_dirtied;
1042 bufusage.local_blks_written =
1043 pgBufferUsage.local_blks_written - bufusage_start.local_blks_written;
1044 bufusage.temp_blks_read =
1045 pgBufferUsage.temp_blks_read - bufusage_start.temp_blks_read;
1046 bufusage.temp_blks_written =
1047 pgBufferUsage.temp_blks_written - bufusage_start.temp_blks_written;
1048 bufusage.blk_read_time = pgBufferUsage.blk_read_time;
1049 INSTR_TIME_SUBTRACT(bufusage.blk_read_time, bufusage_start.blk_read_time);
1050 bufusage.blk_write_time = pgBufferUsage.blk_write_time;
1051 INSTR_TIME_SUBTRACT(bufusage.blk_write_time, bufusage_start.blk_write_time);
1052
1053 pgss_store(queryString,
1054 0, /* signal that it's a utility stmt */
1055 pstmt->stmt_location,
1056 pstmt->stmt_len,
1057 INSTR_TIME_GET_MILLISEC(duration),
1058 rows,
1059 &bufusage,
1060 NULL);
1061 }
1062 else
1063 {
1064 if (prev_ProcessUtility)
1065 prev_ProcessUtility(pstmt, queryString,
1066 context, params, queryEnv,
1067 dest, completionTag);
1068 else
1069 standard_ProcessUtility(pstmt, queryString,
1070 context, params, queryEnv,
1071 dest, completionTag);
1072 }
1073 }
1074
1075 /*
1076 * Given an arbitrarily long query string, produce a hash for the purposes of
1077 * identifying the query, without normalizing constants. Used when hashing
1078 * utility statements.
1079 */
1080 static uint64
pgss_hash_string(const char * str,int len)1081 pgss_hash_string(const char *str, int len)
1082 {
1083 return DatumGetUInt64(hash_any_extended((const unsigned char *) str,
1084 len, 0));
1085 }
1086
1087 /*
1088 * Store some statistics for a statement.
1089 *
1090 * If queryId is 0 then this is a utility statement and we should compute
1091 * a suitable queryId internally.
1092 *
1093 * If jstate is not NULL then we're trying to create an entry for which
1094 * we have no statistics as yet; we just want to record the normalized
1095 * query string. total_time, rows, bufusage are ignored in this case.
1096 */
1097 static void
pgss_store(const char * query,uint64 queryId,int query_location,int query_len,double total_time,uint64 rows,const BufferUsage * bufusage,pgssJumbleState * jstate)1098 pgss_store(const char *query, uint64 queryId,
1099 int query_location, int query_len,
1100 double total_time, uint64 rows,
1101 const BufferUsage *bufusage,
1102 pgssJumbleState *jstate)
1103 {
1104 pgssHashKey key;
1105 pgssEntry *entry;
1106 char *norm_query = NULL;
1107 int encoding = GetDatabaseEncoding();
1108
1109 Assert(query != NULL);
1110
1111 /* Safety check... */
1112 if (!pgss || !pgss_hash)
1113 return;
1114
1115 /*
1116 * Confine our attention to the relevant part of the string, if the query
1117 * is a portion of a multi-statement source string.
1118 *
1119 * First apply starting offset, unless it's -1 (unknown).
1120 */
1121 if (query_location >= 0)
1122 {
1123 Assert(query_location <= strlen(query));
1124 query += query_location;
1125 /* Length of 0 (or -1) means "rest of string" */
1126 if (query_len <= 0)
1127 query_len = strlen(query);
1128 else
1129 Assert(query_len <= strlen(query));
1130 }
1131 else
1132 {
1133 /* If query location is unknown, distrust query_len as well */
1134 query_location = 0;
1135 query_len = strlen(query);
1136 }
1137
1138 /*
1139 * Discard leading and trailing whitespace, too. Use scanner_isspace()
1140 * not libc's isspace(), because we want to match the lexer's behavior.
1141 */
1142 while (query_len > 0 && scanner_isspace(query[0]))
1143 query++, query_location++, query_len--;
1144 while (query_len > 0 && scanner_isspace(query[query_len - 1]))
1145 query_len--;
1146
1147 /*
1148 * For utility statements, we just hash the query string to get an ID.
1149 */
1150 if (queryId == UINT64CONST(0))
1151 queryId = pgss_hash_string(query, query_len);
1152
1153 /* Set up key for hashtable search */
1154 key.userid = GetUserId();
1155 key.dbid = MyDatabaseId;
1156 key.queryid = queryId;
1157
1158 /* Lookup the hash table entry with shared lock. */
1159 LWLockAcquire(pgss->lock, LW_SHARED);
1160
1161 entry = (pgssEntry *) hash_search(pgss_hash, &key, HASH_FIND, NULL);
1162
1163 /* Create new entry, if not present */
1164 if (!entry)
1165 {
1166 Size query_offset;
1167 int gc_count;
1168 bool stored;
1169 bool do_gc;
1170
1171 /*
1172 * Create a new, normalized query string if caller asked. We don't
1173 * need to hold the lock while doing this work. (Note: in any case,
1174 * it's possible that someone else creates a duplicate hashtable entry
1175 * in the interval where we don't hold the lock below. That case is
1176 * handled by entry_alloc.)
1177 */
1178 if (jstate)
1179 {
1180 LWLockRelease(pgss->lock);
1181 norm_query = generate_normalized_query(jstate, query,
1182 query_location,
1183 &query_len,
1184 encoding);
1185 LWLockAcquire(pgss->lock, LW_SHARED);
1186 }
1187
1188 /* Append new query text to file with only shared lock held */
1189 stored = qtext_store(norm_query ? norm_query : query, query_len,
1190 &query_offset, &gc_count);
1191
1192 /*
1193 * Determine whether we need to garbage collect external query texts
1194 * while the shared lock is still held. This micro-optimization
1195 * avoids taking the time to decide this while holding exclusive lock.
1196 */
1197 do_gc = need_gc_qtexts();
1198
1199 /* Need exclusive lock to make a new hashtable entry - promote */
1200 LWLockRelease(pgss->lock);
1201 LWLockAcquire(pgss->lock, LW_EXCLUSIVE);
1202
1203 /*
1204 * A garbage collection may have occurred while we weren't holding the
1205 * lock. In the unlikely event that this happens, the query text we
1206 * stored above will have been garbage collected, so write it again.
1207 * This should be infrequent enough that doing it while holding
1208 * exclusive lock isn't a performance problem.
1209 */
1210 if (!stored || pgss->gc_count != gc_count)
1211 stored = qtext_store(norm_query ? norm_query : query, query_len,
1212 &query_offset, NULL);
1213
1214 /* If we failed to write to the text file, give up */
1215 if (!stored)
1216 goto done;
1217
1218 /* OK to create a new hashtable entry */
1219 entry = entry_alloc(&key, query_offset, query_len, encoding,
1220 jstate != NULL);
1221
1222 /* If needed, perform garbage collection while exclusive lock held */
1223 if (do_gc)
1224 gc_qtexts();
1225 }
1226
1227 /* Increment the counts, except when jstate is not NULL */
1228 if (!jstate)
1229 {
1230 /*
1231 * Grab the spinlock while updating the counters (see comment about
1232 * locking rules at the head of the file)
1233 */
1234 volatile pgssEntry *e = (volatile pgssEntry *) entry;
1235
1236 SpinLockAcquire(&e->mutex);
1237
1238 /* "Unstick" entry if it was previously sticky */
1239 if (e->counters.calls == 0)
1240 e->counters.usage = USAGE_INIT;
1241
1242 e->counters.calls += 1;
1243 e->counters.total_time += total_time;
1244 if (e->counters.calls == 1)
1245 {
1246 e->counters.min_time = total_time;
1247 e->counters.max_time = total_time;
1248 e->counters.mean_time = total_time;
1249 }
1250 else
1251 {
1252 /*
1253 * Welford's method for accurately computing variance. See
1254 * <http://www.johndcook.com/blog/standard_deviation/>
1255 */
1256 double old_mean = e->counters.mean_time;
1257
1258 e->counters.mean_time +=
1259 (total_time - old_mean) / e->counters.calls;
1260 e->counters.sum_var_time +=
1261 (total_time - old_mean) * (total_time - e->counters.mean_time);
1262
1263 /* calculate min and max time */
1264 if (e->counters.min_time > total_time)
1265 e->counters.min_time = total_time;
1266 if (e->counters.max_time < total_time)
1267 e->counters.max_time = total_time;
1268 }
1269 e->counters.rows += rows;
1270 e->counters.shared_blks_hit += bufusage->shared_blks_hit;
1271 e->counters.shared_blks_read += bufusage->shared_blks_read;
1272 e->counters.shared_blks_dirtied += bufusage->shared_blks_dirtied;
1273 e->counters.shared_blks_written += bufusage->shared_blks_written;
1274 e->counters.local_blks_hit += bufusage->local_blks_hit;
1275 e->counters.local_blks_read += bufusage->local_blks_read;
1276 e->counters.local_blks_dirtied += bufusage->local_blks_dirtied;
1277 e->counters.local_blks_written += bufusage->local_blks_written;
1278 e->counters.temp_blks_read += bufusage->temp_blks_read;
1279 e->counters.temp_blks_written += bufusage->temp_blks_written;
1280 e->counters.blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_read_time);
1281 e->counters.blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_write_time);
1282 e->counters.usage += USAGE_EXEC(total_time);
1283
1284 SpinLockRelease(&e->mutex);
1285 }
1286
1287 done:
1288 LWLockRelease(pgss->lock);
1289
1290 /* We postpone this clean-up until we're out of the lock */
1291 if (norm_query)
1292 pfree(norm_query);
1293 }
1294
1295 /*
1296 * Reset all statement statistics.
1297 */
1298 Datum
pg_stat_statements_reset(PG_FUNCTION_ARGS)1299 pg_stat_statements_reset(PG_FUNCTION_ARGS)
1300 {
1301 if (!pgss || !pgss_hash)
1302 ereport(ERROR,
1303 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1304 errmsg("pg_stat_statements must be loaded via shared_preload_libraries")));
1305 entry_reset();
1306 PG_RETURN_VOID();
1307 }
1308
1309 /* Number of output arguments (columns) for various API versions */
1310 #define PG_STAT_STATEMENTS_COLS_V1_0 14
1311 #define PG_STAT_STATEMENTS_COLS_V1_1 18
1312 #define PG_STAT_STATEMENTS_COLS_V1_2 19
1313 #define PG_STAT_STATEMENTS_COLS_V1_3 23
1314 #define PG_STAT_STATEMENTS_COLS 23 /* maximum of above */
1315
1316 /*
1317 * Retrieve statement statistics.
1318 *
1319 * The SQL API of this function has changed multiple times, and will likely
1320 * do so again in future. To support the case where a newer version of this
1321 * loadable module is being used with an old SQL declaration of the function,
1322 * we continue to support the older API versions. For 1.2 and later, the
1323 * expected API version is identified by embedding it in the C name of the
1324 * function. Unfortunately we weren't bright enough to do that for 1.1.
1325 */
1326 Datum
pg_stat_statements_1_3(PG_FUNCTION_ARGS)1327 pg_stat_statements_1_3(PG_FUNCTION_ARGS)
1328 {
1329 bool showtext = PG_GETARG_BOOL(0);
1330
1331 pg_stat_statements_internal(fcinfo, PGSS_V1_3, showtext);
1332
1333 return (Datum) 0;
1334 }
1335
1336 Datum
pg_stat_statements_1_2(PG_FUNCTION_ARGS)1337 pg_stat_statements_1_2(PG_FUNCTION_ARGS)
1338 {
1339 bool showtext = PG_GETARG_BOOL(0);
1340
1341 pg_stat_statements_internal(fcinfo, PGSS_V1_2, showtext);
1342
1343 return (Datum) 0;
1344 }
1345
1346 /*
1347 * Legacy entry point for pg_stat_statements() API versions 1.0 and 1.1.
1348 * This can be removed someday, perhaps.
1349 */
1350 Datum
pg_stat_statements(PG_FUNCTION_ARGS)1351 pg_stat_statements(PG_FUNCTION_ARGS)
1352 {
1353 /* If it's really API 1.1, we'll figure that out below */
1354 pg_stat_statements_internal(fcinfo, PGSS_V1_0, true);
1355
1356 return (Datum) 0;
1357 }
1358
1359 /* Common code for all versions of pg_stat_statements() */
1360 static void
pg_stat_statements_internal(FunctionCallInfo fcinfo,pgssVersion api_version,bool showtext)1361 pg_stat_statements_internal(FunctionCallInfo fcinfo,
1362 pgssVersion api_version,
1363 bool showtext)
1364 {
1365 ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
1366 TupleDesc tupdesc;
1367 Tuplestorestate *tupstore;
1368 MemoryContext per_query_ctx;
1369 MemoryContext oldcontext;
1370 Oid userid = GetUserId();
1371 bool is_allowed_role = false;
1372 char *qbuffer = NULL;
1373 Size qbuffer_size = 0;
1374 Size extent = 0;
1375 int gc_count = 0;
1376 HASH_SEQ_STATUS hash_seq;
1377 pgssEntry *entry;
1378
1379 /* Superusers or members of pg_read_all_stats members are allowed */
1380 is_allowed_role = is_member_of_role(GetUserId(), DEFAULT_ROLE_READ_ALL_STATS);
1381
1382 /* hash table must exist already */
1383 if (!pgss || !pgss_hash)
1384 ereport(ERROR,
1385 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1386 errmsg("pg_stat_statements must be loaded via shared_preload_libraries")));
1387
1388 /* check to see if caller supports us returning a tuplestore */
1389 if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
1390 ereport(ERROR,
1391 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1392 errmsg("set-valued function called in context that cannot accept a set")));
1393 if (!(rsinfo->allowedModes & SFRM_Materialize))
1394 ereport(ERROR,
1395 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1396 errmsg("materialize mode required, but it is not " \
1397 "allowed in this context")));
1398
1399 /* Switch into long-lived context to construct returned data structures */
1400 per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
1401 oldcontext = MemoryContextSwitchTo(per_query_ctx);
1402
1403 /* Build a tuple descriptor for our result type */
1404 if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
1405 elog(ERROR, "return type must be a row type");
1406
1407 /*
1408 * Check we have the expected number of output arguments. Aside from
1409 * being a good safety check, we need a kluge here to detect API version
1410 * 1.1, which was wedged into the code in an ill-considered way.
1411 */
1412 switch (tupdesc->natts)
1413 {
1414 case PG_STAT_STATEMENTS_COLS_V1_0:
1415 if (api_version != PGSS_V1_0)
1416 elog(ERROR, "incorrect number of output arguments");
1417 break;
1418 case PG_STAT_STATEMENTS_COLS_V1_1:
1419 /* pg_stat_statements() should have told us 1.0 */
1420 if (api_version != PGSS_V1_0)
1421 elog(ERROR, "incorrect number of output arguments");
1422 api_version = PGSS_V1_1;
1423 break;
1424 case PG_STAT_STATEMENTS_COLS_V1_2:
1425 if (api_version != PGSS_V1_2)
1426 elog(ERROR, "incorrect number of output arguments");
1427 break;
1428 case PG_STAT_STATEMENTS_COLS_V1_3:
1429 if (api_version != PGSS_V1_3)
1430 elog(ERROR, "incorrect number of output arguments");
1431 break;
1432 default:
1433 elog(ERROR, "incorrect number of output arguments");
1434 }
1435
1436 tupstore = tuplestore_begin_heap(true, false, work_mem);
1437 rsinfo->returnMode = SFRM_Materialize;
1438 rsinfo->setResult = tupstore;
1439 rsinfo->setDesc = tupdesc;
1440
1441 MemoryContextSwitchTo(oldcontext);
1442
1443 /*
1444 * We'd like to load the query text file (if needed) while not holding any
1445 * lock on pgss->lock. In the worst case we'll have to do this again
1446 * after we have the lock, but it's unlikely enough to make this a win
1447 * despite occasional duplicated work. We need to reload if anybody
1448 * writes to the file (either a retail qtext_store(), or a garbage
1449 * collection) between this point and where we've gotten shared lock. If
1450 * a qtext_store is actually in progress when we look, we might as well
1451 * skip the speculative load entirely.
1452 */
1453 if (showtext)
1454 {
1455 int n_writers;
1456
1457 /* Take the mutex so we can examine variables */
1458 {
1459 volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
1460
1461 SpinLockAcquire(&s->mutex);
1462 extent = s->extent;
1463 n_writers = s->n_writers;
1464 gc_count = s->gc_count;
1465 SpinLockRelease(&s->mutex);
1466 }
1467
1468 /* No point in loading file now if there are active writers */
1469 if (n_writers == 0)
1470 qbuffer = qtext_load_file(&qbuffer_size);
1471 }
1472
1473 /*
1474 * Get shared lock, load or reload the query text file if we must, and
1475 * iterate over the hashtable entries.
1476 *
1477 * With a large hash table, we might be holding the lock rather longer
1478 * than one could wish. However, this only blocks creation of new hash
1479 * table entries, and the larger the hash table the less likely that is to
1480 * be needed. So we can hope this is okay. Perhaps someday we'll decide
1481 * we need to partition the hash table to limit the time spent holding any
1482 * one lock.
1483 */
1484 LWLockAcquire(pgss->lock, LW_SHARED);
1485
1486 if (showtext)
1487 {
1488 /*
1489 * Here it is safe to examine extent and gc_count without taking the
1490 * mutex. Note that although other processes might change
1491 * pgss->extent just after we look at it, the strings they then write
1492 * into the file cannot yet be referenced in the hashtable, so we
1493 * don't care whether we see them or not.
1494 *
1495 * If qtext_load_file fails, we just press on; we'll return NULL for
1496 * every query text.
1497 */
1498 if (qbuffer == NULL ||
1499 pgss->extent != extent ||
1500 pgss->gc_count != gc_count)
1501 {
1502 if (qbuffer)
1503 free(qbuffer);
1504 qbuffer = qtext_load_file(&qbuffer_size);
1505 }
1506 }
1507
1508 hash_seq_init(&hash_seq, pgss_hash);
1509 while ((entry = hash_seq_search(&hash_seq)) != NULL)
1510 {
1511 Datum values[PG_STAT_STATEMENTS_COLS];
1512 bool nulls[PG_STAT_STATEMENTS_COLS];
1513 int i = 0;
1514 Counters tmp;
1515 double stddev;
1516 int64 queryid = entry->key.queryid;
1517
1518 memset(values, 0, sizeof(values));
1519 memset(nulls, 0, sizeof(nulls));
1520
1521 values[i++] = ObjectIdGetDatum(entry->key.userid);
1522 values[i++] = ObjectIdGetDatum(entry->key.dbid);
1523
1524 if (is_allowed_role || entry->key.userid == userid)
1525 {
1526 if (api_version >= PGSS_V1_2)
1527 values[i++] = Int64GetDatumFast(queryid);
1528
1529 if (showtext)
1530 {
1531 char *qstr = qtext_fetch(entry->query_offset,
1532 entry->query_len,
1533 qbuffer,
1534 qbuffer_size);
1535
1536 if (qstr)
1537 {
1538 char *enc;
1539
1540 enc = pg_any_to_server(qstr,
1541 entry->query_len,
1542 entry->encoding);
1543
1544 values[i++] = CStringGetTextDatum(enc);
1545
1546 if (enc != qstr)
1547 pfree(enc);
1548 }
1549 else
1550 {
1551 /* Just return a null if we fail to find the text */
1552 nulls[i++] = true;
1553 }
1554 }
1555 else
1556 {
1557 /* Query text not requested */
1558 nulls[i++] = true;
1559 }
1560 }
1561 else
1562 {
1563 /* Don't show queryid */
1564 if (api_version >= PGSS_V1_2)
1565 nulls[i++] = true;
1566
1567 /*
1568 * Don't show query text, but hint as to the reason for not doing
1569 * so if it was requested
1570 */
1571 if (showtext)
1572 values[i++] = CStringGetTextDatum("<insufficient privilege>");
1573 else
1574 nulls[i++] = true;
1575 }
1576
1577 /* copy counters to a local variable to keep locking time short */
1578 {
1579 volatile pgssEntry *e = (volatile pgssEntry *) entry;
1580
1581 SpinLockAcquire(&e->mutex);
1582 tmp = e->counters;
1583 SpinLockRelease(&e->mutex);
1584 }
1585
1586 /* Skip entry if unexecuted (ie, it's a pending "sticky" entry) */
1587 if (tmp.calls == 0)
1588 continue;
1589
1590 values[i++] = Int64GetDatumFast(tmp.calls);
1591 values[i++] = Float8GetDatumFast(tmp.total_time);
1592 if (api_version >= PGSS_V1_3)
1593 {
1594 values[i++] = Float8GetDatumFast(tmp.min_time);
1595 values[i++] = Float8GetDatumFast(tmp.max_time);
1596 values[i++] = Float8GetDatumFast(tmp.mean_time);
1597
1598 /*
1599 * Note we are calculating the population variance here, not the
1600 * sample variance, as we have data for the whole population, so
1601 * Bessel's correction is not used, and we don't divide by
1602 * tmp.calls - 1.
1603 */
1604 if (tmp.calls > 1)
1605 stddev = sqrt(tmp.sum_var_time / tmp.calls);
1606 else
1607 stddev = 0.0;
1608 values[i++] = Float8GetDatumFast(stddev);
1609 }
1610 values[i++] = Int64GetDatumFast(tmp.rows);
1611 values[i++] = Int64GetDatumFast(tmp.shared_blks_hit);
1612 values[i++] = Int64GetDatumFast(tmp.shared_blks_read);
1613 if (api_version >= PGSS_V1_1)
1614 values[i++] = Int64GetDatumFast(tmp.shared_blks_dirtied);
1615 values[i++] = Int64GetDatumFast(tmp.shared_blks_written);
1616 values[i++] = Int64GetDatumFast(tmp.local_blks_hit);
1617 values[i++] = Int64GetDatumFast(tmp.local_blks_read);
1618 if (api_version >= PGSS_V1_1)
1619 values[i++] = Int64GetDatumFast(tmp.local_blks_dirtied);
1620 values[i++] = Int64GetDatumFast(tmp.local_blks_written);
1621 values[i++] = Int64GetDatumFast(tmp.temp_blks_read);
1622 values[i++] = Int64GetDatumFast(tmp.temp_blks_written);
1623 if (api_version >= PGSS_V1_1)
1624 {
1625 values[i++] = Float8GetDatumFast(tmp.blk_read_time);
1626 values[i++] = Float8GetDatumFast(tmp.blk_write_time);
1627 }
1628
1629 Assert(i == (api_version == PGSS_V1_0 ? PG_STAT_STATEMENTS_COLS_V1_0 :
1630 api_version == PGSS_V1_1 ? PG_STAT_STATEMENTS_COLS_V1_1 :
1631 api_version == PGSS_V1_2 ? PG_STAT_STATEMENTS_COLS_V1_2 :
1632 api_version == PGSS_V1_3 ? PG_STAT_STATEMENTS_COLS_V1_3 :
1633 -1 /* fail if you forget to update this assert */ ));
1634
1635 tuplestore_putvalues(tupstore, tupdesc, values, nulls);
1636 }
1637
1638 /* clean up and return the tuplestore */
1639 LWLockRelease(pgss->lock);
1640
1641 if (qbuffer)
1642 free(qbuffer);
1643
1644 tuplestore_donestoring(tupstore);
1645 }
1646
1647 /*
1648 * Estimate shared memory space needed.
1649 */
1650 static Size
pgss_memsize(void)1651 pgss_memsize(void)
1652 {
1653 Size size;
1654
1655 size = MAXALIGN(sizeof(pgssSharedState));
1656 size = add_size(size, hash_estimate_size(pgss_max, sizeof(pgssEntry)));
1657
1658 return size;
1659 }
1660
1661 /*
1662 * Allocate a new hashtable entry.
1663 * caller must hold an exclusive lock on pgss->lock
1664 *
1665 * "query" need not be null-terminated; we rely on query_len instead
1666 *
1667 * If "sticky" is true, make the new entry artificially sticky so that it will
1668 * probably still be there when the query finishes execution. We do this by
1669 * giving it a median usage value rather than the normal value. (Strictly
1670 * speaking, query strings are normalized on a best effort basis, though it
1671 * would be difficult to demonstrate this even under artificial conditions.)
1672 *
1673 * Note: despite needing exclusive lock, it's not an error for the target
1674 * entry to already exist. This is because pgss_store releases and
1675 * reacquires lock after failing to find a match; so someone else could
1676 * have made the entry while we waited to get exclusive lock.
1677 */
1678 static pgssEntry *
entry_alloc(pgssHashKey * key,Size query_offset,int query_len,int encoding,bool sticky)1679 entry_alloc(pgssHashKey *key, Size query_offset, int query_len, int encoding,
1680 bool sticky)
1681 {
1682 pgssEntry *entry;
1683 bool found;
1684
1685 /* Make space if needed */
1686 while (hash_get_num_entries(pgss_hash) >= pgss_max)
1687 entry_dealloc();
1688
1689 /* Find or create an entry with desired hash code */
1690 entry = (pgssEntry *) hash_search(pgss_hash, key, HASH_ENTER, &found);
1691
1692 if (!found)
1693 {
1694 /* New entry, initialize it */
1695
1696 /* reset the statistics */
1697 memset(&entry->counters, 0, sizeof(Counters));
1698 /* set the appropriate initial usage count */
1699 entry->counters.usage = sticky ? pgss->cur_median_usage : USAGE_INIT;
1700 /* re-initialize the mutex each time ... we assume no one using it */
1701 SpinLockInit(&entry->mutex);
1702 /* ... and don't forget the query text metadata */
1703 Assert(query_len >= 0);
1704 entry->query_offset = query_offset;
1705 entry->query_len = query_len;
1706 entry->encoding = encoding;
1707 }
1708
1709 return entry;
1710 }
1711
1712 /*
1713 * qsort comparator for sorting into increasing usage order
1714 */
1715 static int
entry_cmp(const void * lhs,const void * rhs)1716 entry_cmp(const void *lhs, const void *rhs)
1717 {
1718 double l_usage = (*(pgssEntry *const *) lhs)->counters.usage;
1719 double r_usage = (*(pgssEntry *const *) rhs)->counters.usage;
1720
1721 if (l_usage < r_usage)
1722 return -1;
1723 else if (l_usage > r_usage)
1724 return +1;
1725 else
1726 return 0;
1727 }
1728
1729 /*
1730 * Deallocate least-used entries.
1731 *
1732 * Caller must hold an exclusive lock on pgss->lock.
1733 */
1734 static void
entry_dealloc(void)1735 entry_dealloc(void)
1736 {
1737 HASH_SEQ_STATUS hash_seq;
1738 pgssEntry **entries;
1739 pgssEntry *entry;
1740 int nvictims;
1741 int i;
1742 Size tottextlen;
1743 int nvalidtexts;
1744
1745 /*
1746 * Sort entries by usage and deallocate USAGE_DEALLOC_PERCENT of them.
1747 * While we're scanning the table, apply the decay factor to the usage
1748 * values, and update the mean query length.
1749 *
1750 * Note that the mean query length is almost immediately obsolete, since
1751 * we compute it before not after discarding the least-used entries.
1752 * Hopefully, that doesn't affect the mean too much; it doesn't seem worth
1753 * making two passes to get a more current result. Likewise, the new
1754 * cur_median_usage includes the entries we're about to zap.
1755 */
1756
1757 entries = palloc(hash_get_num_entries(pgss_hash) * sizeof(pgssEntry *));
1758
1759 i = 0;
1760 tottextlen = 0;
1761 nvalidtexts = 0;
1762
1763 hash_seq_init(&hash_seq, pgss_hash);
1764 while ((entry = hash_seq_search(&hash_seq)) != NULL)
1765 {
1766 entries[i++] = entry;
1767 /* "Sticky" entries get a different usage decay rate. */
1768 if (entry->counters.calls == 0)
1769 entry->counters.usage *= STICKY_DECREASE_FACTOR;
1770 else
1771 entry->counters.usage *= USAGE_DECREASE_FACTOR;
1772 /* In the mean length computation, ignore dropped texts. */
1773 if (entry->query_len >= 0)
1774 {
1775 tottextlen += entry->query_len + 1;
1776 nvalidtexts++;
1777 }
1778 }
1779
1780 /* Sort into increasing order by usage */
1781 qsort(entries, i, sizeof(pgssEntry *), entry_cmp);
1782
1783 /* Record the (approximate) median usage */
1784 if (i > 0)
1785 pgss->cur_median_usage = entries[i / 2]->counters.usage;
1786 /* Record the mean query length */
1787 if (nvalidtexts > 0)
1788 pgss->mean_query_len = tottextlen / nvalidtexts;
1789 else
1790 pgss->mean_query_len = ASSUMED_LENGTH_INIT;
1791
1792 /* Now zap an appropriate fraction of lowest-usage entries */
1793 nvictims = Max(10, i * USAGE_DEALLOC_PERCENT / 100);
1794 nvictims = Min(nvictims, i);
1795
1796 for (i = 0; i < nvictims; i++)
1797 {
1798 hash_search(pgss_hash, &entries[i]->key, HASH_REMOVE, NULL);
1799 }
1800
1801 pfree(entries);
1802 }
1803
1804 /*
1805 * Given a query string (not necessarily null-terminated), allocate a new
1806 * entry in the external query text file and store the string there.
1807 *
1808 * If successful, returns true, and stores the new entry's offset in the file
1809 * into *query_offset. Also, if gc_count isn't NULL, *gc_count is set to the
1810 * number of garbage collections that have occurred so far.
1811 *
1812 * On failure, returns false.
1813 *
1814 * At least a shared lock on pgss->lock must be held by the caller, so as
1815 * to prevent a concurrent garbage collection. Share-lock-holding callers
1816 * should pass a gc_count pointer to obtain the number of garbage collections,
1817 * so that they can recheck the count after obtaining exclusive lock to
1818 * detect whether a garbage collection occurred (and removed this entry).
1819 */
1820 static bool
qtext_store(const char * query,int query_len,Size * query_offset,int * gc_count)1821 qtext_store(const char *query, int query_len,
1822 Size *query_offset, int *gc_count)
1823 {
1824 Size off;
1825 int fd;
1826
1827 /*
1828 * We use a spinlock to protect extent/n_writers/gc_count, so that
1829 * multiple processes may execute this function concurrently.
1830 */
1831 {
1832 volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
1833
1834 SpinLockAcquire(&s->mutex);
1835 off = s->extent;
1836 s->extent += query_len + 1;
1837 s->n_writers++;
1838 if (gc_count)
1839 *gc_count = s->gc_count;
1840 SpinLockRelease(&s->mutex);
1841 }
1842
1843 *query_offset = off;
1844
1845 /* Now write the data into the successfully-reserved part of the file */
1846 fd = OpenTransientFile(PGSS_TEXT_FILE, O_RDWR | O_CREAT | PG_BINARY);
1847 if (fd < 0)
1848 goto error;
1849
1850 if (lseek(fd, off, SEEK_SET) != off)
1851 goto error;
1852
1853 if (write(fd, query, query_len) != query_len)
1854 goto error;
1855 if (write(fd, "\0", 1) != 1)
1856 goto error;
1857
1858 CloseTransientFile(fd);
1859
1860 /* Mark our write complete */
1861 {
1862 volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
1863
1864 SpinLockAcquire(&s->mutex);
1865 s->n_writers--;
1866 SpinLockRelease(&s->mutex);
1867 }
1868
1869 return true;
1870
1871 error:
1872 ereport(LOG,
1873 (errcode_for_file_access(),
1874 errmsg("could not write pg_stat_statement file \"%s\": %m",
1875 PGSS_TEXT_FILE)));
1876
1877 if (fd >= 0)
1878 CloseTransientFile(fd);
1879
1880 /* Mark our write complete */
1881 {
1882 volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
1883
1884 SpinLockAcquire(&s->mutex);
1885 s->n_writers--;
1886 SpinLockRelease(&s->mutex);
1887 }
1888
1889 return false;
1890 }
1891
1892 /*
1893 * Read the external query text file into a malloc'd buffer.
1894 *
1895 * Returns NULL (without throwing an error) if unable to read, eg
1896 * file not there or insufficient memory.
1897 *
1898 * On success, the buffer size is also returned into *buffer_size.
1899 *
1900 * This can be called without any lock on pgss->lock, but in that case
1901 * the caller is responsible for verifying that the result is sane.
1902 */
1903 static char *
qtext_load_file(Size * buffer_size)1904 qtext_load_file(Size *buffer_size)
1905 {
1906 char *buf;
1907 int fd;
1908 struct stat stat;
1909 Size nread;
1910
1911 fd = OpenTransientFile(PGSS_TEXT_FILE, O_RDONLY | PG_BINARY);
1912 if (fd < 0)
1913 {
1914 if (errno != ENOENT)
1915 ereport(LOG,
1916 (errcode_for_file_access(),
1917 errmsg("could not read pg_stat_statement file \"%s\": %m",
1918 PGSS_TEXT_FILE)));
1919 return NULL;
1920 }
1921
1922 /* Get file length */
1923 if (fstat(fd, &stat))
1924 {
1925 ereport(LOG,
1926 (errcode_for_file_access(),
1927 errmsg("could not stat pg_stat_statement file \"%s\": %m",
1928 PGSS_TEXT_FILE)));
1929 CloseTransientFile(fd);
1930 return NULL;
1931 }
1932
1933 /* Allocate buffer; beware that off_t might be wider than size_t */
1934 if (stat.st_size <= MaxAllocHugeSize)
1935 buf = (char *) malloc(stat.st_size);
1936 else
1937 buf = NULL;
1938 if (buf == NULL)
1939 {
1940 ereport(LOG,
1941 (errcode(ERRCODE_OUT_OF_MEMORY),
1942 errmsg("out of memory"),
1943 errdetail("Could not allocate enough memory to read pg_stat_statement file \"%s\".",
1944 PGSS_TEXT_FILE)));
1945 CloseTransientFile(fd);
1946 return NULL;
1947 }
1948
1949 /*
1950 * OK, slurp in the file. Windows fails if we try to read more than
1951 * INT_MAX bytes at once, and other platforms might not like that either,
1952 * so read a very large file in 1GB segments.
1953 */
1954 nread = 0;
1955 while (nread < stat.st_size)
1956 {
1957 int toread = Min(1024 * 1024 * 1024, stat.st_size - nread);
1958
1959 /*
1960 * If we get a short read and errno doesn't get set, the reason is
1961 * probably that garbage collection truncated the file since we did
1962 * the fstat(), so we don't log a complaint --- but we don't return
1963 * the data, either, since it's most likely corrupt due to concurrent
1964 * writes from garbage collection.
1965 */
1966 errno = 0;
1967 if (read(fd, buf + nread, toread) != toread)
1968 {
1969 if (errno)
1970 ereport(LOG,
1971 (errcode_for_file_access(),
1972 errmsg("could not read pg_stat_statement file \"%s\": %m",
1973 PGSS_TEXT_FILE)));
1974 free(buf);
1975 CloseTransientFile(fd);
1976 return NULL;
1977 }
1978 nread += toread;
1979 }
1980
1981 CloseTransientFile(fd);
1982
1983 *buffer_size = nread;
1984 return buf;
1985 }
1986
1987 /*
1988 * Locate a query text in the file image previously read by qtext_load_file().
1989 *
1990 * We validate the given offset/length, and return NULL if bogus. Otherwise,
1991 * the result points to a null-terminated string within the buffer.
1992 */
1993 static char *
qtext_fetch(Size query_offset,int query_len,char * buffer,Size buffer_size)1994 qtext_fetch(Size query_offset, int query_len,
1995 char *buffer, Size buffer_size)
1996 {
1997 /* File read failed? */
1998 if (buffer == NULL)
1999 return NULL;
2000 /* Bogus offset/length? */
2001 if (query_len < 0 ||
2002 query_offset + query_len >= buffer_size)
2003 return NULL;
2004 /* As a further sanity check, make sure there's a trailing null */
2005 if (buffer[query_offset + query_len] != '\0')
2006 return NULL;
2007 /* Looks OK */
2008 return buffer + query_offset;
2009 }
2010
2011 /*
2012 * Do we need to garbage-collect the external query text file?
2013 *
2014 * Caller should hold at least a shared lock on pgss->lock.
2015 */
2016 static bool
need_gc_qtexts(void)2017 need_gc_qtexts(void)
2018 {
2019 Size extent;
2020
2021 /* Read shared extent pointer */
2022 {
2023 volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
2024
2025 SpinLockAcquire(&s->mutex);
2026 extent = s->extent;
2027 SpinLockRelease(&s->mutex);
2028 }
2029
2030 /* Don't proceed if file does not exceed 512 bytes per possible entry */
2031 if (extent < 512 * pgss_max)
2032 return false;
2033
2034 /*
2035 * Don't proceed if file is less than about 50% bloat. Nothing can or
2036 * should be done in the event of unusually large query texts accounting
2037 * for file's large size. We go to the trouble of maintaining the mean
2038 * query length in order to prevent garbage collection from thrashing
2039 * uselessly.
2040 */
2041 if (extent < pgss->mean_query_len * pgss_max * 2)
2042 return false;
2043
2044 return true;
2045 }
2046
2047 /*
2048 * Garbage-collect orphaned query texts in external file.
2049 *
2050 * This won't be called often in the typical case, since it's likely that
2051 * there won't be too much churn, and besides, a similar compaction process
2052 * occurs when serializing to disk at shutdown or as part of resetting.
2053 * Despite this, it seems prudent to plan for the edge case where the file
2054 * becomes unreasonably large, with no other method of compaction likely to
2055 * occur in the foreseeable future.
2056 *
2057 * The caller must hold an exclusive lock on pgss->lock.
2058 *
2059 * At the first sign of trouble we unlink the query text file to get a clean
2060 * slate (although existing statistics are retained), rather than risk
2061 * thrashing by allowing the same problem case to recur indefinitely.
2062 */
2063 static void
gc_qtexts(void)2064 gc_qtexts(void)
2065 {
2066 char *qbuffer;
2067 Size qbuffer_size;
2068 FILE *qfile = NULL;
2069 HASH_SEQ_STATUS hash_seq;
2070 pgssEntry *entry;
2071 Size extent;
2072 int nentries;
2073
2074 /*
2075 * When called from pgss_store, some other session might have proceeded
2076 * with garbage collection in the no-lock-held interim of lock strength
2077 * escalation. Check once more that this is actually necessary.
2078 */
2079 if (!need_gc_qtexts())
2080 return;
2081
2082 /*
2083 * Load the old texts file. If we fail (out of memory, for instance),
2084 * invalidate query texts. Hopefully this is rare. It might seem better
2085 * to leave things alone on an OOM failure, but the problem is that the
2086 * file is only going to get bigger; hoping for a future non-OOM result is
2087 * risky and can easily lead to complete denial of service.
2088 */
2089 qbuffer = qtext_load_file(&qbuffer_size);
2090 if (qbuffer == NULL)
2091 goto gc_fail;
2092
2093 /*
2094 * We overwrite the query texts file in place, so as to reduce the risk of
2095 * an out-of-disk-space failure. Since the file is guaranteed not to get
2096 * larger, this should always work on traditional filesystems; though we
2097 * could still lose on copy-on-write filesystems.
2098 */
2099 qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W);
2100 if (qfile == NULL)
2101 {
2102 ereport(LOG,
2103 (errcode_for_file_access(),
2104 errmsg("could not write pg_stat_statement file \"%s\": %m",
2105 PGSS_TEXT_FILE)));
2106 goto gc_fail;
2107 }
2108
2109 extent = 0;
2110 nentries = 0;
2111
2112 hash_seq_init(&hash_seq, pgss_hash);
2113 while ((entry = hash_seq_search(&hash_seq)) != NULL)
2114 {
2115 int query_len = entry->query_len;
2116 char *qry = qtext_fetch(entry->query_offset,
2117 query_len,
2118 qbuffer,
2119 qbuffer_size);
2120
2121 if (qry == NULL)
2122 {
2123 /* Trouble ... drop the text */
2124 entry->query_offset = 0;
2125 entry->query_len = -1;
2126 /* entry will not be counted in mean query length computation */
2127 continue;
2128 }
2129
2130 if (fwrite(qry, 1, query_len + 1, qfile) != query_len + 1)
2131 {
2132 ereport(LOG,
2133 (errcode_for_file_access(),
2134 errmsg("could not write pg_stat_statement file \"%s\": %m",
2135 PGSS_TEXT_FILE)));
2136 hash_seq_term(&hash_seq);
2137 goto gc_fail;
2138 }
2139
2140 entry->query_offset = extent;
2141 extent += query_len + 1;
2142 nentries++;
2143 }
2144
2145 /*
2146 * Truncate away any now-unused space. If this fails for some odd reason,
2147 * we log it, but there's no need to fail.
2148 */
2149 if (ftruncate(fileno(qfile), extent) != 0)
2150 ereport(LOG,
2151 (errcode_for_file_access(),
2152 errmsg("could not truncate pg_stat_statement file \"%s\": %m",
2153 PGSS_TEXT_FILE)));
2154
2155 if (FreeFile(qfile))
2156 {
2157 ereport(LOG,
2158 (errcode_for_file_access(),
2159 errmsg("could not write pg_stat_statement file \"%s\": %m",
2160 PGSS_TEXT_FILE)));
2161 qfile = NULL;
2162 goto gc_fail;
2163 }
2164
2165 elog(DEBUG1, "pgss gc of queries file shrunk size from %zu to %zu",
2166 pgss->extent, extent);
2167
2168 /* Reset the shared extent pointer */
2169 pgss->extent = extent;
2170
2171 /*
2172 * Also update the mean query length, to be sure that need_gc_qtexts()
2173 * won't still think we have a problem.
2174 */
2175 if (nentries > 0)
2176 pgss->mean_query_len = extent / nentries;
2177 else
2178 pgss->mean_query_len = ASSUMED_LENGTH_INIT;
2179
2180 free(qbuffer);
2181
2182 /*
2183 * OK, count a garbage collection cycle. (Note: even though we have
2184 * exclusive lock on pgss->lock, we must take pgss->mutex for this, since
2185 * other processes may examine gc_count while holding only the mutex.
2186 * Also, we have to advance the count *after* we've rewritten the file,
2187 * else other processes might not realize they read a stale file.)
2188 */
2189 record_gc_qtexts();
2190
2191 return;
2192
2193 gc_fail:
2194 /* clean up resources */
2195 if (qfile)
2196 FreeFile(qfile);
2197 if (qbuffer)
2198 free(qbuffer);
2199
2200 /*
2201 * Since the contents of the external file are now uncertain, mark all
2202 * hashtable entries as having invalid texts.
2203 */
2204 hash_seq_init(&hash_seq, pgss_hash);
2205 while ((entry = hash_seq_search(&hash_seq)) != NULL)
2206 {
2207 entry->query_offset = 0;
2208 entry->query_len = -1;
2209 }
2210
2211 /*
2212 * Destroy the query text file and create a new, empty one
2213 */
2214 (void) unlink(PGSS_TEXT_FILE);
2215 qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W);
2216 if (qfile == NULL)
2217 ereport(LOG,
2218 (errcode_for_file_access(),
2219 errmsg("could not write new pg_stat_statement file \"%s\": %m",
2220 PGSS_TEXT_FILE)));
2221 else
2222 FreeFile(qfile);
2223
2224 /* Reset the shared extent pointer */
2225 pgss->extent = 0;
2226
2227 /* Reset mean_query_len to match the new state */
2228 pgss->mean_query_len = ASSUMED_LENGTH_INIT;
2229
2230 /*
2231 * Bump the GC count even though we failed.
2232 *
2233 * This is needed to make concurrent readers of file without any lock on
2234 * pgss->lock notice existence of new version of file. Once readers
2235 * subsequently observe a change in GC count with pgss->lock held, that
2236 * forces a safe reopen of file. Writers also require that we bump here,
2237 * of course. (As required by locking protocol, readers and writers don't
2238 * trust earlier file contents until gc_count is found unchanged after
2239 * pgss->lock acquired in shared or exclusive mode respectively.)
2240 */
2241 record_gc_qtexts();
2242 }
2243
2244 /*
2245 * Release all entries.
2246 */
2247 static void
entry_reset(void)2248 entry_reset(void)
2249 {
2250 HASH_SEQ_STATUS hash_seq;
2251 pgssEntry *entry;
2252 FILE *qfile;
2253
2254 LWLockAcquire(pgss->lock, LW_EXCLUSIVE);
2255
2256 hash_seq_init(&hash_seq, pgss_hash);
2257 while ((entry = hash_seq_search(&hash_seq)) != NULL)
2258 {
2259 hash_search(pgss_hash, &entry->key, HASH_REMOVE, NULL);
2260 }
2261
2262 /*
2263 * Write new empty query file, perhaps even creating a new one to recover
2264 * if the file was missing.
2265 */
2266 qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W);
2267 if (qfile == NULL)
2268 {
2269 ereport(LOG,
2270 (errcode_for_file_access(),
2271 errmsg("could not create pg_stat_statement file \"%s\": %m",
2272 PGSS_TEXT_FILE)));
2273 goto done;
2274 }
2275
2276 /* If ftruncate fails, log it, but it's not a fatal problem */
2277 if (ftruncate(fileno(qfile), 0) != 0)
2278 ereport(LOG,
2279 (errcode_for_file_access(),
2280 errmsg("could not truncate pg_stat_statement file \"%s\": %m",
2281 PGSS_TEXT_FILE)));
2282
2283 FreeFile(qfile);
2284
2285 done:
2286 pgss->extent = 0;
2287 /* This counts as a query text garbage collection for our purposes */
2288 record_gc_qtexts();
2289
2290 LWLockRelease(pgss->lock);
2291 }
2292
2293 /*
2294 * AppendJumble: Append a value that is substantive in a given query to
2295 * the current jumble.
2296 */
2297 static void
AppendJumble(pgssJumbleState * jstate,const unsigned char * item,Size size)2298 AppendJumble(pgssJumbleState *jstate, const unsigned char *item, Size size)
2299 {
2300 unsigned char *jumble = jstate->jumble;
2301 Size jumble_len = jstate->jumble_len;
2302
2303 /*
2304 * Whenever the jumble buffer is full, we hash the current contents and
2305 * reset the buffer to contain just that hash value, thus relying on the
2306 * hash to summarize everything so far.
2307 */
2308 while (size > 0)
2309 {
2310 Size part_size;
2311
2312 if (jumble_len >= JUMBLE_SIZE)
2313 {
2314 uint64 start_hash;
2315
2316 start_hash = DatumGetUInt64(hash_any_extended(jumble,
2317 JUMBLE_SIZE, 0));
2318 memcpy(jumble, &start_hash, sizeof(start_hash));
2319 jumble_len = sizeof(start_hash);
2320 }
2321 part_size = Min(size, JUMBLE_SIZE - jumble_len);
2322 memcpy(jumble + jumble_len, item, part_size);
2323 jumble_len += part_size;
2324 item += part_size;
2325 size -= part_size;
2326 }
2327 jstate->jumble_len = jumble_len;
2328 }
2329
2330 /*
2331 * Wrappers around AppendJumble to encapsulate details of serialization
2332 * of individual local variable elements.
2333 */
2334 #define APP_JUMB(item) \
2335 AppendJumble(jstate, (const unsigned char *) &(item), sizeof(item))
2336 #define APP_JUMB_STRING(str) \
2337 AppendJumble(jstate, (const unsigned char *) (str), strlen(str) + 1)
2338
2339 /*
2340 * JumbleQuery: Selectively serialize the query tree, appending significant
2341 * data to the "query jumble" while ignoring nonsignificant data.
2342 *
2343 * Rule of thumb for what to include is that we should ignore anything not
2344 * semantically significant (such as alias names) as well as anything that can
2345 * be deduced from child nodes (else we'd just be double-hashing that piece
2346 * of information).
2347 */
2348 static void
JumbleQuery(pgssJumbleState * jstate,Query * query)2349 JumbleQuery(pgssJumbleState *jstate, Query *query)
2350 {
2351 Assert(IsA(query, Query));
2352 Assert(query->utilityStmt == NULL);
2353
2354 APP_JUMB(query->commandType);
2355 /* resultRelation is usually predictable from commandType */
2356 JumbleExpr(jstate, (Node *) query->cteList);
2357 JumbleRangeTable(jstate, query->rtable);
2358 JumbleExpr(jstate, (Node *) query->jointree);
2359 JumbleExpr(jstate, (Node *) query->targetList);
2360 JumbleExpr(jstate, (Node *) query->onConflict);
2361 JumbleExpr(jstate, (Node *) query->returningList);
2362 JumbleExpr(jstate, (Node *) query->groupClause);
2363 JumbleExpr(jstate, (Node *) query->groupingSets);
2364 JumbleExpr(jstate, query->havingQual);
2365 JumbleExpr(jstate, (Node *) query->windowClause);
2366 JumbleExpr(jstate, (Node *) query->distinctClause);
2367 JumbleExpr(jstate, (Node *) query->sortClause);
2368 JumbleExpr(jstate, query->limitOffset);
2369 JumbleExpr(jstate, query->limitCount);
2370 /* we ignore rowMarks */
2371 JumbleExpr(jstate, query->setOperations);
2372 }
2373
2374 /*
2375 * Jumble a range table
2376 */
2377 static void
JumbleRangeTable(pgssJumbleState * jstate,List * rtable)2378 JumbleRangeTable(pgssJumbleState *jstate, List *rtable)
2379 {
2380 ListCell *lc;
2381
2382 foreach(lc, rtable)
2383 {
2384 RangeTblEntry *rte = lfirst_node(RangeTblEntry, lc);
2385
2386 APP_JUMB(rte->rtekind);
2387 switch (rte->rtekind)
2388 {
2389 case RTE_RELATION:
2390 APP_JUMB(rte->relid);
2391 JumbleExpr(jstate, (Node *) rte->tablesample);
2392 break;
2393 case RTE_SUBQUERY:
2394 JumbleQuery(jstate, rte->subquery);
2395 break;
2396 case RTE_JOIN:
2397 APP_JUMB(rte->jointype);
2398 break;
2399 case RTE_FUNCTION:
2400 JumbleExpr(jstate, (Node *) rte->functions);
2401 break;
2402 case RTE_TABLEFUNC:
2403 JumbleExpr(jstate, (Node *) rte->tablefunc);
2404 break;
2405 case RTE_VALUES:
2406 JumbleExpr(jstate, (Node *) rte->values_lists);
2407 break;
2408 case RTE_CTE:
2409
2410 /*
2411 * Depending on the CTE name here isn't ideal, but it's the
2412 * only info we have to identify the referenced WITH item.
2413 */
2414 APP_JUMB_STRING(rte->ctename);
2415 APP_JUMB(rte->ctelevelsup);
2416 break;
2417 case RTE_NAMEDTUPLESTORE:
2418 APP_JUMB_STRING(rte->enrname);
2419 break;
2420 default:
2421 elog(ERROR, "unrecognized RTE kind: %d", (int) rte->rtekind);
2422 break;
2423 }
2424 }
2425 }
2426
2427 /*
2428 * Jumble an expression tree
2429 *
2430 * In general this function should handle all the same node types that
2431 * expression_tree_walker() does, and therefore it's coded to be as parallel
2432 * to that function as possible. However, since we are only invoked on
2433 * queries immediately post-parse-analysis, we need not handle node types
2434 * that only appear in planning.
2435 *
2436 * Note: the reason we don't simply use expression_tree_walker() is that the
2437 * point of that function is to support tree walkers that don't care about
2438 * most tree node types, but here we care about all types. We should complain
2439 * about any unrecognized node type.
2440 */
2441 static void
JumbleExpr(pgssJumbleState * jstate,Node * node)2442 JumbleExpr(pgssJumbleState *jstate, Node *node)
2443 {
2444 ListCell *temp;
2445
2446 if (node == NULL)
2447 return;
2448
2449 /* Guard against stack overflow due to overly complex expressions */
2450 check_stack_depth();
2451
2452 /*
2453 * We always emit the node's NodeTag, then any additional fields that are
2454 * considered significant, and then we recurse to any child nodes.
2455 */
2456 APP_JUMB(node->type);
2457
2458 switch (nodeTag(node))
2459 {
2460 case T_Var:
2461 {
2462 Var *var = (Var *) node;
2463
2464 APP_JUMB(var->varno);
2465 APP_JUMB(var->varattno);
2466 APP_JUMB(var->varlevelsup);
2467 }
2468 break;
2469 case T_Const:
2470 {
2471 Const *c = (Const *) node;
2472
2473 /* We jumble only the constant's type, not its value */
2474 APP_JUMB(c->consttype);
2475 /* Also, record its parse location for query normalization */
2476 RecordConstLocation(jstate, c->location);
2477 }
2478 break;
2479 case T_Param:
2480 {
2481 Param *p = (Param *) node;
2482
2483 APP_JUMB(p->paramkind);
2484 APP_JUMB(p->paramid);
2485 APP_JUMB(p->paramtype);
2486 /* Also, track the highest external Param id */
2487 if (p->paramkind == PARAM_EXTERN &&
2488 p->paramid > jstate->highest_extern_param_id)
2489 jstate->highest_extern_param_id = p->paramid;
2490 }
2491 break;
2492 case T_Aggref:
2493 {
2494 Aggref *expr = (Aggref *) node;
2495
2496 APP_JUMB(expr->aggfnoid);
2497 JumbleExpr(jstate, (Node *) expr->aggdirectargs);
2498 JumbleExpr(jstate, (Node *) expr->args);
2499 JumbleExpr(jstate, (Node *) expr->aggorder);
2500 JumbleExpr(jstate, (Node *) expr->aggdistinct);
2501 JumbleExpr(jstate, (Node *) expr->aggfilter);
2502 }
2503 break;
2504 case T_GroupingFunc:
2505 {
2506 GroupingFunc *grpnode = (GroupingFunc *) node;
2507
2508 JumbleExpr(jstate, (Node *) grpnode->refs);
2509 }
2510 break;
2511 case T_WindowFunc:
2512 {
2513 WindowFunc *expr = (WindowFunc *) node;
2514
2515 APP_JUMB(expr->winfnoid);
2516 APP_JUMB(expr->winref);
2517 JumbleExpr(jstate, (Node *) expr->args);
2518 JumbleExpr(jstate, (Node *) expr->aggfilter);
2519 }
2520 break;
2521 case T_ArrayRef:
2522 {
2523 ArrayRef *aref = (ArrayRef *) node;
2524
2525 JumbleExpr(jstate, (Node *) aref->refupperindexpr);
2526 JumbleExpr(jstate, (Node *) aref->reflowerindexpr);
2527 JumbleExpr(jstate, (Node *) aref->refexpr);
2528 JumbleExpr(jstate, (Node *) aref->refassgnexpr);
2529 }
2530 break;
2531 case T_FuncExpr:
2532 {
2533 FuncExpr *expr = (FuncExpr *) node;
2534
2535 APP_JUMB(expr->funcid);
2536 JumbleExpr(jstate, (Node *) expr->args);
2537 }
2538 break;
2539 case T_NamedArgExpr:
2540 {
2541 NamedArgExpr *nae = (NamedArgExpr *) node;
2542
2543 APP_JUMB(nae->argnumber);
2544 JumbleExpr(jstate, (Node *) nae->arg);
2545 }
2546 break;
2547 case T_OpExpr:
2548 case T_DistinctExpr: /* struct-equivalent to OpExpr */
2549 case T_NullIfExpr: /* struct-equivalent to OpExpr */
2550 {
2551 OpExpr *expr = (OpExpr *) node;
2552
2553 APP_JUMB(expr->opno);
2554 JumbleExpr(jstate, (Node *) expr->args);
2555 }
2556 break;
2557 case T_ScalarArrayOpExpr:
2558 {
2559 ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) node;
2560
2561 APP_JUMB(expr->opno);
2562 APP_JUMB(expr->useOr);
2563 JumbleExpr(jstate, (Node *) expr->args);
2564 }
2565 break;
2566 case T_BoolExpr:
2567 {
2568 BoolExpr *expr = (BoolExpr *) node;
2569
2570 APP_JUMB(expr->boolop);
2571 JumbleExpr(jstate, (Node *) expr->args);
2572 }
2573 break;
2574 case T_SubLink:
2575 {
2576 SubLink *sublink = (SubLink *) node;
2577
2578 APP_JUMB(sublink->subLinkType);
2579 APP_JUMB(sublink->subLinkId);
2580 JumbleExpr(jstate, (Node *) sublink->testexpr);
2581 JumbleQuery(jstate, castNode(Query, sublink->subselect));
2582 }
2583 break;
2584 case T_FieldSelect:
2585 {
2586 FieldSelect *fs = (FieldSelect *) node;
2587
2588 APP_JUMB(fs->fieldnum);
2589 JumbleExpr(jstate, (Node *) fs->arg);
2590 }
2591 break;
2592 case T_FieldStore:
2593 {
2594 FieldStore *fstore = (FieldStore *) node;
2595
2596 JumbleExpr(jstate, (Node *) fstore->arg);
2597 JumbleExpr(jstate, (Node *) fstore->newvals);
2598 }
2599 break;
2600 case T_RelabelType:
2601 {
2602 RelabelType *rt = (RelabelType *) node;
2603
2604 APP_JUMB(rt->resulttype);
2605 JumbleExpr(jstate, (Node *) rt->arg);
2606 }
2607 break;
2608 case T_CoerceViaIO:
2609 {
2610 CoerceViaIO *cio = (CoerceViaIO *) node;
2611
2612 APP_JUMB(cio->resulttype);
2613 JumbleExpr(jstate, (Node *) cio->arg);
2614 }
2615 break;
2616 case T_ArrayCoerceExpr:
2617 {
2618 ArrayCoerceExpr *acexpr = (ArrayCoerceExpr *) node;
2619
2620 APP_JUMB(acexpr->resulttype);
2621 JumbleExpr(jstate, (Node *) acexpr->arg);
2622 JumbleExpr(jstate, (Node *) acexpr->elemexpr);
2623 }
2624 break;
2625 case T_ConvertRowtypeExpr:
2626 {
2627 ConvertRowtypeExpr *crexpr = (ConvertRowtypeExpr *) node;
2628
2629 APP_JUMB(crexpr->resulttype);
2630 JumbleExpr(jstate, (Node *) crexpr->arg);
2631 }
2632 break;
2633 case T_CollateExpr:
2634 {
2635 CollateExpr *ce = (CollateExpr *) node;
2636
2637 APP_JUMB(ce->collOid);
2638 JumbleExpr(jstate, (Node *) ce->arg);
2639 }
2640 break;
2641 case T_CaseExpr:
2642 {
2643 CaseExpr *caseexpr = (CaseExpr *) node;
2644
2645 JumbleExpr(jstate, (Node *) caseexpr->arg);
2646 foreach(temp, caseexpr->args)
2647 {
2648 CaseWhen *when = lfirst_node(CaseWhen, temp);
2649
2650 JumbleExpr(jstate, (Node *) when->expr);
2651 JumbleExpr(jstate, (Node *) when->result);
2652 }
2653 JumbleExpr(jstate, (Node *) caseexpr->defresult);
2654 }
2655 break;
2656 case T_CaseTestExpr:
2657 {
2658 CaseTestExpr *ct = (CaseTestExpr *) node;
2659
2660 APP_JUMB(ct->typeId);
2661 }
2662 break;
2663 case T_ArrayExpr:
2664 JumbleExpr(jstate, (Node *) ((ArrayExpr *) node)->elements);
2665 break;
2666 case T_RowExpr:
2667 JumbleExpr(jstate, (Node *) ((RowExpr *) node)->args);
2668 break;
2669 case T_RowCompareExpr:
2670 {
2671 RowCompareExpr *rcexpr = (RowCompareExpr *) node;
2672
2673 APP_JUMB(rcexpr->rctype);
2674 JumbleExpr(jstate, (Node *) rcexpr->largs);
2675 JumbleExpr(jstate, (Node *) rcexpr->rargs);
2676 }
2677 break;
2678 case T_CoalesceExpr:
2679 JumbleExpr(jstate, (Node *) ((CoalesceExpr *) node)->args);
2680 break;
2681 case T_MinMaxExpr:
2682 {
2683 MinMaxExpr *mmexpr = (MinMaxExpr *) node;
2684
2685 APP_JUMB(mmexpr->op);
2686 JumbleExpr(jstate, (Node *) mmexpr->args);
2687 }
2688 break;
2689 case T_SQLValueFunction:
2690 {
2691 SQLValueFunction *svf = (SQLValueFunction *) node;
2692
2693 APP_JUMB(svf->op);
2694 /* type is fully determined by op */
2695 APP_JUMB(svf->typmod);
2696 }
2697 break;
2698 case T_XmlExpr:
2699 {
2700 XmlExpr *xexpr = (XmlExpr *) node;
2701
2702 APP_JUMB(xexpr->op);
2703 JumbleExpr(jstate, (Node *) xexpr->named_args);
2704 JumbleExpr(jstate, (Node *) xexpr->args);
2705 }
2706 break;
2707 case T_NullTest:
2708 {
2709 NullTest *nt = (NullTest *) node;
2710
2711 APP_JUMB(nt->nulltesttype);
2712 JumbleExpr(jstate, (Node *) nt->arg);
2713 }
2714 break;
2715 case T_BooleanTest:
2716 {
2717 BooleanTest *bt = (BooleanTest *) node;
2718
2719 APP_JUMB(bt->booltesttype);
2720 JumbleExpr(jstate, (Node *) bt->arg);
2721 }
2722 break;
2723 case T_CoerceToDomain:
2724 {
2725 CoerceToDomain *cd = (CoerceToDomain *) node;
2726
2727 APP_JUMB(cd->resulttype);
2728 JumbleExpr(jstate, (Node *) cd->arg);
2729 }
2730 break;
2731 case T_CoerceToDomainValue:
2732 {
2733 CoerceToDomainValue *cdv = (CoerceToDomainValue *) node;
2734
2735 APP_JUMB(cdv->typeId);
2736 }
2737 break;
2738 case T_SetToDefault:
2739 {
2740 SetToDefault *sd = (SetToDefault *) node;
2741
2742 APP_JUMB(sd->typeId);
2743 }
2744 break;
2745 case T_CurrentOfExpr:
2746 {
2747 CurrentOfExpr *ce = (CurrentOfExpr *) node;
2748
2749 APP_JUMB(ce->cvarno);
2750 if (ce->cursor_name)
2751 APP_JUMB_STRING(ce->cursor_name);
2752 APP_JUMB(ce->cursor_param);
2753 }
2754 break;
2755 case T_NextValueExpr:
2756 {
2757 NextValueExpr *nve = (NextValueExpr *) node;
2758
2759 APP_JUMB(nve->seqid);
2760 APP_JUMB(nve->typeId);
2761 }
2762 break;
2763 case T_InferenceElem:
2764 {
2765 InferenceElem *ie = (InferenceElem *) node;
2766
2767 APP_JUMB(ie->infercollid);
2768 APP_JUMB(ie->inferopclass);
2769 JumbleExpr(jstate, ie->expr);
2770 }
2771 break;
2772 case T_TargetEntry:
2773 {
2774 TargetEntry *tle = (TargetEntry *) node;
2775
2776 APP_JUMB(tle->resno);
2777 APP_JUMB(tle->ressortgroupref);
2778 JumbleExpr(jstate, (Node *) tle->expr);
2779 }
2780 break;
2781 case T_RangeTblRef:
2782 {
2783 RangeTblRef *rtr = (RangeTblRef *) node;
2784
2785 APP_JUMB(rtr->rtindex);
2786 }
2787 break;
2788 case T_JoinExpr:
2789 {
2790 JoinExpr *join = (JoinExpr *) node;
2791
2792 APP_JUMB(join->jointype);
2793 APP_JUMB(join->isNatural);
2794 APP_JUMB(join->rtindex);
2795 JumbleExpr(jstate, join->larg);
2796 JumbleExpr(jstate, join->rarg);
2797 JumbleExpr(jstate, join->quals);
2798 }
2799 break;
2800 case T_FromExpr:
2801 {
2802 FromExpr *from = (FromExpr *) node;
2803
2804 JumbleExpr(jstate, (Node *) from->fromlist);
2805 JumbleExpr(jstate, from->quals);
2806 }
2807 break;
2808 case T_OnConflictExpr:
2809 {
2810 OnConflictExpr *conf = (OnConflictExpr *) node;
2811
2812 APP_JUMB(conf->action);
2813 JumbleExpr(jstate, (Node *) conf->arbiterElems);
2814 JumbleExpr(jstate, conf->arbiterWhere);
2815 JumbleExpr(jstate, (Node *) conf->onConflictSet);
2816 JumbleExpr(jstate, conf->onConflictWhere);
2817 APP_JUMB(conf->constraint);
2818 APP_JUMB(conf->exclRelIndex);
2819 JumbleExpr(jstate, (Node *) conf->exclRelTlist);
2820 }
2821 break;
2822 case T_List:
2823 foreach(temp, (List *) node)
2824 {
2825 JumbleExpr(jstate, (Node *) lfirst(temp));
2826 }
2827 break;
2828 case T_IntList:
2829 foreach(temp, (List *) node)
2830 {
2831 APP_JUMB(lfirst_int(temp));
2832 }
2833 break;
2834 case T_SortGroupClause:
2835 {
2836 SortGroupClause *sgc = (SortGroupClause *) node;
2837
2838 APP_JUMB(sgc->tleSortGroupRef);
2839 APP_JUMB(sgc->eqop);
2840 APP_JUMB(sgc->sortop);
2841 APP_JUMB(sgc->nulls_first);
2842 }
2843 break;
2844 case T_GroupingSet:
2845 {
2846 GroupingSet *gsnode = (GroupingSet *) node;
2847
2848 JumbleExpr(jstate, (Node *) gsnode->content);
2849 }
2850 break;
2851 case T_WindowClause:
2852 {
2853 WindowClause *wc = (WindowClause *) node;
2854
2855 APP_JUMB(wc->winref);
2856 APP_JUMB(wc->frameOptions);
2857 JumbleExpr(jstate, (Node *) wc->partitionClause);
2858 JumbleExpr(jstate, (Node *) wc->orderClause);
2859 JumbleExpr(jstate, wc->startOffset);
2860 JumbleExpr(jstate, wc->endOffset);
2861 }
2862 break;
2863 case T_CommonTableExpr:
2864 {
2865 CommonTableExpr *cte = (CommonTableExpr *) node;
2866
2867 /* we store the string name because RTE_CTE RTEs need it */
2868 APP_JUMB_STRING(cte->ctename);
2869 JumbleQuery(jstate, castNode(Query, cte->ctequery));
2870 }
2871 break;
2872 case T_SetOperationStmt:
2873 {
2874 SetOperationStmt *setop = (SetOperationStmt *) node;
2875
2876 APP_JUMB(setop->op);
2877 APP_JUMB(setop->all);
2878 JumbleExpr(jstate, setop->larg);
2879 JumbleExpr(jstate, setop->rarg);
2880 }
2881 break;
2882 case T_RangeTblFunction:
2883 {
2884 RangeTblFunction *rtfunc = (RangeTblFunction *) node;
2885
2886 JumbleExpr(jstate, rtfunc->funcexpr);
2887 }
2888 break;
2889 case T_TableFunc:
2890 {
2891 TableFunc *tablefunc = (TableFunc *) node;
2892
2893 JumbleExpr(jstate, tablefunc->docexpr);
2894 JumbleExpr(jstate, tablefunc->rowexpr);
2895 JumbleExpr(jstate, (Node *) tablefunc->colexprs);
2896 }
2897 break;
2898 case T_TableSampleClause:
2899 {
2900 TableSampleClause *tsc = (TableSampleClause *) node;
2901
2902 APP_JUMB(tsc->tsmhandler);
2903 JumbleExpr(jstate, (Node *) tsc->args);
2904 JumbleExpr(jstate, (Node *) tsc->repeatable);
2905 }
2906 break;
2907 default:
2908 /* Only a warning, since we can stumble along anyway */
2909 elog(WARNING, "unrecognized node type: %d",
2910 (int) nodeTag(node));
2911 break;
2912 }
2913 }
2914
2915 /*
2916 * Record location of constant within query string of query tree
2917 * that is currently being walked.
2918 */
2919 static void
RecordConstLocation(pgssJumbleState * jstate,int location)2920 RecordConstLocation(pgssJumbleState *jstate, int location)
2921 {
2922 /* -1 indicates unknown or undefined location */
2923 if (location >= 0)
2924 {
2925 /* enlarge array if needed */
2926 if (jstate->clocations_count >= jstate->clocations_buf_size)
2927 {
2928 jstate->clocations_buf_size *= 2;
2929 jstate->clocations = (pgssLocationLen *)
2930 repalloc(jstate->clocations,
2931 jstate->clocations_buf_size *
2932 sizeof(pgssLocationLen));
2933 }
2934 jstate->clocations[jstate->clocations_count].location = location;
2935 /* initialize lengths to -1 to simplify fill_in_constant_lengths */
2936 jstate->clocations[jstate->clocations_count].length = -1;
2937 jstate->clocations_count++;
2938 }
2939 }
2940
2941 /*
2942 * Generate a normalized version of the query string that will be used to
2943 * represent all similar queries.
2944 *
2945 * Note that the normalized representation may well vary depending on
2946 * just which "equivalent" query is used to create the hashtable entry.
2947 * We assume this is OK.
2948 *
2949 * If query_loc > 0, then "query" has been advanced by that much compared to
2950 * the original string start, so we need to translate the provided locations
2951 * to compensate. (This lets us avoid re-scanning statements before the one
2952 * of interest, so it's worth doing.)
2953 *
2954 * *query_len_p contains the input string length, and is updated with
2955 * the result string length on exit. The resulting string might be longer
2956 * or shorter depending on what happens with replacement of constants.
2957 *
2958 * Returns a palloc'd string.
2959 */
2960 static char *
generate_normalized_query(pgssJumbleState * jstate,const char * query,int query_loc,int * query_len_p,int encoding)2961 generate_normalized_query(pgssJumbleState *jstate, const char *query,
2962 int query_loc, int *query_len_p, int encoding)
2963 {
2964 char *norm_query;
2965 int query_len = *query_len_p;
2966 int i,
2967 norm_query_buflen, /* Space allowed for norm_query */
2968 len_to_wrt, /* Length (in bytes) to write */
2969 quer_loc = 0, /* Source query byte location */
2970 n_quer_loc = 0, /* Normalized query byte location */
2971 last_off = 0, /* Offset from start for previous tok */
2972 last_tok_len = 0; /* Length (in bytes) of that tok */
2973
2974 /*
2975 * Get constants' lengths (core system only gives us locations). Note
2976 * this also ensures the items are sorted by location.
2977 */
2978 fill_in_constant_lengths(jstate, query, query_loc);
2979
2980 /*
2981 * Allow for $n symbols to be longer than the constants they replace.
2982 * Constants must take at least one byte in text form, while a $n symbol
2983 * certainly isn't more than 11 bytes, even if n reaches INT_MAX. We
2984 * could refine that limit based on the max value of n for the current
2985 * query, but it hardly seems worth any extra effort to do so.
2986 */
2987 norm_query_buflen = query_len + jstate->clocations_count * 10;
2988
2989 /* Allocate result buffer */
2990 norm_query = palloc(norm_query_buflen + 1);
2991
2992 for (i = 0; i < jstate->clocations_count; i++)
2993 {
2994 int off, /* Offset from start for cur tok */
2995 tok_len; /* Length (in bytes) of that tok */
2996
2997 off = jstate->clocations[i].location;
2998 /* Adjust recorded location if we're dealing with partial string */
2999 off -= query_loc;
3000
3001 tok_len = jstate->clocations[i].length;
3002
3003 if (tok_len < 0)
3004 continue; /* ignore any duplicates */
3005
3006 /* Copy next chunk (what precedes the next constant) */
3007 len_to_wrt = off - last_off;
3008 len_to_wrt -= last_tok_len;
3009
3010 Assert(len_to_wrt >= 0);
3011 memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
3012 n_quer_loc += len_to_wrt;
3013
3014 /* And insert a param symbol in place of the constant token */
3015 n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d",
3016 i + 1 + jstate->highest_extern_param_id);
3017
3018 quer_loc = off + tok_len;
3019 last_off = off;
3020 last_tok_len = tok_len;
3021 }
3022
3023 /*
3024 * We've copied up until the last ignorable constant. Copy over the
3025 * remaining bytes of the original query string.
3026 */
3027 len_to_wrt = query_len - quer_loc;
3028
3029 Assert(len_to_wrt >= 0);
3030 memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
3031 n_quer_loc += len_to_wrt;
3032
3033 Assert(n_quer_loc <= norm_query_buflen);
3034 norm_query[n_quer_loc] = '\0';
3035
3036 *query_len_p = n_quer_loc;
3037 return norm_query;
3038 }
3039
3040 /*
3041 * Given a valid SQL string and an array of constant-location records,
3042 * fill in the textual lengths of those constants.
3043 *
3044 * The constants may use any allowed constant syntax, such as float literals,
3045 * bit-strings, single-quoted strings and dollar-quoted strings. This is
3046 * accomplished by using the public API for the core scanner.
3047 *
3048 * It is the caller's job to ensure that the string is a valid SQL statement
3049 * with constants at the indicated locations. Since in practice the string
3050 * has already been parsed, and the locations that the caller provides will
3051 * have originated from within the authoritative parser, this should not be
3052 * a problem.
3053 *
3054 * Duplicate constant pointers are possible, and will have their lengths
3055 * marked as '-1', so that they are later ignored. (Actually, we assume the
3056 * lengths were initialized as -1 to start with, and don't change them here.)
3057 *
3058 * If query_loc > 0, then "query" has been advanced by that much compared to
3059 * the original string start, so we need to translate the provided locations
3060 * to compensate. (This lets us avoid re-scanning statements before the one
3061 * of interest, so it's worth doing.)
3062 *
3063 * N.B. There is an assumption that a '-' character at a Const location begins
3064 * a negative numeric constant. This precludes there ever being another
3065 * reason for a constant to start with a '-'.
3066 */
3067 static void
fill_in_constant_lengths(pgssJumbleState * jstate,const char * query,int query_loc)3068 fill_in_constant_lengths(pgssJumbleState *jstate, const char *query,
3069 int query_loc)
3070 {
3071 pgssLocationLen *locs;
3072 core_yyscan_t yyscanner;
3073 core_yy_extra_type yyextra;
3074 core_YYSTYPE yylval;
3075 YYLTYPE yylloc;
3076 int last_loc = -1;
3077 int i;
3078
3079 /*
3080 * Sort the records by location so that we can process them in order while
3081 * scanning the query text.
3082 */
3083 if (jstate->clocations_count > 1)
3084 qsort(jstate->clocations, jstate->clocations_count,
3085 sizeof(pgssLocationLen), comp_location);
3086 locs = jstate->clocations;
3087
3088 /* initialize the flex scanner --- should match raw_parser() */
3089 yyscanner = scanner_init(query,
3090 &yyextra,
3091 ScanKeywords,
3092 NumScanKeywords);
3093
3094 /* we don't want to re-emit any escape string warnings */
3095 yyextra.escape_string_warning = false;
3096
3097 /* Search for each constant, in sequence */
3098 for (i = 0; i < jstate->clocations_count; i++)
3099 {
3100 int loc = locs[i].location;
3101 int tok;
3102
3103 /* Adjust recorded location if we're dealing with partial string */
3104 loc -= query_loc;
3105
3106 Assert(loc >= 0);
3107
3108 if (loc <= last_loc)
3109 continue; /* Duplicate constant, ignore */
3110
3111 /* Lex tokens until we find the desired constant */
3112 for (;;)
3113 {
3114 tok = core_yylex(&yylval, &yylloc, yyscanner);
3115
3116 /* We should not hit end-of-string, but if we do, behave sanely */
3117 if (tok == 0)
3118 break; /* out of inner for-loop */
3119
3120 /*
3121 * We should find the token position exactly, but if we somehow
3122 * run past it, work with that.
3123 */
3124 if (yylloc >= loc)
3125 {
3126 if (query[loc] == '-')
3127 {
3128 /*
3129 * It's a negative value - this is the one and only case
3130 * where we replace more than a single token.
3131 *
3132 * Do not compensate for the core system's special-case
3133 * adjustment of location to that of the leading '-'
3134 * operator in the event of a negative constant. It is
3135 * also useful for our purposes to start from the minus
3136 * symbol. In this way, queries like "select * from foo
3137 * where bar = 1" and "select * from foo where bar = -2"
3138 * will have identical normalized query strings.
3139 */
3140 tok = core_yylex(&yylval, &yylloc, yyscanner);
3141 if (tok == 0)
3142 break; /* out of inner for-loop */
3143 }
3144
3145 /*
3146 * We now rely on the assumption that flex has placed a zero
3147 * byte after the text of the current token in scanbuf.
3148 */
3149 locs[i].length = strlen(yyextra.scanbuf + loc);
3150 break; /* out of inner for-loop */
3151 }
3152 }
3153
3154 /* If we hit end-of-string, give up, leaving remaining lengths -1 */
3155 if (tok == 0)
3156 break;
3157
3158 last_loc = loc;
3159 }
3160
3161 scanner_finish(yyscanner);
3162 }
3163
3164 /*
3165 * comp_location: comparator for qsorting pgssLocationLen structs by location
3166 */
3167 static int
comp_location(const void * a,const void * b)3168 comp_location(const void *a, const void *b)
3169 {
3170 int l = ((const pgssLocationLen *) a)->location;
3171 int r = ((const pgssLocationLen *) b)->location;
3172
3173 if (l < r)
3174 return -1;
3175 else if (l > r)
3176 return +1;
3177 else
3178 return 0;
3179 }
3180