1 /*-------------------------------------------------------------------------
2 *
3 * pg_stat_statements.c
4 * Track statement planning and execution times as well as resource
5 * usage across a whole database cluster.
6 *
7 * Execution costs are totaled for each distinct source query, and kept in
8 * a shared hashtable. (We track only as many distinct queries as will fit
9 * in the designated amount of shared memory.)
10 *
11 * As of Postgres 9.2, this module normalizes query entries. Normalization
12 * is a process whereby similar queries, typically differing only in their
13 * constants (though the exact rules are somewhat more subtle than that) are
14 * recognized as equivalent, and are tracked as a single entry. This is
15 * particularly useful for non-prepared queries.
16 *
17 * Normalization is implemented by fingerprinting queries, selectively
18 * serializing those fields of each query tree's nodes that are judged to be
19 * essential to the query. This is referred to as a query jumble. This is
20 * distinct from a regular serialization in that various extraneous
21 * information is ignored as irrelevant or not essential to the query, such
22 * as the collations of Vars and, most notably, the values of constants.
23 *
24 * This jumble is acquired at the end of parse analysis of each query, and
25 * a 64-bit hash of it is stored into the query's Query.queryId field.
26 * The server then copies this value around, making it available in plan
27 * tree(s) generated from the query. The executor can then use this value
28 * to blame query costs on the proper queryId.
29 *
30 * To facilitate presenting entries to users, we create "representative" query
31 * strings in which constants are replaced with parameter symbols ($n), to
32 * make it clearer what a normalized entry can represent. To save on shared
33 * memory, and to avoid having to truncate oversized query strings, we store
34 * these strings in a temporary external query-texts file. Offsets into this
35 * file are kept in shared memory.
36 *
37 * Note about locking issues: to create or delete an entry in the shared
38 * hashtable, one must hold pgss->lock exclusively. Modifying any field
39 * in an entry except the counters requires the same. To look up an entry,
40 * one must hold the lock shared. To read or update the counters within
41 * an entry, one must hold the lock shared or exclusive (so the entry doesn't
42 * disappear!) and also take the entry's mutex spinlock.
43 * The shared state variable pgss->extent (the next free spot in the external
44 * query-text file) should be accessed only while holding either the
45 * pgss->mutex spinlock, or exclusive lock on pgss->lock. We use the mutex to
46 * allow reserving file space while holding only shared lock on pgss->lock.
47 * Rewriting the entire external query-text file, eg for garbage collection,
48 * requires holding pgss->lock exclusively; this allows individual entries
49 * in the file to be read or written while holding only shared lock.
50 *
51 *
52 * Copyright (c) 2008-2020, PostgreSQL Global Development Group
53 *
54 * IDENTIFICATION
55 * contrib/pg_stat_statements/pg_stat_statements.c
56 *
57 *-------------------------------------------------------------------------
58 */
59 #include "postgres.h"
60
61 #include <math.h>
62 #include <sys/stat.h>
63 #include <unistd.h>
64
65 #include "catalog/pg_authid.h"
66 #include "common/hashfn.h"
67 #include "executor/instrument.h"
68 #include "funcapi.h"
69 #include "mb/pg_wchar.h"
70 #include "miscadmin.h"
71 #include "optimizer/planner.h"
72 #include "parser/analyze.h"
73 #include "parser/parsetree.h"
74 #include "parser/scanner.h"
75 #include "parser/scansup.h"
76 #include "pgstat.h"
77 #include "storage/fd.h"
78 #include "storage/ipc.h"
79 #include "storage/spin.h"
80 #include "tcop/utility.h"
81 #include "utils/acl.h"
82 #include "utils/builtins.h"
83 #include "utils/memutils.h"
84
85 PG_MODULE_MAGIC;
86
87 /* Location of permanent stats file (valid when database is shut down) */
88 #define PGSS_DUMP_FILE PGSTAT_STAT_PERMANENT_DIRECTORY "/pg_stat_statements.stat"
89
90 /*
91 * Location of external query text file. We don't keep it in the core
92 * system's stats_temp_directory. The core system can safely use that GUC
93 * setting, because the statistics collector temp file paths are set only once
94 * as part of changing the GUC, but pg_stat_statements has no way of avoiding
95 * race conditions. Besides, we only expect modest, infrequent I/O for query
96 * strings, so placing the file on a faster filesystem is not compelling.
97 */
98 #define PGSS_TEXT_FILE PG_STAT_TMP_DIR "/pgss_query_texts.stat"
99
100 /* Magic number identifying the stats file format */
101 static const uint32 PGSS_FILE_HEADER = 0x20171004;
102
103 /* PostgreSQL major version number, changes in which invalidate all entries */
104 static const uint32 PGSS_PG_MAJOR_VERSION = PG_VERSION_NUM / 100;
105
106 /* XXX: Should USAGE_EXEC reflect execution time and/or buffer usage? */
107 #define USAGE_EXEC(duration) (1.0)
108 #define USAGE_INIT (1.0) /* including initial planning */
109 #define ASSUMED_MEDIAN_INIT (10.0) /* initial assumed median usage */
110 #define ASSUMED_LENGTH_INIT 1024 /* initial assumed mean query length */
111 #define USAGE_DECREASE_FACTOR (0.99) /* decreased every entry_dealloc */
112 #define STICKY_DECREASE_FACTOR (0.50) /* factor for sticky entries */
113 #define USAGE_DEALLOC_PERCENT 5 /* free this % of entries at once */
114 #define IS_STICKY(c) ((c.calls[PGSS_PLAN] + c.calls[PGSS_EXEC]) == 0)
115
116 #define JUMBLE_SIZE 1024 /* query serialization buffer size */
117
118 /*
119 * Extension version number, for supporting older extension versions' objects
120 */
121 typedef enum pgssVersion
122 {
123 PGSS_V1_0 = 0,
124 PGSS_V1_1,
125 PGSS_V1_2,
126 PGSS_V1_3,
127 PGSS_V1_8
128 } pgssVersion;
129
130 typedef enum pgssStoreKind
131 {
132 PGSS_INVALID = -1,
133
134 /*
135 * PGSS_PLAN and PGSS_EXEC must be respectively 0 and 1 as they're used to
136 * reference the underlying values in the arrays in the Counters struct,
137 * and this order is required in pg_stat_statements_internal().
138 */
139 PGSS_PLAN = 0,
140 PGSS_EXEC,
141
142 PGSS_NUMKIND /* Must be last value of this enum */
143 } pgssStoreKind;
144
145 /*
146 * Hashtable key that defines the identity of a hashtable entry. We separate
147 * queries by user and by database even if they are otherwise identical.
148 *
149 * Right now, this structure contains no padding. If you add any, make sure
150 * to teach pgss_store() to zero the padding bytes. Otherwise, things will
151 * break, because pgss_hash is created using HASH_BLOBS, and thus tag_hash
152 * is used to hash this.
153 */
154 typedef struct pgssHashKey
155 {
156 Oid userid; /* user OID */
157 Oid dbid; /* database OID */
158 uint64 queryid; /* query identifier */
159 } pgssHashKey;
160
161 /*
162 * The actual stats counters kept within pgssEntry.
163 */
164 typedef struct Counters
165 {
166 int64 calls[PGSS_NUMKIND]; /* # of times planned/executed */
167 double total_time[PGSS_NUMKIND]; /* total planning/execution time,
168 * in msec */
169 double min_time[PGSS_NUMKIND]; /* minimum planning/execution time in
170 * msec */
171 double max_time[PGSS_NUMKIND]; /* maximum planning/execution time in
172 * msec */
173 double mean_time[PGSS_NUMKIND]; /* mean planning/execution time in
174 * msec */
175 double sum_var_time[PGSS_NUMKIND]; /* sum of variances in
176 * planning/execution time in msec */
177 int64 rows; /* total # of retrieved or affected rows */
178 int64 shared_blks_hit; /* # of shared buffer hits */
179 int64 shared_blks_read; /* # of shared disk blocks read */
180 int64 shared_blks_dirtied; /* # of shared disk blocks dirtied */
181 int64 shared_blks_written; /* # of shared disk blocks written */
182 int64 local_blks_hit; /* # of local buffer hits */
183 int64 local_blks_read; /* # of local disk blocks read */
184 int64 local_blks_dirtied; /* # of local disk blocks dirtied */
185 int64 local_blks_written; /* # of local disk blocks written */
186 int64 temp_blks_read; /* # of temp blocks read */
187 int64 temp_blks_written; /* # of temp blocks written */
188 double blk_read_time; /* time spent reading, in msec */
189 double blk_write_time; /* time spent writing, in msec */
190 double usage; /* usage factor */
191 int64 wal_records; /* # of WAL records generated */
192 int64 wal_fpi; /* # of WAL full page images generated */
193 uint64 wal_bytes; /* total amount of WAL bytes generated */
194 } Counters;
195
196 /*
197 * Statistics per statement
198 *
199 * Note: in event of a failure in garbage collection of the query text file,
200 * we reset query_offset to zero and query_len to -1. This will be seen as
201 * an invalid state by qtext_fetch().
202 */
203 typedef struct pgssEntry
204 {
205 pgssHashKey key; /* hash key of entry - MUST BE FIRST */
206 Counters counters; /* the statistics for this query */
207 Size query_offset; /* query text offset in external file */
208 int query_len; /* # of valid bytes in query string, or -1 */
209 int encoding; /* query text encoding */
210 slock_t mutex; /* protects the counters only */
211 } pgssEntry;
212
213 /*
214 * Global shared state
215 */
216 typedef struct pgssSharedState
217 {
218 LWLock *lock; /* protects hashtable search/modification */
219 double cur_median_usage; /* current median usage in hashtable */
220 Size mean_query_len; /* current mean entry text length */
221 slock_t mutex; /* protects following fields only: */
222 Size extent; /* current extent of query file */
223 int n_writers; /* number of active writers to query file */
224 int gc_count; /* query file garbage collection cycle count */
225 } pgssSharedState;
226
227 /*
228 * Struct for tracking locations/lengths of constants during normalization
229 */
230 typedef struct pgssLocationLen
231 {
232 int location; /* start offset in query text */
233 int length; /* length in bytes, or -1 to ignore */
234 } pgssLocationLen;
235
236 /*
237 * Working state for computing a query jumble and producing a normalized
238 * query string
239 */
240 typedef struct pgssJumbleState
241 {
242 /* Jumble of current query tree */
243 unsigned char *jumble;
244
245 /* Number of bytes used in jumble[] */
246 Size jumble_len;
247
248 /* Array of locations of constants that should be removed */
249 pgssLocationLen *clocations;
250
251 /* Allocated length of clocations array */
252 int clocations_buf_size;
253
254 /* Current number of valid entries in clocations array */
255 int clocations_count;
256
257 /* highest Param id we've seen, in order to start normalization correctly */
258 int highest_extern_param_id;
259 } pgssJumbleState;
260
261 /*---- Local variables ----*/
262
263 /* Current nesting depth of ExecutorRun+ProcessUtility calls */
264 static int exec_nested_level = 0;
265
266 /* Current nesting depth of planner calls */
267 static int plan_nested_level = 0;
268
269 /* Saved hook values in case of unload */
270 static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
271 static post_parse_analyze_hook_type prev_post_parse_analyze_hook = NULL;
272 static planner_hook_type prev_planner_hook = NULL;
273 static ExecutorStart_hook_type prev_ExecutorStart = NULL;
274 static ExecutorRun_hook_type prev_ExecutorRun = NULL;
275 static ExecutorFinish_hook_type prev_ExecutorFinish = NULL;
276 static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
277 static ProcessUtility_hook_type prev_ProcessUtility = NULL;
278
279 /* Links to shared memory state */
280 static pgssSharedState *pgss = NULL;
281 static HTAB *pgss_hash = NULL;
282
283 /*---- GUC variables ----*/
284
285 typedef enum
286 {
287 PGSS_TRACK_NONE, /* track no statements */
288 PGSS_TRACK_TOP, /* only top level statements */
289 PGSS_TRACK_ALL /* all statements, including nested ones */
290 } PGSSTrackLevel;
291
292 static const struct config_enum_entry track_options[] =
293 {
294 {"none", PGSS_TRACK_NONE, false},
295 {"top", PGSS_TRACK_TOP, false},
296 {"all", PGSS_TRACK_ALL, false},
297 {NULL, 0, false}
298 };
299
300 static int pgss_max; /* max # statements to track */
301 static int pgss_track; /* tracking level */
302 static bool pgss_track_utility; /* whether to track utility commands */
303 static bool pgss_track_planning; /* whether to track planning duration */
304 static bool pgss_save; /* whether to save stats across shutdown */
305
306
307 #define pgss_enabled(level) \
308 (pgss_track == PGSS_TRACK_ALL || \
309 (pgss_track == PGSS_TRACK_TOP && (level) == 0))
310
311 #define record_gc_qtexts() \
312 do { \
313 volatile pgssSharedState *s = (volatile pgssSharedState *) pgss; \
314 SpinLockAcquire(&s->mutex); \
315 s->gc_count++; \
316 SpinLockRelease(&s->mutex); \
317 } while(0)
318
319 /*---- Function declarations ----*/
320
321 void _PG_init(void);
322 void _PG_fini(void);
323
324 PG_FUNCTION_INFO_V1(pg_stat_statements_reset);
325 PG_FUNCTION_INFO_V1(pg_stat_statements_reset_1_7);
326 PG_FUNCTION_INFO_V1(pg_stat_statements_1_2);
327 PG_FUNCTION_INFO_V1(pg_stat_statements_1_3);
328 PG_FUNCTION_INFO_V1(pg_stat_statements_1_8);
329 PG_FUNCTION_INFO_V1(pg_stat_statements);
330
331 static void pgss_shmem_startup(void);
332 static void pgss_shmem_shutdown(int code, Datum arg);
333 static void pgss_post_parse_analyze(ParseState *pstate, Query *query);
334 static PlannedStmt *pgss_planner(Query *parse,
335 const char *query_string,
336 int cursorOptions,
337 ParamListInfo boundParams);
338 static void pgss_ExecutorStart(QueryDesc *queryDesc, int eflags);
339 static void pgss_ExecutorRun(QueryDesc *queryDesc,
340 ScanDirection direction,
341 uint64 count, bool execute_once);
342 static void pgss_ExecutorFinish(QueryDesc *queryDesc);
343 static void pgss_ExecutorEnd(QueryDesc *queryDesc);
344 static void pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
345 ProcessUtilityContext context, ParamListInfo params,
346 QueryEnvironment *queryEnv,
347 DestReceiver *dest, QueryCompletion *qc);
348 static uint64 pgss_hash_string(const char *str, int len);
349 static void pgss_store(const char *query, uint64 queryId,
350 int query_location, int query_len,
351 pgssStoreKind kind,
352 double total_time, uint64 rows,
353 const BufferUsage *bufusage,
354 const WalUsage *walusage,
355 pgssJumbleState *jstate);
356 static void pg_stat_statements_internal(FunctionCallInfo fcinfo,
357 pgssVersion api_version,
358 bool showtext);
359 static Size pgss_memsize(void);
360 static pgssEntry *entry_alloc(pgssHashKey *key, Size query_offset, int query_len,
361 int encoding, bool sticky);
362 static void entry_dealloc(void);
363 static bool qtext_store(const char *query, int query_len,
364 Size *query_offset, int *gc_count);
365 static char *qtext_load_file(Size *buffer_size);
366 static char *qtext_fetch(Size query_offset, int query_len,
367 char *buffer, Size buffer_size);
368 static bool need_gc_qtexts(void);
369 static void gc_qtexts(void);
370 static void entry_reset(Oid userid, Oid dbid, uint64 queryid);
371 static void AppendJumble(pgssJumbleState *jstate,
372 const unsigned char *item, Size size);
373 static void JumbleQuery(pgssJumbleState *jstate, Query *query);
374 static void JumbleRangeTable(pgssJumbleState *jstate, List *rtable);
375 static void JumbleRowMarks(pgssJumbleState *jstate, List *rowMarks);
376 static void JumbleExpr(pgssJumbleState *jstate, Node *node);
377 static void RecordConstLocation(pgssJumbleState *jstate, int location);
378 static char *generate_normalized_query(pgssJumbleState *jstate, const char *query,
379 int query_loc, int *query_len_p, int encoding);
380 static void fill_in_constant_lengths(pgssJumbleState *jstate, const char *query,
381 int query_loc);
382 static int comp_location(const void *a, const void *b);
383
384
385 /*
386 * Module load callback
387 */
388 void
_PG_init(void)389 _PG_init(void)
390 {
391 /*
392 * In order to create our shared memory area, we have to be loaded via
393 * shared_preload_libraries. If not, fall out without hooking into any of
394 * the main system. (We don't throw error here because it seems useful to
395 * allow the pg_stat_statements functions to be created even when the
396 * module isn't active. The functions must protect themselves against
397 * being called then, however.)
398 */
399 if (!process_shared_preload_libraries_in_progress)
400 return;
401
402 /*
403 * Define (or redefine) custom GUC variables.
404 */
405 DefineCustomIntVariable("pg_stat_statements.max",
406 "Sets the maximum number of statements tracked by pg_stat_statements.",
407 NULL,
408 &pgss_max,
409 5000,
410 100,
411 INT_MAX,
412 PGC_POSTMASTER,
413 0,
414 NULL,
415 NULL,
416 NULL);
417
418 DefineCustomEnumVariable("pg_stat_statements.track",
419 "Selects which statements are tracked by pg_stat_statements.",
420 NULL,
421 &pgss_track,
422 PGSS_TRACK_TOP,
423 track_options,
424 PGC_SUSET,
425 0,
426 NULL,
427 NULL,
428 NULL);
429
430 DefineCustomBoolVariable("pg_stat_statements.track_utility",
431 "Selects whether utility commands are tracked by pg_stat_statements.",
432 NULL,
433 &pgss_track_utility,
434 true,
435 PGC_SUSET,
436 0,
437 NULL,
438 NULL,
439 NULL);
440
441 DefineCustomBoolVariable("pg_stat_statements.track_planning",
442 "Selects whether planning duration is tracked by pg_stat_statements.",
443 NULL,
444 &pgss_track_planning,
445 false,
446 PGC_SUSET,
447 0,
448 NULL,
449 NULL,
450 NULL);
451
452 DefineCustomBoolVariable("pg_stat_statements.save",
453 "Save pg_stat_statements statistics across server shutdowns.",
454 NULL,
455 &pgss_save,
456 true,
457 PGC_SIGHUP,
458 0,
459 NULL,
460 NULL,
461 NULL);
462
463 EmitWarningsOnPlaceholders("pg_stat_statements");
464
465 /*
466 * Request additional shared resources. (These are no-ops if we're not in
467 * the postmaster process.) We'll allocate or attach to the shared
468 * resources in pgss_shmem_startup().
469 */
470 RequestAddinShmemSpace(pgss_memsize());
471 RequestNamedLWLockTranche("pg_stat_statements", 1);
472
473 /*
474 * Install hooks.
475 */
476 prev_shmem_startup_hook = shmem_startup_hook;
477 shmem_startup_hook = pgss_shmem_startup;
478 prev_post_parse_analyze_hook = post_parse_analyze_hook;
479 post_parse_analyze_hook = pgss_post_parse_analyze;
480 prev_planner_hook = planner_hook;
481 planner_hook = pgss_planner;
482 prev_ExecutorStart = ExecutorStart_hook;
483 ExecutorStart_hook = pgss_ExecutorStart;
484 prev_ExecutorRun = ExecutorRun_hook;
485 ExecutorRun_hook = pgss_ExecutorRun;
486 prev_ExecutorFinish = ExecutorFinish_hook;
487 ExecutorFinish_hook = pgss_ExecutorFinish;
488 prev_ExecutorEnd = ExecutorEnd_hook;
489 ExecutorEnd_hook = pgss_ExecutorEnd;
490 prev_ProcessUtility = ProcessUtility_hook;
491 ProcessUtility_hook = pgss_ProcessUtility;
492 }
493
494 /*
495 * Module unload callback
496 */
497 void
_PG_fini(void)498 _PG_fini(void)
499 {
500 /* Uninstall hooks. */
501 shmem_startup_hook = prev_shmem_startup_hook;
502 post_parse_analyze_hook = prev_post_parse_analyze_hook;
503 planner_hook = prev_planner_hook;
504 ExecutorStart_hook = prev_ExecutorStart;
505 ExecutorRun_hook = prev_ExecutorRun;
506 ExecutorFinish_hook = prev_ExecutorFinish;
507 ExecutorEnd_hook = prev_ExecutorEnd;
508 ProcessUtility_hook = prev_ProcessUtility;
509 }
510
511 /*
512 * shmem_startup hook: allocate or attach to shared memory,
513 * then load any pre-existing statistics from file.
514 * Also create and load the query-texts file, which is expected to exist
515 * (even if empty) while the module is enabled.
516 */
517 static void
pgss_shmem_startup(void)518 pgss_shmem_startup(void)
519 {
520 bool found;
521 HASHCTL info;
522 FILE *file = NULL;
523 FILE *qfile = NULL;
524 uint32 header;
525 int32 num;
526 int32 pgver;
527 int32 i;
528 int buffer_size;
529 char *buffer = NULL;
530
531 if (prev_shmem_startup_hook)
532 prev_shmem_startup_hook();
533
534 /* reset in case this is a restart within the postmaster */
535 pgss = NULL;
536 pgss_hash = NULL;
537
538 /*
539 * Create or attach to the shared memory state, including hash table
540 */
541 LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
542
543 pgss = ShmemInitStruct("pg_stat_statements",
544 sizeof(pgssSharedState),
545 &found);
546
547 if (!found)
548 {
549 /* First time through ... */
550 pgss->lock = &(GetNamedLWLockTranche("pg_stat_statements"))->lock;
551 pgss->cur_median_usage = ASSUMED_MEDIAN_INIT;
552 pgss->mean_query_len = ASSUMED_LENGTH_INIT;
553 SpinLockInit(&pgss->mutex);
554 pgss->extent = 0;
555 pgss->n_writers = 0;
556 pgss->gc_count = 0;
557 }
558
559 memset(&info, 0, sizeof(info));
560 info.keysize = sizeof(pgssHashKey);
561 info.entrysize = sizeof(pgssEntry);
562 pgss_hash = ShmemInitHash("pg_stat_statements hash",
563 pgss_max, pgss_max,
564 &info,
565 HASH_ELEM | HASH_BLOBS);
566
567 LWLockRelease(AddinShmemInitLock);
568
569 /*
570 * If we're in the postmaster (or a standalone backend...), set up a shmem
571 * exit hook to dump the statistics to disk.
572 */
573 if (!IsUnderPostmaster)
574 on_shmem_exit(pgss_shmem_shutdown, (Datum) 0);
575
576 /*
577 * Done if some other process already completed our initialization.
578 */
579 if (found)
580 return;
581
582 /*
583 * Note: we don't bother with locks here, because there should be no other
584 * processes running when this code is reached.
585 */
586
587 /* Unlink query text file possibly left over from crash */
588 unlink(PGSS_TEXT_FILE);
589
590 /* Allocate new query text temp file */
591 qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W);
592 if (qfile == NULL)
593 goto write_error;
594
595 /*
596 * If we were told not to load old statistics, we're done. (Note we do
597 * not try to unlink any old dump file in this case. This seems a bit
598 * questionable but it's the historical behavior.)
599 */
600 if (!pgss_save)
601 {
602 FreeFile(qfile);
603 return;
604 }
605
606 /*
607 * Attempt to load old statistics from the dump file.
608 */
609 file = AllocateFile(PGSS_DUMP_FILE, PG_BINARY_R);
610 if (file == NULL)
611 {
612 if (errno != ENOENT)
613 goto read_error;
614 /* No existing persisted stats file, so we're done */
615 FreeFile(qfile);
616 return;
617 }
618
619 buffer_size = 2048;
620 buffer = (char *) palloc(buffer_size);
621
622 if (fread(&header, sizeof(uint32), 1, file) != 1 ||
623 fread(&pgver, sizeof(uint32), 1, file) != 1 ||
624 fread(&num, sizeof(int32), 1, file) != 1)
625 goto read_error;
626
627 if (header != PGSS_FILE_HEADER ||
628 pgver != PGSS_PG_MAJOR_VERSION)
629 goto data_error;
630
631 for (i = 0; i < num; i++)
632 {
633 pgssEntry temp;
634 pgssEntry *entry;
635 Size query_offset;
636
637 if (fread(&temp, sizeof(pgssEntry), 1, file) != 1)
638 goto read_error;
639
640 /* Encoding is the only field we can easily sanity-check */
641 if (!PG_VALID_BE_ENCODING(temp.encoding))
642 goto data_error;
643
644 /* Resize buffer as needed */
645 if (temp.query_len >= buffer_size)
646 {
647 buffer_size = Max(buffer_size * 2, temp.query_len + 1);
648 buffer = repalloc(buffer, buffer_size);
649 }
650
651 if (fread(buffer, 1, temp.query_len + 1, file) != temp.query_len + 1)
652 goto read_error;
653
654 /* Should have a trailing null, but let's make sure */
655 buffer[temp.query_len] = '\0';
656
657 /* Skip loading "sticky" entries */
658 if (IS_STICKY(temp.counters))
659 continue;
660
661 /* Store the query text */
662 query_offset = pgss->extent;
663 if (fwrite(buffer, 1, temp.query_len + 1, qfile) != temp.query_len + 1)
664 goto write_error;
665 pgss->extent += temp.query_len + 1;
666
667 /* make the hashtable entry (discards old entries if too many) */
668 entry = entry_alloc(&temp.key, query_offset, temp.query_len,
669 temp.encoding,
670 false);
671
672 /* copy in the actual stats */
673 entry->counters = temp.counters;
674 }
675
676 pfree(buffer);
677 FreeFile(file);
678 FreeFile(qfile);
679
680 /*
681 * Remove the persisted stats file so it's not included in
682 * backups/replication standbys, etc. A new file will be written on next
683 * shutdown.
684 *
685 * Note: it's okay if the PGSS_TEXT_FILE is included in a basebackup,
686 * because we remove that file on startup; it acts inversely to
687 * PGSS_DUMP_FILE, in that it is only supposed to be around when the
688 * server is running, whereas PGSS_DUMP_FILE is only supposed to be around
689 * when the server is not running. Leaving the file creates no danger of
690 * a newly restored database having a spurious record of execution costs,
691 * which is what we're really concerned about here.
692 */
693 unlink(PGSS_DUMP_FILE);
694
695 return;
696
697 read_error:
698 ereport(LOG,
699 (errcode_for_file_access(),
700 errmsg("could not read file \"%s\": %m",
701 PGSS_DUMP_FILE)));
702 goto fail;
703 data_error:
704 ereport(LOG,
705 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
706 errmsg("ignoring invalid data in file \"%s\"",
707 PGSS_DUMP_FILE)));
708 goto fail;
709 write_error:
710 ereport(LOG,
711 (errcode_for_file_access(),
712 errmsg("could not write file \"%s\": %m",
713 PGSS_TEXT_FILE)));
714 fail:
715 if (buffer)
716 pfree(buffer);
717 if (file)
718 FreeFile(file);
719 if (qfile)
720 FreeFile(qfile);
721 /* If possible, throw away the bogus file; ignore any error */
722 unlink(PGSS_DUMP_FILE);
723
724 /*
725 * Don't unlink PGSS_TEXT_FILE here; it should always be around while the
726 * server is running with pg_stat_statements enabled
727 */
728 }
729
730 /*
731 * shmem_shutdown hook: Dump statistics into file.
732 *
733 * Note: we don't bother with acquiring lock, because there should be no
734 * other processes running when this is called.
735 */
736 static void
pgss_shmem_shutdown(int code,Datum arg)737 pgss_shmem_shutdown(int code, Datum arg)
738 {
739 FILE *file;
740 char *qbuffer = NULL;
741 Size qbuffer_size = 0;
742 HASH_SEQ_STATUS hash_seq;
743 int32 num_entries;
744 pgssEntry *entry;
745
746 /* Don't try to dump during a crash. */
747 if (code)
748 return;
749
750 /* Safety check ... shouldn't get here unless shmem is set up. */
751 if (!pgss || !pgss_hash)
752 return;
753
754 /* Don't dump if told not to. */
755 if (!pgss_save)
756 return;
757
758 file = AllocateFile(PGSS_DUMP_FILE ".tmp", PG_BINARY_W);
759 if (file == NULL)
760 goto error;
761
762 if (fwrite(&PGSS_FILE_HEADER, sizeof(uint32), 1, file) != 1)
763 goto error;
764 if (fwrite(&PGSS_PG_MAJOR_VERSION, sizeof(uint32), 1, file) != 1)
765 goto error;
766 num_entries = hash_get_num_entries(pgss_hash);
767 if (fwrite(&num_entries, sizeof(int32), 1, file) != 1)
768 goto error;
769
770 qbuffer = qtext_load_file(&qbuffer_size);
771 if (qbuffer == NULL)
772 goto error;
773
774 /*
775 * When serializing to disk, we store query texts immediately after their
776 * entry data. Any orphaned query texts are thereby excluded.
777 */
778 hash_seq_init(&hash_seq, pgss_hash);
779 while ((entry = hash_seq_search(&hash_seq)) != NULL)
780 {
781 int len = entry->query_len;
782 char *qstr = qtext_fetch(entry->query_offset, len,
783 qbuffer, qbuffer_size);
784
785 if (qstr == NULL)
786 continue; /* Ignore any entries with bogus texts */
787
788 if (fwrite(entry, sizeof(pgssEntry), 1, file) != 1 ||
789 fwrite(qstr, 1, len + 1, file) != len + 1)
790 {
791 /* note: we assume hash_seq_term won't change errno */
792 hash_seq_term(&hash_seq);
793 goto error;
794 }
795 }
796
797 free(qbuffer);
798 qbuffer = NULL;
799
800 if (FreeFile(file))
801 {
802 file = NULL;
803 goto error;
804 }
805
806 /*
807 * Rename file into place, so we atomically replace any old one.
808 */
809 (void) durable_rename(PGSS_DUMP_FILE ".tmp", PGSS_DUMP_FILE, LOG);
810
811 /* Unlink query-texts file; it's not needed while shutdown */
812 unlink(PGSS_TEXT_FILE);
813
814 return;
815
816 error:
817 ereport(LOG,
818 (errcode_for_file_access(),
819 errmsg("could not write file \"%s\": %m",
820 PGSS_DUMP_FILE ".tmp")));
821 if (qbuffer)
822 free(qbuffer);
823 if (file)
824 FreeFile(file);
825 unlink(PGSS_DUMP_FILE ".tmp");
826 unlink(PGSS_TEXT_FILE);
827 }
828
829 /*
830 * Post-parse-analysis hook: mark query with a queryId
831 */
832 static void
pgss_post_parse_analyze(ParseState * pstate,Query * query)833 pgss_post_parse_analyze(ParseState *pstate, Query *query)
834 {
835 pgssJumbleState jstate;
836
837 if (prev_post_parse_analyze_hook)
838 prev_post_parse_analyze_hook(pstate, query);
839
840 /* Assert we didn't do this already */
841 Assert(query->queryId == UINT64CONST(0));
842
843 /* Safety check... */
844 if (!pgss || !pgss_hash || !pgss_enabled(exec_nested_level))
845 return;
846
847 /*
848 * Utility statements get queryId zero. We do this even in cases where
849 * the statement contains an optimizable statement for which a queryId
850 * could be derived (such as EXPLAIN or DECLARE CURSOR). For such cases,
851 * runtime control will first go through ProcessUtility and then the
852 * executor, and we don't want the executor hooks to do anything, since we
853 * are already measuring the statement's costs at the utility level.
854 */
855 if (query->utilityStmt)
856 {
857 query->queryId = UINT64CONST(0);
858 return;
859 }
860
861 /* Set up workspace for query jumbling */
862 jstate.jumble = (unsigned char *) palloc(JUMBLE_SIZE);
863 jstate.jumble_len = 0;
864 jstate.clocations_buf_size = 32;
865 jstate.clocations = (pgssLocationLen *)
866 palloc(jstate.clocations_buf_size * sizeof(pgssLocationLen));
867 jstate.clocations_count = 0;
868 jstate.highest_extern_param_id = 0;
869
870 /* Compute query ID and mark the Query node with it */
871 JumbleQuery(&jstate, query);
872 query->queryId =
873 DatumGetUInt64(hash_any_extended(jstate.jumble, jstate.jumble_len, 0));
874
875 /*
876 * If we are unlucky enough to get a hash of zero, use 1 instead, to
877 * prevent confusion with the utility-statement case.
878 */
879 if (query->queryId == UINT64CONST(0))
880 query->queryId = UINT64CONST(1);
881
882 /*
883 * If we were able to identify any ignorable constants, we immediately
884 * create a hash table entry for the query, so that we can record the
885 * normalized form of the query string. If there were no such constants,
886 * the normalized string would be the same as the query text anyway, so
887 * there's no need for an early entry.
888 */
889 if (jstate.clocations_count > 0)
890 pgss_store(pstate->p_sourcetext,
891 query->queryId,
892 query->stmt_location,
893 query->stmt_len,
894 PGSS_INVALID,
895 0,
896 0,
897 NULL,
898 NULL,
899 &jstate);
900 }
901
902 /*
903 * Planner hook: forward to regular planner, but measure planning time
904 * if needed.
905 */
906 static PlannedStmt *
pgss_planner(Query * parse,const char * query_string,int cursorOptions,ParamListInfo boundParams)907 pgss_planner(Query *parse,
908 const char *query_string,
909 int cursorOptions,
910 ParamListInfo boundParams)
911 {
912 PlannedStmt *result;
913
914 /*
915 * We can't process the query if no query_string is provided, as
916 * pgss_store needs it. We also ignore query without queryid, as it would
917 * be treated as a utility statement, which may not be the case.
918 *
919 * Note that planner_hook can be called from the planner itself, so we
920 * have a specific nesting level for the planner. However, utility
921 * commands containing optimizable statements can also call the planner,
922 * same for regular DML (for instance for underlying foreign key queries).
923 * So testing the planner nesting level only is not enough to detect real
924 * top level planner call.
925 */
926 if (pgss_enabled(plan_nested_level + exec_nested_level)
927 && pgss_track_planning && query_string
928 && parse->queryId != UINT64CONST(0))
929 {
930 instr_time start;
931 instr_time duration;
932 BufferUsage bufusage_start,
933 bufusage;
934 WalUsage walusage_start,
935 walusage;
936
937 /* We need to track buffer usage as the planner can access them. */
938 bufusage_start = pgBufferUsage;
939
940 /*
941 * Similarly the planner could write some WAL records in some cases
942 * (e.g. setting a hint bit with those being WAL-logged)
943 */
944 walusage_start = pgWalUsage;
945 INSTR_TIME_SET_CURRENT(start);
946
947 plan_nested_level++;
948 PG_TRY();
949 {
950 if (prev_planner_hook)
951 result = prev_planner_hook(parse, query_string, cursorOptions,
952 boundParams);
953 else
954 result = standard_planner(parse, query_string, cursorOptions,
955 boundParams);
956 }
957 PG_FINALLY();
958 {
959 plan_nested_level--;
960 }
961 PG_END_TRY();
962
963 INSTR_TIME_SET_CURRENT(duration);
964 INSTR_TIME_SUBTRACT(duration, start);
965
966 /* calc differences of buffer counters. */
967 memset(&bufusage, 0, sizeof(BufferUsage));
968 BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start);
969
970 /* calc differences of WAL counters. */
971 memset(&walusage, 0, sizeof(WalUsage));
972 WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start);
973
974 pgss_store(query_string,
975 parse->queryId,
976 parse->stmt_location,
977 parse->stmt_len,
978 PGSS_PLAN,
979 INSTR_TIME_GET_MILLISEC(duration),
980 0,
981 &bufusage,
982 &walusage,
983 NULL);
984 }
985 else
986 {
987 if (prev_planner_hook)
988 result = prev_planner_hook(parse, query_string, cursorOptions,
989 boundParams);
990 else
991 result = standard_planner(parse, query_string, cursorOptions,
992 boundParams);
993 }
994
995 return result;
996 }
997
998 /*
999 * ExecutorStart hook: start up tracking if needed
1000 */
1001 static void
pgss_ExecutorStart(QueryDesc * queryDesc,int eflags)1002 pgss_ExecutorStart(QueryDesc *queryDesc, int eflags)
1003 {
1004 if (prev_ExecutorStart)
1005 prev_ExecutorStart(queryDesc, eflags);
1006 else
1007 standard_ExecutorStart(queryDesc, eflags);
1008
1009 /*
1010 * If query has queryId zero, don't track it. This prevents double
1011 * counting of optimizable statements that are directly contained in
1012 * utility statements.
1013 */
1014 if (pgss_enabled(exec_nested_level) && queryDesc->plannedstmt->queryId != UINT64CONST(0))
1015 {
1016 /*
1017 * Set up to track total elapsed time in ExecutorRun. Make sure the
1018 * space is allocated in the per-query context so it will go away at
1019 * ExecutorEnd.
1020 */
1021 if (queryDesc->totaltime == NULL)
1022 {
1023 MemoryContext oldcxt;
1024
1025 oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
1026 queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL);
1027 MemoryContextSwitchTo(oldcxt);
1028 }
1029 }
1030 }
1031
1032 /*
1033 * ExecutorRun hook: all we need do is track nesting depth
1034 */
1035 static void
pgss_ExecutorRun(QueryDesc * queryDesc,ScanDirection direction,uint64 count,bool execute_once)1036 pgss_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, uint64 count,
1037 bool execute_once)
1038 {
1039 exec_nested_level++;
1040 PG_TRY();
1041 {
1042 if (prev_ExecutorRun)
1043 prev_ExecutorRun(queryDesc, direction, count, execute_once);
1044 else
1045 standard_ExecutorRun(queryDesc, direction, count, execute_once);
1046 }
1047 PG_FINALLY();
1048 {
1049 exec_nested_level--;
1050 }
1051 PG_END_TRY();
1052 }
1053
1054 /*
1055 * ExecutorFinish hook: all we need do is track nesting depth
1056 */
1057 static void
pgss_ExecutorFinish(QueryDesc * queryDesc)1058 pgss_ExecutorFinish(QueryDesc *queryDesc)
1059 {
1060 exec_nested_level++;
1061 PG_TRY();
1062 {
1063 if (prev_ExecutorFinish)
1064 prev_ExecutorFinish(queryDesc);
1065 else
1066 standard_ExecutorFinish(queryDesc);
1067 }
1068 PG_FINALLY();
1069 {
1070 exec_nested_level--;
1071 }
1072 PG_END_TRY();
1073 }
1074
1075 /*
1076 * ExecutorEnd hook: store results if needed
1077 */
1078 static void
pgss_ExecutorEnd(QueryDesc * queryDesc)1079 pgss_ExecutorEnd(QueryDesc *queryDesc)
1080 {
1081 uint64 queryId = queryDesc->plannedstmt->queryId;
1082
1083 if (queryId != UINT64CONST(0) && queryDesc->totaltime &&
1084 pgss_enabled(exec_nested_level))
1085 {
1086 /*
1087 * Make sure stats accumulation is done. (Note: it's okay if several
1088 * levels of hook all do this.)
1089 */
1090 InstrEndLoop(queryDesc->totaltime);
1091
1092 pgss_store(queryDesc->sourceText,
1093 queryId,
1094 queryDesc->plannedstmt->stmt_location,
1095 queryDesc->plannedstmt->stmt_len,
1096 PGSS_EXEC,
1097 queryDesc->totaltime->total * 1000.0, /* convert to msec */
1098 queryDesc->estate->es_processed,
1099 &queryDesc->totaltime->bufusage,
1100 &queryDesc->totaltime->walusage,
1101 NULL);
1102 }
1103
1104 if (prev_ExecutorEnd)
1105 prev_ExecutorEnd(queryDesc);
1106 else
1107 standard_ExecutorEnd(queryDesc);
1108 }
1109
1110 /*
1111 * ProcessUtility hook
1112 */
1113 static void
pgss_ProcessUtility(PlannedStmt * pstmt,const char * queryString,ProcessUtilityContext context,ParamListInfo params,QueryEnvironment * queryEnv,DestReceiver * dest,QueryCompletion * qc)1114 pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
1115 ProcessUtilityContext context,
1116 ParamListInfo params, QueryEnvironment *queryEnv,
1117 DestReceiver *dest, QueryCompletion *qc)
1118 {
1119 Node *parsetree = pstmt->utilityStmt;
1120
1121 /*
1122 * If it's an EXECUTE statement, we don't track it and don't increment the
1123 * nesting level. This allows the cycles to be charged to the underlying
1124 * PREPARE instead (by the Executor hooks), which is much more useful.
1125 *
1126 * We also don't track execution of PREPARE. If we did, we would get one
1127 * hash table entry for the PREPARE (with hash calculated from the query
1128 * string), and then a different one with the same query string (but hash
1129 * calculated from the query tree) would be used to accumulate costs of
1130 * ensuing EXECUTEs. This would be confusing, and inconsistent with other
1131 * cases where planning time is not included at all.
1132 *
1133 * Likewise, we don't track execution of DEALLOCATE.
1134 */
1135 if (pgss_track_utility && pgss_enabled(exec_nested_level) &&
1136 !IsA(parsetree, ExecuteStmt) &&
1137 !IsA(parsetree, PrepareStmt) &&
1138 !IsA(parsetree, DeallocateStmt))
1139 {
1140 instr_time start;
1141 instr_time duration;
1142 uint64 rows;
1143 BufferUsage bufusage_start,
1144 bufusage;
1145 WalUsage walusage_start,
1146 walusage;
1147
1148 bufusage_start = pgBufferUsage;
1149 walusage_start = pgWalUsage;
1150 INSTR_TIME_SET_CURRENT(start);
1151
1152 exec_nested_level++;
1153 PG_TRY();
1154 {
1155 if (prev_ProcessUtility)
1156 prev_ProcessUtility(pstmt, queryString,
1157 context, params, queryEnv,
1158 dest, qc);
1159 else
1160 standard_ProcessUtility(pstmt, queryString,
1161 context, params, queryEnv,
1162 dest, qc);
1163 }
1164 PG_FINALLY();
1165 {
1166 exec_nested_level--;
1167 }
1168 PG_END_TRY();
1169
1170 INSTR_TIME_SET_CURRENT(duration);
1171 INSTR_TIME_SUBTRACT(duration, start);
1172
1173 rows = (qc && qc->commandTag == CMDTAG_COPY) ? qc->nprocessed : 0;
1174
1175 /* calc differences of buffer counters. */
1176 memset(&bufusage, 0, sizeof(BufferUsage));
1177 BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start);
1178
1179 /* calc differences of WAL counters. */
1180 memset(&walusage, 0, sizeof(WalUsage));
1181 WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start);
1182
1183 pgss_store(queryString,
1184 0, /* signal that it's a utility stmt */
1185 pstmt->stmt_location,
1186 pstmt->stmt_len,
1187 PGSS_EXEC,
1188 INSTR_TIME_GET_MILLISEC(duration),
1189 rows,
1190 &bufusage,
1191 &walusage,
1192 NULL);
1193 }
1194 else
1195 {
1196 if (prev_ProcessUtility)
1197 prev_ProcessUtility(pstmt, queryString,
1198 context, params, queryEnv,
1199 dest, qc);
1200 else
1201 standard_ProcessUtility(pstmt, queryString,
1202 context, params, queryEnv,
1203 dest, qc);
1204 }
1205 }
1206
1207 /*
1208 * Given an arbitrarily long query string, produce a hash for the purposes of
1209 * identifying the query, without normalizing constants. Used when hashing
1210 * utility statements.
1211 */
1212 static uint64
pgss_hash_string(const char * str,int len)1213 pgss_hash_string(const char *str, int len)
1214 {
1215 return DatumGetUInt64(hash_any_extended((const unsigned char *) str,
1216 len, 0));
1217 }
1218
1219 /*
1220 * Store some statistics for a statement.
1221 *
1222 * If queryId is 0 then this is a utility statement and we should compute
1223 * a suitable queryId internally.
1224 *
1225 * If jstate is not NULL then we're trying to create an entry for which
1226 * we have no statistics as yet; we just want to record the normalized
1227 * query string. total_time, rows, bufusage and walusage are ignored in this
1228 * case.
1229 *
1230 * If kind is PGSS_PLAN or PGSS_EXEC, its value is used as the array position
1231 * for the arrays in the Counters field.
1232 */
1233 static void
pgss_store(const char * query,uint64 queryId,int query_location,int query_len,pgssStoreKind kind,double total_time,uint64 rows,const BufferUsage * bufusage,const WalUsage * walusage,pgssJumbleState * jstate)1234 pgss_store(const char *query, uint64 queryId,
1235 int query_location, int query_len,
1236 pgssStoreKind kind,
1237 double total_time, uint64 rows,
1238 const BufferUsage *bufusage,
1239 const WalUsage *walusage,
1240 pgssJumbleState *jstate)
1241 {
1242 pgssHashKey key;
1243 pgssEntry *entry;
1244 char *norm_query = NULL;
1245 int encoding = GetDatabaseEncoding();
1246
1247 Assert(query != NULL);
1248
1249 /* Safety check... */
1250 if (!pgss || !pgss_hash)
1251 return;
1252
1253 /*
1254 * Confine our attention to the relevant part of the string, if the query
1255 * is a portion of a multi-statement source string.
1256 *
1257 * First apply starting offset, unless it's -1 (unknown).
1258 */
1259 if (query_location >= 0)
1260 {
1261 Assert(query_location <= strlen(query));
1262 query += query_location;
1263 /* Length of 0 (or -1) means "rest of string" */
1264 if (query_len <= 0)
1265 query_len = strlen(query);
1266 else
1267 Assert(query_len <= strlen(query));
1268 }
1269 else
1270 {
1271 /* If query location is unknown, distrust query_len as well */
1272 query_location = 0;
1273 query_len = strlen(query);
1274 }
1275
1276 /*
1277 * Discard leading and trailing whitespace, too. Use scanner_isspace()
1278 * not libc's isspace(), because we want to match the lexer's behavior.
1279 */
1280 while (query_len > 0 && scanner_isspace(query[0]))
1281 query++, query_location++, query_len--;
1282 while (query_len > 0 && scanner_isspace(query[query_len - 1]))
1283 query_len--;
1284
1285 /*
1286 * For utility statements, we just hash the query string to get an ID.
1287 */
1288 if (queryId == UINT64CONST(0))
1289 {
1290 queryId = pgss_hash_string(query, query_len);
1291
1292 /*
1293 * If we are unlucky enough to get a hash of zero(invalid), use
1294 * queryID as 2 instead, queryID 1 is already in use for normal
1295 * statements.
1296 */
1297 if (queryId == UINT64CONST(0))
1298 queryId = UINT64CONST(2);
1299 }
1300
1301 /* Set up key for hashtable search */
1302 key.userid = GetUserId();
1303 key.dbid = MyDatabaseId;
1304 key.queryid = queryId;
1305
1306 /* Lookup the hash table entry with shared lock. */
1307 LWLockAcquire(pgss->lock, LW_SHARED);
1308
1309 entry = (pgssEntry *) hash_search(pgss_hash, &key, HASH_FIND, NULL);
1310
1311 /* Create new entry, if not present */
1312 if (!entry)
1313 {
1314 Size query_offset;
1315 int gc_count;
1316 bool stored;
1317 bool do_gc;
1318
1319 /*
1320 * Create a new, normalized query string if caller asked. We don't
1321 * need to hold the lock while doing this work. (Note: in any case,
1322 * it's possible that someone else creates a duplicate hashtable entry
1323 * in the interval where we don't hold the lock below. That case is
1324 * handled by entry_alloc.)
1325 */
1326 if (jstate)
1327 {
1328 LWLockRelease(pgss->lock);
1329 norm_query = generate_normalized_query(jstate, query,
1330 query_location,
1331 &query_len,
1332 encoding);
1333 LWLockAcquire(pgss->lock, LW_SHARED);
1334 }
1335
1336 /* Append new query text to file with only shared lock held */
1337 stored = qtext_store(norm_query ? norm_query : query, query_len,
1338 &query_offset, &gc_count);
1339
1340 /*
1341 * Determine whether we need to garbage collect external query texts
1342 * while the shared lock is still held. This micro-optimization
1343 * avoids taking the time to decide this while holding exclusive lock.
1344 */
1345 do_gc = need_gc_qtexts();
1346
1347 /* Need exclusive lock to make a new hashtable entry - promote */
1348 LWLockRelease(pgss->lock);
1349 LWLockAcquire(pgss->lock, LW_EXCLUSIVE);
1350
1351 /*
1352 * A garbage collection may have occurred while we weren't holding the
1353 * lock. In the unlikely event that this happens, the query text we
1354 * stored above will have been garbage collected, so write it again.
1355 * This should be infrequent enough that doing it while holding
1356 * exclusive lock isn't a performance problem.
1357 */
1358 if (!stored || pgss->gc_count != gc_count)
1359 stored = qtext_store(norm_query ? norm_query : query, query_len,
1360 &query_offset, NULL);
1361
1362 /* If we failed to write to the text file, give up */
1363 if (!stored)
1364 goto done;
1365
1366 /* OK to create a new hashtable entry */
1367 entry = entry_alloc(&key, query_offset, query_len, encoding,
1368 jstate != NULL);
1369
1370 /* If needed, perform garbage collection while exclusive lock held */
1371 if (do_gc)
1372 gc_qtexts();
1373 }
1374
1375 /* Increment the counts, except when jstate is not NULL */
1376 if (!jstate)
1377 {
1378 /*
1379 * Grab the spinlock while updating the counters (see comment about
1380 * locking rules at the head of the file)
1381 */
1382 volatile pgssEntry *e = (volatile pgssEntry *) entry;
1383
1384 Assert(kind == PGSS_PLAN || kind == PGSS_EXEC);
1385
1386 SpinLockAcquire(&e->mutex);
1387
1388 /* "Unstick" entry if it was previously sticky */
1389 if (IS_STICKY(e->counters))
1390 e->counters.usage = USAGE_INIT;
1391
1392 e->counters.calls[kind] += 1;
1393 e->counters.total_time[kind] += total_time;
1394
1395 if (e->counters.calls[kind] == 1)
1396 {
1397 e->counters.min_time[kind] = total_time;
1398 e->counters.max_time[kind] = total_time;
1399 e->counters.mean_time[kind] = total_time;
1400 }
1401 else
1402 {
1403 /*
1404 * Welford's method for accurately computing variance. See
1405 * <http://www.johndcook.com/blog/standard_deviation/>
1406 */
1407 double old_mean = e->counters.mean_time[kind];
1408
1409 e->counters.mean_time[kind] +=
1410 (total_time - old_mean) / e->counters.calls[kind];
1411 e->counters.sum_var_time[kind] +=
1412 (total_time - old_mean) * (total_time - e->counters.mean_time[kind]);
1413
1414 /* calculate min and max time */
1415 if (e->counters.min_time[kind] > total_time)
1416 e->counters.min_time[kind] = total_time;
1417 if (e->counters.max_time[kind] < total_time)
1418 e->counters.max_time[kind] = total_time;
1419 }
1420 e->counters.rows += rows;
1421 e->counters.shared_blks_hit += bufusage->shared_blks_hit;
1422 e->counters.shared_blks_read += bufusage->shared_blks_read;
1423 e->counters.shared_blks_dirtied += bufusage->shared_blks_dirtied;
1424 e->counters.shared_blks_written += bufusage->shared_blks_written;
1425 e->counters.local_blks_hit += bufusage->local_blks_hit;
1426 e->counters.local_blks_read += bufusage->local_blks_read;
1427 e->counters.local_blks_dirtied += bufusage->local_blks_dirtied;
1428 e->counters.local_blks_written += bufusage->local_blks_written;
1429 e->counters.temp_blks_read += bufusage->temp_blks_read;
1430 e->counters.temp_blks_written += bufusage->temp_blks_written;
1431 e->counters.blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_read_time);
1432 e->counters.blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_write_time);
1433 e->counters.usage += USAGE_EXEC(total_time);
1434 e->counters.wal_records += walusage->wal_records;
1435 e->counters.wal_fpi += walusage->wal_fpi;
1436 e->counters.wal_bytes += walusage->wal_bytes;
1437
1438 SpinLockRelease(&e->mutex);
1439 }
1440
1441 done:
1442 LWLockRelease(pgss->lock);
1443
1444 /* We postpone this clean-up until we're out of the lock */
1445 if (norm_query)
1446 pfree(norm_query);
1447 }
1448
1449 /*
1450 * Reset statement statistics corresponding to userid, dbid, and queryid.
1451 */
1452 Datum
pg_stat_statements_reset_1_7(PG_FUNCTION_ARGS)1453 pg_stat_statements_reset_1_7(PG_FUNCTION_ARGS)
1454 {
1455 Oid userid;
1456 Oid dbid;
1457 uint64 queryid;
1458
1459 userid = PG_GETARG_OID(0);
1460 dbid = PG_GETARG_OID(1);
1461 queryid = (uint64) PG_GETARG_INT64(2);
1462
1463 entry_reset(userid, dbid, queryid);
1464
1465 PG_RETURN_VOID();
1466 }
1467
1468 /*
1469 * Reset statement statistics.
1470 */
1471 Datum
pg_stat_statements_reset(PG_FUNCTION_ARGS)1472 pg_stat_statements_reset(PG_FUNCTION_ARGS)
1473 {
1474 entry_reset(0, 0, 0);
1475
1476 PG_RETURN_VOID();
1477 }
1478
1479 /* Number of output arguments (columns) for various API versions */
1480 #define PG_STAT_STATEMENTS_COLS_V1_0 14
1481 #define PG_STAT_STATEMENTS_COLS_V1_1 18
1482 #define PG_STAT_STATEMENTS_COLS_V1_2 19
1483 #define PG_STAT_STATEMENTS_COLS_V1_3 23
1484 #define PG_STAT_STATEMENTS_COLS_V1_8 32
1485 #define PG_STAT_STATEMENTS_COLS 32 /* maximum of above */
1486
1487 /*
1488 * Retrieve statement statistics.
1489 *
1490 * The SQL API of this function has changed multiple times, and will likely
1491 * do so again in future. To support the case where a newer version of this
1492 * loadable module is being used with an old SQL declaration of the function,
1493 * we continue to support the older API versions. For 1.2 and later, the
1494 * expected API version is identified by embedding it in the C name of the
1495 * function. Unfortunately we weren't bright enough to do that for 1.1.
1496 */
1497 Datum
pg_stat_statements_1_8(PG_FUNCTION_ARGS)1498 pg_stat_statements_1_8(PG_FUNCTION_ARGS)
1499 {
1500 bool showtext = PG_GETARG_BOOL(0);
1501
1502 pg_stat_statements_internal(fcinfo, PGSS_V1_8, showtext);
1503
1504 return (Datum) 0;
1505 }
1506
1507 Datum
pg_stat_statements_1_3(PG_FUNCTION_ARGS)1508 pg_stat_statements_1_3(PG_FUNCTION_ARGS)
1509 {
1510 bool showtext = PG_GETARG_BOOL(0);
1511
1512 pg_stat_statements_internal(fcinfo, PGSS_V1_3, showtext);
1513
1514 return (Datum) 0;
1515 }
1516
1517 Datum
pg_stat_statements_1_2(PG_FUNCTION_ARGS)1518 pg_stat_statements_1_2(PG_FUNCTION_ARGS)
1519 {
1520 bool showtext = PG_GETARG_BOOL(0);
1521
1522 pg_stat_statements_internal(fcinfo, PGSS_V1_2, showtext);
1523
1524 return (Datum) 0;
1525 }
1526
1527 /*
1528 * Legacy entry point for pg_stat_statements() API versions 1.0 and 1.1.
1529 * This can be removed someday, perhaps.
1530 */
1531 Datum
pg_stat_statements(PG_FUNCTION_ARGS)1532 pg_stat_statements(PG_FUNCTION_ARGS)
1533 {
1534 /* If it's really API 1.1, we'll figure that out below */
1535 pg_stat_statements_internal(fcinfo, PGSS_V1_0, true);
1536
1537 return (Datum) 0;
1538 }
1539
1540 /* Common code for all versions of pg_stat_statements() */
1541 static void
pg_stat_statements_internal(FunctionCallInfo fcinfo,pgssVersion api_version,bool showtext)1542 pg_stat_statements_internal(FunctionCallInfo fcinfo,
1543 pgssVersion api_version,
1544 bool showtext)
1545 {
1546 ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
1547 TupleDesc tupdesc;
1548 Tuplestorestate *tupstore;
1549 MemoryContext per_query_ctx;
1550 MemoryContext oldcontext;
1551 Oid userid = GetUserId();
1552 bool is_allowed_role = false;
1553 char *qbuffer = NULL;
1554 Size qbuffer_size = 0;
1555 Size extent = 0;
1556 int gc_count = 0;
1557 HASH_SEQ_STATUS hash_seq;
1558 pgssEntry *entry;
1559
1560 /* Superusers or members of pg_read_all_stats members are allowed */
1561 is_allowed_role = is_member_of_role(GetUserId(), DEFAULT_ROLE_READ_ALL_STATS);
1562
1563 /* hash table must exist already */
1564 if (!pgss || !pgss_hash)
1565 ereport(ERROR,
1566 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1567 errmsg("pg_stat_statements must be loaded via shared_preload_libraries")));
1568
1569 /* check to see if caller supports us returning a tuplestore */
1570 if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
1571 ereport(ERROR,
1572 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1573 errmsg("set-valued function called in context that cannot accept a set")));
1574 if (!(rsinfo->allowedModes & SFRM_Materialize))
1575 ereport(ERROR,
1576 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1577 errmsg("materialize mode required, but it is not allowed in this context")));
1578
1579 /* Switch into long-lived context to construct returned data structures */
1580 per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
1581 oldcontext = MemoryContextSwitchTo(per_query_ctx);
1582
1583 /* Build a tuple descriptor for our result type */
1584 if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
1585 elog(ERROR, "return type must be a row type");
1586
1587 /*
1588 * Check we have the expected number of output arguments. Aside from
1589 * being a good safety check, we need a kluge here to detect API version
1590 * 1.1, which was wedged into the code in an ill-considered way.
1591 */
1592 switch (tupdesc->natts)
1593 {
1594 case PG_STAT_STATEMENTS_COLS_V1_0:
1595 if (api_version != PGSS_V1_0)
1596 elog(ERROR, "incorrect number of output arguments");
1597 break;
1598 case PG_STAT_STATEMENTS_COLS_V1_1:
1599 /* pg_stat_statements() should have told us 1.0 */
1600 if (api_version != PGSS_V1_0)
1601 elog(ERROR, "incorrect number of output arguments");
1602 api_version = PGSS_V1_1;
1603 break;
1604 case PG_STAT_STATEMENTS_COLS_V1_2:
1605 if (api_version != PGSS_V1_2)
1606 elog(ERROR, "incorrect number of output arguments");
1607 break;
1608 case PG_STAT_STATEMENTS_COLS_V1_3:
1609 if (api_version != PGSS_V1_3)
1610 elog(ERROR, "incorrect number of output arguments");
1611 break;
1612 case PG_STAT_STATEMENTS_COLS_V1_8:
1613 if (api_version != PGSS_V1_8)
1614 elog(ERROR, "incorrect number of output arguments");
1615 break;
1616 default:
1617 elog(ERROR, "incorrect number of output arguments");
1618 }
1619
1620 tupstore = tuplestore_begin_heap(true, false, work_mem);
1621 rsinfo->returnMode = SFRM_Materialize;
1622 rsinfo->setResult = tupstore;
1623 rsinfo->setDesc = tupdesc;
1624
1625 MemoryContextSwitchTo(oldcontext);
1626
1627 /*
1628 * We'd like to load the query text file (if needed) while not holding any
1629 * lock on pgss->lock. In the worst case we'll have to do this again
1630 * after we have the lock, but it's unlikely enough to make this a win
1631 * despite occasional duplicated work. We need to reload if anybody
1632 * writes to the file (either a retail qtext_store(), or a garbage
1633 * collection) between this point and where we've gotten shared lock. If
1634 * a qtext_store is actually in progress when we look, we might as well
1635 * skip the speculative load entirely.
1636 */
1637 if (showtext)
1638 {
1639 int n_writers;
1640
1641 /* Take the mutex so we can examine variables */
1642 {
1643 volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
1644
1645 SpinLockAcquire(&s->mutex);
1646 extent = s->extent;
1647 n_writers = s->n_writers;
1648 gc_count = s->gc_count;
1649 SpinLockRelease(&s->mutex);
1650 }
1651
1652 /* No point in loading file now if there are active writers */
1653 if (n_writers == 0)
1654 qbuffer = qtext_load_file(&qbuffer_size);
1655 }
1656
1657 /*
1658 * Get shared lock, load or reload the query text file if we must, and
1659 * iterate over the hashtable entries.
1660 *
1661 * With a large hash table, we might be holding the lock rather longer
1662 * than one could wish. However, this only blocks creation of new hash
1663 * table entries, and the larger the hash table the less likely that is to
1664 * be needed. So we can hope this is okay. Perhaps someday we'll decide
1665 * we need to partition the hash table to limit the time spent holding any
1666 * one lock.
1667 */
1668 LWLockAcquire(pgss->lock, LW_SHARED);
1669
1670 if (showtext)
1671 {
1672 /*
1673 * Here it is safe to examine extent and gc_count without taking the
1674 * mutex. Note that although other processes might change
1675 * pgss->extent just after we look at it, the strings they then write
1676 * into the file cannot yet be referenced in the hashtable, so we
1677 * don't care whether we see them or not.
1678 *
1679 * If qtext_load_file fails, we just press on; we'll return NULL for
1680 * every query text.
1681 */
1682 if (qbuffer == NULL ||
1683 pgss->extent != extent ||
1684 pgss->gc_count != gc_count)
1685 {
1686 if (qbuffer)
1687 free(qbuffer);
1688 qbuffer = qtext_load_file(&qbuffer_size);
1689 }
1690 }
1691
1692 hash_seq_init(&hash_seq, pgss_hash);
1693 while ((entry = hash_seq_search(&hash_seq)) != NULL)
1694 {
1695 Datum values[PG_STAT_STATEMENTS_COLS];
1696 bool nulls[PG_STAT_STATEMENTS_COLS];
1697 int i = 0;
1698 Counters tmp;
1699 double stddev;
1700 int64 queryid = entry->key.queryid;
1701
1702 memset(values, 0, sizeof(values));
1703 memset(nulls, 0, sizeof(nulls));
1704
1705 values[i++] = ObjectIdGetDatum(entry->key.userid);
1706 values[i++] = ObjectIdGetDatum(entry->key.dbid);
1707
1708 if (is_allowed_role || entry->key.userid == userid)
1709 {
1710 if (api_version >= PGSS_V1_2)
1711 values[i++] = Int64GetDatumFast(queryid);
1712
1713 if (showtext)
1714 {
1715 char *qstr = qtext_fetch(entry->query_offset,
1716 entry->query_len,
1717 qbuffer,
1718 qbuffer_size);
1719
1720 if (qstr)
1721 {
1722 char *enc;
1723
1724 enc = pg_any_to_server(qstr,
1725 entry->query_len,
1726 entry->encoding);
1727
1728 values[i++] = CStringGetTextDatum(enc);
1729
1730 if (enc != qstr)
1731 pfree(enc);
1732 }
1733 else
1734 {
1735 /* Just return a null if we fail to find the text */
1736 nulls[i++] = true;
1737 }
1738 }
1739 else
1740 {
1741 /* Query text not requested */
1742 nulls[i++] = true;
1743 }
1744 }
1745 else
1746 {
1747 /* Don't show queryid */
1748 if (api_version >= PGSS_V1_2)
1749 nulls[i++] = true;
1750
1751 /*
1752 * Don't show query text, but hint as to the reason for not doing
1753 * so if it was requested
1754 */
1755 if (showtext)
1756 values[i++] = CStringGetTextDatum("<insufficient privilege>");
1757 else
1758 nulls[i++] = true;
1759 }
1760
1761 /* copy counters to a local variable to keep locking time short */
1762 {
1763 volatile pgssEntry *e = (volatile pgssEntry *) entry;
1764
1765 SpinLockAcquire(&e->mutex);
1766 tmp = e->counters;
1767 SpinLockRelease(&e->mutex);
1768 }
1769
1770 /* Skip entry if unexecuted (ie, it's a pending "sticky" entry) */
1771 if (IS_STICKY(tmp))
1772 continue;
1773
1774 /* Note that we rely on PGSS_PLAN being 0 and PGSS_EXEC being 1. */
1775 for (int kind = 0; kind < PGSS_NUMKIND; kind++)
1776 {
1777 if (kind == PGSS_EXEC || api_version >= PGSS_V1_8)
1778 {
1779 values[i++] = Int64GetDatumFast(tmp.calls[kind]);
1780 values[i++] = Float8GetDatumFast(tmp.total_time[kind]);
1781 }
1782
1783 if ((kind == PGSS_EXEC && api_version >= PGSS_V1_3) ||
1784 api_version >= PGSS_V1_8)
1785 {
1786 values[i++] = Float8GetDatumFast(tmp.min_time[kind]);
1787 values[i++] = Float8GetDatumFast(tmp.max_time[kind]);
1788 values[i++] = Float8GetDatumFast(tmp.mean_time[kind]);
1789
1790 /*
1791 * Note we are calculating the population variance here, not
1792 * the sample variance, as we have data for the whole
1793 * population, so Bessel's correction is not used, and we
1794 * don't divide by tmp.calls - 1.
1795 */
1796 if (tmp.calls[kind] > 1)
1797 stddev = sqrt(tmp.sum_var_time[kind] / tmp.calls[kind]);
1798 else
1799 stddev = 0.0;
1800 values[i++] = Float8GetDatumFast(stddev);
1801 }
1802 }
1803 values[i++] = Int64GetDatumFast(tmp.rows);
1804 values[i++] = Int64GetDatumFast(tmp.shared_blks_hit);
1805 values[i++] = Int64GetDatumFast(tmp.shared_blks_read);
1806 if (api_version >= PGSS_V1_1)
1807 values[i++] = Int64GetDatumFast(tmp.shared_blks_dirtied);
1808 values[i++] = Int64GetDatumFast(tmp.shared_blks_written);
1809 values[i++] = Int64GetDatumFast(tmp.local_blks_hit);
1810 values[i++] = Int64GetDatumFast(tmp.local_blks_read);
1811 if (api_version >= PGSS_V1_1)
1812 values[i++] = Int64GetDatumFast(tmp.local_blks_dirtied);
1813 values[i++] = Int64GetDatumFast(tmp.local_blks_written);
1814 values[i++] = Int64GetDatumFast(tmp.temp_blks_read);
1815 values[i++] = Int64GetDatumFast(tmp.temp_blks_written);
1816 if (api_version >= PGSS_V1_1)
1817 {
1818 values[i++] = Float8GetDatumFast(tmp.blk_read_time);
1819 values[i++] = Float8GetDatumFast(tmp.blk_write_time);
1820 }
1821 if (api_version >= PGSS_V1_8)
1822 {
1823 char buf[256];
1824 Datum wal_bytes;
1825
1826 values[i++] = Int64GetDatumFast(tmp.wal_records);
1827 values[i++] = Int64GetDatumFast(tmp.wal_fpi);
1828
1829 snprintf(buf, sizeof buf, UINT64_FORMAT, tmp.wal_bytes);
1830
1831 /* Convert to numeric. */
1832 wal_bytes = DirectFunctionCall3(numeric_in,
1833 CStringGetDatum(buf),
1834 ObjectIdGetDatum(0),
1835 Int32GetDatum(-1));
1836 values[i++] = wal_bytes;
1837 }
1838
1839 Assert(i == (api_version == PGSS_V1_0 ? PG_STAT_STATEMENTS_COLS_V1_0 :
1840 api_version == PGSS_V1_1 ? PG_STAT_STATEMENTS_COLS_V1_1 :
1841 api_version == PGSS_V1_2 ? PG_STAT_STATEMENTS_COLS_V1_2 :
1842 api_version == PGSS_V1_3 ? PG_STAT_STATEMENTS_COLS_V1_3 :
1843 api_version == PGSS_V1_8 ? PG_STAT_STATEMENTS_COLS_V1_8 :
1844 -1 /* fail if you forget to update this assert */ ));
1845
1846 tuplestore_putvalues(tupstore, tupdesc, values, nulls);
1847 }
1848
1849 /* clean up and return the tuplestore */
1850 LWLockRelease(pgss->lock);
1851
1852 if (qbuffer)
1853 free(qbuffer);
1854
1855 tuplestore_donestoring(tupstore);
1856 }
1857
1858 /*
1859 * Estimate shared memory space needed.
1860 */
1861 static Size
pgss_memsize(void)1862 pgss_memsize(void)
1863 {
1864 Size size;
1865
1866 size = MAXALIGN(sizeof(pgssSharedState));
1867 size = add_size(size, hash_estimate_size(pgss_max, sizeof(pgssEntry)));
1868
1869 return size;
1870 }
1871
1872 /*
1873 * Allocate a new hashtable entry.
1874 * caller must hold an exclusive lock on pgss->lock
1875 *
1876 * "query" need not be null-terminated; we rely on query_len instead
1877 *
1878 * If "sticky" is true, make the new entry artificially sticky so that it will
1879 * probably still be there when the query finishes execution. We do this by
1880 * giving it a median usage value rather than the normal value. (Strictly
1881 * speaking, query strings are normalized on a best effort basis, though it
1882 * would be difficult to demonstrate this even under artificial conditions.)
1883 *
1884 * Note: despite needing exclusive lock, it's not an error for the target
1885 * entry to already exist. This is because pgss_store releases and
1886 * reacquires lock after failing to find a match; so someone else could
1887 * have made the entry while we waited to get exclusive lock.
1888 */
1889 static pgssEntry *
entry_alloc(pgssHashKey * key,Size query_offset,int query_len,int encoding,bool sticky)1890 entry_alloc(pgssHashKey *key, Size query_offset, int query_len, int encoding,
1891 bool sticky)
1892 {
1893 pgssEntry *entry;
1894 bool found;
1895
1896 /* Make space if needed */
1897 while (hash_get_num_entries(pgss_hash) >= pgss_max)
1898 entry_dealloc();
1899
1900 /* Find or create an entry with desired hash code */
1901 entry = (pgssEntry *) hash_search(pgss_hash, key, HASH_ENTER, &found);
1902
1903 if (!found)
1904 {
1905 /* New entry, initialize it */
1906
1907 /* reset the statistics */
1908 memset(&entry->counters, 0, sizeof(Counters));
1909 /* set the appropriate initial usage count */
1910 entry->counters.usage = sticky ? pgss->cur_median_usage : USAGE_INIT;
1911 /* re-initialize the mutex each time ... we assume no one using it */
1912 SpinLockInit(&entry->mutex);
1913 /* ... and don't forget the query text metadata */
1914 Assert(query_len >= 0);
1915 entry->query_offset = query_offset;
1916 entry->query_len = query_len;
1917 entry->encoding = encoding;
1918 }
1919
1920 return entry;
1921 }
1922
1923 /*
1924 * qsort comparator for sorting into increasing usage order
1925 */
1926 static int
entry_cmp(const void * lhs,const void * rhs)1927 entry_cmp(const void *lhs, const void *rhs)
1928 {
1929 double l_usage = (*(pgssEntry *const *) lhs)->counters.usage;
1930 double r_usage = (*(pgssEntry *const *) rhs)->counters.usage;
1931
1932 if (l_usage < r_usage)
1933 return -1;
1934 else if (l_usage > r_usage)
1935 return +1;
1936 else
1937 return 0;
1938 }
1939
1940 /*
1941 * Deallocate least-used entries.
1942 *
1943 * Caller must hold an exclusive lock on pgss->lock.
1944 */
1945 static void
entry_dealloc(void)1946 entry_dealloc(void)
1947 {
1948 HASH_SEQ_STATUS hash_seq;
1949 pgssEntry **entries;
1950 pgssEntry *entry;
1951 int nvictims;
1952 int i;
1953 Size tottextlen;
1954 int nvalidtexts;
1955
1956 /*
1957 * Sort entries by usage and deallocate USAGE_DEALLOC_PERCENT of them.
1958 * While we're scanning the table, apply the decay factor to the usage
1959 * values, and update the mean query length.
1960 *
1961 * Note that the mean query length is almost immediately obsolete, since
1962 * we compute it before not after discarding the least-used entries.
1963 * Hopefully, that doesn't affect the mean too much; it doesn't seem worth
1964 * making two passes to get a more current result. Likewise, the new
1965 * cur_median_usage includes the entries we're about to zap.
1966 */
1967
1968 entries = palloc(hash_get_num_entries(pgss_hash) * sizeof(pgssEntry *));
1969
1970 i = 0;
1971 tottextlen = 0;
1972 nvalidtexts = 0;
1973
1974 hash_seq_init(&hash_seq, pgss_hash);
1975 while ((entry = hash_seq_search(&hash_seq)) != NULL)
1976 {
1977 entries[i++] = entry;
1978 /* "Sticky" entries get a different usage decay rate. */
1979 if (IS_STICKY(entry->counters))
1980 entry->counters.usage *= STICKY_DECREASE_FACTOR;
1981 else
1982 entry->counters.usage *= USAGE_DECREASE_FACTOR;
1983 /* In the mean length computation, ignore dropped texts. */
1984 if (entry->query_len >= 0)
1985 {
1986 tottextlen += entry->query_len + 1;
1987 nvalidtexts++;
1988 }
1989 }
1990
1991 /* Sort into increasing order by usage */
1992 qsort(entries, i, sizeof(pgssEntry *), entry_cmp);
1993
1994 /* Record the (approximate) median usage */
1995 if (i > 0)
1996 pgss->cur_median_usage = entries[i / 2]->counters.usage;
1997 /* Record the mean query length */
1998 if (nvalidtexts > 0)
1999 pgss->mean_query_len = tottextlen / nvalidtexts;
2000 else
2001 pgss->mean_query_len = ASSUMED_LENGTH_INIT;
2002
2003 /* Now zap an appropriate fraction of lowest-usage entries */
2004 nvictims = Max(10, i * USAGE_DEALLOC_PERCENT / 100);
2005 nvictims = Min(nvictims, i);
2006
2007 for (i = 0; i < nvictims; i++)
2008 {
2009 hash_search(pgss_hash, &entries[i]->key, HASH_REMOVE, NULL);
2010 }
2011
2012 pfree(entries);
2013 }
2014
2015 /*
2016 * Given a query string (not necessarily null-terminated), allocate a new
2017 * entry in the external query text file and store the string there.
2018 *
2019 * If successful, returns true, and stores the new entry's offset in the file
2020 * into *query_offset. Also, if gc_count isn't NULL, *gc_count is set to the
2021 * number of garbage collections that have occurred so far.
2022 *
2023 * On failure, returns false.
2024 *
2025 * At least a shared lock on pgss->lock must be held by the caller, so as
2026 * to prevent a concurrent garbage collection. Share-lock-holding callers
2027 * should pass a gc_count pointer to obtain the number of garbage collections,
2028 * so that they can recheck the count after obtaining exclusive lock to
2029 * detect whether a garbage collection occurred (and removed this entry).
2030 */
2031 static bool
qtext_store(const char * query,int query_len,Size * query_offset,int * gc_count)2032 qtext_store(const char *query, int query_len,
2033 Size *query_offset, int *gc_count)
2034 {
2035 Size off;
2036 int fd;
2037
2038 /*
2039 * We use a spinlock to protect extent/n_writers/gc_count, so that
2040 * multiple processes may execute this function concurrently.
2041 */
2042 {
2043 volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
2044
2045 SpinLockAcquire(&s->mutex);
2046 off = s->extent;
2047 s->extent += query_len + 1;
2048 s->n_writers++;
2049 if (gc_count)
2050 *gc_count = s->gc_count;
2051 SpinLockRelease(&s->mutex);
2052 }
2053
2054 *query_offset = off;
2055
2056 /* Now write the data into the successfully-reserved part of the file */
2057 fd = OpenTransientFile(PGSS_TEXT_FILE, O_RDWR | O_CREAT | PG_BINARY);
2058 if (fd < 0)
2059 goto error;
2060
2061 if (pg_pwrite(fd, query, query_len, off) != query_len)
2062 goto error;
2063 if (pg_pwrite(fd, "\0", 1, off + query_len) != 1)
2064 goto error;
2065
2066 CloseTransientFile(fd);
2067
2068 /* Mark our write complete */
2069 {
2070 volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
2071
2072 SpinLockAcquire(&s->mutex);
2073 s->n_writers--;
2074 SpinLockRelease(&s->mutex);
2075 }
2076
2077 return true;
2078
2079 error:
2080 ereport(LOG,
2081 (errcode_for_file_access(),
2082 errmsg("could not write file \"%s\": %m",
2083 PGSS_TEXT_FILE)));
2084
2085 if (fd >= 0)
2086 CloseTransientFile(fd);
2087
2088 /* Mark our write complete */
2089 {
2090 volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
2091
2092 SpinLockAcquire(&s->mutex);
2093 s->n_writers--;
2094 SpinLockRelease(&s->mutex);
2095 }
2096
2097 return false;
2098 }
2099
2100 /*
2101 * Read the external query text file into a malloc'd buffer.
2102 *
2103 * Returns NULL (without throwing an error) if unable to read, eg
2104 * file not there or insufficient memory.
2105 *
2106 * On success, the buffer size is also returned into *buffer_size.
2107 *
2108 * This can be called without any lock on pgss->lock, but in that case
2109 * the caller is responsible for verifying that the result is sane.
2110 */
2111 static char *
qtext_load_file(Size * buffer_size)2112 qtext_load_file(Size *buffer_size)
2113 {
2114 char *buf;
2115 int fd;
2116 struct stat stat;
2117 Size nread;
2118
2119 fd = OpenTransientFile(PGSS_TEXT_FILE, O_RDONLY | PG_BINARY);
2120 if (fd < 0)
2121 {
2122 if (errno != ENOENT)
2123 ereport(LOG,
2124 (errcode_for_file_access(),
2125 errmsg("could not read file \"%s\": %m",
2126 PGSS_TEXT_FILE)));
2127 return NULL;
2128 }
2129
2130 /* Get file length */
2131 if (fstat(fd, &stat))
2132 {
2133 ereport(LOG,
2134 (errcode_for_file_access(),
2135 errmsg("could not stat file \"%s\": %m",
2136 PGSS_TEXT_FILE)));
2137 CloseTransientFile(fd);
2138 return NULL;
2139 }
2140
2141 /* Allocate buffer; beware that off_t might be wider than size_t */
2142 if (stat.st_size <= MaxAllocHugeSize)
2143 buf = (char *) malloc(stat.st_size);
2144 else
2145 buf = NULL;
2146 if (buf == NULL)
2147 {
2148 ereport(LOG,
2149 (errcode(ERRCODE_OUT_OF_MEMORY),
2150 errmsg("out of memory"),
2151 errdetail("Could not allocate enough memory to read file \"%s\".",
2152 PGSS_TEXT_FILE)));
2153 CloseTransientFile(fd);
2154 return NULL;
2155 }
2156
2157 /*
2158 * OK, slurp in the file. Windows fails if we try to read more than
2159 * INT_MAX bytes at once, and other platforms might not like that either,
2160 * so read a very large file in 1GB segments.
2161 */
2162 nread = 0;
2163 while (nread < stat.st_size)
2164 {
2165 int toread = Min(1024 * 1024 * 1024, stat.st_size - nread);
2166
2167 /*
2168 * If we get a short read and errno doesn't get set, the reason is
2169 * probably that garbage collection truncated the file since we did
2170 * the fstat(), so we don't log a complaint --- but we don't return
2171 * the data, either, since it's most likely corrupt due to concurrent
2172 * writes from garbage collection.
2173 */
2174 errno = 0;
2175 if (read(fd, buf + nread, toread) != toread)
2176 {
2177 if (errno)
2178 ereport(LOG,
2179 (errcode_for_file_access(),
2180 errmsg("could not read file \"%s\": %m",
2181 PGSS_TEXT_FILE)));
2182 free(buf);
2183 CloseTransientFile(fd);
2184 return NULL;
2185 }
2186 nread += toread;
2187 }
2188
2189 if (CloseTransientFile(fd) != 0)
2190 ereport(LOG,
2191 (errcode_for_file_access(),
2192 errmsg("could not close file \"%s\": %m", PGSS_TEXT_FILE)));
2193
2194 *buffer_size = nread;
2195 return buf;
2196 }
2197
2198 /*
2199 * Locate a query text in the file image previously read by qtext_load_file().
2200 *
2201 * We validate the given offset/length, and return NULL if bogus. Otherwise,
2202 * the result points to a null-terminated string within the buffer.
2203 */
2204 static char *
qtext_fetch(Size query_offset,int query_len,char * buffer,Size buffer_size)2205 qtext_fetch(Size query_offset, int query_len,
2206 char *buffer, Size buffer_size)
2207 {
2208 /* File read failed? */
2209 if (buffer == NULL)
2210 return NULL;
2211 /* Bogus offset/length? */
2212 if (query_len < 0 ||
2213 query_offset + query_len >= buffer_size)
2214 return NULL;
2215 /* As a further sanity check, make sure there's a trailing null */
2216 if (buffer[query_offset + query_len] != '\0')
2217 return NULL;
2218 /* Looks OK */
2219 return buffer + query_offset;
2220 }
2221
2222 /*
2223 * Do we need to garbage-collect the external query text file?
2224 *
2225 * Caller should hold at least a shared lock on pgss->lock.
2226 */
2227 static bool
need_gc_qtexts(void)2228 need_gc_qtexts(void)
2229 {
2230 Size extent;
2231
2232 /* Read shared extent pointer */
2233 {
2234 volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
2235
2236 SpinLockAcquire(&s->mutex);
2237 extent = s->extent;
2238 SpinLockRelease(&s->mutex);
2239 }
2240
2241 /* Don't proceed if file does not exceed 512 bytes per possible entry */
2242 if (extent < 512 * pgss_max)
2243 return false;
2244
2245 /*
2246 * Don't proceed if file is less than about 50% bloat. Nothing can or
2247 * should be done in the event of unusually large query texts accounting
2248 * for file's large size. We go to the trouble of maintaining the mean
2249 * query length in order to prevent garbage collection from thrashing
2250 * uselessly.
2251 */
2252 if (extent < pgss->mean_query_len * pgss_max * 2)
2253 return false;
2254
2255 return true;
2256 }
2257
2258 /*
2259 * Garbage-collect orphaned query texts in external file.
2260 *
2261 * This won't be called often in the typical case, since it's likely that
2262 * there won't be too much churn, and besides, a similar compaction process
2263 * occurs when serializing to disk at shutdown or as part of resetting.
2264 * Despite this, it seems prudent to plan for the edge case where the file
2265 * becomes unreasonably large, with no other method of compaction likely to
2266 * occur in the foreseeable future.
2267 *
2268 * The caller must hold an exclusive lock on pgss->lock.
2269 *
2270 * At the first sign of trouble we unlink the query text file to get a clean
2271 * slate (although existing statistics are retained), rather than risk
2272 * thrashing by allowing the same problem case to recur indefinitely.
2273 */
2274 static void
gc_qtexts(void)2275 gc_qtexts(void)
2276 {
2277 char *qbuffer;
2278 Size qbuffer_size;
2279 FILE *qfile = NULL;
2280 HASH_SEQ_STATUS hash_seq;
2281 pgssEntry *entry;
2282 Size extent;
2283 int nentries;
2284
2285 /*
2286 * When called from pgss_store, some other session might have proceeded
2287 * with garbage collection in the no-lock-held interim of lock strength
2288 * escalation. Check once more that this is actually necessary.
2289 */
2290 if (!need_gc_qtexts())
2291 return;
2292
2293 /*
2294 * Load the old texts file. If we fail (out of memory, for instance),
2295 * invalidate query texts. Hopefully this is rare. It might seem better
2296 * to leave things alone on an OOM failure, but the problem is that the
2297 * file is only going to get bigger; hoping for a future non-OOM result is
2298 * risky and can easily lead to complete denial of service.
2299 */
2300 qbuffer = qtext_load_file(&qbuffer_size);
2301 if (qbuffer == NULL)
2302 goto gc_fail;
2303
2304 /*
2305 * We overwrite the query texts file in place, so as to reduce the risk of
2306 * an out-of-disk-space failure. Since the file is guaranteed not to get
2307 * larger, this should always work on traditional filesystems; though we
2308 * could still lose on copy-on-write filesystems.
2309 */
2310 qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W);
2311 if (qfile == NULL)
2312 {
2313 ereport(LOG,
2314 (errcode_for_file_access(),
2315 errmsg("could not write file \"%s\": %m",
2316 PGSS_TEXT_FILE)));
2317 goto gc_fail;
2318 }
2319
2320 extent = 0;
2321 nentries = 0;
2322
2323 hash_seq_init(&hash_seq, pgss_hash);
2324 while ((entry = hash_seq_search(&hash_seq)) != NULL)
2325 {
2326 int query_len = entry->query_len;
2327 char *qry = qtext_fetch(entry->query_offset,
2328 query_len,
2329 qbuffer,
2330 qbuffer_size);
2331
2332 if (qry == NULL)
2333 {
2334 /* Trouble ... drop the text */
2335 entry->query_offset = 0;
2336 entry->query_len = -1;
2337 /* entry will not be counted in mean query length computation */
2338 continue;
2339 }
2340
2341 if (fwrite(qry, 1, query_len + 1, qfile) != query_len + 1)
2342 {
2343 ereport(LOG,
2344 (errcode_for_file_access(),
2345 errmsg("could not write file \"%s\": %m",
2346 PGSS_TEXT_FILE)));
2347 hash_seq_term(&hash_seq);
2348 goto gc_fail;
2349 }
2350
2351 entry->query_offset = extent;
2352 extent += query_len + 1;
2353 nentries++;
2354 }
2355
2356 /*
2357 * Truncate away any now-unused space. If this fails for some odd reason,
2358 * we log it, but there's no need to fail.
2359 */
2360 if (ftruncate(fileno(qfile), extent) != 0)
2361 ereport(LOG,
2362 (errcode_for_file_access(),
2363 errmsg("could not truncate file \"%s\": %m",
2364 PGSS_TEXT_FILE)));
2365
2366 if (FreeFile(qfile))
2367 {
2368 ereport(LOG,
2369 (errcode_for_file_access(),
2370 errmsg("could not write file \"%s\": %m",
2371 PGSS_TEXT_FILE)));
2372 qfile = NULL;
2373 goto gc_fail;
2374 }
2375
2376 elog(DEBUG1, "pgss gc of queries file shrunk size from %zu to %zu",
2377 pgss->extent, extent);
2378
2379 /* Reset the shared extent pointer */
2380 pgss->extent = extent;
2381
2382 /*
2383 * Also update the mean query length, to be sure that need_gc_qtexts()
2384 * won't still think we have a problem.
2385 */
2386 if (nentries > 0)
2387 pgss->mean_query_len = extent / nentries;
2388 else
2389 pgss->mean_query_len = ASSUMED_LENGTH_INIT;
2390
2391 free(qbuffer);
2392
2393 /*
2394 * OK, count a garbage collection cycle. (Note: even though we have
2395 * exclusive lock on pgss->lock, we must take pgss->mutex for this, since
2396 * other processes may examine gc_count while holding only the mutex.
2397 * Also, we have to advance the count *after* we've rewritten the file,
2398 * else other processes might not realize they read a stale file.)
2399 */
2400 record_gc_qtexts();
2401
2402 return;
2403
2404 gc_fail:
2405 /* clean up resources */
2406 if (qfile)
2407 FreeFile(qfile);
2408 if (qbuffer)
2409 free(qbuffer);
2410
2411 /*
2412 * Since the contents of the external file are now uncertain, mark all
2413 * hashtable entries as having invalid texts.
2414 */
2415 hash_seq_init(&hash_seq, pgss_hash);
2416 while ((entry = hash_seq_search(&hash_seq)) != NULL)
2417 {
2418 entry->query_offset = 0;
2419 entry->query_len = -1;
2420 }
2421
2422 /*
2423 * Destroy the query text file and create a new, empty one
2424 */
2425 (void) unlink(PGSS_TEXT_FILE);
2426 qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W);
2427 if (qfile == NULL)
2428 ereport(LOG,
2429 (errcode_for_file_access(),
2430 errmsg("could not recreate file \"%s\": %m",
2431 PGSS_TEXT_FILE)));
2432 else
2433 FreeFile(qfile);
2434
2435 /* Reset the shared extent pointer */
2436 pgss->extent = 0;
2437
2438 /* Reset mean_query_len to match the new state */
2439 pgss->mean_query_len = ASSUMED_LENGTH_INIT;
2440
2441 /*
2442 * Bump the GC count even though we failed.
2443 *
2444 * This is needed to make concurrent readers of file without any lock on
2445 * pgss->lock notice existence of new version of file. Once readers
2446 * subsequently observe a change in GC count with pgss->lock held, that
2447 * forces a safe reopen of file. Writers also require that we bump here,
2448 * of course. (As required by locking protocol, readers and writers don't
2449 * trust earlier file contents until gc_count is found unchanged after
2450 * pgss->lock acquired in shared or exclusive mode respectively.)
2451 */
2452 record_gc_qtexts();
2453 }
2454
2455 /*
2456 * Release entries corresponding to parameters passed.
2457 */
2458 static void
entry_reset(Oid userid,Oid dbid,uint64 queryid)2459 entry_reset(Oid userid, Oid dbid, uint64 queryid)
2460 {
2461 HASH_SEQ_STATUS hash_seq;
2462 pgssEntry *entry;
2463 FILE *qfile;
2464 long num_entries;
2465 long num_remove = 0;
2466 pgssHashKey key;
2467
2468 if (!pgss || !pgss_hash)
2469 ereport(ERROR,
2470 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
2471 errmsg("pg_stat_statements must be loaded via shared_preload_libraries")));
2472
2473 LWLockAcquire(pgss->lock, LW_EXCLUSIVE);
2474 num_entries = hash_get_num_entries(pgss_hash);
2475
2476 if (userid != 0 && dbid != 0 && queryid != UINT64CONST(0))
2477 {
2478 /* If all the parameters are available, use the fast path. */
2479 key.userid = userid;
2480 key.dbid = dbid;
2481 key.queryid = queryid;
2482
2483 /* Remove the key if exists */
2484 entry = (pgssEntry *) hash_search(pgss_hash, &key, HASH_REMOVE, NULL);
2485 if (entry) /* found */
2486 num_remove++;
2487 }
2488 else if (userid != 0 || dbid != 0 || queryid != UINT64CONST(0))
2489 {
2490 /* Remove entries corresponding to valid parameters. */
2491 hash_seq_init(&hash_seq, pgss_hash);
2492 while ((entry = hash_seq_search(&hash_seq)) != NULL)
2493 {
2494 if ((!userid || entry->key.userid == userid) &&
2495 (!dbid || entry->key.dbid == dbid) &&
2496 (!queryid || entry->key.queryid == queryid))
2497 {
2498 hash_search(pgss_hash, &entry->key, HASH_REMOVE, NULL);
2499 num_remove++;
2500 }
2501 }
2502 }
2503 else
2504 {
2505 /* Remove all entries. */
2506 hash_seq_init(&hash_seq, pgss_hash);
2507 while ((entry = hash_seq_search(&hash_seq)) != NULL)
2508 {
2509 hash_search(pgss_hash, &entry->key, HASH_REMOVE, NULL);
2510 num_remove++;
2511 }
2512 }
2513
2514 /* All entries are removed? */
2515 if (num_entries != num_remove)
2516 goto release_lock;
2517
2518 /*
2519 * Write new empty query file, perhaps even creating a new one to recover
2520 * if the file was missing.
2521 */
2522 qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W);
2523 if (qfile == NULL)
2524 {
2525 ereport(LOG,
2526 (errcode_for_file_access(),
2527 errmsg("could not create file \"%s\": %m",
2528 PGSS_TEXT_FILE)));
2529 goto done;
2530 }
2531
2532 /* If ftruncate fails, log it, but it's not a fatal problem */
2533 if (ftruncate(fileno(qfile), 0) != 0)
2534 ereport(LOG,
2535 (errcode_for_file_access(),
2536 errmsg("could not truncate file \"%s\": %m",
2537 PGSS_TEXT_FILE)));
2538
2539 FreeFile(qfile);
2540
2541 done:
2542 pgss->extent = 0;
2543 /* This counts as a query text garbage collection for our purposes */
2544 record_gc_qtexts();
2545
2546 release_lock:
2547 LWLockRelease(pgss->lock);
2548 }
2549
2550 /*
2551 * AppendJumble: Append a value that is substantive in a given query to
2552 * the current jumble.
2553 */
2554 static void
AppendJumble(pgssJumbleState * jstate,const unsigned char * item,Size size)2555 AppendJumble(pgssJumbleState *jstate, const unsigned char *item, Size size)
2556 {
2557 unsigned char *jumble = jstate->jumble;
2558 Size jumble_len = jstate->jumble_len;
2559
2560 /*
2561 * Whenever the jumble buffer is full, we hash the current contents and
2562 * reset the buffer to contain just that hash value, thus relying on the
2563 * hash to summarize everything so far.
2564 */
2565 while (size > 0)
2566 {
2567 Size part_size;
2568
2569 if (jumble_len >= JUMBLE_SIZE)
2570 {
2571 uint64 start_hash;
2572
2573 start_hash = DatumGetUInt64(hash_any_extended(jumble,
2574 JUMBLE_SIZE, 0));
2575 memcpy(jumble, &start_hash, sizeof(start_hash));
2576 jumble_len = sizeof(start_hash);
2577 }
2578 part_size = Min(size, JUMBLE_SIZE - jumble_len);
2579 memcpy(jumble + jumble_len, item, part_size);
2580 jumble_len += part_size;
2581 item += part_size;
2582 size -= part_size;
2583 }
2584 jstate->jumble_len = jumble_len;
2585 }
2586
2587 /*
2588 * Wrappers around AppendJumble to encapsulate details of serialization
2589 * of individual local variable elements.
2590 */
2591 #define APP_JUMB(item) \
2592 AppendJumble(jstate, (const unsigned char *) &(item), sizeof(item))
2593 #define APP_JUMB_STRING(str) \
2594 AppendJumble(jstate, (const unsigned char *) (str), strlen(str) + 1)
2595
2596 /*
2597 * JumbleQuery: Selectively serialize the query tree, appending significant
2598 * data to the "query jumble" while ignoring nonsignificant data.
2599 *
2600 * Rule of thumb for what to include is that we should ignore anything not
2601 * semantically significant (such as alias names) as well as anything that can
2602 * be deduced from child nodes (else we'd just be double-hashing that piece
2603 * of information).
2604 */
2605 static void
JumbleQuery(pgssJumbleState * jstate,Query * query)2606 JumbleQuery(pgssJumbleState *jstate, Query *query)
2607 {
2608 Assert(IsA(query, Query));
2609 Assert(query->utilityStmt == NULL);
2610
2611 APP_JUMB(query->commandType);
2612 /* resultRelation is usually predictable from commandType */
2613 JumbleExpr(jstate, (Node *) query->cteList);
2614 JumbleRangeTable(jstate, query->rtable);
2615 JumbleExpr(jstate, (Node *) query->jointree);
2616 JumbleExpr(jstate, (Node *) query->targetList);
2617 JumbleExpr(jstate, (Node *) query->onConflict);
2618 JumbleExpr(jstate, (Node *) query->returningList);
2619 JumbleExpr(jstate, (Node *) query->groupClause);
2620 JumbleExpr(jstate, (Node *) query->groupingSets);
2621 JumbleExpr(jstate, query->havingQual);
2622 JumbleExpr(jstate, (Node *) query->windowClause);
2623 JumbleExpr(jstate, (Node *) query->distinctClause);
2624 JumbleExpr(jstate, (Node *) query->sortClause);
2625 JumbleExpr(jstate, query->limitOffset);
2626 JumbleExpr(jstate, query->limitCount);
2627 JumbleRowMarks(jstate, query->rowMarks);
2628 JumbleExpr(jstate, query->setOperations);
2629 }
2630
2631 /*
2632 * Jumble a range table
2633 */
2634 static void
JumbleRangeTable(pgssJumbleState * jstate,List * rtable)2635 JumbleRangeTable(pgssJumbleState *jstate, List *rtable)
2636 {
2637 ListCell *lc;
2638
2639 foreach(lc, rtable)
2640 {
2641 RangeTblEntry *rte = lfirst_node(RangeTblEntry, lc);
2642
2643 APP_JUMB(rte->rtekind);
2644 switch (rte->rtekind)
2645 {
2646 case RTE_RELATION:
2647 APP_JUMB(rte->relid);
2648 JumbleExpr(jstate, (Node *) rte->tablesample);
2649 break;
2650 case RTE_SUBQUERY:
2651 JumbleQuery(jstate, rte->subquery);
2652 break;
2653 case RTE_JOIN:
2654 APP_JUMB(rte->jointype);
2655 break;
2656 case RTE_FUNCTION:
2657 JumbleExpr(jstate, (Node *) rte->functions);
2658 break;
2659 case RTE_TABLEFUNC:
2660 JumbleExpr(jstate, (Node *) rte->tablefunc);
2661 break;
2662 case RTE_VALUES:
2663 JumbleExpr(jstate, (Node *) rte->values_lists);
2664 break;
2665 case RTE_CTE:
2666
2667 /*
2668 * Depending on the CTE name here isn't ideal, but it's the
2669 * only info we have to identify the referenced WITH item.
2670 */
2671 APP_JUMB_STRING(rte->ctename);
2672 APP_JUMB(rte->ctelevelsup);
2673 break;
2674 case RTE_NAMEDTUPLESTORE:
2675 APP_JUMB_STRING(rte->enrname);
2676 break;
2677 case RTE_RESULT:
2678 break;
2679 default:
2680 elog(ERROR, "unrecognized RTE kind: %d", (int) rte->rtekind);
2681 break;
2682 }
2683 }
2684 }
2685
2686 /*
2687 * Jumble a rowMarks list
2688 */
2689 static void
JumbleRowMarks(pgssJumbleState * jstate,List * rowMarks)2690 JumbleRowMarks(pgssJumbleState *jstate, List *rowMarks)
2691 {
2692 ListCell *lc;
2693
2694 foreach(lc, rowMarks)
2695 {
2696 RowMarkClause *rowmark = lfirst_node(RowMarkClause, lc);
2697
2698 if (!rowmark->pushedDown)
2699 {
2700 APP_JUMB(rowmark->rti);
2701 APP_JUMB(rowmark->strength);
2702 APP_JUMB(rowmark->waitPolicy);
2703 }
2704 }
2705 }
2706
2707 /*
2708 * Jumble an expression tree
2709 *
2710 * In general this function should handle all the same node types that
2711 * expression_tree_walker() does, and therefore it's coded to be as parallel
2712 * to that function as possible. However, since we are only invoked on
2713 * queries immediately post-parse-analysis, we need not handle node types
2714 * that only appear in planning.
2715 *
2716 * Note: the reason we don't simply use expression_tree_walker() is that the
2717 * point of that function is to support tree walkers that don't care about
2718 * most tree node types, but here we care about all types. We should complain
2719 * about any unrecognized node type.
2720 */
2721 static void
JumbleExpr(pgssJumbleState * jstate,Node * node)2722 JumbleExpr(pgssJumbleState *jstate, Node *node)
2723 {
2724 ListCell *temp;
2725
2726 if (node == NULL)
2727 return;
2728
2729 /* Guard against stack overflow due to overly complex expressions */
2730 check_stack_depth();
2731
2732 /*
2733 * We always emit the node's NodeTag, then any additional fields that are
2734 * considered significant, and then we recurse to any child nodes.
2735 */
2736 APP_JUMB(node->type);
2737
2738 switch (nodeTag(node))
2739 {
2740 case T_Var:
2741 {
2742 Var *var = (Var *) node;
2743
2744 APP_JUMB(var->varno);
2745 APP_JUMB(var->varattno);
2746 APP_JUMB(var->varlevelsup);
2747 }
2748 break;
2749 case T_Const:
2750 {
2751 Const *c = (Const *) node;
2752
2753 /* We jumble only the constant's type, not its value */
2754 APP_JUMB(c->consttype);
2755 /* Also, record its parse location for query normalization */
2756 RecordConstLocation(jstate, c->location);
2757 }
2758 break;
2759 case T_Param:
2760 {
2761 Param *p = (Param *) node;
2762
2763 APP_JUMB(p->paramkind);
2764 APP_JUMB(p->paramid);
2765 APP_JUMB(p->paramtype);
2766 /* Also, track the highest external Param id */
2767 if (p->paramkind == PARAM_EXTERN &&
2768 p->paramid > jstate->highest_extern_param_id)
2769 jstate->highest_extern_param_id = p->paramid;
2770 }
2771 break;
2772 case T_Aggref:
2773 {
2774 Aggref *expr = (Aggref *) node;
2775
2776 APP_JUMB(expr->aggfnoid);
2777 JumbleExpr(jstate, (Node *) expr->aggdirectargs);
2778 JumbleExpr(jstate, (Node *) expr->args);
2779 JumbleExpr(jstate, (Node *) expr->aggorder);
2780 JumbleExpr(jstate, (Node *) expr->aggdistinct);
2781 JumbleExpr(jstate, (Node *) expr->aggfilter);
2782 }
2783 break;
2784 case T_GroupingFunc:
2785 {
2786 GroupingFunc *grpnode = (GroupingFunc *) node;
2787
2788 JumbleExpr(jstate, (Node *) grpnode->refs);
2789 }
2790 break;
2791 case T_WindowFunc:
2792 {
2793 WindowFunc *expr = (WindowFunc *) node;
2794
2795 APP_JUMB(expr->winfnoid);
2796 APP_JUMB(expr->winref);
2797 JumbleExpr(jstate, (Node *) expr->args);
2798 JumbleExpr(jstate, (Node *) expr->aggfilter);
2799 }
2800 break;
2801 case T_SubscriptingRef:
2802 {
2803 SubscriptingRef *sbsref = (SubscriptingRef *) node;
2804
2805 JumbleExpr(jstate, (Node *) sbsref->refupperindexpr);
2806 JumbleExpr(jstate, (Node *) sbsref->reflowerindexpr);
2807 JumbleExpr(jstate, (Node *) sbsref->refexpr);
2808 JumbleExpr(jstate, (Node *) sbsref->refassgnexpr);
2809 }
2810 break;
2811 case T_FuncExpr:
2812 {
2813 FuncExpr *expr = (FuncExpr *) node;
2814
2815 APP_JUMB(expr->funcid);
2816 JumbleExpr(jstate, (Node *) expr->args);
2817 }
2818 break;
2819 case T_NamedArgExpr:
2820 {
2821 NamedArgExpr *nae = (NamedArgExpr *) node;
2822
2823 APP_JUMB(nae->argnumber);
2824 JumbleExpr(jstate, (Node *) nae->arg);
2825 }
2826 break;
2827 case T_OpExpr:
2828 case T_DistinctExpr: /* struct-equivalent to OpExpr */
2829 case T_NullIfExpr: /* struct-equivalent to OpExpr */
2830 {
2831 OpExpr *expr = (OpExpr *) node;
2832
2833 APP_JUMB(expr->opno);
2834 JumbleExpr(jstate, (Node *) expr->args);
2835 }
2836 break;
2837 case T_ScalarArrayOpExpr:
2838 {
2839 ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) node;
2840
2841 APP_JUMB(expr->opno);
2842 APP_JUMB(expr->useOr);
2843 JumbleExpr(jstate, (Node *) expr->args);
2844 }
2845 break;
2846 case T_BoolExpr:
2847 {
2848 BoolExpr *expr = (BoolExpr *) node;
2849
2850 APP_JUMB(expr->boolop);
2851 JumbleExpr(jstate, (Node *) expr->args);
2852 }
2853 break;
2854 case T_SubLink:
2855 {
2856 SubLink *sublink = (SubLink *) node;
2857
2858 APP_JUMB(sublink->subLinkType);
2859 APP_JUMB(sublink->subLinkId);
2860 JumbleExpr(jstate, (Node *) sublink->testexpr);
2861 JumbleQuery(jstate, castNode(Query, sublink->subselect));
2862 }
2863 break;
2864 case T_FieldSelect:
2865 {
2866 FieldSelect *fs = (FieldSelect *) node;
2867
2868 APP_JUMB(fs->fieldnum);
2869 JumbleExpr(jstate, (Node *) fs->arg);
2870 }
2871 break;
2872 case T_FieldStore:
2873 {
2874 FieldStore *fstore = (FieldStore *) node;
2875
2876 JumbleExpr(jstate, (Node *) fstore->arg);
2877 JumbleExpr(jstate, (Node *) fstore->newvals);
2878 }
2879 break;
2880 case T_RelabelType:
2881 {
2882 RelabelType *rt = (RelabelType *) node;
2883
2884 APP_JUMB(rt->resulttype);
2885 JumbleExpr(jstate, (Node *) rt->arg);
2886 }
2887 break;
2888 case T_CoerceViaIO:
2889 {
2890 CoerceViaIO *cio = (CoerceViaIO *) node;
2891
2892 APP_JUMB(cio->resulttype);
2893 JumbleExpr(jstate, (Node *) cio->arg);
2894 }
2895 break;
2896 case T_ArrayCoerceExpr:
2897 {
2898 ArrayCoerceExpr *acexpr = (ArrayCoerceExpr *) node;
2899
2900 APP_JUMB(acexpr->resulttype);
2901 JumbleExpr(jstate, (Node *) acexpr->arg);
2902 JumbleExpr(jstate, (Node *) acexpr->elemexpr);
2903 }
2904 break;
2905 case T_ConvertRowtypeExpr:
2906 {
2907 ConvertRowtypeExpr *crexpr = (ConvertRowtypeExpr *) node;
2908
2909 APP_JUMB(crexpr->resulttype);
2910 JumbleExpr(jstate, (Node *) crexpr->arg);
2911 }
2912 break;
2913 case T_CollateExpr:
2914 {
2915 CollateExpr *ce = (CollateExpr *) node;
2916
2917 APP_JUMB(ce->collOid);
2918 JumbleExpr(jstate, (Node *) ce->arg);
2919 }
2920 break;
2921 case T_CaseExpr:
2922 {
2923 CaseExpr *caseexpr = (CaseExpr *) node;
2924
2925 JumbleExpr(jstate, (Node *) caseexpr->arg);
2926 foreach(temp, caseexpr->args)
2927 {
2928 CaseWhen *when = lfirst_node(CaseWhen, temp);
2929
2930 JumbleExpr(jstate, (Node *) when->expr);
2931 JumbleExpr(jstate, (Node *) when->result);
2932 }
2933 JumbleExpr(jstate, (Node *) caseexpr->defresult);
2934 }
2935 break;
2936 case T_CaseTestExpr:
2937 {
2938 CaseTestExpr *ct = (CaseTestExpr *) node;
2939
2940 APP_JUMB(ct->typeId);
2941 }
2942 break;
2943 case T_ArrayExpr:
2944 JumbleExpr(jstate, (Node *) ((ArrayExpr *) node)->elements);
2945 break;
2946 case T_RowExpr:
2947 JumbleExpr(jstate, (Node *) ((RowExpr *) node)->args);
2948 break;
2949 case T_RowCompareExpr:
2950 {
2951 RowCompareExpr *rcexpr = (RowCompareExpr *) node;
2952
2953 APP_JUMB(rcexpr->rctype);
2954 JumbleExpr(jstate, (Node *) rcexpr->largs);
2955 JumbleExpr(jstate, (Node *) rcexpr->rargs);
2956 }
2957 break;
2958 case T_CoalesceExpr:
2959 JumbleExpr(jstate, (Node *) ((CoalesceExpr *) node)->args);
2960 break;
2961 case T_MinMaxExpr:
2962 {
2963 MinMaxExpr *mmexpr = (MinMaxExpr *) node;
2964
2965 APP_JUMB(mmexpr->op);
2966 JumbleExpr(jstate, (Node *) mmexpr->args);
2967 }
2968 break;
2969 case T_SQLValueFunction:
2970 {
2971 SQLValueFunction *svf = (SQLValueFunction *) node;
2972
2973 APP_JUMB(svf->op);
2974 /* type is fully determined by op */
2975 APP_JUMB(svf->typmod);
2976 }
2977 break;
2978 case T_XmlExpr:
2979 {
2980 XmlExpr *xexpr = (XmlExpr *) node;
2981
2982 APP_JUMB(xexpr->op);
2983 JumbleExpr(jstate, (Node *) xexpr->named_args);
2984 JumbleExpr(jstate, (Node *) xexpr->args);
2985 }
2986 break;
2987 case T_NullTest:
2988 {
2989 NullTest *nt = (NullTest *) node;
2990
2991 APP_JUMB(nt->nulltesttype);
2992 JumbleExpr(jstate, (Node *) nt->arg);
2993 }
2994 break;
2995 case T_BooleanTest:
2996 {
2997 BooleanTest *bt = (BooleanTest *) node;
2998
2999 APP_JUMB(bt->booltesttype);
3000 JumbleExpr(jstate, (Node *) bt->arg);
3001 }
3002 break;
3003 case T_CoerceToDomain:
3004 {
3005 CoerceToDomain *cd = (CoerceToDomain *) node;
3006
3007 APP_JUMB(cd->resulttype);
3008 JumbleExpr(jstate, (Node *) cd->arg);
3009 }
3010 break;
3011 case T_CoerceToDomainValue:
3012 {
3013 CoerceToDomainValue *cdv = (CoerceToDomainValue *) node;
3014
3015 APP_JUMB(cdv->typeId);
3016 }
3017 break;
3018 case T_SetToDefault:
3019 {
3020 SetToDefault *sd = (SetToDefault *) node;
3021
3022 APP_JUMB(sd->typeId);
3023 }
3024 break;
3025 case T_CurrentOfExpr:
3026 {
3027 CurrentOfExpr *ce = (CurrentOfExpr *) node;
3028
3029 APP_JUMB(ce->cvarno);
3030 if (ce->cursor_name)
3031 APP_JUMB_STRING(ce->cursor_name);
3032 APP_JUMB(ce->cursor_param);
3033 }
3034 break;
3035 case T_NextValueExpr:
3036 {
3037 NextValueExpr *nve = (NextValueExpr *) node;
3038
3039 APP_JUMB(nve->seqid);
3040 APP_JUMB(nve->typeId);
3041 }
3042 break;
3043 case T_InferenceElem:
3044 {
3045 InferenceElem *ie = (InferenceElem *) node;
3046
3047 APP_JUMB(ie->infercollid);
3048 APP_JUMB(ie->inferopclass);
3049 JumbleExpr(jstate, ie->expr);
3050 }
3051 break;
3052 case T_TargetEntry:
3053 {
3054 TargetEntry *tle = (TargetEntry *) node;
3055
3056 APP_JUMB(tle->resno);
3057 APP_JUMB(tle->ressortgroupref);
3058 JumbleExpr(jstate, (Node *) tle->expr);
3059 }
3060 break;
3061 case T_RangeTblRef:
3062 {
3063 RangeTblRef *rtr = (RangeTblRef *) node;
3064
3065 APP_JUMB(rtr->rtindex);
3066 }
3067 break;
3068 case T_JoinExpr:
3069 {
3070 JoinExpr *join = (JoinExpr *) node;
3071
3072 APP_JUMB(join->jointype);
3073 APP_JUMB(join->isNatural);
3074 APP_JUMB(join->rtindex);
3075 JumbleExpr(jstate, join->larg);
3076 JumbleExpr(jstate, join->rarg);
3077 JumbleExpr(jstate, join->quals);
3078 }
3079 break;
3080 case T_FromExpr:
3081 {
3082 FromExpr *from = (FromExpr *) node;
3083
3084 JumbleExpr(jstate, (Node *) from->fromlist);
3085 JumbleExpr(jstate, from->quals);
3086 }
3087 break;
3088 case T_OnConflictExpr:
3089 {
3090 OnConflictExpr *conf = (OnConflictExpr *) node;
3091
3092 APP_JUMB(conf->action);
3093 JumbleExpr(jstate, (Node *) conf->arbiterElems);
3094 JumbleExpr(jstate, conf->arbiterWhere);
3095 JumbleExpr(jstate, (Node *) conf->onConflictSet);
3096 JumbleExpr(jstate, conf->onConflictWhere);
3097 APP_JUMB(conf->constraint);
3098 APP_JUMB(conf->exclRelIndex);
3099 JumbleExpr(jstate, (Node *) conf->exclRelTlist);
3100 }
3101 break;
3102 case T_List:
3103 foreach(temp, (List *) node)
3104 {
3105 JumbleExpr(jstate, (Node *) lfirst(temp));
3106 }
3107 break;
3108 case T_IntList:
3109 foreach(temp, (List *) node)
3110 {
3111 APP_JUMB(lfirst_int(temp));
3112 }
3113 break;
3114 case T_SortGroupClause:
3115 {
3116 SortGroupClause *sgc = (SortGroupClause *) node;
3117
3118 APP_JUMB(sgc->tleSortGroupRef);
3119 APP_JUMB(sgc->eqop);
3120 APP_JUMB(sgc->sortop);
3121 APP_JUMB(sgc->nulls_first);
3122 }
3123 break;
3124 case T_GroupingSet:
3125 {
3126 GroupingSet *gsnode = (GroupingSet *) node;
3127
3128 JumbleExpr(jstate, (Node *) gsnode->content);
3129 }
3130 break;
3131 case T_WindowClause:
3132 {
3133 WindowClause *wc = (WindowClause *) node;
3134
3135 APP_JUMB(wc->winref);
3136 APP_JUMB(wc->frameOptions);
3137 JumbleExpr(jstate, (Node *) wc->partitionClause);
3138 JumbleExpr(jstate, (Node *) wc->orderClause);
3139 JumbleExpr(jstate, wc->startOffset);
3140 JumbleExpr(jstate, wc->endOffset);
3141 }
3142 break;
3143 case T_CommonTableExpr:
3144 {
3145 CommonTableExpr *cte = (CommonTableExpr *) node;
3146
3147 /* we store the string name because RTE_CTE RTEs need it */
3148 APP_JUMB_STRING(cte->ctename);
3149 APP_JUMB(cte->ctematerialized);
3150 JumbleQuery(jstate, castNode(Query, cte->ctequery));
3151 }
3152 break;
3153 case T_SetOperationStmt:
3154 {
3155 SetOperationStmt *setop = (SetOperationStmt *) node;
3156
3157 APP_JUMB(setop->op);
3158 APP_JUMB(setop->all);
3159 JumbleExpr(jstate, setop->larg);
3160 JumbleExpr(jstate, setop->rarg);
3161 }
3162 break;
3163 case T_RangeTblFunction:
3164 {
3165 RangeTblFunction *rtfunc = (RangeTblFunction *) node;
3166
3167 JumbleExpr(jstate, rtfunc->funcexpr);
3168 }
3169 break;
3170 case T_TableFunc:
3171 {
3172 TableFunc *tablefunc = (TableFunc *) node;
3173
3174 JumbleExpr(jstate, tablefunc->docexpr);
3175 JumbleExpr(jstate, tablefunc->rowexpr);
3176 JumbleExpr(jstate, (Node *) tablefunc->colexprs);
3177 }
3178 break;
3179 case T_TableSampleClause:
3180 {
3181 TableSampleClause *tsc = (TableSampleClause *) node;
3182
3183 APP_JUMB(tsc->tsmhandler);
3184 JumbleExpr(jstate, (Node *) tsc->args);
3185 JumbleExpr(jstate, (Node *) tsc->repeatable);
3186 }
3187 break;
3188 default:
3189 /* Only a warning, since we can stumble along anyway */
3190 elog(WARNING, "unrecognized node type: %d",
3191 (int) nodeTag(node));
3192 break;
3193 }
3194 }
3195
3196 /*
3197 * Record location of constant within query string of query tree
3198 * that is currently being walked.
3199 */
3200 static void
RecordConstLocation(pgssJumbleState * jstate,int location)3201 RecordConstLocation(pgssJumbleState *jstate, int location)
3202 {
3203 /* -1 indicates unknown or undefined location */
3204 if (location >= 0)
3205 {
3206 /* enlarge array if needed */
3207 if (jstate->clocations_count >= jstate->clocations_buf_size)
3208 {
3209 jstate->clocations_buf_size *= 2;
3210 jstate->clocations = (pgssLocationLen *)
3211 repalloc(jstate->clocations,
3212 jstate->clocations_buf_size *
3213 sizeof(pgssLocationLen));
3214 }
3215 jstate->clocations[jstate->clocations_count].location = location;
3216 /* initialize lengths to -1 to simplify fill_in_constant_lengths */
3217 jstate->clocations[jstate->clocations_count].length = -1;
3218 jstate->clocations_count++;
3219 }
3220 }
3221
3222 /*
3223 * Generate a normalized version of the query string that will be used to
3224 * represent all similar queries.
3225 *
3226 * Note that the normalized representation may well vary depending on
3227 * just which "equivalent" query is used to create the hashtable entry.
3228 * We assume this is OK.
3229 *
3230 * If query_loc > 0, then "query" has been advanced by that much compared to
3231 * the original string start, so we need to translate the provided locations
3232 * to compensate. (This lets us avoid re-scanning statements before the one
3233 * of interest, so it's worth doing.)
3234 *
3235 * *query_len_p contains the input string length, and is updated with
3236 * the result string length on exit. The resulting string might be longer
3237 * or shorter depending on what happens with replacement of constants.
3238 *
3239 * Returns a palloc'd string.
3240 */
3241 static char *
generate_normalized_query(pgssJumbleState * jstate,const char * query,int query_loc,int * query_len_p,int encoding)3242 generate_normalized_query(pgssJumbleState *jstate, const char *query,
3243 int query_loc, int *query_len_p, int encoding)
3244 {
3245 char *norm_query;
3246 int query_len = *query_len_p;
3247 int i,
3248 norm_query_buflen, /* Space allowed for norm_query */
3249 len_to_wrt, /* Length (in bytes) to write */
3250 quer_loc = 0, /* Source query byte location */
3251 n_quer_loc = 0, /* Normalized query byte location */
3252 last_off = 0, /* Offset from start for previous tok */
3253 last_tok_len = 0; /* Length (in bytes) of that tok */
3254
3255 /*
3256 * Get constants' lengths (core system only gives us locations). Note
3257 * this also ensures the items are sorted by location.
3258 */
3259 fill_in_constant_lengths(jstate, query, query_loc);
3260
3261 /*
3262 * Allow for $n symbols to be longer than the constants they replace.
3263 * Constants must take at least one byte in text form, while a $n symbol
3264 * certainly isn't more than 11 bytes, even if n reaches INT_MAX. We
3265 * could refine that limit based on the max value of n for the current
3266 * query, but it hardly seems worth any extra effort to do so.
3267 */
3268 norm_query_buflen = query_len + jstate->clocations_count * 10;
3269
3270 /* Allocate result buffer */
3271 norm_query = palloc(norm_query_buflen + 1);
3272
3273 for (i = 0; i < jstate->clocations_count; i++)
3274 {
3275 int off, /* Offset from start for cur tok */
3276 tok_len; /* Length (in bytes) of that tok */
3277
3278 off = jstate->clocations[i].location;
3279 /* Adjust recorded location if we're dealing with partial string */
3280 off -= query_loc;
3281
3282 tok_len = jstate->clocations[i].length;
3283
3284 if (tok_len < 0)
3285 continue; /* ignore any duplicates */
3286
3287 /* Copy next chunk (what precedes the next constant) */
3288 len_to_wrt = off - last_off;
3289 len_to_wrt -= last_tok_len;
3290
3291 Assert(len_to_wrt >= 0);
3292 memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
3293 n_quer_loc += len_to_wrt;
3294
3295 /* And insert a param symbol in place of the constant token */
3296 n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d",
3297 i + 1 + jstate->highest_extern_param_id);
3298
3299 quer_loc = off + tok_len;
3300 last_off = off;
3301 last_tok_len = tok_len;
3302 }
3303
3304 /*
3305 * We've copied up until the last ignorable constant. Copy over the
3306 * remaining bytes of the original query string.
3307 */
3308 len_to_wrt = query_len - quer_loc;
3309
3310 Assert(len_to_wrt >= 0);
3311 memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
3312 n_quer_loc += len_to_wrt;
3313
3314 Assert(n_quer_loc <= norm_query_buflen);
3315 norm_query[n_quer_loc] = '\0';
3316
3317 *query_len_p = n_quer_loc;
3318 return norm_query;
3319 }
3320
3321 /*
3322 * Given a valid SQL string and an array of constant-location records,
3323 * fill in the textual lengths of those constants.
3324 *
3325 * The constants may use any allowed constant syntax, such as float literals,
3326 * bit-strings, single-quoted strings and dollar-quoted strings. This is
3327 * accomplished by using the public API for the core scanner.
3328 *
3329 * It is the caller's job to ensure that the string is a valid SQL statement
3330 * with constants at the indicated locations. Since in practice the string
3331 * has already been parsed, and the locations that the caller provides will
3332 * have originated from within the authoritative parser, this should not be
3333 * a problem.
3334 *
3335 * Duplicate constant pointers are possible, and will have their lengths
3336 * marked as '-1', so that they are later ignored. (Actually, we assume the
3337 * lengths were initialized as -1 to start with, and don't change them here.)
3338 *
3339 * If query_loc > 0, then "query" has been advanced by that much compared to
3340 * the original string start, so we need to translate the provided locations
3341 * to compensate. (This lets us avoid re-scanning statements before the one
3342 * of interest, so it's worth doing.)
3343 *
3344 * N.B. There is an assumption that a '-' character at a Const location begins
3345 * a negative numeric constant. This precludes there ever being another
3346 * reason for a constant to start with a '-'.
3347 */
3348 static void
fill_in_constant_lengths(pgssJumbleState * jstate,const char * query,int query_loc)3349 fill_in_constant_lengths(pgssJumbleState *jstate, const char *query,
3350 int query_loc)
3351 {
3352 pgssLocationLen *locs;
3353 core_yyscan_t yyscanner;
3354 core_yy_extra_type yyextra;
3355 core_YYSTYPE yylval;
3356 YYLTYPE yylloc;
3357 int last_loc = -1;
3358 int i;
3359
3360 /*
3361 * Sort the records by location so that we can process them in order while
3362 * scanning the query text.
3363 */
3364 if (jstate->clocations_count > 1)
3365 qsort(jstate->clocations, jstate->clocations_count,
3366 sizeof(pgssLocationLen), comp_location);
3367 locs = jstate->clocations;
3368
3369 /* initialize the flex scanner --- should match raw_parser() */
3370 yyscanner = scanner_init(query,
3371 &yyextra,
3372 &ScanKeywords,
3373 ScanKeywordTokens);
3374
3375 /* we don't want to re-emit any escape string warnings */
3376 yyextra.escape_string_warning = false;
3377
3378 /* Search for each constant, in sequence */
3379 for (i = 0; i < jstate->clocations_count; i++)
3380 {
3381 int loc = locs[i].location;
3382 int tok;
3383
3384 /* Adjust recorded location if we're dealing with partial string */
3385 loc -= query_loc;
3386
3387 Assert(loc >= 0);
3388
3389 if (loc <= last_loc)
3390 continue; /* Duplicate constant, ignore */
3391
3392 /* Lex tokens until we find the desired constant */
3393 for (;;)
3394 {
3395 tok = core_yylex(&yylval, &yylloc, yyscanner);
3396
3397 /* We should not hit end-of-string, but if we do, behave sanely */
3398 if (tok == 0)
3399 break; /* out of inner for-loop */
3400
3401 /*
3402 * We should find the token position exactly, but if we somehow
3403 * run past it, work with that.
3404 */
3405 if (yylloc >= loc)
3406 {
3407 if (query[loc] == '-')
3408 {
3409 /*
3410 * It's a negative value - this is the one and only case
3411 * where we replace more than a single token.
3412 *
3413 * Do not compensate for the core system's special-case
3414 * adjustment of location to that of the leading '-'
3415 * operator in the event of a negative constant. It is
3416 * also useful for our purposes to start from the minus
3417 * symbol. In this way, queries like "select * from foo
3418 * where bar = 1" and "select * from foo where bar = -2"
3419 * will have identical normalized query strings.
3420 */
3421 tok = core_yylex(&yylval, &yylloc, yyscanner);
3422 if (tok == 0)
3423 break; /* out of inner for-loop */
3424 }
3425
3426 /*
3427 * We now rely on the assumption that flex has placed a zero
3428 * byte after the text of the current token in scanbuf.
3429 */
3430 locs[i].length = strlen(yyextra.scanbuf + loc);
3431 break; /* out of inner for-loop */
3432 }
3433 }
3434
3435 /* If we hit end-of-string, give up, leaving remaining lengths -1 */
3436 if (tok == 0)
3437 break;
3438
3439 last_loc = loc;
3440 }
3441
3442 scanner_finish(yyscanner);
3443 }
3444
3445 /*
3446 * comp_location: comparator for qsorting pgssLocationLen structs by location
3447 */
3448 static int
comp_location(const void * a,const void * b)3449 comp_location(const void *a, const void *b)
3450 {
3451 int l = ((const pgssLocationLen *) a)->location;
3452 int r = ((const pgssLocationLen *) b)->location;
3453
3454 if (l < r)
3455 return -1;
3456 else if (l > r)
3457 return +1;
3458 else
3459 return 0;
3460 }
3461