1 /*------------------------------------------------------------------------- 2 * 3 * pg_stat_statements.c 4 * Track statement execution times across a whole database cluster. 5 * 6 * Execution costs are totalled for each distinct source query, and kept in 7 * a shared hashtable. (We track only as many distinct queries as will fit 8 * in the designated amount of shared memory.) 9 * 10 * As of Postgres 9.2, this module normalizes query entries. Normalization 11 * is a process whereby similar queries, typically differing only in their 12 * constants (though the exact rules are somewhat more subtle than that) are 13 * recognized as equivalent, and are tracked as a single entry. This is 14 * particularly useful for non-prepared queries. 15 * 16 * Normalization is implemented by fingerprinting queries, selectively 17 * serializing those fields of each query tree's nodes that are judged to be 18 * essential to the query. This is referred to as a query jumble. This is 19 * distinct from a regular serialization in that various extraneous 20 * information is ignored as irrelevant or not essential to the query, such 21 * as the collations of Vars and, most notably, the values of constants. 22 * 23 * This jumble is acquired at the end of parse analysis of each query, and 24 * a 64-bit hash of it is stored into the query's Query.queryId field. 25 * The server then copies this value around, making it available in plan 26 * tree(s) generated from the query. The executor can then use this value 27 * to blame query costs on the proper queryId. 28 * 29 * To facilitate presenting entries to users, we create "representative" query 30 * strings in which constants are replaced with parameter symbols ($n), to 31 * make it clearer what a normalized entry can represent. To save on shared 32 * memory, and to avoid having to truncate oversized query strings, we store 33 * these strings in a temporary external query-texts file. Offsets into this 34 * file are kept in shared memory. 35 * 36 * Note about locking issues: to create or delete an entry in the shared 37 * hashtable, one must hold pgss->lock exclusively. Modifying any field 38 * in an entry except the counters requires the same. To look up an entry, 39 * one must hold the lock shared. To read or update the counters within 40 * an entry, one must hold the lock shared or exclusive (so the entry doesn't 41 * disappear!) and also take the entry's mutex spinlock. 42 * The shared state variable pgss->extent (the next free spot in the external 43 * query-text file) should be accessed only while holding either the 44 * pgss->mutex spinlock, or exclusive lock on pgss->lock. We use the mutex to 45 * allow reserving file space while holding only shared lock on pgss->lock. 46 * Rewriting the entire external query-text file, eg for garbage collection, 47 * requires holding pgss->lock exclusively; this allows individual entries 48 * in the file to be read or written while holding only shared lock. 49 * 50 * 51 * Copyright (c) 2008-2018, PostgreSQL Global Development Group 52 * 53 * IDENTIFICATION 54 * contrib/pg_stat_statements/pg_stat_statements.c 55 * 56 *------------------------------------------------------------------------- 57 */ 58 #include "postgres.h" 59 60 #include <math.h> 61 #include <sys/stat.h> 62 #include <unistd.h> 63 64 #include "access/hash.h" 65 #include "catalog/pg_authid.h" 66 #include "executor/instrument.h" 67 #include "funcapi.h" 68 #include "mb/pg_wchar.h" 69 #include "miscadmin.h" 70 #include "parser/analyze.h" 71 #include "parser/parsetree.h" 72 #include "parser/scanner.h" 73 #include "parser/scansup.h" 74 #include "pgstat.h" 75 #include "storage/fd.h" 76 #include "storage/ipc.h" 77 #include "storage/spin.h" 78 #include "tcop/utility.h" 79 #include "utils/acl.h" 80 #include "utils/builtins.h" 81 #include "utils/memutils.h" 82 83 PG_MODULE_MAGIC; 84 85 /* Location of permanent stats file (valid when database is shut down) */ 86 #define PGSS_DUMP_FILE PGSTAT_STAT_PERMANENT_DIRECTORY "/pg_stat_statements.stat" 87 88 /* 89 * Location of external query text file. We don't keep it in the core 90 * system's stats_temp_directory. The core system can safely use that GUC 91 * setting, because the statistics collector temp file paths are set only once 92 * as part of changing the GUC, but pg_stat_statements has no way of avoiding 93 * race conditions. Besides, we only expect modest, infrequent I/O for query 94 * strings, so placing the file on a faster filesystem is not compelling. 95 */ 96 #define PGSS_TEXT_FILE PG_STAT_TMP_DIR "/pgss_query_texts.stat" 97 98 /* Magic number identifying the stats file format */ 99 static const uint32 PGSS_FILE_HEADER = 0x20171004; 100 101 /* PostgreSQL major version number, changes in which invalidate all entries */ 102 static const uint32 PGSS_PG_MAJOR_VERSION = PG_VERSION_NUM / 100; 103 104 /* XXX: Should USAGE_EXEC reflect execution time and/or buffer usage? */ 105 #define USAGE_EXEC(duration) (1.0) 106 #define USAGE_INIT (1.0) /* including initial planning */ 107 #define ASSUMED_MEDIAN_INIT (10.0) /* initial assumed median usage */ 108 #define ASSUMED_LENGTH_INIT 1024 /* initial assumed mean query length */ 109 #define USAGE_DECREASE_FACTOR (0.99) /* decreased every entry_dealloc */ 110 #define STICKY_DECREASE_FACTOR (0.50) /* factor for sticky entries */ 111 #define USAGE_DEALLOC_PERCENT 5 /* free this % of entries at once */ 112 113 #define JUMBLE_SIZE 1024 /* query serialization buffer size */ 114 115 /* 116 * Extension version number, for supporting older extension versions' objects 117 */ 118 typedef enum pgssVersion 119 { 120 PGSS_V1_0 = 0, 121 PGSS_V1_1, 122 PGSS_V1_2, 123 PGSS_V1_3 124 } pgssVersion; 125 126 /* 127 * Hashtable key that defines the identity of a hashtable entry. We separate 128 * queries by user and by database even if they are otherwise identical. 129 * 130 * Right now, this structure contains no padding. If you add any, make sure 131 * to teach pgss_store() to zero the padding bytes. Otherwise, things will 132 * break, because pgss_hash is created using HASH_BLOBS, and thus tag_hash 133 * is used to hash this. 134 */ 135 typedef struct pgssHashKey 136 { 137 Oid userid; /* user OID */ 138 Oid dbid; /* database OID */ 139 uint64 queryid; /* query identifier */ 140 } pgssHashKey; 141 142 /* 143 * The actual stats counters kept within pgssEntry. 144 */ 145 typedef struct Counters 146 { 147 int64 calls; /* # of times executed */ 148 double total_time; /* total execution time, in msec */ 149 double min_time; /* minimum execution time in msec */ 150 double max_time; /* maximum execution time in msec */ 151 double mean_time; /* mean execution time in msec */ 152 double sum_var_time; /* sum of variances in execution time in msec */ 153 int64 rows; /* total # of retrieved or affected rows */ 154 int64 shared_blks_hit; /* # of shared buffer hits */ 155 int64 shared_blks_read; /* # of shared disk blocks read */ 156 int64 shared_blks_dirtied; /* # of shared disk blocks dirtied */ 157 int64 shared_blks_written; /* # of shared disk blocks written */ 158 int64 local_blks_hit; /* # of local buffer hits */ 159 int64 local_blks_read; /* # of local disk blocks read */ 160 int64 local_blks_dirtied; /* # of local disk blocks dirtied */ 161 int64 local_blks_written; /* # of local disk blocks written */ 162 int64 temp_blks_read; /* # of temp blocks read */ 163 int64 temp_blks_written; /* # of temp blocks written */ 164 double blk_read_time; /* time spent reading, in msec */ 165 double blk_write_time; /* time spent writing, in msec */ 166 double usage; /* usage factor */ 167 } Counters; 168 169 /* 170 * Statistics per statement 171 * 172 * Note: in event of a failure in garbage collection of the query text file, 173 * we reset query_offset to zero and query_len to -1. This will be seen as 174 * an invalid state by qtext_fetch(). 175 */ 176 typedef struct pgssEntry 177 { 178 pgssHashKey key; /* hash key of entry - MUST BE FIRST */ 179 Counters counters; /* the statistics for this query */ 180 Size query_offset; /* query text offset in external file */ 181 int query_len; /* # of valid bytes in query string, or -1 */ 182 int encoding; /* query text encoding */ 183 slock_t mutex; /* protects the counters only */ 184 } pgssEntry; 185 186 /* 187 * Global shared state 188 */ 189 typedef struct pgssSharedState 190 { 191 LWLock *lock; /* protects hashtable search/modification */ 192 double cur_median_usage; /* current median usage in hashtable */ 193 Size mean_query_len; /* current mean entry text length */ 194 slock_t mutex; /* protects following fields only: */ 195 Size extent; /* current extent of query file */ 196 int n_writers; /* number of active writers to query file */ 197 int gc_count; /* query file garbage collection cycle count */ 198 } pgssSharedState; 199 200 /* 201 * Struct for tracking locations/lengths of constants during normalization 202 */ 203 typedef struct pgssLocationLen 204 { 205 int location; /* start offset in query text */ 206 int length; /* length in bytes, or -1 to ignore */ 207 } pgssLocationLen; 208 209 /* 210 * Working state for computing a query jumble and producing a normalized 211 * query string 212 */ 213 typedef struct pgssJumbleState 214 { 215 /* Jumble of current query tree */ 216 unsigned char *jumble; 217 218 /* Number of bytes used in jumble[] */ 219 Size jumble_len; 220 221 /* Array of locations of constants that should be removed */ 222 pgssLocationLen *clocations; 223 224 /* Allocated length of clocations array */ 225 int clocations_buf_size; 226 227 /* Current number of valid entries in clocations array */ 228 int clocations_count; 229 230 /* highest Param id we've seen, in order to start normalization correctly */ 231 int highest_extern_param_id; 232 } pgssJumbleState; 233 234 /*---- Local variables ----*/ 235 236 /* Current nesting depth of ExecutorRun+ProcessUtility calls */ 237 static int nested_level = 0; 238 239 /* Saved hook values in case of unload */ 240 static shmem_startup_hook_type prev_shmem_startup_hook = NULL; 241 static post_parse_analyze_hook_type prev_post_parse_analyze_hook = NULL; 242 static ExecutorStart_hook_type prev_ExecutorStart = NULL; 243 static ExecutorRun_hook_type prev_ExecutorRun = NULL; 244 static ExecutorFinish_hook_type prev_ExecutorFinish = NULL; 245 static ExecutorEnd_hook_type prev_ExecutorEnd = NULL; 246 static ProcessUtility_hook_type prev_ProcessUtility = NULL; 247 248 /* Links to shared memory state */ 249 static pgssSharedState *pgss = NULL; 250 static HTAB *pgss_hash = NULL; 251 252 /*---- GUC variables ----*/ 253 254 typedef enum 255 { 256 PGSS_TRACK_NONE, /* track no statements */ 257 PGSS_TRACK_TOP, /* only top level statements */ 258 PGSS_TRACK_ALL /* all statements, including nested ones */ 259 } PGSSTrackLevel; 260 261 static const struct config_enum_entry track_options[] = 262 { 263 {"none", PGSS_TRACK_NONE, false}, 264 {"top", PGSS_TRACK_TOP, false}, 265 {"all", PGSS_TRACK_ALL, false}, 266 {NULL, 0, false} 267 }; 268 269 static int pgss_max; /* max # statements to track */ 270 static int pgss_track; /* tracking level */ 271 static bool pgss_track_utility; /* whether to track utility commands */ 272 static bool pgss_save; /* whether to save stats across shutdown */ 273 274 275 #define pgss_enabled() \ 276 (pgss_track == PGSS_TRACK_ALL || \ 277 (pgss_track == PGSS_TRACK_TOP && nested_level == 0)) 278 279 #define record_gc_qtexts() \ 280 do { \ 281 volatile pgssSharedState *s = (volatile pgssSharedState *) pgss; \ 282 SpinLockAcquire(&s->mutex); \ 283 s->gc_count++; \ 284 SpinLockRelease(&s->mutex); \ 285 } while(0) 286 287 /*---- Function declarations ----*/ 288 289 void _PG_init(void); 290 void _PG_fini(void); 291 292 PG_FUNCTION_INFO_V1(pg_stat_statements_reset); 293 PG_FUNCTION_INFO_V1(pg_stat_statements_1_2); 294 PG_FUNCTION_INFO_V1(pg_stat_statements_1_3); 295 PG_FUNCTION_INFO_V1(pg_stat_statements); 296 297 static void pgss_shmem_startup(void); 298 static void pgss_shmem_shutdown(int code, Datum arg); 299 static void pgss_post_parse_analyze(ParseState *pstate, Query *query); 300 static void pgss_ExecutorStart(QueryDesc *queryDesc, int eflags); 301 static void pgss_ExecutorRun(QueryDesc *queryDesc, 302 ScanDirection direction, 303 uint64 count, bool execute_once); 304 static void pgss_ExecutorFinish(QueryDesc *queryDesc); 305 static void pgss_ExecutorEnd(QueryDesc *queryDesc); 306 static void pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, 307 ProcessUtilityContext context, ParamListInfo params, 308 QueryEnvironment *queryEnv, 309 DestReceiver *dest, char *completionTag); 310 static uint64 pgss_hash_string(const char *str, int len); 311 static void pgss_store(const char *query, uint64 queryId, 312 int query_location, int query_len, 313 double total_time, uint64 rows, 314 const BufferUsage *bufusage, 315 pgssJumbleState *jstate); 316 static void pg_stat_statements_internal(FunctionCallInfo fcinfo, 317 pgssVersion api_version, 318 bool showtext); 319 static Size pgss_memsize(void); 320 static pgssEntry *entry_alloc(pgssHashKey *key, Size query_offset, int query_len, 321 int encoding, bool sticky); 322 static void entry_dealloc(void); 323 static bool qtext_store(const char *query, int query_len, 324 Size *query_offset, int *gc_count); 325 static char *qtext_load_file(Size *buffer_size); 326 static char *qtext_fetch(Size query_offset, int query_len, 327 char *buffer, Size buffer_size); 328 static bool need_gc_qtexts(void); 329 static void gc_qtexts(void); 330 static void entry_reset(void); 331 static void AppendJumble(pgssJumbleState *jstate, 332 const unsigned char *item, Size size); 333 static void JumbleQuery(pgssJumbleState *jstate, Query *query); 334 static void JumbleRangeTable(pgssJumbleState *jstate, List *rtable); 335 static void JumbleExpr(pgssJumbleState *jstate, Node *node); 336 static void RecordConstLocation(pgssJumbleState *jstate, int location); 337 static char *generate_normalized_query(pgssJumbleState *jstate, const char *query, 338 int query_loc, int *query_len_p, int encoding); 339 static void fill_in_constant_lengths(pgssJumbleState *jstate, const char *query, 340 int query_loc); 341 static int comp_location(const void *a, const void *b); 342 343 344 /* 345 * Module load callback 346 */ 347 void 348 _PG_init(void) 349 { 350 /* 351 * In order to create our shared memory area, we have to be loaded via 352 * shared_preload_libraries. If not, fall out without hooking into any of 353 * the main system. (We don't throw error here because it seems useful to 354 * allow the pg_stat_statements functions to be created even when the 355 * module isn't active. The functions must protect themselves against 356 * being called then, however.) 357 */ 358 if (!process_shared_preload_libraries_in_progress) 359 return; 360 361 /* 362 * Define (or redefine) custom GUC variables. 363 */ 364 DefineCustomIntVariable("pg_stat_statements.max", 365 "Sets the maximum number of statements tracked by pg_stat_statements.", 366 NULL, 367 &pgss_max, 368 5000, 369 100, 370 INT_MAX, 371 PGC_POSTMASTER, 372 0, 373 NULL, 374 NULL, 375 NULL); 376 377 DefineCustomEnumVariable("pg_stat_statements.track", 378 "Selects which statements are tracked by pg_stat_statements.", 379 NULL, 380 &pgss_track, 381 PGSS_TRACK_TOP, 382 track_options, 383 PGC_SUSET, 384 0, 385 NULL, 386 NULL, 387 NULL); 388 389 DefineCustomBoolVariable("pg_stat_statements.track_utility", 390 "Selects whether utility commands are tracked by pg_stat_statements.", 391 NULL, 392 &pgss_track_utility, 393 true, 394 PGC_SUSET, 395 0, 396 NULL, 397 NULL, 398 NULL); 399 400 DefineCustomBoolVariable("pg_stat_statements.save", 401 "Save pg_stat_statements statistics across server shutdowns.", 402 NULL, 403 &pgss_save, 404 true, 405 PGC_SIGHUP, 406 0, 407 NULL, 408 NULL, 409 NULL); 410 411 EmitWarningsOnPlaceholders("pg_stat_statements"); 412 413 /* 414 * Request additional shared resources. (These are no-ops if we're not in 415 * the postmaster process.) We'll allocate or attach to the shared 416 * resources in pgss_shmem_startup(). 417 */ 418 RequestAddinShmemSpace(pgss_memsize()); 419 RequestNamedLWLockTranche("pg_stat_statements", 1); 420 421 /* 422 * Install hooks. 423 */ 424 prev_shmem_startup_hook = shmem_startup_hook; 425 shmem_startup_hook = pgss_shmem_startup; 426 prev_post_parse_analyze_hook = post_parse_analyze_hook; 427 post_parse_analyze_hook = pgss_post_parse_analyze; 428 prev_ExecutorStart = ExecutorStart_hook; 429 ExecutorStart_hook = pgss_ExecutorStart; 430 prev_ExecutorRun = ExecutorRun_hook; 431 ExecutorRun_hook = pgss_ExecutorRun; 432 prev_ExecutorFinish = ExecutorFinish_hook; 433 ExecutorFinish_hook = pgss_ExecutorFinish; 434 prev_ExecutorEnd = ExecutorEnd_hook; 435 ExecutorEnd_hook = pgss_ExecutorEnd; 436 prev_ProcessUtility = ProcessUtility_hook; 437 ProcessUtility_hook = pgss_ProcessUtility; 438 } 439 440 /* 441 * Module unload callback 442 */ 443 void 444 _PG_fini(void) 445 { 446 /* Uninstall hooks. */ 447 shmem_startup_hook = prev_shmem_startup_hook; 448 post_parse_analyze_hook = prev_post_parse_analyze_hook; 449 ExecutorStart_hook = prev_ExecutorStart; 450 ExecutorRun_hook = prev_ExecutorRun; 451 ExecutorFinish_hook = prev_ExecutorFinish; 452 ExecutorEnd_hook = prev_ExecutorEnd; 453 ProcessUtility_hook = prev_ProcessUtility; 454 } 455 456 /* 457 * shmem_startup hook: allocate or attach to shared memory, 458 * then load any pre-existing statistics from file. 459 * Also create and load the query-texts file, which is expected to exist 460 * (even if empty) while the module is enabled. 461 */ 462 static void 463 pgss_shmem_startup(void) 464 { 465 bool found; 466 HASHCTL info; 467 FILE *file = NULL; 468 FILE *qfile = NULL; 469 uint32 header; 470 int32 num; 471 int32 pgver; 472 int32 i; 473 int buffer_size; 474 char *buffer = NULL; 475 476 if (prev_shmem_startup_hook) 477 prev_shmem_startup_hook(); 478 479 /* reset in case this is a restart within the postmaster */ 480 pgss = NULL; 481 pgss_hash = NULL; 482 483 /* 484 * Create or attach to the shared memory state, including hash table 485 */ 486 LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); 487 488 pgss = ShmemInitStruct("pg_stat_statements", 489 sizeof(pgssSharedState), 490 &found); 491 492 if (!found) 493 { 494 /* First time through ... */ 495 pgss->lock = &(GetNamedLWLockTranche("pg_stat_statements"))->lock; 496 pgss->cur_median_usage = ASSUMED_MEDIAN_INIT; 497 pgss->mean_query_len = ASSUMED_LENGTH_INIT; 498 SpinLockInit(&pgss->mutex); 499 pgss->extent = 0; 500 pgss->n_writers = 0; 501 pgss->gc_count = 0; 502 } 503 504 memset(&info, 0, sizeof(info)); 505 info.keysize = sizeof(pgssHashKey); 506 info.entrysize = sizeof(pgssEntry); 507 pgss_hash = ShmemInitHash("pg_stat_statements hash", 508 pgss_max, pgss_max, 509 &info, 510 HASH_ELEM | HASH_BLOBS); 511 512 LWLockRelease(AddinShmemInitLock); 513 514 /* 515 * If we're in the postmaster (or a standalone backend...), set up a shmem 516 * exit hook to dump the statistics to disk. 517 */ 518 if (!IsUnderPostmaster) 519 on_shmem_exit(pgss_shmem_shutdown, (Datum) 0); 520 521 /* 522 * Done if some other process already completed our initialization. 523 */ 524 if (found) 525 return; 526 527 /* 528 * Note: we don't bother with locks here, because there should be no other 529 * processes running when this code is reached. 530 */ 531 532 /* Unlink query text file possibly left over from crash */ 533 unlink(PGSS_TEXT_FILE); 534 535 /* Allocate new query text temp file */ 536 qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W); 537 if (qfile == NULL) 538 goto write_error; 539 540 /* 541 * If we were told not to load old statistics, we're done. (Note we do 542 * not try to unlink any old dump file in this case. This seems a bit 543 * questionable but it's the historical behavior.) 544 */ 545 if (!pgss_save) 546 { 547 FreeFile(qfile); 548 return; 549 } 550 551 /* 552 * Attempt to load old statistics from the dump file. 553 */ 554 file = AllocateFile(PGSS_DUMP_FILE, PG_BINARY_R); 555 if (file == NULL) 556 { 557 if (errno != ENOENT) 558 goto read_error; 559 /* No existing persisted stats file, so we're done */ 560 FreeFile(qfile); 561 return; 562 } 563 564 buffer_size = 2048; 565 buffer = (char *) palloc(buffer_size); 566 567 if (fread(&header, sizeof(uint32), 1, file) != 1 || 568 fread(&pgver, sizeof(uint32), 1, file) != 1 || 569 fread(&num, sizeof(int32), 1, file) != 1) 570 goto read_error; 571 572 if (header != PGSS_FILE_HEADER || 573 pgver != PGSS_PG_MAJOR_VERSION) 574 goto data_error; 575 576 for (i = 0; i < num; i++) 577 { 578 pgssEntry temp; 579 pgssEntry *entry; 580 Size query_offset; 581 582 if (fread(&temp, sizeof(pgssEntry), 1, file) != 1) 583 goto read_error; 584 585 /* Encoding is the only field we can easily sanity-check */ 586 if (!PG_VALID_BE_ENCODING(temp.encoding)) 587 goto data_error; 588 589 /* Resize buffer as needed */ 590 if (temp.query_len >= buffer_size) 591 { 592 buffer_size = Max(buffer_size * 2, temp.query_len + 1); 593 buffer = repalloc(buffer, buffer_size); 594 } 595 596 if (fread(buffer, 1, temp.query_len + 1, file) != temp.query_len + 1) 597 goto read_error; 598 599 /* Should have a trailing null, but let's make sure */ 600 buffer[temp.query_len] = '\0'; 601 602 /* Skip loading "sticky" entries */ 603 if (temp.counters.calls == 0) 604 continue; 605 606 /* Store the query text */ 607 query_offset = pgss->extent; 608 if (fwrite(buffer, 1, temp.query_len + 1, qfile) != temp.query_len + 1) 609 goto write_error; 610 pgss->extent += temp.query_len + 1; 611 612 /* make the hashtable entry (discards old entries if too many) */ 613 entry = entry_alloc(&temp.key, query_offset, temp.query_len, 614 temp.encoding, 615 false); 616 617 /* copy in the actual stats */ 618 entry->counters = temp.counters; 619 } 620 621 pfree(buffer); 622 FreeFile(file); 623 FreeFile(qfile); 624 625 /* 626 * Remove the persisted stats file so it's not included in 627 * backups/replication slaves, etc. A new file will be written on next 628 * shutdown. 629 * 630 * Note: it's okay if the PGSS_TEXT_FILE is included in a basebackup, 631 * because we remove that file on startup; it acts inversely to 632 * PGSS_DUMP_FILE, in that it is only supposed to be around when the 633 * server is running, whereas PGSS_DUMP_FILE is only supposed to be around 634 * when the server is not running. Leaving the file creates no danger of 635 * a newly restored database having a spurious record of execution costs, 636 * which is what we're really concerned about here. 637 */ 638 unlink(PGSS_DUMP_FILE); 639 640 return; 641 642 read_error: 643 ereport(LOG, 644 (errcode_for_file_access(), 645 errmsg("could not read pg_stat_statement file \"%s\": %m", 646 PGSS_DUMP_FILE))); 647 goto fail; 648 data_error: 649 ereport(LOG, 650 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), 651 errmsg("ignoring invalid data in pg_stat_statement file \"%s\"", 652 PGSS_DUMP_FILE))); 653 goto fail; 654 write_error: 655 ereport(LOG, 656 (errcode_for_file_access(), 657 errmsg("could not write pg_stat_statement file \"%s\": %m", 658 PGSS_TEXT_FILE))); 659 fail: 660 if (buffer) 661 pfree(buffer); 662 if (file) 663 FreeFile(file); 664 if (qfile) 665 FreeFile(qfile); 666 /* If possible, throw away the bogus file; ignore any error */ 667 unlink(PGSS_DUMP_FILE); 668 669 /* 670 * Don't unlink PGSS_TEXT_FILE here; it should always be around while the 671 * server is running with pg_stat_statements enabled 672 */ 673 } 674 675 /* 676 * shmem_shutdown hook: Dump statistics into file. 677 * 678 * Note: we don't bother with acquiring lock, because there should be no 679 * other processes running when this is called. 680 */ 681 static void 682 pgss_shmem_shutdown(int code, Datum arg) 683 { 684 FILE *file; 685 char *qbuffer = NULL; 686 Size qbuffer_size = 0; 687 HASH_SEQ_STATUS hash_seq; 688 int32 num_entries; 689 pgssEntry *entry; 690 691 /* Don't try to dump during a crash. */ 692 if (code) 693 return; 694 695 /* Safety check ... shouldn't get here unless shmem is set up. */ 696 if (!pgss || !pgss_hash) 697 return; 698 699 /* Don't dump if told not to. */ 700 if (!pgss_save) 701 return; 702 703 file = AllocateFile(PGSS_DUMP_FILE ".tmp", PG_BINARY_W); 704 if (file == NULL) 705 goto error; 706 707 if (fwrite(&PGSS_FILE_HEADER, sizeof(uint32), 1, file) != 1) 708 goto error; 709 if (fwrite(&PGSS_PG_MAJOR_VERSION, sizeof(uint32), 1, file) != 1) 710 goto error; 711 num_entries = hash_get_num_entries(pgss_hash); 712 if (fwrite(&num_entries, sizeof(int32), 1, file) != 1) 713 goto error; 714 715 qbuffer = qtext_load_file(&qbuffer_size); 716 if (qbuffer == NULL) 717 goto error; 718 719 /* 720 * When serializing to disk, we store query texts immediately after their 721 * entry data. Any orphaned query texts are thereby excluded. 722 */ 723 hash_seq_init(&hash_seq, pgss_hash); 724 while ((entry = hash_seq_search(&hash_seq)) != NULL) 725 { 726 int len = entry->query_len; 727 char *qstr = qtext_fetch(entry->query_offset, len, 728 qbuffer, qbuffer_size); 729 730 if (qstr == NULL) 731 continue; /* Ignore any entries with bogus texts */ 732 733 if (fwrite(entry, sizeof(pgssEntry), 1, file) != 1 || 734 fwrite(qstr, 1, len + 1, file) != len + 1) 735 { 736 /* note: we assume hash_seq_term won't change errno */ 737 hash_seq_term(&hash_seq); 738 goto error; 739 } 740 } 741 742 free(qbuffer); 743 qbuffer = NULL; 744 745 if (FreeFile(file)) 746 { 747 file = NULL; 748 goto error; 749 } 750 751 /* 752 * Rename file into place, so we atomically replace any old one. 753 */ 754 (void) durable_rename(PGSS_DUMP_FILE ".tmp", PGSS_DUMP_FILE, LOG); 755 756 /* Unlink query-texts file; it's not needed while shutdown */ 757 unlink(PGSS_TEXT_FILE); 758 759 return; 760 761 error: 762 ereport(LOG, 763 (errcode_for_file_access(), 764 errmsg("could not write pg_stat_statement file \"%s\": %m", 765 PGSS_DUMP_FILE ".tmp"))); 766 if (qbuffer) 767 free(qbuffer); 768 if (file) 769 FreeFile(file); 770 unlink(PGSS_DUMP_FILE ".tmp"); 771 unlink(PGSS_TEXT_FILE); 772 } 773 774 /* 775 * Post-parse-analysis hook: mark query with a queryId 776 */ 777 static void 778 pgss_post_parse_analyze(ParseState *pstate, Query *query) 779 { 780 pgssJumbleState jstate; 781 782 if (prev_post_parse_analyze_hook) 783 prev_post_parse_analyze_hook(pstate, query); 784 785 /* Assert we didn't do this already */ 786 Assert(query->queryId == UINT64CONST(0)); 787 788 /* Safety check... */ 789 if (!pgss || !pgss_hash) 790 return; 791 792 /* 793 * Utility statements get queryId zero. We do this even in cases where 794 * the statement contains an optimizable statement for which a queryId 795 * could be derived (such as EXPLAIN or DECLARE CURSOR). For such cases, 796 * runtime control will first go through ProcessUtility and then the 797 * executor, and we don't want the executor hooks to do anything, since we 798 * are already measuring the statement's costs at the utility level. 799 */ 800 if (query->utilityStmt) 801 { 802 query->queryId = UINT64CONST(0); 803 return; 804 } 805 806 /* Set up workspace for query jumbling */ 807 jstate.jumble = (unsigned char *) palloc(JUMBLE_SIZE); 808 jstate.jumble_len = 0; 809 jstate.clocations_buf_size = 32; 810 jstate.clocations = (pgssLocationLen *) 811 palloc(jstate.clocations_buf_size * sizeof(pgssLocationLen)); 812 jstate.clocations_count = 0; 813 jstate.highest_extern_param_id = 0; 814 815 /* Compute query ID and mark the Query node with it */ 816 JumbleQuery(&jstate, query); 817 query->queryId = 818 DatumGetUInt64(hash_any_extended(jstate.jumble, jstate.jumble_len, 0)); 819 820 /* 821 * If we are unlucky enough to get a hash of zero, use 1 instead, to 822 * prevent confusion with the utility-statement case. 823 */ 824 if (query->queryId == UINT64CONST(0)) 825 query->queryId = UINT64CONST(1); 826 827 /* 828 * If we were able to identify any ignorable constants, we immediately 829 * create a hash table entry for the query, so that we can record the 830 * normalized form of the query string. If there were no such constants, 831 * the normalized string would be the same as the query text anyway, so 832 * there's no need for an early entry. 833 */ 834 if (jstate.clocations_count > 0) 835 pgss_store(pstate->p_sourcetext, 836 query->queryId, 837 query->stmt_location, 838 query->stmt_len, 839 0, 840 0, 841 NULL, 842 &jstate); 843 } 844 845 /* 846 * ExecutorStart hook: start up tracking if needed 847 */ 848 static void 849 pgss_ExecutorStart(QueryDesc *queryDesc, int eflags) 850 { 851 if (prev_ExecutorStart) 852 prev_ExecutorStart(queryDesc, eflags); 853 else 854 standard_ExecutorStart(queryDesc, eflags); 855 856 /* 857 * If query has queryId zero, don't track it. This prevents double 858 * counting of optimizable statements that are directly contained in 859 * utility statements. 860 */ 861 if (pgss_enabled() && queryDesc->plannedstmt->queryId != UINT64CONST(0)) 862 { 863 /* 864 * Set up to track total elapsed time in ExecutorRun. Make sure the 865 * space is allocated in the per-query context so it will go away at 866 * ExecutorEnd. 867 */ 868 if (queryDesc->totaltime == NULL) 869 { 870 MemoryContext oldcxt; 871 872 oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt); 873 queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL); 874 MemoryContextSwitchTo(oldcxt); 875 } 876 } 877 } 878 879 /* 880 * ExecutorRun hook: all we need do is track nesting depth 881 */ 882 static void 883 pgss_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, uint64 count, 884 bool execute_once) 885 { 886 nested_level++; 887 PG_TRY(); 888 { 889 if (prev_ExecutorRun) 890 prev_ExecutorRun(queryDesc, direction, count, execute_once); 891 else 892 standard_ExecutorRun(queryDesc, direction, count, execute_once); 893 nested_level--; 894 } 895 PG_CATCH(); 896 { 897 nested_level--; 898 PG_RE_THROW(); 899 } 900 PG_END_TRY(); 901 } 902 903 /* 904 * ExecutorFinish hook: all we need do is track nesting depth 905 */ 906 static void 907 pgss_ExecutorFinish(QueryDesc *queryDesc) 908 { 909 nested_level++; 910 PG_TRY(); 911 { 912 if (prev_ExecutorFinish) 913 prev_ExecutorFinish(queryDesc); 914 else 915 standard_ExecutorFinish(queryDesc); 916 nested_level--; 917 } 918 PG_CATCH(); 919 { 920 nested_level--; 921 PG_RE_THROW(); 922 } 923 PG_END_TRY(); 924 } 925 926 /* 927 * ExecutorEnd hook: store results if needed 928 */ 929 static void 930 pgss_ExecutorEnd(QueryDesc *queryDesc) 931 { 932 uint64 queryId = queryDesc->plannedstmt->queryId; 933 934 if (queryId != UINT64CONST(0) && queryDesc->totaltime && pgss_enabled()) 935 { 936 /* 937 * Make sure stats accumulation is done. (Note: it's okay if several 938 * levels of hook all do this.) 939 */ 940 InstrEndLoop(queryDesc->totaltime); 941 942 pgss_store(queryDesc->sourceText, 943 queryId, 944 queryDesc->plannedstmt->stmt_location, 945 queryDesc->plannedstmt->stmt_len, 946 queryDesc->totaltime->total * 1000.0, /* convert to msec */ 947 queryDesc->estate->es_processed, 948 &queryDesc->totaltime->bufusage, 949 NULL); 950 } 951 952 if (prev_ExecutorEnd) 953 prev_ExecutorEnd(queryDesc); 954 else 955 standard_ExecutorEnd(queryDesc); 956 } 957 958 /* 959 * ProcessUtility hook 960 */ 961 static void 962 pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString, 963 ProcessUtilityContext context, 964 ParamListInfo params, QueryEnvironment *queryEnv, 965 DestReceiver *dest, char *completionTag) 966 { 967 Node *parsetree = pstmt->utilityStmt; 968 969 /* 970 * If it's an EXECUTE statement, we don't track it and don't increment the 971 * nesting level. This allows the cycles to be charged to the underlying 972 * PREPARE instead (by the Executor hooks), which is much more useful. 973 * 974 * We also don't track execution of PREPARE. If we did, we would get one 975 * hash table entry for the PREPARE (with hash calculated from the query 976 * string), and then a different one with the same query string (but hash 977 * calculated from the query tree) would be used to accumulate costs of 978 * ensuing EXECUTEs. This would be confusing, and inconsistent with other 979 * cases where planning time is not included at all. 980 * 981 * Likewise, we don't track execution of DEALLOCATE. 982 */ 983 if (pgss_track_utility && pgss_enabled() && 984 !IsA(parsetree, ExecuteStmt) && 985 !IsA(parsetree, PrepareStmt) && 986 !IsA(parsetree, DeallocateStmt)) 987 { 988 instr_time start; 989 instr_time duration; 990 uint64 rows; 991 BufferUsage bufusage_start, 992 bufusage; 993 994 bufusage_start = pgBufferUsage; 995 INSTR_TIME_SET_CURRENT(start); 996 997 nested_level++; 998 PG_TRY(); 999 { 1000 if (prev_ProcessUtility) 1001 prev_ProcessUtility(pstmt, queryString, 1002 context, params, queryEnv, 1003 dest, completionTag); 1004 else 1005 standard_ProcessUtility(pstmt, queryString, 1006 context, params, queryEnv, 1007 dest, completionTag); 1008 nested_level--; 1009 } 1010 PG_CATCH(); 1011 { 1012 nested_level--; 1013 PG_RE_THROW(); 1014 } 1015 PG_END_TRY(); 1016 1017 INSTR_TIME_SET_CURRENT(duration); 1018 INSTR_TIME_SUBTRACT(duration, start); 1019 1020 /* parse command tag to retrieve the number of affected rows. */ 1021 if (completionTag && 1022 strncmp(completionTag, "COPY ", 5) == 0) 1023 rows = pg_strtouint64(completionTag + 5, NULL, 10); 1024 else 1025 rows = 0; 1026 1027 /* calc differences of buffer counters. */ 1028 bufusage.shared_blks_hit = 1029 pgBufferUsage.shared_blks_hit - bufusage_start.shared_blks_hit; 1030 bufusage.shared_blks_read = 1031 pgBufferUsage.shared_blks_read - bufusage_start.shared_blks_read; 1032 bufusage.shared_blks_dirtied = 1033 pgBufferUsage.shared_blks_dirtied - bufusage_start.shared_blks_dirtied; 1034 bufusage.shared_blks_written = 1035 pgBufferUsage.shared_blks_written - bufusage_start.shared_blks_written; 1036 bufusage.local_blks_hit = 1037 pgBufferUsage.local_blks_hit - bufusage_start.local_blks_hit; 1038 bufusage.local_blks_read = 1039 pgBufferUsage.local_blks_read - bufusage_start.local_blks_read; 1040 bufusage.local_blks_dirtied = 1041 pgBufferUsage.local_blks_dirtied - bufusage_start.local_blks_dirtied; 1042 bufusage.local_blks_written = 1043 pgBufferUsage.local_blks_written - bufusage_start.local_blks_written; 1044 bufusage.temp_blks_read = 1045 pgBufferUsage.temp_blks_read - bufusage_start.temp_blks_read; 1046 bufusage.temp_blks_written = 1047 pgBufferUsage.temp_blks_written - bufusage_start.temp_blks_written; 1048 bufusage.blk_read_time = pgBufferUsage.blk_read_time; 1049 INSTR_TIME_SUBTRACT(bufusage.blk_read_time, bufusage_start.blk_read_time); 1050 bufusage.blk_write_time = pgBufferUsage.blk_write_time; 1051 INSTR_TIME_SUBTRACT(bufusage.blk_write_time, bufusage_start.blk_write_time); 1052 1053 pgss_store(queryString, 1054 0, /* signal that it's a utility stmt */ 1055 pstmt->stmt_location, 1056 pstmt->stmt_len, 1057 INSTR_TIME_GET_MILLISEC(duration), 1058 rows, 1059 &bufusage, 1060 NULL); 1061 } 1062 else 1063 { 1064 if (prev_ProcessUtility) 1065 prev_ProcessUtility(pstmt, queryString, 1066 context, params, queryEnv, 1067 dest, completionTag); 1068 else 1069 standard_ProcessUtility(pstmt, queryString, 1070 context, params, queryEnv, 1071 dest, completionTag); 1072 } 1073 } 1074 1075 /* 1076 * Given an arbitrarily long query string, produce a hash for the purposes of 1077 * identifying the query, without normalizing constants. Used when hashing 1078 * utility statements. 1079 */ 1080 static uint64 1081 pgss_hash_string(const char *str, int len) 1082 { 1083 return DatumGetUInt64(hash_any_extended((const unsigned char *) str, 1084 len, 0)); 1085 } 1086 1087 /* 1088 * Store some statistics for a statement. 1089 * 1090 * If queryId is 0 then this is a utility statement and we should compute 1091 * a suitable queryId internally. 1092 * 1093 * If jstate is not NULL then we're trying to create an entry for which 1094 * we have no statistics as yet; we just want to record the normalized 1095 * query string. total_time, rows, bufusage are ignored in this case. 1096 */ 1097 static void 1098 pgss_store(const char *query, uint64 queryId, 1099 int query_location, int query_len, 1100 double total_time, uint64 rows, 1101 const BufferUsage *bufusage, 1102 pgssJumbleState *jstate) 1103 { 1104 pgssHashKey key; 1105 pgssEntry *entry; 1106 char *norm_query = NULL; 1107 int encoding = GetDatabaseEncoding(); 1108 1109 Assert(query != NULL); 1110 1111 /* Safety check... */ 1112 if (!pgss || !pgss_hash) 1113 return; 1114 1115 /* 1116 * Confine our attention to the relevant part of the string, if the query 1117 * is a portion of a multi-statement source string. 1118 * 1119 * First apply starting offset, unless it's -1 (unknown). 1120 */ 1121 if (query_location >= 0) 1122 { 1123 Assert(query_location <= strlen(query)); 1124 query += query_location; 1125 /* Length of 0 (or -1) means "rest of string" */ 1126 if (query_len <= 0) 1127 query_len = strlen(query); 1128 else 1129 Assert(query_len <= strlen(query)); 1130 } 1131 else 1132 { 1133 /* If query location is unknown, distrust query_len as well */ 1134 query_location = 0; 1135 query_len = strlen(query); 1136 } 1137 1138 /* 1139 * Discard leading and trailing whitespace, too. Use scanner_isspace() 1140 * not libc's isspace(), because we want to match the lexer's behavior. 1141 */ 1142 while (query_len > 0 && scanner_isspace(query[0])) 1143 query++, query_location++, query_len--; 1144 while (query_len > 0 && scanner_isspace(query[query_len - 1])) 1145 query_len--; 1146 1147 /* 1148 * For utility statements, we just hash the query string to get an ID. 1149 */ 1150 if (queryId == UINT64CONST(0)) 1151 queryId = pgss_hash_string(query, query_len); 1152 1153 /* Set up key for hashtable search */ 1154 key.userid = GetUserId(); 1155 key.dbid = MyDatabaseId; 1156 key.queryid = queryId; 1157 1158 /* Lookup the hash table entry with shared lock. */ 1159 LWLockAcquire(pgss->lock, LW_SHARED); 1160 1161 entry = (pgssEntry *) hash_search(pgss_hash, &key, HASH_FIND, NULL); 1162 1163 /* Create new entry, if not present */ 1164 if (!entry) 1165 { 1166 Size query_offset; 1167 int gc_count; 1168 bool stored; 1169 bool do_gc; 1170 1171 /* 1172 * Create a new, normalized query string if caller asked. We don't 1173 * need to hold the lock while doing this work. (Note: in any case, 1174 * it's possible that someone else creates a duplicate hashtable entry 1175 * in the interval where we don't hold the lock below. That case is 1176 * handled by entry_alloc.) 1177 */ 1178 if (jstate) 1179 { 1180 LWLockRelease(pgss->lock); 1181 norm_query = generate_normalized_query(jstate, query, 1182 query_location, 1183 &query_len, 1184 encoding); 1185 LWLockAcquire(pgss->lock, LW_SHARED); 1186 } 1187 1188 /* Append new query text to file with only shared lock held */ 1189 stored = qtext_store(norm_query ? norm_query : query, query_len, 1190 &query_offset, &gc_count); 1191 1192 /* 1193 * Determine whether we need to garbage collect external query texts 1194 * while the shared lock is still held. This micro-optimization 1195 * avoids taking the time to decide this while holding exclusive lock. 1196 */ 1197 do_gc = need_gc_qtexts(); 1198 1199 /* Need exclusive lock to make a new hashtable entry - promote */ 1200 LWLockRelease(pgss->lock); 1201 LWLockAcquire(pgss->lock, LW_EXCLUSIVE); 1202 1203 /* 1204 * A garbage collection may have occurred while we weren't holding the 1205 * lock. In the unlikely event that this happens, the query text we 1206 * stored above will have been garbage collected, so write it again. 1207 * This should be infrequent enough that doing it while holding 1208 * exclusive lock isn't a performance problem. 1209 */ 1210 if (!stored || pgss->gc_count != gc_count) 1211 stored = qtext_store(norm_query ? norm_query : query, query_len, 1212 &query_offset, NULL); 1213 1214 /* If we failed to write to the text file, give up */ 1215 if (!stored) 1216 goto done; 1217 1218 /* OK to create a new hashtable entry */ 1219 entry = entry_alloc(&key, query_offset, query_len, encoding, 1220 jstate != NULL); 1221 1222 /* If needed, perform garbage collection while exclusive lock held */ 1223 if (do_gc) 1224 gc_qtexts(); 1225 } 1226 1227 /* Increment the counts, except when jstate is not NULL */ 1228 if (!jstate) 1229 { 1230 /* 1231 * Grab the spinlock while updating the counters (see comment about 1232 * locking rules at the head of the file) 1233 */ 1234 volatile pgssEntry *e = (volatile pgssEntry *) entry; 1235 1236 SpinLockAcquire(&e->mutex); 1237 1238 /* "Unstick" entry if it was previously sticky */ 1239 if (e->counters.calls == 0) 1240 e->counters.usage = USAGE_INIT; 1241 1242 e->counters.calls += 1; 1243 e->counters.total_time += total_time; 1244 if (e->counters.calls == 1) 1245 { 1246 e->counters.min_time = total_time; 1247 e->counters.max_time = total_time; 1248 e->counters.mean_time = total_time; 1249 } 1250 else 1251 { 1252 /* 1253 * Welford's method for accurately computing variance. See 1254 * <http://www.johndcook.com/blog/standard_deviation/> 1255 */ 1256 double old_mean = e->counters.mean_time; 1257 1258 e->counters.mean_time += 1259 (total_time - old_mean) / e->counters.calls; 1260 e->counters.sum_var_time += 1261 (total_time - old_mean) * (total_time - e->counters.mean_time); 1262 1263 /* calculate min and max time */ 1264 if (e->counters.min_time > total_time) 1265 e->counters.min_time = total_time; 1266 if (e->counters.max_time < total_time) 1267 e->counters.max_time = total_time; 1268 } 1269 e->counters.rows += rows; 1270 e->counters.shared_blks_hit += bufusage->shared_blks_hit; 1271 e->counters.shared_blks_read += bufusage->shared_blks_read; 1272 e->counters.shared_blks_dirtied += bufusage->shared_blks_dirtied; 1273 e->counters.shared_blks_written += bufusage->shared_blks_written; 1274 e->counters.local_blks_hit += bufusage->local_blks_hit; 1275 e->counters.local_blks_read += bufusage->local_blks_read; 1276 e->counters.local_blks_dirtied += bufusage->local_blks_dirtied; 1277 e->counters.local_blks_written += bufusage->local_blks_written; 1278 e->counters.temp_blks_read += bufusage->temp_blks_read; 1279 e->counters.temp_blks_written += bufusage->temp_blks_written; 1280 e->counters.blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_read_time); 1281 e->counters.blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_write_time); 1282 e->counters.usage += USAGE_EXEC(total_time); 1283 1284 SpinLockRelease(&e->mutex); 1285 } 1286 1287 done: 1288 LWLockRelease(pgss->lock); 1289 1290 /* We postpone this clean-up until we're out of the lock */ 1291 if (norm_query) 1292 pfree(norm_query); 1293 } 1294 1295 /* 1296 * Reset all statement statistics. 1297 */ 1298 Datum 1299 pg_stat_statements_reset(PG_FUNCTION_ARGS) 1300 { 1301 if (!pgss || !pgss_hash) 1302 ereport(ERROR, 1303 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), 1304 errmsg("pg_stat_statements must be loaded via shared_preload_libraries"))); 1305 entry_reset(); 1306 PG_RETURN_VOID(); 1307 } 1308 1309 /* Number of output arguments (columns) for various API versions */ 1310 #define PG_STAT_STATEMENTS_COLS_V1_0 14 1311 #define PG_STAT_STATEMENTS_COLS_V1_1 18 1312 #define PG_STAT_STATEMENTS_COLS_V1_2 19 1313 #define PG_STAT_STATEMENTS_COLS_V1_3 23 1314 #define PG_STAT_STATEMENTS_COLS 23 /* maximum of above */ 1315 1316 /* 1317 * Retrieve statement statistics. 1318 * 1319 * The SQL API of this function has changed multiple times, and will likely 1320 * do so again in future. To support the case where a newer version of this 1321 * loadable module is being used with an old SQL declaration of the function, 1322 * we continue to support the older API versions. For 1.2 and later, the 1323 * expected API version is identified by embedding it in the C name of the 1324 * function. Unfortunately we weren't bright enough to do that for 1.1. 1325 */ 1326 Datum 1327 pg_stat_statements_1_3(PG_FUNCTION_ARGS) 1328 { 1329 bool showtext = PG_GETARG_BOOL(0); 1330 1331 pg_stat_statements_internal(fcinfo, PGSS_V1_3, showtext); 1332 1333 return (Datum) 0; 1334 } 1335 1336 Datum 1337 pg_stat_statements_1_2(PG_FUNCTION_ARGS) 1338 { 1339 bool showtext = PG_GETARG_BOOL(0); 1340 1341 pg_stat_statements_internal(fcinfo, PGSS_V1_2, showtext); 1342 1343 return (Datum) 0; 1344 } 1345 1346 /* 1347 * Legacy entry point for pg_stat_statements() API versions 1.0 and 1.1. 1348 * This can be removed someday, perhaps. 1349 */ 1350 Datum 1351 pg_stat_statements(PG_FUNCTION_ARGS) 1352 { 1353 /* If it's really API 1.1, we'll figure that out below */ 1354 pg_stat_statements_internal(fcinfo, PGSS_V1_0, true); 1355 1356 return (Datum) 0; 1357 } 1358 1359 /* Common code for all versions of pg_stat_statements() */ 1360 static void 1361 pg_stat_statements_internal(FunctionCallInfo fcinfo, 1362 pgssVersion api_version, 1363 bool showtext) 1364 { 1365 ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; 1366 TupleDesc tupdesc; 1367 Tuplestorestate *tupstore; 1368 MemoryContext per_query_ctx; 1369 MemoryContext oldcontext; 1370 Oid userid = GetUserId(); 1371 bool is_allowed_role = false; 1372 char *qbuffer = NULL; 1373 Size qbuffer_size = 0; 1374 Size extent = 0; 1375 int gc_count = 0; 1376 HASH_SEQ_STATUS hash_seq; 1377 pgssEntry *entry; 1378 1379 /* Superusers or members of pg_read_all_stats members are allowed */ 1380 is_allowed_role = is_member_of_role(GetUserId(), DEFAULT_ROLE_READ_ALL_STATS); 1381 1382 /* hash table must exist already */ 1383 if (!pgss || !pgss_hash) 1384 ereport(ERROR, 1385 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), 1386 errmsg("pg_stat_statements must be loaded via shared_preload_libraries"))); 1387 1388 /* check to see if caller supports us returning a tuplestore */ 1389 if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) 1390 ereport(ERROR, 1391 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), 1392 errmsg("set-valued function called in context that cannot accept a set"))); 1393 if (!(rsinfo->allowedModes & SFRM_Materialize)) 1394 ereport(ERROR, 1395 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), 1396 errmsg("materialize mode required, but it is not " \ 1397 "allowed in this context"))); 1398 1399 /* Switch into long-lived context to construct returned data structures */ 1400 per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; 1401 oldcontext = MemoryContextSwitchTo(per_query_ctx); 1402 1403 /* Build a tuple descriptor for our result type */ 1404 if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) 1405 elog(ERROR, "return type must be a row type"); 1406 1407 /* 1408 * Check we have the expected number of output arguments. Aside from 1409 * being a good safety check, we need a kluge here to detect API version 1410 * 1.1, which was wedged into the code in an ill-considered way. 1411 */ 1412 switch (tupdesc->natts) 1413 { 1414 case PG_STAT_STATEMENTS_COLS_V1_0: 1415 if (api_version != PGSS_V1_0) 1416 elog(ERROR, "incorrect number of output arguments"); 1417 break; 1418 case PG_STAT_STATEMENTS_COLS_V1_1: 1419 /* pg_stat_statements() should have told us 1.0 */ 1420 if (api_version != PGSS_V1_0) 1421 elog(ERROR, "incorrect number of output arguments"); 1422 api_version = PGSS_V1_1; 1423 break; 1424 case PG_STAT_STATEMENTS_COLS_V1_2: 1425 if (api_version != PGSS_V1_2) 1426 elog(ERROR, "incorrect number of output arguments"); 1427 break; 1428 case PG_STAT_STATEMENTS_COLS_V1_3: 1429 if (api_version != PGSS_V1_3) 1430 elog(ERROR, "incorrect number of output arguments"); 1431 break; 1432 default: 1433 elog(ERROR, "incorrect number of output arguments"); 1434 } 1435 1436 tupstore = tuplestore_begin_heap(true, false, work_mem); 1437 rsinfo->returnMode = SFRM_Materialize; 1438 rsinfo->setResult = tupstore; 1439 rsinfo->setDesc = tupdesc; 1440 1441 MemoryContextSwitchTo(oldcontext); 1442 1443 /* 1444 * We'd like to load the query text file (if needed) while not holding any 1445 * lock on pgss->lock. In the worst case we'll have to do this again 1446 * after we have the lock, but it's unlikely enough to make this a win 1447 * despite occasional duplicated work. We need to reload if anybody 1448 * writes to the file (either a retail qtext_store(), or a garbage 1449 * collection) between this point and where we've gotten shared lock. If 1450 * a qtext_store is actually in progress when we look, we might as well 1451 * skip the speculative load entirely. 1452 */ 1453 if (showtext) 1454 { 1455 int n_writers; 1456 1457 /* Take the mutex so we can examine variables */ 1458 { 1459 volatile pgssSharedState *s = (volatile pgssSharedState *) pgss; 1460 1461 SpinLockAcquire(&s->mutex); 1462 extent = s->extent; 1463 n_writers = s->n_writers; 1464 gc_count = s->gc_count; 1465 SpinLockRelease(&s->mutex); 1466 } 1467 1468 /* No point in loading file now if there are active writers */ 1469 if (n_writers == 0) 1470 qbuffer = qtext_load_file(&qbuffer_size); 1471 } 1472 1473 /* 1474 * Get shared lock, load or reload the query text file if we must, and 1475 * iterate over the hashtable entries. 1476 * 1477 * With a large hash table, we might be holding the lock rather longer 1478 * than one could wish. However, this only blocks creation of new hash 1479 * table entries, and the larger the hash table the less likely that is to 1480 * be needed. So we can hope this is okay. Perhaps someday we'll decide 1481 * we need to partition the hash table to limit the time spent holding any 1482 * one lock. 1483 */ 1484 LWLockAcquire(pgss->lock, LW_SHARED); 1485 1486 if (showtext) 1487 { 1488 /* 1489 * Here it is safe to examine extent and gc_count without taking the 1490 * mutex. Note that although other processes might change 1491 * pgss->extent just after we look at it, the strings they then write 1492 * into the file cannot yet be referenced in the hashtable, so we 1493 * don't care whether we see them or not. 1494 * 1495 * If qtext_load_file fails, we just press on; we'll return NULL for 1496 * every query text. 1497 */ 1498 if (qbuffer == NULL || 1499 pgss->extent != extent || 1500 pgss->gc_count != gc_count) 1501 { 1502 if (qbuffer) 1503 free(qbuffer); 1504 qbuffer = qtext_load_file(&qbuffer_size); 1505 } 1506 } 1507 1508 hash_seq_init(&hash_seq, pgss_hash); 1509 while ((entry = hash_seq_search(&hash_seq)) != NULL) 1510 { 1511 Datum values[PG_STAT_STATEMENTS_COLS]; 1512 bool nulls[PG_STAT_STATEMENTS_COLS]; 1513 int i = 0; 1514 Counters tmp; 1515 double stddev; 1516 int64 queryid = entry->key.queryid; 1517 1518 memset(values, 0, sizeof(values)); 1519 memset(nulls, 0, sizeof(nulls)); 1520 1521 values[i++] = ObjectIdGetDatum(entry->key.userid); 1522 values[i++] = ObjectIdGetDatum(entry->key.dbid); 1523 1524 if (is_allowed_role || entry->key.userid == userid) 1525 { 1526 if (api_version >= PGSS_V1_2) 1527 values[i++] = Int64GetDatumFast(queryid); 1528 1529 if (showtext) 1530 { 1531 char *qstr = qtext_fetch(entry->query_offset, 1532 entry->query_len, 1533 qbuffer, 1534 qbuffer_size); 1535 1536 if (qstr) 1537 { 1538 char *enc; 1539 1540 enc = pg_any_to_server(qstr, 1541 entry->query_len, 1542 entry->encoding); 1543 1544 values[i++] = CStringGetTextDatum(enc); 1545 1546 if (enc != qstr) 1547 pfree(enc); 1548 } 1549 else 1550 { 1551 /* Just return a null if we fail to find the text */ 1552 nulls[i++] = true; 1553 } 1554 } 1555 else 1556 { 1557 /* Query text not requested */ 1558 nulls[i++] = true; 1559 } 1560 } 1561 else 1562 { 1563 /* Don't show queryid */ 1564 if (api_version >= PGSS_V1_2) 1565 nulls[i++] = true; 1566 1567 /* 1568 * Don't show query text, but hint as to the reason for not doing 1569 * so if it was requested 1570 */ 1571 if (showtext) 1572 values[i++] = CStringGetTextDatum("<insufficient privilege>"); 1573 else 1574 nulls[i++] = true; 1575 } 1576 1577 /* copy counters to a local variable to keep locking time short */ 1578 { 1579 volatile pgssEntry *e = (volatile pgssEntry *) entry; 1580 1581 SpinLockAcquire(&e->mutex); 1582 tmp = e->counters; 1583 SpinLockRelease(&e->mutex); 1584 } 1585 1586 /* Skip entry if unexecuted (ie, it's a pending "sticky" entry) */ 1587 if (tmp.calls == 0) 1588 continue; 1589 1590 values[i++] = Int64GetDatumFast(tmp.calls); 1591 values[i++] = Float8GetDatumFast(tmp.total_time); 1592 if (api_version >= PGSS_V1_3) 1593 { 1594 values[i++] = Float8GetDatumFast(tmp.min_time); 1595 values[i++] = Float8GetDatumFast(tmp.max_time); 1596 values[i++] = Float8GetDatumFast(tmp.mean_time); 1597 1598 /* 1599 * Note we are calculating the population variance here, not the 1600 * sample variance, as we have data for the whole population, so 1601 * Bessel's correction is not used, and we don't divide by 1602 * tmp.calls - 1. 1603 */ 1604 if (tmp.calls > 1) 1605 stddev = sqrt(tmp.sum_var_time / tmp.calls); 1606 else 1607 stddev = 0.0; 1608 values[i++] = Float8GetDatumFast(stddev); 1609 } 1610 values[i++] = Int64GetDatumFast(tmp.rows); 1611 values[i++] = Int64GetDatumFast(tmp.shared_blks_hit); 1612 values[i++] = Int64GetDatumFast(tmp.shared_blks_read); 1613 if (api_version >= PGSS_V1_1) 1614 values[i++] = Int64GetDatumFast(tmp.shared_blks_dirtied); 1615 values[i++] = Int64GetDatumFast(tmp.shared_blks_written); 1616 values[i++] = Int64GetDatumFast(tmp.local_blks_hit); 1617 values[i++] = Int64GetDatumFast(tmp.local_blks_read); 1618 if (api_version >= PGSS_V1_1) 1619 values[i++] = Int64GetDatumFast(tmp.local_blks_dirtied); 1620 values[i++] = Int64GetDatumFast(tmp.local_blks_written); 1621 values[i++] = Int64GetDatumFast(tmp.temp_blks_read); 1622 values[i++] = Int64GetDatumFast(tmp.temp_blks_written); 1623 if (api_version >= PGSS_V1_1) 1624 { 1625 values[i++] = Float8GetDatumFast(tmp.blk_read_time); 1626 values[i++] = Float8GetDatumFast(tmp.blk_write_time); 1627 } 1628 1629 Assert(i == (api_version == PGSS_V1_0 ? PG_STAT_STATEMENTS_COLS_V1_0 : 1630 api_version == PGSS_V1_1 ? PG_STAT_STATEMENTS_COLS_V1_1 : 1631 api_version == PGSS_V1_2 ? PG_STAT_STATEMENTS_COLS_V1_2 : 1632 api_version == PGSS_V1_3 ? PG_STAT_STATEMENTS_COLS_V1_3 : 1633 -1 /* fail if you forget to update this assert */ )); 1634 1635 tuplestore_putvalues(tupstore, tupdesc, values, nulls); 1636 } 1637 1638 /* clean up and return the tuplestore */ 1639 LWLockRelease(pgss->lock); 1640 1641 if (qbuffer) 1642 free(qbuffer); 1643 1644 tuplestore_donestoring(tupstore); 1645 } 1646 1647 /* 1648 * Estimate shared memory space needed. 1649 */ 1650 static Size 1651 pgss_memsize(void) 1652 { 1653 Size size; 1654 1655 size = MAXALIGN(sizeof(pgssSharedState)); 1656 size = add_size(size, hash_estimate_size(pgss_max, sizeof(pgssEntry))); 1657 1658 return size; 1659 } 1660 1661 /* 1662 * Allocate a new hashtable entry. 1663 * caller must hold an exclusive lock on pgss->lock 1664 * 1665 * "query" need not be null-terminated; we rely on query_len instead 1666 * 1667 * If "sticky" is true, make the new entry artificially sticky so that it will 1668 * probably still be there when the query finishes execution. We do this by 1669 * giving it a median usage value rather than the normal value. (Strictly 1670 * speaking, query strings are normalized on a best effort basis, though it 1671 * would be difficult to demonstrate this even under artificial conditions.) 1672 * 1673 * Note: despite needing exclusive lock, it's not an error for the target 1674 * entry to already exist. This is because pgss_store releases and 1675 * reacquires lock after failing to find a match; so someone else could 1676 * have made the entry while we waited to get exclusive lock. 1677 */ 1678 static pgssEntry * 1679 entry_alloc(pgssHashKey *key, Size query_offset, int query_len, int encoding, 1680 bool sticky) 1681 { 1682 pgssEntry *entry; 1683 bool found; 1684 1685 /* Make space if needed */ 1686 while (hash_get_num_entries(pgss_hash) >= pgss_max) 1687 entry_dealloc(); 1688 1689 /* Find or create an entry with desired hash code */ 1690 entry = (pgssEntry *) hash_search(pgss_hash, key, HASH_ENTER, &found); 1691 1692 if (!found) 1693 { 1694 /* New entry, initialize it */ 1695 1696 /* reset the statistics */ 1697 memset(&entry->counters, 0, sizeof(Counters)); 1698 /* set the appropriate initial usage count */ 1699 entry->counters.usage = sticky ? pgss->cur_median_usage : USAGE_INIT; 1700 /* re-initialize the mutex each time ... we assume no one using it */ 1701 SpinLockInit(&entry->mutex); 1702 /* ... and don't forget the query text metadata */ 1703 Assert(query_len >= 0); 1704 entry->query_offset = query_offset; 1705 entry->query_len = query_len; 1706 entry->encoding = encoding; 1707 } 1708 1709 return entry; 1710 } 1711 1712 /* 1713 * qsort comparator for sorting into increasing usage order 1714 */ 1715 static int 1716 entry_cmp(const void *lhs, const void *rhs) 1717 { 1718 double l_usage = (*(pgssEntry *const *) lhs)->counters.usage; 1719 double r_usage = (*(pgssEntry *const *) rhs)->counters.usage; 1720 1721 if (l_usage < r_usage) 1722 return -1; 1723 else if (l_usage > r_usage) 1724 return +1; 1725 else 1726 return 0; 1727 } 1728 1729 /* 1730 * Deallocate least-used entries. 1731 * 1732 * Caller must hold an exclusive lock on pgss->lock. 1733 */ 1734 static void 1735 entry_dealloc(void) 1736 { 1737 HASH_SEQ_STATUS hash_seq; 1738 pgssEntry **entries; 1739 pgssEntry *entry; 1740 int nvictims; 1741 int i; 1742 Size tottextlen; 1743 int nvalidtexts; 1744 1745 /* 1746 * Sort entries by usage and deallocate USAGE_DEALLOC_PERCENT of them. 1747 * While we're scanning the table, apply the decay factor to the usage 1748 * values, and update the mean query length. 1749 * 1750 * Note that the mean query length is almost immediately obsolete, since 1751 * we compute it before not after discarding the least-used entries. 1752 * Hopefully, that doesn't affect the mean too much; it doesn't seem worth 1753 * making two passes to get a more current result. Likewise, the new 1754 * cur_median_usage includes the entries we're about to zap. 1755 */ 1756 1757 entries = palloc(hash_get_num_entries(pgss_hash) * sizeof(pgssEntry *)); 1758 1759 i = 0; 1760 tottextlen = 0; 1761 nvalidtexts = 0; 1762 1763 hash_seq_init(&hash_seq, pgss_hash); 1764 while ((entry = hash_seq_search(&hash_seq)) != NULL) 1765 { 1766 entries[i++] = entry; 1767 /* "Sticky" entries get a different usage decay rate. */ 1768 if (entry->counters.calls == 0) 1769 entry->counters.usage *= STICKY_DECREASE_FACTOR; 1770 else 1771 entry->counters.usage *= USAGE_DECREASE_FACTOR; 1772 /* In the mean length computation, ignore dropped texts. */ 1773 if (entry->query_len >= 0) 1774 { 1775 tottextlen += entry->query_len + 1; 1776 nvalidtexts++; 1777 } 1778 } 1779 1780 /* Sort into increasing order by usage */ 1781 qsort(entries, i, sizeof(pgssEntry *), entry_cmp); 1782 1783 /* Record the (approximate) median usage */ 1784 if (i > 0) 1785 pgss->cur_median_usage = entries[i / 2]->counters.usage; 1786 /* Record the mean query length */ 1787 if (nvalidtexts > 0) 1788 pgss->mean_query_len = tottextlen / nvalidtexts; 1789 else 1790 pgss->mean_query_len = ASSUMED_LENGTH_INIT; 1791 1792 /* Now zap an appropriate fraction of lowest-usage entries */ 1793 nvictims = Max(10, i * USAGE_DEALLOC_PERCENT / 100); 1794 nvictims = Min(nvictims, i); 1795 1796 for (i = 0; i < nvictims; i++) 1797 { 1798 hash_search(pgss_hash, &entries[i]->key, HASH_REMOVE, NULL); 1799 } 1800 1801 pfree(entries); 1802 } 1803 1804 /* 1805 * Given a query string (not necessarily null-terminated), allocate a new 1806 * entry in the external query text file and store the string there. 1807 * 1808 * If successful, returns true, and stores the new entry's offset in the file 1809 * into *query_offset. Also, if gc_count isn't NULL, *gc_count is set to the 1810 * number of garbage collections that have occurred so far. 1811 * 1812 * On failure, returns false. 1813 * 1814 * At least a shared lock on pgss->lock must be held by the caller, so as 1815 * to prevent a concurrent garbage collection. Share-lock-holding callers 1816 * should pass a gc_count pointer to obtain the number of garbage collections, 1817 * so that they can recheck the count after obtaining exclusive lock to 1818 * detect whether a garbage collection occurred (and removed this entry). 1819 */ 1820 static bool 1821 qtext_store(const char *query, int query_len, 1822 Size *query_offset, int *gc_count) 1823 { 1824 Size off; 1825 int fd; 1826 1827 /* 1828 * We use a spinlock to protect extent/n_writers/gc_count, so that 1829 * multiple processes may execute this function concurrently. 1830 */ 1831 { 1832 volatile pgssSharedState *s = (volatile pgssSharedState *) pgss; 1833 1834 SpinLockAcquire(&s->mutex); 1835 off = s->extent; 1836 s->extent += query_len + 1; 1837 s->n_writers++; 1838 if (gc_count) 1839 *gc_count = s->gc_count; 1840 SpinLockRelease(&s->mutex); 1841 } 1842 1843 *query_offset = off; 1844 1845 /* Now write the data into the successfully-reserved part of the file */ 1846 fd = OpenTransientFile(PGSS_TEXT_FILE, O_RDWR | O_CREAT | PG_BINARY); 1847 if (fd < 0) 1848 goto error; 1849 1850 if (lseek(fd, off, SEEK_SET) != off) 1851 goto error; 1852 1853 if (write(fd, query, query_len) != query_len) 1854 goto error; 1855 if (write(fd, "\0", 1) != 1) 1856 goto error; 1857 1858 CloseTransientFile(fd); 1859 1860 /* Mark our write complete */ 1861 { 1862 volatile pgssSharedState *s = (volatile pgssSharedState *) pgss; 1863 1864 SpinLockAcquire(&s->mutex); 1865 s->n_writers--; 1866 SpinLockRelease(&s->mutex); 1867 } 1868 1869 return true; 1870 1871 error: 1872 ereport(LOG, 1873 (errcode_for_file_access(), 1874 errmsg("could not write pg_stat_statement file \"%s\": %m", 1875 PGSS_TEXT_FILE))); 1876 1877 if (fd >= 0) 1878 CloseTransientFile(fd); 1879 1880 /* Mark our write complete */ 1881 { 1882 volatile pgssSharedState *s = (volatile pgssSharedState *) pgss; 1883 1884 SpinLockAcquire(&s->mutex); 1885 s->n_writers--; 1886 SpinLockRelease(&s->mutex); 1887 } 1888 1889 return false; 1890 } 1891 1892 /* 1893 * Read the external query text file into a malloc'd buffer. 1894 * 1895 * Returns NULL (without throwing an error) if unable to read, eg 1896 * file not there or insufficient memory. 1897 * 1898 * On success, the buffer size is also returned into *buffer_size. 1899 * 1900 * This can be called without any lock on pgss->lock, but in that case 1901 * the caller is responsible for verifying that the result is sane. 1902 */ 1903 static char * 1904 qtext_load_file(Size *buffer_size) 1905 { 1906 char *buf; 1907 int fd; 1908 struct stat stat; 1909 Size nread; 1910 1911 fd = OpenTransientFile(PGSS_TEXT_FILE, O_RDONLY | PG_BINARY); 1912 if (fd < 0) 1913 { 1914 if (errno != ENOENT) 1915 ereport(LOG, 1916 (errcode_for_file_access(), 1917 errmsg("could not read pg_stat_statement file \"%s\": %m", 1918 PGSS_TEXT_FILE))); 1919 return NULL; 1920 } 1921 1922 /* Get file length */ 1923 if (fstat(fd, &stat)) 1924 { 1925 ereport(LOG, 1926 (errcode_for_file_access(), 1927 errmsg("could not stat pg_stat_statement file \"%s\": %m", 1928 PGSS_TEXT_FILE))); 1929 CloseTransientFile(fd); 1930 return NULL; 1931 } 1932 1933 /* Allocate buffer; beware that off_t might be wider than size_t */ 1934 if (stat.st_size <= MaxAllocHugeSize) 1935 buf = (char *) malloc(stat.st_size); 1936 else 1937 buf = NULL; 1938 if (buf == NULL) 1939 { 1940 ereport(LOG, 1941 (errcode(ERRCODE_OUT_OF_MEMORY), 1942 errmsg("out of memory"), 1943 errdetail("Could not allocate enough memory to read pg_stat_statement file \"%s\".", 1944 PGSS_TEXT_FILE))); 1945 CloseTransientFile(fd); 1946 return NULL; 1947 } 1948 1949 /* 1950 * OK, slurp in the file. Windows fails if we try to read more than 1951 * INT_MAX bytes at once, and other platforms might not like that either, 1952 * so read a very large file in 1GB segments. 1953 */ 1954 nread = 0; 1955 while (nread < stat.st_size) 1956 { 1957 int toread = Min(1024 * 1024 * 1024, stat.st_size - nread); 1958 1959 /* 1960 * If we get a short read and errno doesn't get set, the reason is 1961 * probably that garbage collection truncated the file since we did 1962 * the fstat(), so we don't log a complaint --- but we don't return 1963 * the data, either, since it's most likely corrupt due to concurrent 1964 * writes from garbage collection. 1965 */ 1966 errno = 0; 1967 if (read(fd, buf + nread, toread) != toread) 1968 { 1969 if (errno) 1970 ereport(LOG, 1971 (errcode_for_file_access(), 1972 errmsg("could not read pg_stat_statement file \"%s\": %m", 1973 PGSS_TEXT_FILE))); 1974 free(buf); 1975 CloseTransientFile(fd); 1976 return NULL; 1977 } 1978 nread += toread; 1979 } 1980 1981 CloseTransientFile(fd); 1982 1983 *buffer_size = nread; 1984 return buf; 1985 } 1986 1987 /* 1988 * Locate a query text in the file image previously read by qtext_load_file(). 1989 * 1990 * We validate the given offset/length, and return NULL if bogus. Otherwise, 1991 * the result points to a null-terminated string within the buffer. 1992 */ 1993 static char * 1994 qtext_fetch(Size query_offset, int query_len, 1995 char *buffer, Size buffer_size) 1996 { 1997 /* File read failed? */ 1998 if (buffer == NULL) 1999 return NULL; 2000 /* Bogus offset/length? */ 2001 if (query_len < 0 || 2002 query_offset + query_len >= buffer_size) 2003 return NULL; 2004 /* As a further sanity check, make sure there's a trailing null */ 2005 if (buffer[query_offset + query_len] != '\0') 2006 return NULL; 2007 /* Looks OK */ 2008 return buffer + query_offset; 2009 } 2010 2011 /* 2012 * Do we need to garbage-collect the external query text file? 2013 * 2014 * Caller should hold at least a shared lock on pgss->lock. 2015 */ 2016 static bool 2017 need_gc_qtexts(void) 2018 { 2019 Size extent; 2020 2021 /* Read shared extent pointer */ 2022 { 2023 volatile pgssSharedState *s = (volatile pgssSharedState *) pgss; 2024 2025 SpinLockAcquire(&s->mutex); 2026 extent = s->extent; 2027 SpinLockRelease(&s->mutex); 2028 } 2029 2030 /* Don't proceed if file does not exceed 512 bytes per possible entry */ 2031 if (extent < 512 * pgss_max) 2032 return false; 2033 2034 /* 2035 * Don't proceed if file is less than about 50% bloat. Nothing can or 2036 * should be done in the event of unusually large query texts accounting 2037 * for file's large size. We go to the trouble of maintaining the mean 2038 * query length in order to prevent garbage collection from thrashing 2039 * uselessly. 2040 */ 2041 if (extent < pgss->mean_query_len * pgss_max * 2) 2042 return false; 2043 2044 return true; 2045 } 2046 2047 /* 2048 * Garbage-collect orphaned query texts in external file. 2049 * 2050 * This won't be called often in the typical case, since it's likely that 2051 * there won't be too much churn, and besides, a similar compaction process 2052 * occurs when serializing to disk at shutdown or as part of resetting. 2053 * Despite this, it seems prudent to plan for the edge case where the file 2054 * becomes unreasonably large, with no other method of compaction likely to 2055 * occur in the foreseeable future. 2056 * 2057 * The caller must hold an exclusive lock on pgss->lock. 2058 * 2059 * At the first sign of trouble we unlink the query text file to get a clean 2060 * slate (although existing statistics are retained), rather than risk 2061 * thrashing by allowing the same problem case to recur indefinitely. 2062 */ 2063 static void 2064 gc_qtexts(void) 2065 { 2066 char *qbuffer; 2067 Size qbuffer_size; 2068 FILE *qfile = NULL; 2069 HASH_SEQ_STATUS hash_seq; 2070 pgssEntry *entry; 2071 Size extent; 2072 int nentries; 2073 2074 /* 2075 * When called from pgss_store, some other session might have proceeded 2076 * with garbage collection in the no-lock-held interim of lock strength 2077 * escalation. Check once more that this is actually necessary. 2078 */ 2079 if (!need_gc_qtexts()) 2080 return; 2081 2082 /* 2083 * Load the old texts file. If we fail (out of memory, for instance), 2084 * invalidate query texts. Hopefully this is rare. It might seem better 2085 * to leave things alone on an OOM failure, but the problem is that the 2086 * file is only going to get bigger; hoping for a future non-OOM result is 2087 * risky and can easily lead to complete denial of service. 2088 */ 2089 qbuffer = qtext_load_file(&qbuffer_size); 2090 if (qbuffer == NULL) 2091 goto gc_fail; 2092 2093 /* 2094 * We overwrite the query texts file in place, so as to reduce the risk of 2095 * an out-of-disk-space failure. Since the file is guaranteed not to get 2096 * larger, this should always work on traditional filesystems; though we 2097 * could still lose on copy-on-write filesystems. 2098 */ 2099 qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W); 2100 if (qfile == NULL) 2101 { 2102 ereport(LOG, 2103 (errcode_for_file_access(), 2104 errmsg("could not write pg_stat_statement file \"%s\": %m", 2105 PGSS_TEXT_FILE))); 2106 goto gc_fail; 2107 } 2108 2109 extent = 0; 2110 nentries = 0; 2111 2112 hash_seq_init(&hash_seq, pgss_hash); 2113 while ((entry = hash_seq_search(&hash_seq)) != NULL) 2114 { 2115 int query_len = entry->query_len; 2116 char *qry = qtext_fetch(entry->query_offset, 2117 query_len, 2118 qbuffer, 2119 qbuffer_size); 2120 2121 if (qry == NULL) 2122 { 2123 /* Trouble ... drop the text */ 2124 entry->query_offset = 0; 2125 entry->query_len = -1; 2126 /* entry will not be counted in mean query length computation */ 2127 continue; 2128 } 2129 2130 if (fwrite(qry, 1, query_len + 1, qfile) != query_len + 1) 2131 { 2132 ereport(LOG, 2133 (errcode_for_file_access(), 2134 errmsg("could not write pg_stat_statement file \"%s\": %m", 2135 PGSS_TEXT_FILE))); 2136 hash_seq_term(&hash_seq); 2137 goto gc_fail; 2138 } 2139 2140 entry->query_offset = extent; 2141 extent += query_len + 1; 2142 nentries++; 2143 } 2144 2145 /* 2146 * Truncate away any now-unused space. If this fails for some odd reason, 2147 * we log it, but there's no need to fail. 2148 */ 2149 if (ftruncate(fileno(qfile), extent) != 0) 2150 ereport(LOG, 2151 (errcode_for_file_access(), 2152 errmsg("could not truncate pg_stat_statement file \"%s\": %m", 2153 PGSS_TEXT_FILE))); 2154 2155 if (FreeFile(qfile)) 2156 { 2157 ereport(LOG, 2158 (errcode_for_file_access(), 2159 errmsg("could not write pg_stat_statement file \"%s\": %m", 2160 PGSS_TEXT_FILE))); 2161 qfile = NULL; 2162 goto gc_fail; 2163 } 2164 2165 elog(DEBUG1, "pgss gc of queries file shrunk size from %zu to %zu", 2166 pgss->extent, extent); 2167 2168 /* Reset the shared extent pointer */ 2169 pgss->extent = extent; 2170 2171 /* 2172 * Also update the mean query length, to be sure that need_gc_qtexts() 2173 * won't still think we have a problem. 2174 */ 2175 if (nentries > 0) 2176 pgss->mean_query_len = extent / nentries; 2177 else 2178 pgss->mean_query_len = ASSUMED_LENGTH_INIT; 2179 2180 free(qbuffer); 2181 2182 /* 2183 * OK, count a garbage collection cycle. (Note: even though we have 2184 * exclusive lock on pgss->lock, we must take pgss->mutex for this, since 2185 * other processes may examine gc_count while holding only the mutex. 2186 * Also, we have to advance the count *after* we've rewritten the file, 2187 * else other processes might not realize they read a stale file.) 2188 */ 2189 record_gc_qtexts(); 2190 2191 return; 2192 2193 gc_fail: 2194 /* clean up resources */ 2195 if (qfile) 2196 FreeFile(qfile); 2197 if (qbuffer) 2198 free(qbuffer); 2199 2200 /* 2201 * Since the contents of the external file are now uncertain, mark all 2202 * hashtable entries as having invalid texts. 2203 */ 2204 hash_seq_init(&hash_seq, pgss_hash); 2205 while ((entry = hash_seq_search(&hash_seq)) != NULL) 2206 { 2207 entry->query_offset = 0; 2208 entry->query_len = -1; 2209 } 2210 2211 /* 2212 * Destroy the query text file and create a new, empty one 2213 */ 2214 (void) unlink(PGSS_TEXT_FILE); 2215 qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W); 2216 if (qfile == NULL) 2217 ereport(LOG, 2218 (errcode_for_file_access(), 2219 errmsg("could not write new pg_stat_statement file \"%s\": %m", 2220 PGSS_TEXT_FILE))); 2221 else 2222 FreeFile(qfile); 2223 2224 /* Reset the shared extent pointer */ 2225 pgss->extent = 0; 2226 2227 /* Reset mean_query_len to match the new state */ 2228 pgss->mean_query_len = ASSUMED_LENGTH_INIT; 2229 2230 /* 2231 * Bump the GC count even though we failed. 2232 * 2233 * This is needed to make concurrent readers of file without any lock on 2234 * pgss->lock notice existence of new version of file. Once readers 2235 * subsequently observe a change in GC count with pgss->lock held, that 2236 * forces a safe reopen of file. Writers also require that we bump here, 2237 * of course. (As required by locking protocol, readers and writers don't 2238 * trust earlier file contents until gc_count is found unchanged after 2239 * pgss->lock acquired in shared or exclusive mode respectively.) 2240 */ 2241 record_gc_qtexts(); 2242 } 2243 2244 /* 2245 * Release all entries. 2246 */ 2247 static void 2248 entry_reset(void) 2249 { 2250 HASH_SEQ_STATUS hash_seq; 2251 pgssEntry *entry; 2252 FILE *qfile; 2253 2254 LWLockAcquire(pgss->lock, LW_EXCLUSIVE); 2255 2256 hash_seq_init(&hash_seq, pgss_hash); 2257 while ((entry = hash_seq_search(&hash_seq)) != NULL) 2258 { 2259 hash_search(pgss_hash, &entry->key, HASH_REMOVE, NULL); 2260 } 2261 2262 /* 2263 * Write new empty query file, perhaps even creating a new one to recover 2264 * if the file was missing. 2265 */ 2266 qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W); 2267 if (qfile == NULL) 2268 { 2269 ereport(LOG, 2270 (errcode_for_file_access(), 2271 errmsg("could not create pg_stat_statement file \"%s\": %m", 2272 PGSS_TEXT_FILE))); 2273 goto done; 2274 } 2275 2276 /* If ftruncate fails, log it, but it's not a fatal problem */ 2277 if (ftruncate(fileno(qfile), 0) != 0) 2278 ereport(LOG, 2279 (errcode_for_file_access(), 2280 errmsg("could not truncate pg_stat_statement file \"%s\": %m", 2281 PGSS_TEXT_FILE))); 2282 2283 FreeFile(qfile); 2284 2285 done: 2286 pgss->extent = 0; 2287 /* This counts as a query text garbage collection for our purposes */ 2288 record_gc_qtexts(); 2289 2290 LWLockRelease(pgss->lock); 2291 } 2292 2293 /* 2294 * AppendJumble: Append a value that is substantive in a given query to 2295 * the current jumble. 2296 */ 2297 static void 2298 AppendJumble(pgssJumbleState *jstate, const unsigned char *item, Size size) 2299 { 2300 unsigned char *jumble = jstate->jumble; 2301 Size jumble_len = jstate->jumble_len; 2302 2303 /* 2304 * Whenever the jumble buffer is full, we hash the current contents and 2305 * reset the buffer to contain just that hash value, thus relying on the 2306 * hash to summarize everything so far. 2307 */ 2308 while (size > 0) 2309 { 2310 Size part_size; 2311 2312 if (jumble_len >= JUMBLE_SIZE) 2313 { 2314 uint64 start_hash; 2315 2316 start_hash = DatumGetUInt64(hash_any_extended(jumble, 2317 JUMBLE_SIZE, 0)); 2318 memcpy(jumble, &start_hash, sizeof(start_hash)); 2319 jumble_len = sizeof(start_hash); 2320 } 2321 part_size = Min(size, JUMBLE_SIZE - jumble_len); 2322 memcpy(jumble + jumble_len, item, part_size); 2323 jumble_len += part_size; 2324 item += part_size; 2325 size -= part_size; 2326 } 2327 jstate->jumble_len = jumble_len; 2328 } 2329 2330 /* 2331 * Wrappers around AppendJumble to encapsulate details of serialization 2332 * of individual local variable elements. 2333 */ 2334 #define APP_JUMB(item) \ 2335 AppendJumble(jstate, (const unsigned char *) &(item), sizeof(item)) 2336 #define APP_JUMB_STRING(str) \ 2337 AppendJumble(jstate, (const unsigned char *) (str), strlen(str) + 1) 2338 2339 /* 2340 * JumbleQuery: Selectively serialize the query tree, appending significant 2341 * data to the "query jumble" while ignoring nonsignificant data. 2342 * 2343 * Rule of thumb for what to include is that we should ignore anything not 2344 * semantically significant (such as alias names) as well as anything that can 2345 * be deduced from child nodes (else we'd just be double-hashing that piece 2346 * of information). 2347 */ 2348 static void 2349 JumbleQuery(pgssJumbleState *jstate, Query *query) 2350 { 2351 Assert(IsA(query, Query)); 2352 Assert(query->utilityStmt == NULL); 2353 2354 APP_JUMB(query->commandType); 2355 /* resultRelation is usually predictable from commandType */ 2356 JumbleExpr(jstate, (Node *) query->cteList); 2357 JumbleRangeTable(jstate, query->rtable); 2358 JumbleExpr(jstate, (Node *) query->jointree); 2359 JumbleExpr(jstate, (Node *) query->targetList); 2360 JumbleExpr(jstate, (Node *) query->onConflict); 2361 JumbleExpr(jstate, (Node *) query->returningList); 2362 JumbleExpr(jstate, (Node *) query->groupClause); 2363 JumbleExpr(jstate, (Node *) query->groupingSets); 2364 JumbleExpr(jstate, query->havingQual); 2365 JumbleExpr(jstate, (Node *) query->windowClause); 2366 JumbleExpr(jstate, (Node *) query->distinctClause); 2367 JumbleExpr(jstate, (Node *) query->sortClause); 2368 JumbleExpr(jstate, query->limitOffset); 2369 JumbleExpr(jstate, query->limitCount); 2370 /* we ignore rowMarks */ 2371 JumbleExpr(jstate, query->setOperations); 2372 } 2373 2374 /* 2375 * Jumble a range table 2376 */ 2377 static void 2378 JumbleRangeTable(pgssJumbleState *jstate, List *rtable) 2379 { 2380 ListCell *lc; 2381 2382 foreach(lc, rtable) 2383 { 2384 RangeTblEntry *rte = lfirst_node(RangeTblEntry, lc); 2385 2386 APP_JUMB(rte->rtekind); 2387 switch (rte->rtekind) 2388 { 2389 case RTE_RELATION: 2390 APP_JUMB(rte->relid); 2391 JumbleExpr(jstate, (Node *) rte->tablesample); 2392 break; 2393 case RTE_SUBQUERY: 2394 JumbleQuery(jstate, rte->subquery); 2395 break; 2396 case RTE_JOIN: 2397 APP_JUMB(rte->jointype); 2398 break; 2399 case RTE_FUNCTION: 2400 JumbleExpr(jstate, (Node *) rte->functions); 2401 break; 2402 case RTE_TABLEFUNC: 2403 JumbleExpr(jstate, (Node *) rte->tablefunc); 2404 break; 2405 case RTE_VALUES: 2406 JumbleExpr(jstate, (Node *) rte->values_lists); 2407 break; 2408 case RTE_CTE: 2409 2410 /* 2411 * Depending on the CTE name here isn't ideal, but it's the 2412 * only info we have to identify the referenced WITH item. 2413 */ 2414 APP_JUMB_STRING(rte->ctename); 2415 APP_JUMB(rte->ctelevelsup); 2416 break; 2417 case RTE_NAMEDTUPLESTORE: 2418 APP_JUMB_STRING(rte->enrname); 2419 break; 2420 default: 2421 elog(ERROR, "unrecognized RTE kind: %d", (int) rte->rtekind); 2422 break; 2423 } 2424 } 2425 } 2426 2427 /* 2428 * Jumble an expression tree 2429 * 2430 * In general this function should handle all the same node types that 2431 * expression_tree_walker() does, and therefore it's coded to be as parallel 2432 * to that function as possible. However, since we are only invoked on 2433 * queries immediately post-parse-analysis, we need not handle node types 2434 * that only appear in planning. 2435 * 2436 * Note: the reason we don't simply use expression_tree_walker() is that the 2437 * point of that function is to support tree walkers that don't care about 2438 * most tree node types, but here we care about all types. We should complain 2439 * about any unrecognized node type. 2440 */ 2441 static void 2442 JumbleExpr(pgssJumbleState *jstate, Node *node) 2443 { 2444 ListCell *temp; 2445 2446 if (node == NULL) 2447 return; 2448 2449 /* Guard against stack overflow due to overly complex expressions */ 2450 check_stack_depth(); 2451 2452 /* 2453 * We always emit the node's NodeTag, then any additional fields that are 2454 * considered significant, and then we recurse to any child nodes. 2455 */ 2456 APP_JUMB(node->type); 2457 2458 switch (nodeTag(node)) 2459 { 2460 case T_Var: 2461 { 2462 Var *var = (Var *) node; 2463 2464 APP_JUMB(var->varno); 2465 APP_JUMB(var->varattno); 2466 APP_JUMB(var->varlevelsup); 2467 } 2468 break; 2469 case T_Const: 2470 { 2471 Const *c = (Const *) node; 2472 2473 /* We jumble only the constant's type, not its value */ 2474 APP_JUMB(c->consttype); 2475 /* Also, record its parse location for query normalization */ 2476 RecordConstLocation(jstate, c->location); 2477 } 2478 break; 2479 case T_Param: 2480 { 2481 Param *p = (Param *) node; 2482 2483 APP_JUMB(p->paramkind); 2484 APP_JUMB(p->paramid); 2485 APP_JUMB(p->paramtype); 2486 /* Also, track the highest external Param id */ 2487 if (p->paramkind == PARAM_EXTERN && 2488 p->paramid > jstate->highest_extern_param_id) 2489 jstate->highest_extern_param_id = p->paramid; 2490 } 2491 break; 2492 case T_Aggref: 2493 { 2494 Aggref *expr = (Aggref *) node; 2495 2496 APP_JUMB(expr->aggfnoid); 2497 JumbleExpr(jstate, (Node *) expr->aggdirectargs); 2498 JumbleExpr(jstate, (Node *) expr->args); 2499 JumbleExpr(jstate, (Node *) expr->aggorder); 2500 JumbleExpr(jstate, (Node *) expr->aggdistinct); 2501 JumbleExpr(jstate, (Node *) expr->aggfilter); 2502 } 2503 break; 2504 case T_GroupingFunc: 2505 { 2506 GroupingFunc *grpnode = (GroupingFunc *) node; 2507 2508 JumbleExpr(jstate, (Node *) grpnode->refs); 2509 } 2510 break; 2511 case T_WindowFunc: 2512 { 2513 WindowFunc *expr = (WindowFunc *) node; 2514 2515 APP_JUMB(expr->winfnoid); 2516 APP_JUMB(expr->winref); 2517 JumbleExpr(jstate, (Node *) expr->args); 2518 JumbleExpr(jstate, (Node *) expr->aggfilter); 2519 } 2520 break; 2521 case T_ArrayRef: 2522 { 2523 ArrayRef *aref = (ArrayRef *) node; 2524 2525 JumbleExpr(jstate, (Node *) aref->refupperindexpr); 2526 JumbleExpr(jstate, (Node *) aref->reflowerindexpr); 2527 JumbleExpr(jstate, (Node *) aref->refexpr); 2528 JumbleExpr(jstate, (Node *) aref->refassgnexpr); 2529 } 2530 break; 2531 case T_FuncExpr: 2532 { 2533 FuncExpr *expr = (FuncExpr *) node; 2534 2535 APP_JUMB(expr->funcid); 2536 JumbleExpr(jstate, (Node *) expr->args); 2537 } 2538 break; 2539 case T_NamedArgExpr: 2540 { 2541 NamedArgExpr *nae = (NamedArgExpr *) node; 2542 2543 APP_JUMB(nae->argnumber); 2544 JumbleExpr(jstate, (Node *) nae->arg); 2545 } 2546 break; 2547 case T_OpExpr: 2548 case T_DistinctExpr: /* struct-equivalent to OpExpr */ 2549 case T_NullIfExpr: /* struct-equivalent to OpExpr */ 2550 { 2551 OpExpr *expr = (OpExpr *) node; 2552 2553 APP_JUMB(expr->opno); 2554 JumbleExpr(jstate, (Node *) expr->args); 2555 } 2556 break; 2557 case T_ScalarArrayOpExpr: 2558 { 2559 ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) node; 2560 2561 APP_JUMB(expr->opno); 2562 APP_JUMB(expr->useOr); 2563 JumbleExpr(jstate, (Node *) expr->args); 2564 } 2565 break; 2566 case T_BoolExpr: 2567 { 2568 BoolExpr *expr = (BoolExpr *) node; 2569 2570 APP_JUMB(expr->boolop); 2571 JumbleExpr(jstate, (Node *) expr->args); 2572 } 2573 break; 2574 case T_SubLink: 2575 { 2576 SubLink *sublink = (SubLink *) node; 2577 2578 APP_JUMB(sublink->subLinkType); 2579 APP_JUMB(sublink->subLinkId); 2580 JumbleExpr(jstate, (Node *) sublink->testexpr); 2581 JumbleQuery(jstate, castNode(Query, sublink->subselect)); 2582 } 2583 break; 2584 case T_FieldSelect: 2585 { 2586 FieldSelect *fs = (FieldSelect *) node; 2587 2588 APP_JUMB(fs->fieldnum); 2589 JumbleExpr(jstate, (Node *) fs->arg); 2590 } 2591 break; 2592 case T_FieldStore: 2593 { 2594 FieldStore *fstore = (FieldStore *) node; 2595 2596 JumbleExpr(jstate, (Node *) fstore->arg); 2597 JumbleExpr(jstate, (Node *) fstore->newvals); 2598 } 2599 break; 2600 case T_RelabelType: 2601 { 2602 RelabelType *rt = (RelabelType *) node; 2603 2604 APP_JUMB(rt->resulttype); 2605 JumbleExpr(jstate, (Node *) rt->arg); 2606 } 2607 break; 2608 case T_CoerceViaIO: 2609 { 2610 CoerceViaIO *cio = (CoerceViaIO *) node; 2611 2612 APP_JUMB(cio->resulttype); 2613 JumbleExpr(jstate, (Node *) cio->arg); 2614 } 2615 break; 2616 case T_ArrayCoerceExpr: 2617 { 2618 ArrayCoerceExpr *acexpr = (ArrayCoerceExpr *) node; 2619 2620 APP_JUMB(acexpr->resulttype); 2621 JumbleExpr(jstate, (Node *) acexpr->arg); 2622 JumbleExpr(jstate, (Node *) acexpr->elemexpr); 2623 } 2624 break; 2625 case T_ConvertRowtypeExpr: 2626 { 2627 ConvertRowtypeExpr *crexpr = (ConvertRowtypeExpr *) node; 2628 2629 APP_JUMB(crexpr->resulttype); 2630 JumbleExpr(jstate, (Node *) crexpr->arg); 2631 } 2632 break; 2633 case T_CollateExpr: 2634 { 2635 CollateExpr *ce = (CollateExpr *) node; 2636 2637 APP_JUMB(ce->collOid); 2638 JumbleExpr(jstate, (Node *) ce->arg); 2639 } 2640 break; 2641 case T_CaseExpr: 2642 { 2643 CaseExpr *caseexpr = (CaseExpr *) node; 2644 2645 JumbleExpr(jstate, (Node *) caseexpr->arg); 2646 foreach(temp, caseexpr->args) 2647 { 2648 CaseWhen *when = lfirst_node(CaseWhen, temp); 2649 2650 JumbleExpr(jstate, (Node *) when->expr); 2651 JumbleExpr(jstate, (Node *) when->result); 2652 } 2653 JumbleExpr(jstate, (Node *) caseexpr->defresult); 2654 } 2655 break; 2656 case T_CaseTestExpr: 2657 { 2658 CaseTestExpr *ct = (CaseTestExpr *) node; 2659 2660 APP_JUMB(ct->typeId); 2661 } 2662 break; 2663 case T_ArrayExpr: 2664 JumbleExpr(jstate, (Node *) ((ArrayExpr *) node)->elements); 2665 break; 2666 case T_RowExpr: 2667 JumbleExpr(jstate, (Node *) ((RowExpr *) node)->args); 2668 break; 2669 case T_RowCompareExpr: 2670 { 2671 RowCompareExpr *rcexpr = (RowCompareExpr *) node; 2672 2673 APP_JUMB(rcexpr->rctype); 2674 JumbleExpr(jstate, (Node *) rcexpr->largs); 2675 JumbleExpr(jstate, (Node *) rcexpr->rargs); 2676 } 2677 break; 2678 case T_CoalesceExpr: 2679 JumbleExpr(jstate, (Node *) ((CoalesceExpr *) node)->args); 2680 break; 2681 case T_MinMaxExpr: 2682 { 2683 MinMaxExpr *mmexpr = (MinMaxExpr *) node; 2684 2685 APP_JUMB(mmexpr->op); 2686 JumbleExpr(jstate, (Node *) mmexpr->args); 2687 } 2688 break; 2689 case T_SQLValueFunction: 2690 { 2691 SQLValueFunction *svf = (SQLValueFunction *) node; 2692 2693 APP_JUMB(svf->op); 2694 /* type is fully determined by op */ 2695 APP_JUMB(svf->typmod); 2696 } 2697 break; 2698 case T_XmlExpr: 2699 { 2700 XmlExpr *xexpr = (XmlExpr *) node; 2701 2702 APP_JUMB(xexpr->op); 2703 JumbleExpr(jstate, (Node *) xexpr->named_args); 2704 JumbleExpr(jstate, (Node *) xexpr->args); 2705 } 2706 break; 2707 case T_NullTest: 2708 { 2709 NullTest *nt = (NullTest *) node; 2710 2711 APP_JUMB(nt->nulltesttype); 2712 JumbleExpr(jstate, (Node *) nt->arg); 2713 } 2714 break; 2715 case T_BooleanTest: 2716 { 2717 BooleanTest *bt = (BooleanTest *) node; 2718 2719 APP_JUMB(bt->booltesttype); 2720 JumbleExpr(jstate, (Node *) bt->arg); 2721 } 2722 break; 2723 case T_CoerceToDomain: 2724 { 2725 CoerceToDomain *cd = (CoerceToDomain *) node; 2726 2727 APP_JUMB(cd->resulttype); 2728 JumbleExpr(jstate, (Node *) cd->arg); 2729 } 2730 break; 2731 case T_CoerceToDomainValue: 2732 { 2733 CoerceToDomainValue *cdv = (CoerceToDomainValue *) node; 2734 2735 APP_JUMB(cdv->typeId); 2736 } 2737 break; 2738 case T_SetToDefault: 2739 { 2740 SetToDefault *sd = (SetToDefault *) node; 2741 2742 APP_JUMB(sd->typeId); 2743 } 2744 break; 2745 case T_CurrentOfExpr: 2746 { 2747 CurrentOfExpr *ce = (CurrentOfExpr *) node; 2748 2749 APP_JUMB(ce->cvarno); 2750 if (ce->cursor_name) 2751 APP_JUMB_STRING(ce->cursor_name); 2752 APP_JUMB(ce->cursor_param); 2753 } 2754 break; 2755 case T_NextValueExpr: 2756 { 2757 NextValueExpr *nve = (NextValueExpr *) node; 2758 2759 APP_JUMB(nve->seqid); 2760 APP_JUMB(nve->typeId); 2761 } 2762 break; 2763 case T_InferenceElem: 2764 { 2765 InferenceElem *ie = (InferenceElem *) node; 2766 2767 APP_JUMB(ie->infercollid); 2768 APP_JUMB(ie->inferopclass); 2769 JumbleExpr(jstate, ie->expr); 2770 } 2771 break; 2772 case T_TargetEntry: 2773 { 2774 TargetEntry *tle = (TargetEntry *) node; 2775 2776 APP_JUMB(tle->resno); 2777 APP_JUMB(tle->ressortgroupref); 2778 JumbleExpr(jstate, (Node *) tle->expr); 2779 } 2780 break; 2781 case T_RangeTblRef: 2782 { 2783 RangeTblRef *rtr = (RangeTblRef *) node; 2784 2785 APP_JUMB(rtr->rtindex); 2786 } 2787 break; 2788 case T_JoinExpr: 2789 { 2790 JoinExpr *join = (JoinExpr *) node; 2791 2792 APP_JUMB(join->jointype); 2793 APP_JUMB(join->isNatural); 2794 APP_JUMB(join->rtindex); 2795 JumbleExpr(jstate, join->larg); 2796 JumbleExpr(jstate, join->rarg); 2797 JumbleExpr(jstate, join->quals); 2798 } 2799 break; 2800 case T_FromExpr: 2801 { 2802 FromExpr *from = (FromExpr *) node; 2803 2804 JumbleExpr(jstate, (Node *) from->fromlist); 2805 JumbleExpr(jstate, from->quals); 2806 } 2807 break; 2808 case T_OnConflictExpr: 2809 { 2810 OnConflictExpr *conf = (OnConflictExpr *) node; 2811 2812 APP_JUMB(conf->action); 2813 JumbleExpr(jstate, (Node *) conf->arbiterElems); 2814 JumbleExpr(jstate, conf->arbiterWhere); 2815 JumbleExpr(jstate, (Node *) conf->onConflictSet); 2816 JumbleExpr(jstate, conf->onConflictWhere); 2817 APP_JUMB(conf->constraint); 2818 APP_JUMB(conf->exclRelIndex); 2819 JumbleExpr(jstate, (Node *) conf->exclRelTlist); 2820 } 2821 break; 2822 case T_List: 2823 foreach(temp, (List *) node) 2824 { 2825 JumbleExpr(jstate, (Node *) lfirst(temp)); 2826 } 2827 break; 2828 case T_IntList: 2829 foreach(temp, (List *) node) 2830 { 2831 APP_JUMB(lfirst_int(temp)); 2832 } 2833 break; 2834 case T_SortGroupClause: 2835 { 2836 SortGroupClause *sgc = (SortGroupClause *) node; 2837 2838 APP_JUMB(sgc->tleSortGroupRef); 2839 APP_JUMB(sgc->eqop); 2840 APP_JUMB(sgc->sortop); 2841 APP_JUMB(sgc->nulls_first); 2842 } 2843 break; 2844 case T_GroupingSet: 2845 { 2846 GroupingSet *gsnode = (GroupingSet *) node; 2847 2848 JumbleExpr(jstate, (Node *) gsnode->content); 2849 } 2850 break; 2851 case T_WindowClause: 2852 { 2853 WindowClause *wc = (WindowClause *) node; 2854 2855 APP_JUMB(wc->winref); 2856 APP_JUMB(wc->frameOptions); 2857 JumbleExpr(jstate, (Node *) wc->partitionClause); 2858 JumbleExpr(jstate, (Node *) wc->orderClause); 2859 JumbleExpr(jstate, wc->startOffset); 2860 JumbleExpr(jstate, wc->endOffset); 2861 } 2862 break; 2863 case T_CommonTableExpr: 2864 { 2865 CommonTableExpr *cte = (CommonTableExpr *) node; 2866 2867 /* we store the string name because RTE_CTE RTEs need it */ 2868 APP_JUMB_STRING(cte->ctename); 2869 JumbleQuery(jstate, castNode(Query, cte->ctequery)); 2870 } 2871 break; 2872 case T_SetOperationStmt: 2873 { 2874 SetOperationStmt *setop = (SetOperationStmt *) node; 2875 2876 APP_JUMB(setop->op); 2877 APP_JUMB(setop->all); 2878 JumbleExpr(jstate, setop->larg); 2879 JumbleExpr(jstate, setop->rarg); 2880 } 2881 break; 2882 case T_RangeTblFunction: 2883 { 2884 RangeTblFunction *rtfunc = (RangeTblFunction *) node; 2885 2886 JumbleExpr(jstate, rtfunc->funcexpr); 2887 } 2888 break; 2889 case T_TableFunc: 2890 { 2891 TableFunc *tablefunc = (TableFunc *) node; 2892 2893 JumbleExpr(jstate, tablefunc->docexpr); 2894 JumbleExpr(jstate, tablefunc->rowexpr); 2895 JumbleExpr(jstate, (Node *) tablefunc->colexprs); 2896 } 2897 break; 2898 case T_TableSampleClause: 2899 { 2900 TableSampleClause *tsc = (TableSampleClause *) node; 2901 2902 APP_JUMB(tsc->tsmhandler); 2903 JumbleExpr(jstate, (Node *) tsc->args); 2904 JumbleExpr(jstate, (Node *) tsc->repeatable); 2905 } 2906 break; 2907 default: 2908 /* Only a warning, since we can stumble along anyway */ 2909 elog(WARNING, "unrecognized node type: %d", 2910 (int) nodeTag(node)); 2911 break; 2912 } 2913 } 2914 2915 /* 2916 * Record location of constant within query string of query tree 2917 * that is currently being walked. 2918 */ 2919 static void 2920 RecordConstLocation(pgssJumbleState *jstate, int location) 2921 { 2922 /* -1 indicates unknown or undefined location */ 2923 if (location >= 0) 2924 { 2925 /* enlarge array if needed */ 2926 if (jstate->clocations_count >= jstate->clocations_buf_size) 2927 { 2928 jstate->clocations_buf_size *= 2; 2929 jstate->clocations = (pgssLocationLen *) 2930 repalloc(jstate->clocations, 2931 jstate->clocations_buf_size * 2932 sizeof(pgssLocationLen)); 2933 } 2934 jstate->clocations[jstate->clocations_count].location = location; 2935 /* initialize lengths to -1 to simplify fill_in_constant_lengths */ 2936 jstate->clocations[jstate->clocations_count].length = -1; 2937 jstate->clocations_count++; 2938 } 2939 } 2940 2941 /* 2942 * Generate a normalized version of the query string that will be used to 2943 * represent all similar queries. 2944 * 2945 * Note that the normalized representation may well vary depending on 2946 * just which "equivalent" query is used to create the hashtable entry. 2947 * We assume this is OK. 2948 * 2949 * If query_loc > 0, then "query" has been advanced by that much compared to 2950 * the original string start, so we need to translate the provided locations 2951 * to compensate. (This lets us avoid re-scanning statements before the one 2952 * of interest, so it's worth doing.) 2953 * 2954 * *query_len_p contains the input string length, and is updated with 2955 * the result string length on exit. The resulting string might be longer 2956 * or shorter depending on what happens with replacement of constants. 2957 * 2958 * Returns a palloc'd string. 2959 */ 2960 static char * 2961 generate_normalized_query(pgssJumbleState *jstate, const char *query, 2962 int query_loc, int *query_len_p, int encoding) 2963 { 2964 char *norm_query; 2965 int query_len = *query_len_p; 2966 int i, 2967 norm_query_buflen, /* Space allowed for norm_query */ 2968 len_to_wrt, /* Length (in bytes) to write */ 2969 quer_loc = 0, /* Source query byte location */ 2970 n_quer_loc = 0, /* Normalized query byte location */ 2971 last_off = 0, /* Offset from start for previous tok */ 2972 last_tok_len = 0; /* Length (in bytes) of that tok */ 2973 2974 /* 2975 * Get constants' lengths (core system only gives us locations). Note 2976 * this also ensures the items are sorted by location. 2977 */ 2978 fill_in_constant_lengths(jstate, query, query_loc); 2979 2980 /* 2981 * Allow for $n symbols to be longer than the constants they replace. 2982 * Constants must take at least one byte in text form, while a $n symbol 2983 * certainly isn't more than 11 bytes, even if n reaches INT_MAX. We 2984 * could refine that limit based on the max value of n for the current 2985 * query, but it hardly seems worth any extra effort to do so. 2986 */ 2987 norm_query_buflen = query_len + jstate->clocations_count * 10; 2988 2989 /* Allocate result buffer */ 2990 norm_query = palloc(norm_query_buflen + 1); 2991 2992 for (i = 0; i < jstate->clocations_count; i++) 2993 { 2994 int off, /* Offset from start for cur tok */ 2995 tok_len; /* Length (in bytes) of that tok */ 2996 2997 off = jstate->clocations[i].location; 2998 /* Adjust recorded location if we're dealing with partial string */ 2999 off -= query_loc; 3000 3001 tok_len = jstate->clocations[i].length; 3002 3003 if (tok_len < 0) 3004 continue; /* ignore any duplicates */ 3005 3006 /* Copy next chunk (what precedes the next constant) */ 3007 len_to_wrt = off - last_off; 3008 len_to_wrt -= last_tok_len; 3009 3010 Assert(len_to_wrt >= 0); 3011 memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt); 3012 n_quer_loc += len_to_wrt; 3013 3014 /* And insert a param symbol in place of the constant token */ 3015 n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d", 3016 i + 1 + jstate->highest_extern_param_id); 3017 3018 quer_loc = off + tok_len; 3019 last_off = off; 3020 last_tok_len = tok_len; 3021 } 3022 3023 /* 3024 * We've copied up until the last ignorable constant. Copy over the 3025 * remaining bytes of the original query string. 3026 */ 3027 len_to_wrt = query_len - quer_loc; 3028 3029 Assert(len_to_wrt >= 0); 3030 memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt); 3031 n_quer_loc += len_to_wrt; 3032 3033 Assert(n_quer_loc <= norm_query_buflen); 3034 norm_query[n_quer_loc] = '\0'; 3035 3036 *query_len_p = n_quer_loc; 3037 return norm_query; 3038 } 3039 3040 /* 3041 * Given a valid SQL string and an array of constant-location records, 3042 * fill in the textual lengths of those constants. 3043 * 3044 * The constants may use any allowed constant syntax, such as float literals, 3045 * bit-strings, single-quoted strings and dollar-quoted strings. This is 3046 * accomplished by using the public API for the core scanner. 3047 * 3048 * It is the caller's job to ensure that the string is a valid SQL statement 3049 * with constants at the indicated locations. Since in practice the string 3050 * has already been parsed, and the locations that the caller provides will 3051 * have originated from within the authoritative parser, this should not be 3052 * a problem. 3053 * 3054 * Duplicate constant pointers are possible, and will have their lengths 3055 * marked as '-1', so that they are later ignored. (Actually, we assume the 3056 * lengths were initialized as -1 to start with, and don't change them here.) 3057 * 3058 * If query_loc > 0, then "query" has been advanced by that much compared to 3059 * the original string start, so we need to translate the provided locations 3060 * to compensate. (This lets us avoid re-scanning statements before the one 3061 * of interest, so it's worth doing.) 3062 * 3063 * N.B. There is an assumption that a '-' character at a Const location begins 3064 * a negative numeric constant. This precludes there ever being another 3065 * reason for a constant to start with a '-'. 3066 */ 3067 static void 3068 fill_in_constant_lengths(pgssJumbleState *jstate, const char *query, 3069 int query_loc) 3070 { 3071 pgssLocationLen *locs; 3072 core_yyscan_t yyscanner; 3073 core_yy_extra_type yyextra; 3074 core_YYSTYPE yylval; 3075 YYLTYPE yylloc; 3076 int last_loc = -1; 3077 int i; 3078 3079 /* 3080 * Sort the records by location so that we can process them in order while 3081 * scanning the query text. 3082 */ 3083 if (jstate->clocations_count > 1) 3084 qsort(jstate->clocations, jstate->clocations_count, 3085 sizeof(pgssLocationLen), comp_location); 3086 locs = jstate->clocations; 3087 3088 /* initialize the flex scanner --- should match raw_parser() */ 3089 yyscanner = scanner_init(query, 3090 &yyextra, 3091 ScanKeywords, 3092 NumScanKeywords); 3093 3094 /* we don't want to re-emit any escape string warnings */ 3095 yyextra.escape_string_warning = false; 3096 3097 /* Search for each constant, in sequence */ 3098 for (i = 0; i < jstate->clocations_count; i++) 3099 { 3100 int loc = locs[i].location; 3101 int tok; 3102 3103 /* Adjust recorded location if we're dealing with partial string */ 3104 loc -= query_loc; 3105 3106 Assert(loc >= 0); 3107 3108 if (loc <= last_loc) 3109 continue; /* Duplicate constant, ignore */ 3110 3111 /* Lex tokens until we find the desired constant */ 3112 for (;;) 3113 { 3114 tok = core_yylex(&yylval, &yylloc, yyscanner); 3115 3116 /* We should not hit end-of-string, but if we do, behave sanely */ 3117 if (tok == 0) 3118 break; /* out of inner for-loop */ 3119 3120 /* 3121 * We should find the token position exactly, but if we somehow 3122 * run past it, work with that. 3123 */ 3124 if (yylloc >= loc) 3125 { 3126 if (query[loc] == '-') 3127 { 3128 /* 3129 * It's a negative value - this is the one and only case 3130 * where we replace more than a single token. 3131 * 3132 * Do not compensate for the core system's special-case 3133 * adjustment of location to that of the leading '-' 3134 * operator in the event of a negative constant. It is 3135 * also useful for our purposes to start from the minus 3136 * symbol. In this way, queries like "select * from foo 3137 * where bar = 1" and "select * from foo where bar = -2" 3138 * will have identical normalized query strings. 3139 */ 3140 tok = core_yylex(&yylval, &yylloc, yyscanner); 3141 if (tok == 0) 3142 break; /* out of inner for-loop */ 3143 } 3144 3145 /* 3146 * We now rely on the assumption that flex has placed a zero 3147 * byte after the text of the current token in scanbuf. 3148 */ 3149 locs[i].length = strlen(yyextra.scanbuf + loc); 3150 break; /* out of inner for-loop */ 3151 } 3152 } 3153 3154 /* If we hit end-of-string, give up, leaving remaining lengths -1 */ 3155 if (tok == 0) 3156 break; 3157 3158 last_loc = loc; 3159 } 3160 3161 scanner_finish(yyscanner); 3162 } 3163 3164 /* 3165 * comp_location: comparator for qsorting pgssLocationLen structs by location 3166 */ 3167 static int 3168 comp_location(const void *a, const void *b) 3169 { 3170 int l = ((const pgssLocationLen *) a)->location; 3171 int r = ((const pgssLocationLen *) b)->location; 3172 3173 if (l < r) 3174 return -1; 3175 else if (l > r) 3176 return +1; 3177 else 3178 return 0; 3179 } 3180