1 /*
2 * contrib/pageinspect/btreefuncs.c
3 *
4 *
5 * btreefuncs.c
6 *
7 * Copyright (c) 2006 Satoshi Nagayasu <nagayasus@nttdata.co.jp>
8 *
9 * Permission to use, copy, modify, and distribute this software and
10 * its documentation for any purpose, without fee, and without a
11 * written agreement is hereby granted, provided that the above
12 * copyright notice and this paragraph and the following two
13 * paragraphs appear in all copies.
14 *
15 * IN NO EVENT SHALL THE AUTHOR BE LIABLE TO ANY PARTY FOR DIRECT,
16 * INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
17 * LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
18 * DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED
19 * OF THE POSSIBILITY OF SUCH DAMAGE.
20 *
21 * THE AUTHOR SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS
24 * IS" BASIS, AND THE AUTHOR HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE,
25 * SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
26 */
27
28 #include "postgres.h"
29
30 #include "pageinspect.h"
31
32 #include "access/nbtree.h"
33 #include "access/relation.h"
34 #include "catalog/namespace.h"
35 #include "catalog/pg_am.h"
36 #include "funcapi.h"
37 #include "miscadmin.h"
38 #include "utils/builtins.h"
39 #include "utils/rel.h"
40 #include "utils/varlena.h"
41
42
43 PG_FUNCTION_INFO_V1(bt_metap);
44 PG_FUNCTION_INFO_V1(bt_page_items);
45 PG_FUNCTION_INFO_V1(bt_page_items_bytea);
46 PG_FUNCTION_INFO_V1(bt_page_stats);
47
48 #define IS_INDEX(r) ((r)->rd_rel->relkind == RELKIND_INDEX)
49 #define IS_BTREE(r) ((r)->rd_rel->relam == BTREE_AM_OID)
50
51 /* note: BlockNumber is unsigned, hence can't be negative */
52 #define CHECK_RELATION_BLOCK_RANGE(rel, blkno) { \
53 if ( RelationGetNumberOfBlocks(rel) <= (BlockNumber) (blkno) ) \
54 elog(ERROR, "block number out of range"); }
55
56 /* ------------------------------------------------
57 * structure for single btree page statistics
58 * ------------------------------------------------
59 */
60 typedef struct BTPageStat
61 {
62 uint32 blkno;
63 uint32 live_items;
64 uint32 dead_items;
65 uint32 page_size;
66 uint32 max_avail;
67 uint32 free_size;
68 uint32 avg_item_size;
69 char type;
70
71 /* opaque data */
72 BlockNumber btpo_prev;
73 BlockNumber btpo_next;
74 union
75 {
76 uint32 level;
77 TransactionId xact;
78 } btpo;
79 uint16 btpo_flags;
80 BTCycleId btpo_cycleid;
81 } BTPageStat;
82
83
84 /* -------------------------------------------------
85 * GetBTPageStatistics()
86 *
87 * Collect statistics of single b-tree page
88 * -------------------------------------------------
89 */
90 static void
GetBTPageStatistics(BlockNumber blkno,Buffer buffer,BTPageStat * stat)91 GetBTPageStatistics(BlockNumber blkno, Buffer buffer, BTPageStat *stat)
92 {
93 Page page = BufferGetPage(buffer);
94 PageHeader phdr = (PageHeader) page;
95 OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
96 BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
97 int item_size = 0;
98 int off;
99
100 stat->blkno = blkno;
101
102 stat->max_avail = BLCKSZ - (BLCKSZ - phdr->pd_special + SizeOfPageHeaderData);
103
104 stat->dead_items = stat->live_items = 0;
105
106 stat->page_size = PageGetPageSize(page);
107
108 /* page type (flags) */
109 if (P_ISDELETED(opaque))
110 {
111 stat->type = 'd';
112 stat->btpo.xact = opaque->btpo.xact;
113 return;
114 }
115 else if (P_IGNORE(opaque))
116 stat->type = 'e';
117 else if (P_ISLEAF(opaque))
118 stat->type = 'l';
119 else if (P_ISROOT(opaque))
120 stat->type = 'r';
121 else
122 stat->type = 'i';
123
124 /* btpage opaque data */
125 stat->btpo_prev = opaque->btpo_prev;
126 stat->btpo_next = opaque->btpo_next;
127 stat->btpo.level = opaque->btpo.level;
128 stat->btpo_flags = opaque->btpo_flags;
129 stat->btpo_cycleid = opaque->btpo_cycleid;
130
131 /* count live and dead tuples, and free space */
132 for (off = FirstOffsetNumber; off <= maxoff; off++)
133 {
134 IndexTuple itup;
135
136 ItemId id = PageGetItemId(page, off);
137
138 itup = (IndexTuple) PageGetItem(page, id);
139
140 item_size += IndexTupleSize(itup);
141
142 if (!ItemIdIsDead(id))
143 stat->live_items++;
144 else
145 stat->dead_items++;
146 }
147 stat->free_size = PageGetFreeSpace(page);
148
149 if ((stat->live_items + stat->dead_items) > 0)
150 stat->avg_item_size = item_size / (stat->live_items + stat->dead_items);
151 else
152 stat->avg_item_size = 0;
153 }
154
155 /* -----------------------------------------------
156 * bt_page_stats()
157 *
158 * Usage: SELECT * FROM bt_page_stats('t1_pkey', 1);
159 * -----------------------------------------------
160 */
161 Datum
bt_page_stats(PG_FUNCTION_ARGS)162 bt_page_stats(PG_FUNCTION_ARGS)
163 {
164 text *relname = PG_GETARG_TEXT_PP(0);
165 uint32 blkno = PG_GETARG_UINT32(1);
166 Buffer buffer;
167 Relation rel;
168 RangeVar *relrv;
169 Datum result;
170 HeapTuple tuple;
171 TupleDesc tupleDesc;
172 int j;
173 char *values[11];
174 BTPageStat stat;
175
176 if (!superuser())
177 ereport(ERROR,
178 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
179 (errmsg("must be superuser to use pageinspect functions"))));
180
181 relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
182 rel = relation_openrv(relrv, AccessShareLock);
183
184 if (!IS_INDEX(rel) || !IS_BTREE(rel))
185 elog(ERROR, "relation \"%s\" is not a btree index",
186 RelationGetRelationName(rel));
187
188 /*
189 * Reject attempts to read non-local temporary relations; we would be
190 * likely to get wrong data since we have no visibility into the owning
191 * session's local buffers.
192 */
193 if (RELATION_IS_OTHER_TEMP(rel))
194 ereport(ERROR,
195 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
196 errmsg("cannot access temporary tables of other sessions")));
197
198 if (blkno == 0)
199 elog(ERROR, "block 0 is a meta page");
200
201 CHECK_RELATION_BLOCK_RANGE(rel, blkno);
202
203 buffer = ReadBuffer(rel, blkno);
204 LockBuffer(buffer, BUFFER_LOCK_SHARE);
205
206 /* keep compiler quiet */
207 stat.btpo_prev = stat.btpo_next = InvalidBlockNumber;
208 stat.btpo_flags = stat.free_size = stat.avg_item_size = 0;
209
210 GetBTPageStatistics(blkno, buffer, &stat);
211
212 UnlockReleaseBuffer(buffer);
213 relation_close(rel, AccessShareLock);
214
215 /* Build a tuple descriptor for our result type */
216 if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
217 elog(ERROR, "return type must be a row type");
218
219 j = 0;
220 values[j++] = psprintf("%d", stat.blkno);
221 values[j++] = psprintf("%c", stat.type);
222 values[j++] = psprintf("%d", stat.live_items);
223 values[j++] = psprintf("%d", stat.dead_items);
224 values[j++] = psprintf("%d", stat.avg_item_size);
225 values[j++] = psprintf("%d", stat.page_size);
226 values[j++] = psprintf("%d", stat.free_size);
227 values[j++] = psprintf("%d", stat.btpo_prev);
228 values[j++] = psprintf("%d", stat.btpo_next);
229 values[j++] = psprintf("%d", (stat.type == 'd') ? stat.btpo.xact : stat.btpo.level);
230 values[j++] = psprintf("%d", stat.btpo_flags);
231
232 tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc),
233 values);
234
235 result = HeapTupleGetDatum(tuple);
236
237 PG_RETURN_DATUM(result);
238 }
239
240
241 /*
242 * cross-call data structure for SRF
243 */
244 struct user_args
245 {
246 Page page;
247 OffsetNumber offset;
248 };
249
250 /*-------------------------------------------------------
251 * bt_page_print_tuples()
252 *
253 * Form a tuple describing index tuple at a given offset
254 * ------------------------------------------------------
255 */
256 static Datum
bt_page_print_tuples(FuncCallContext * fctx,Page page,OffsetNumber offset)257 bt_page_print_tuples(FuncCallContext *fctx, Page page, OffsetNumber offset)
258 {
259 char *values[6];
260 HeapTuple tuple;
261 ItemId id;
262 IndexTuple itup;
263 int j;
264 int off;
265 int dlen;
266 char *dump;
267 char *ptr;
268
269 id = PageGetItemId(page, offset);
270
271 if (!ItemIdIsValid(id))
272 elog(ERROR, "invalid ItemId");
273
274 itup = (IndexTuple) PageGetItem(page, id);
275
276 j = 0;
277 values[j++] = psprintf("%d", offset);
278 values[j++] = psprintf("(%u,%u)",
279 ItemPointerGetBlockNumberNoCheck(&itup->t_tid),
280 ItemPointerGetOffsetNumberNoCheck(&itup->t_tid));
281 values[j++] = psprintf("%d", (int) IndexTupleSize(itup));
282 values[j++] = psprintf("%c", IndexTupleHasNulls(itup) ? 't' : 'f');
283 values[j++] = psprintf("%c", IndexTupleHasVarwidths(itup) ? 't' : 'f');
284
285 ptr = (char *) itup + IndexInfoFindDataOffset(itup->t_info);
286 dlen = IndexTupleSize(itup) - IndexInfoFindDataOffset(itup->t_info);
287 dump = palloc0(dlen * 3 + 1);
288 values[j] = dump;
289 for (off = 0; off < dlen; off++)
290 {
291 if (off > 0)
292 *dump++ = ' ';
293 sprintf(dump, "%02x", *(ptr + off) & 0xff);
294 dump += 2;
295 }
296
297 tuple = BuildTupleFromCStrings(fctx->attinmeta, values);
298
299 return HeapTupleGetDatum(tuple);
300 }
301
302 /*-------------------------------------------------------
303 * bt_page_items()
304 *
305 * Get IndexTupleData set in a btree page
306 *
307 * Usage: SELECT * FROM bt_page_items('t1_pkey', 1);
308 *-------------------------------------------------------
309 */
310 Datum
bt_page_items(PG_FUNCTION_ARGS)311 bt_page_items(PG_FUNCTION_ARGS)
312 {
313 text *relname = PG_GETARG_TEXT_PP(0);
314 uint32 blkno = PG_GETARG_UINT32(1);
315 Datum result;
316 FuncCallContext *fctx;
317 MemoryContext mctx;
318 struct user_args *uargs;
319
320 if (!superuser())
321 ereport(ERROR,
322 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
323 (errmsg("must be superuser to use pageinspect functions"))));
324
325 if (SRF_IS_FIRSTCALL())
326 {
327 RangeVar *relrv;
328 Relation rel;
329 Buffer buffer;
330 BTPageOpaque opaque;
331 TupleDesc tupleDesc;
332
333 fctx = SRF_FIRSTCALL_INIT();
334
335 relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
336 rel = relation_openrv(relrv, AccessShareLock);
337
338 if (!IS_INDEX(rel) || !IS_BTREE(rel))
339 elog(ERROR, "relation \"%s\" is not a btree index",
340 RelationGetRelationName(rel));
341
342 /*
343 * Reject attempts to read non-local temporary relations; we would be
344 * likely to get wrong data since we have no visibility into the
345 * owning session's local buffers.
346 */
347 if (RELATION_IS_OTHER_TEMP(rel))
348 ereport(ERROR,
349 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
350 errmsg("cannot access temporary tables of other sessions")));
351
352 if (blkno == 0)
353 elog(ERROR, "block 0 is a meta page");
354
355 CHECK_RELATION_BLOCK_RANGE(rel, blkno);
356
357 buffer = ReadBuffer(rel, blkno);
358 LockBuffer(buffer, BUFFER_LOCK_SHARE);
359
360 /*
361 * We copy the page into local storage to avoid holding pin on the
362 * buffer longer than we must, and possibly failing to release it at
363 * all if the calling query doesn't fetch all rows.
364 */
365 mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx);
366
367 uargs = palloc(sizeof(struct user_args));
368
369 uargs->page = palloc(BLCKSZ);
370 memcpy(uargs->page, BufferGetPage(buffer), BLCKSZ);
371
372 UnlockReleaseBuffer(buffer);
373 relation_close(rel, AccessShareLock);
374
375 uargs->offset = FirstOffsetNumber;
376
377 opaque = (BTPageOpaque) PageGetSpecialPointer(uargs->page);
378
379 if (P_ISDELETED(opaque))
380 elog(NOTICE, "page is deleted");
381
382 fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
383
384 /* Build a tuple descriptor for our result type */
385 if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
386 elog(ERROR, "return type must be a row type");
387
388 fctx->attinmeta = TupleDescGetAttInMetadata(tupleDesc);
389
390 fctx->user_fctx = uargs;
391
392 MemoryContextSwitchTo(mctx);
393 }
394
395 fctx = SRF_PERCALL_SETUP();
396 uargs = fctx->user_fctx;
397
398 if (fctx->call_cntr < fctx->max_calls)
399 {
400 result = bt_page_print_tuples(fctx, uargs->page, uargs->offset);
401 uargs->offset++;
402 SRF_RETURN_NEXT(fctx, result);
403 }
404 else
405 {
406 pfree(uargs->page);
407 pfree(uargs);
408 SRF_RETURN_DONE(fctx);
409 }
410 }
411
412 /*-------------------------------------------------------
413 * bt_page_items_bytea()
414 *
415 * Get IndexTupleData set in a btree page
416 *
417 * Usage: SELECT * FROM bt_page_items(get_raw_page('t1_pkey', 1));
418 *-------------------------------------------------------
419 */
420
421 Datum
bt_page_items_bytea(PG_FUNCTION_ARGS)422 bt_page_items_bytea(PG_FUNCTION_ARGS)
423 {
424 bytea *raw_page = PG_GETARG_BYTEA_P(0);
425 Datum result;
426 FuncCallContext *fctx;
427 struct user_args *uargs;
428 int raw_page_size;
429
430 if (!superuser())
431 ereport(ERROR,
432 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
433 (errmsg("must be superuser to use raw page functions"))));
434
435 if (SRF_IS_FIRSTCALL())
436 {
437 BTPageOpaque opaque;
438 MemoryContext mctx;
439 TupleDesc tupleDesc;
440
441 raw_page_size = VARSIZE(raw_page) - VARHDRSZ;
442
443 if (raw_page_size < SizeOfPageHeaderData)
444 ereport(ERROR,
445 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
446 errmsg("input page too small (%d bytes)", raw_page_size)));
447
448 fctx = SRF_FIRSTCALL_INIT();
449 mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx);
450
451 uargs = palloc(sizeof(struct user_args));
452
453 uargs->page = VARDATA(raw_page);
454
455 uargs->offset = FirstOffsetNumber;
456
457 opaque = (BTPageOpaque) PageGetSpecialPointer(uargs->page);
458
459 if (P_ISMETA(opaque))
460 ereport(ERROR,
461 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
462 errmsg("block is a meta page")));
463
464 if (P_ISDELETED(opaque))
465 elog(NOTICE, "page is deleted");
466
467 fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
468
469 /* Build a tuple descriptor for our result type */
470 if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
471 elog(ERROR, "return type must be a row type");
472
473 fctx->attinmeta = TupleDescGetAttInMetadata(tupleDesc);
474
475 fctx->user_fctx = uargs;
476
477 MemoryContextSwitchTo(mctx);
478 }
479
480 fctx = SRF_PERCALL_SETUP();
481 uargs = fctx->user_fctx;
482
483 if (fctx->call_cntr < fctx->max_calls)
484 {
485 result = bt_page_print_tuples(fctx, uargs->page, uargs->offset);
486 uargs->offset++;
487 SRF_RETURN_NEXT(fctx, result);
488 }
489 else
490 {
491 pfree(uargs);
492 SRF_RETURN_DONE(fctx);
493 }
494 }
495
496
497 /* ------------------------------------------------
498 * bt_metap()
499 *
500 * Get a btree's meta-page information
501 *
502 * Usage: SELECT * FROM bt_metap('t1_pkey')
503 * ------------------------------------------------
504 */
505 Datum
bt_metap(PG_FUNCTION_ARGS)506 bt_metap(PG_FUNCTION_ARGS)
507 {
508 text *relname = PG_GETARG_TEXT_PP(0);
509 Datum result;
510 Relation rel;
511 RangeVar *relrv;
512 BTMetaPageData *metad;
513 TupleDesc tupleDesc;
514 int j;
515 char *values[8];
516 Buffer buffer;
517 Page page;
518 HeapTuple tuple;
519
520 if (!superuser())
521 ereport(ERROR,
522 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
523 (errmsg("must be superuser to use pageinspect functions"))));
524
525 relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
526 rel = relation_openrv(relrv, AccessShareLock);
527
528 if (!IS_INDEX(rel) || !IS_BTREE(rel))
529 elog(ERROR, "relation \"%s\" is not a btree index",
530 RelationGetRelationName(rel));
531
532 /*
533 * Reject attempts to read non-local temporary relations; we would be
534 * likely to get wrong data since we have no visibility into the owning
535 * session's local buffers.
536 */
537 if (RELATION_IS_OTHER_TEMP(rel))
538 ereport(ERROR,
539 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
540 errmsg("cannot access temporary tables of other sessions")));
541
542 buffer = ReadBuffer(rel, 0);
543 LockBuffer(buffer, BUFFER_LOCK_SHARE);
544
545 page = BufferGetPage(buffer);
546 metad = BTPageGetMeta(page);
547
548 /* Build a tuple descriptor for our result type */
549 if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
550 elog(ERROR, "return type must be a row type");
551
552 j = 0;
553 values[j++] = psprintf("%d", metad->btm_magic);
554 values[j++] = psprintf("%d", metad->btm_version);
555 values[j++] = psprintf("%d", metad->btm_root);
556 values[j++] = psprintf("%d", metad->btm_level);
557 values[j++] = psprintf("%d", metad->btm_fastroot);
558 values[j++] = psprintf("%d", metad->btm_fastlevel);
559
560 /*
561 * Get values of extended metadata if available, use default values
562 * otherwise.
563 */
564 if (metad->btm_version >= BTREE_NOVAC_VERSION)
565 {
566 /*
567 * kludge: btm_oldest_btpo_xact is declared as int4, which is wrong.
568 * We should at least avoid raising an error when its value happens to
569 * exceed PG_INT32_MAX, though.
570 */
571 values[j++] = psprintf("%d", (int) metad->btm_oldest_btpo_xact);
572 values[j++] = psprintf("%f", metad->btm_last_cleanup_num_heap_tuples);
573 }
574 else
575 {
576 values[j++] = "0";
577 values[j++] = "-1";
578 }
579
580 tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc),
581 values);
582
583 result = HeapTupleGetDatum(tuple);
584
585 UnlockReleaseBuffer(buffer);
586 relation_close(rel, AccessShareLock);
587
588 PG_RETURN_DATUM(result);
589 }
590