1 /*
2 * contrib/pageinspect/btreefuncs.c
3 *
4 *
5 * btreefuncs.c
6 *
7 * Copyright (c) 2006 Satoshi Nagayasu <nagayasus@nttdata.co.jp>
8 *
9 * Permission to use, copy, modify, and distribute this software and
10 * its documentation for any purpose, without fee, and without a
11 * written agreement is hereby granted, provided that the above
12 * copyright notice and this paragraph and the following two
13 * paragraphs appear in all copies.
14 *
15 * IN NO EVENT SHALL THE AUTHOR BE LIABLE TO ANY PARTY FOR DIRECT,
16 * INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
17 * LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
18 * DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED
19 * OF THE POSSIBILITY OF SUCH DAMAGE.
20 *
21 * THE AUTHOR SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS
24 * IS" BASIS, AND THE AUTHOR HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE,
25 * SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
26 */
27
28 #include "postgres.h"
29
30 #include "pageinspect.h"
31
32 #include "access/nbtree.h"
33 #include "catalog/namespace.h"
34 #include "catalog/pg_am.h"
35 #include "funcapi.h"
36 #include "miscadmin.h"
37 #include "utils/builtins.h"
38 #include "utils/rel.h"
39 #include "utils/varlena.h"
40
41
42 PG_FUNCTION_INFO_V1(bt_metap);
43 PG_FUNCTION_INFO_V1(bt_page_items);
44 PG_FUNCTION_INFO_V1(bt_page_items_bytea);
45 PG_FUNCTION_INFO_V1(bt_page_stats);
46
47 #define IS_INDEX(r) ((r)->rd_rel->relkind == RELKIND_INDEX)
48 #define IS_BTREE(r) ((r)->rd_rel->relam == BTREE_AM_OID)
49
50 /* note: BlockNumber is unsigned, hence can't be negative */
51 #define CHECK_RELATION_BLOCK_RANGE(rel, blkno) { \
52 if ( RelationGetNumberOfBlocks(rel) <= (BlockNumber) (blkno) ) \
53 elog(ERROR, "block number out of range"); }
54
55 /* ------------------------------------------------
56 * structure for single btree page statistics
57 * ------------------------------------------------
58 */
59 typedef struct BTPageStat
60 {
61 uint32 blkno;
62 uint32 live_items;
63 uint32 dead_items;
64 uint32 page_size;
65 uint32 max_avail;
66 uint32 free_size;
67 uint32 avg_item_size;
68 char type;
69
70 /* opaque data */
71 BlockNumber btpo_prev;
72 BlockNumber btpo_next;
73 union
74 {
75 uint32 level;
76 TransactionId xact;
77 } btpo;
78 uint16 btpo_flags;
79 BTCycleId btpo_cycleid;
80 } BTPageStat;
81
82
83 /* -------------------------------------------------
84 * GetBTPageStatistics()
85 *
86 * Collect statistics of single b-tree page
87 * -------------------------------------------------
88 */
89 static void
GetBTPageStatistics(BlockNumber blkno,Buffer buffer,BTPageStat * stat)90 GetBTPageStatistics(BlockNumber blkno, Buffer buffer, BTPageStat *stat)
91 {
92 Page page = BufferGetPage(buffer);
93 PageHeader phdr = (PageHeader) page;
94 OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
95 BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
96 int item_size = 0;
97 int off;
98
99 stat->blkno = blkno;
100
101 stat->max_avail = BLCKSZ - (BLCKSZ - phdr->pd_special + SizeOfPageHeaderData);
102
103 stat->dead_items = stat->live_items = 0;
104
105 stat->page_size = PageGetPageSize(page);
106
107 /* page type (flags) */
108 if (P_ISDELETED(opaque))
109 {
110 stat->type = 'd';
111 stat->btpo.xact = opaque->btpo.xact;
112 return;
113 }
114 else if (P_IGNORE(opaque))
115 stat->type = 'e';
116 else if (P_ISLEAF(opaque))
117 stat->type = 'l';
118 else if (P_ISROOT(opaque))
119 stat->type = 'r';
120 else
121 stat->type = 'i';
122
123 /* btpage opaque data */
124 stat->btpo_prev = opaque->btpo_prev;
125 stat->btpo_next = opaque->btpo_next;
126 stat->btpo.level = opaque->btpo.level;
127 stat->btpo_flags = opaque->btpo_flags;
128 stat->btpo_cycleid = opaque->btpo_cycleid;
129
130 /* count live and dead tuples, and free space */
131 for (off = FirstOffsetNumber; off <= maxoff; off++)
132 {
133 IndexTuple itup;
134
135 ItemId id = PageGetItemId(page, off);
136
137 itup = (IndexTuple) PageGetItem(page, id);
138
139 item_size += IndexTupleSize(itup);
140
141 if (!ItemIdIsDead(id))
142 stat->live_items++;
143 else
144 stat->dead_items++;
145 }
146 stat->free_size = PageGetFreeSpace(page);
147
148 if ((stat->live_items + stat->dead_items) > 0)
149 stat->avg_item_size = item_size / (stat->live_items + stat->dead_items);
150 else
151 stat->avg_item_size = 0;
152 }
153
154 /* -----------------------------------------------
155 * bt_page_stats()
156 *
157 * Usage: SELECT * FROM bt_page_stats('t1_pkey', 1);
158 * -----------------------------------------------
159 */
160 Datum
bt_page_stats(PG_FUNCTION_ARGS)161 bt_page_stats(PG_FUNCTION_ARGS)
162 {
163 text *relname = PG_GETARG_TEXT_PP(0);
164 uint32 blkno = PG_GETARG_UINT32(1);
165 Buffer buffer;
166 Relation rel;
167 RangeVar *relrv;
168 Datum result;
169 HeapTuple tuple;
170 TupleDesc tupleDesc;
171 int j;
172 char *values[11];
173 BTPageStat stat;
174
175 if (!superuser())
176 ereport(ERROR,
177 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
178 (errmsg("must be superuser to use pageinspect functions"))));
179
180 relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
181 rel = relation_openrv(relrv, AccessShareLock);
182
183 if (!IS_INDEX(rel) || !IS_BTREE(rel))
184 elog(ERROR, "relation \"%s\" is not a btree index",
185 RelationGetRelationName(rel));
186
187 /*
188 * Reject attempts to read non-local temporary relations; we would be
189 * likely to get wrong data since we have no visibility into the owning
190 * session's local buffers.
191 */
192 if (RELATION_IS_OTHER_TEMP(rel))
193 ereport(ERROR,
194 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
195 errmsg("cannot access temporary tables of other sessions")));
196
197 if (blkno == 0)
198 elog(ERROR, "block 0 is a meta page");
199
200 CHECK_RELATION_BLOCK_RANGE(rel, blkno);
201
202 buffer = ReadBuffer(rel, blkno);
203 LockBuffer(buffer, BUFFER_LOCK_SHARE);
204
205 /* keep compiler quiet */
206 stat.btpo_prev = stat.btpo_next = InvalidBlockNumber;
207 stat.btpo_flags = stat.free_size = stat.avg_item_size = 0;
208
209 GetBTPageStatistics(blkno, buffer, &stat);
210
211 UnlockReleaseBuffer(buffer);
212 relation_close(rel, AccessShareLock);
213
214 /* Build a tuple descriptor for our result type */
215 if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
216 elog(ERROR, "return type must be a row type");
217
218 j = 0;
219 values[j++] = psprintf("%d", stat.blkno);
220 values[j++] = psprintf("%c", stat.type);
221 values[j++] = psprintf("%d", stat.live_items);
222 values[j++] = psprintf("%d", stat.dead_items);
223 values[j++] = psprintf("%d", stat.avg_item_size);
224 values[j++] = psprintf("%d", stat.page_size);
225 values[j++] = psprintf("%d", stat.free_size);
226 values[j++] = psprintf("%d", stat.btpo_prev);
227 values[j++] = psprintf("%d", stat.btpo_next);
228 values[j++] = psprintf("%d", (stat.type == 'd') ? stat.btpo.xact : stat.btpo.level);
229 values[j++] = psprintf("%d", stat.btpo_flags);
230
231 tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc),
232 values);
233
234 result = HeapTupleGetDatum(tuple);
235
236 PG_RETURN_DATUM(result);
237 }
238
239
240 /*
241 * cross-call data structure for SRF
242 */
243 struct user_args
244 {
245 Page page;
246 OffsetNumber offset;
247 };
248
249 /*-------------------------------------------------------
250 * bt_page_print_tuples()
251 *
252 * Form a tuple describing index tuple at a given offset
253 * ------------------------------------------------------
254 */
255 static Datum
bt_page_print_tuples(FuncCallContext * fctx,Page page,OffsetNumber offset)256 bt_page_print_tuples(FuncCallContext *fctx, Page page, OffsetNumber offset)
257 {
258 char *values[6];
259 HeapTuple tuple;
260 ItemId id;
261 IndexTuple itup;
262 int j;
263 int off;
264 int dlen;
265 char *dump;
266 char *ptr;
267
268 id = PageGetItemId(page, offset);
269
270 if (!ItemIdIsValid(id))
271 elog(ERROR, "invalid ItemId");
272
273 itup = (IndexTuple) PageGetItem(page, id);
274
275 j = 0;
276 values[j++] = psprintf("%d", offset);
277 values[j++] = psprintf("(%u,%u)",
278 ItemPointerGetBlockNumberNoCheck(&itup->t_tid),
279 ItemPointerGetOffsetNumberNoCheck(&itup->t_tid));
280 values[j++] = psprintf("%d", (int) IndexTupleSize(itup));
281 values[j++] = psprintf("%c", IndexTupleHasNulls(itup) ? 't' : 'f');
282 values[j++] = psprintf("%c", IndexTupleHasVarwidths(itup) ? 't' : 'f');
283
284 ptr = (char *) itup + IndexInfoFindDataOffset(itup->t_info);
285 dlen = IndexTupleSize(itup) - IndexInfoFindDataOffset(itup->t_info);
286 dump = palloc0(dlen * 3 + 1);
287 values[j] = dump;
288 for (off = 0; off < dlen; off++)
289 {
290 if (off > 0)
291 *dump++ = ' ';
292 sprintf(dump, "%02x", *(ptr + off) & 0xff);
293 dump += 2;
294 }
295
296 tuple = BuildTupleFromCStrings(fctx->attinmeta, values);
297
298 return HeapTupleGetDatum(tuple);
299 }
300
301 /*-------------------------------------------------------
302 * bt_page_items()
303 *
304 * Get IndexTupleData set in a btree page
305 *
306 * Usage: SELECT * FROM bt_page_items('t1_pkey', 1);
307 *-------------------------------------------------------
308 */
309 Datum
bt_page_items(PG_FUNCTION_ARGS)310 bt_page_items(PG_FUNCTION_ARGS)
311 {
312 text *relname = PG_GETARG_TEXT_PP(0);
313 uint32 blkno = PG_GETARG_UINT32(1);
314 Datum result;
315 FuncCallContext *fctx;
316 MemoryContext mctx;
317 struct user_args *uargs;
318
319 if (!superuser())
320 ereport(ERROR,
321 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
322 (errmsg("must be superuser to use pageinspect functions"))));
323
324 if (SRF_IS_FIRSTCALL())
325 {
326 RangeVar *relrv;
327 Relation rel;
328 Buffer buffer;
329 BTPageOpaque opaque;
330 TupleDesc tupleDesc;
331
332 fctx = SRF_FIRSTCALL_INIT();
333
334 relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
335 rel = relation_openrv(relrv, AccessShareLock);
336
337 if (!IS_INDEX(rel) || !IS_BTREE(rel))
338 elog(ERROR, "relation \"%s\" is not a btree index",
339 RelationGetRelationName(rel));
340
341 /*
342 * Reject attempts to read non-local temporary relations; we would be
343 * likely to get wrong data since we have no visibility into the
344 * owning session's local buffers.
345 */
346 if (RELATION_IS_OTHER_TEMP(rel))
347 ereport(ERROR,
348 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
349 errmsg("cannot access temporary tables of other sessions")));
350
351 if (blkno == 0)
352 elog(ERROR, "block 0 is a meta page");
353
354 CHECK_RELATION_BLOCK_RANGE(rel, blkno);
355
356 buffer = ReadBuffer(rel, blkno);
357 LockBuffer(buffer, BUFFER_LOCK_SHARE);
358
359 /*
360 * We copy the page into local storage to avoid holding pin on the
361 * buffer longer than we must, and possibly failing to release it at
362 * all if the calling query doesn't fetch all rows.
363 */
364 mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx);
365
366 uargs = palloc(sizeof(struct user_args));
367
368 uargs->page = palloc(BLCKSZ);
369 memcpy(uargs->page, BufferGetPage(buffer), BLCKSZ);
370
371 UnlockReleaseBuffer(buffer);
372 relation_close(rel, AccessShareLock);
373
374 uargs->offset = FirstOffsetNumber;
375
376 opaque = (BTPageOpaque) PageGetSpecialPointer(uargs->page);
377
378 if (P_ISDELETED(opaque))
379 elog(NOTICE, "page is deleted");
380
381 fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
382
383 /* Build a tuple descriptor for our result type */
384 if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
385 elog(ERROR, "return type must be a row type");
386
387 fctx->attinmeta = TupleDescGetAttInMetadata(tupleDesc);
388
389 fctx->user_fctx = uargs;
390
391 MemoryContextSwitchTo(mctx);
392 }
393
394 fctx = SRF_PERCALL_SETUP();
395 uargs = fctx->user_fctx;
396
397 if (fctx->call_cntr < fctx->max_calls)
398 {
399 result = bt_page_print_tuples(fctx, uargs->page, uargs->offset);
400 uargs->offset++;
401 SRF_RETURN_NEXT(fctx, result);
402 }
403 else
404 {
405 pfree(uargs->page);
406 pfree(uargs);
407 SRF_RETURN_DONE(fctx);
408 }
409 }
410
411 /*-------------------------------------------------------
412 * bt_page_items_bytea()
413 *
414 * Get IndexTupleData set in a btree page
415 *
416 * Usage: SELECT * FROM bt_page_items(get_raw_page('t1_pkey', 1));
417 *-------------------------------------------------------
418 */
419
420 Datum
bt_page_items_bytea(PG_FUNCTION_ARGS)421 bt_page_items_bytea(PG_FUNCTION_ARGS)
422 {
423 bytea *raw_page = PG_GETARG_BYTEA_P(0);
424 Datum result;
425 FuncCallContext *fctx;
426 struct user_args *uargs;
427 int raw_page_size;
428
429 if (!superuser())
430 ereport(ERROR,
431 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
432 (errmsg("must be superuser to use pageinspect functions"))));
433
434 if (SRF_IS_FIRSTCALL())
435 {
436 BTPageOpaque opaque;
437 MemoryContext mctx;
438 TupleDesc tupleDesc;
439
440 raw_page_size = VARSIZE(raw_page) - VARHDRSZ;
441
442 if (raw_page_size < SizeOfPageHeaderData)
443 ereport(ERROR,
444 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
445 errmsg("input page too small (%d bytes)", raw_page_size)));
446
447 fctx = SRF_FIRSTCALL_INIT();
448 mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx);
449
450 uargs = palloc(sizeof(struct user_args));
451
452 uargs->page = VARDATA(raw_page);
453
454 uargs->offset = FirstOffsetNumber;
455
456 opaque = (BTPageOpaque) PageGetSpecialPointer(uargs->page);
457
458 if (P_ISMETA(opaque))
459 ereport(ERROR,
460 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
461 errmsg("block is a meta page")));
462
463 if (P_ISDELETED(opaque))
464 elog(NOTICE, "page is deleted");
465
466 fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
467
468 /* Build a tuple descriptor for our result type */
469 if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
470 elog(ERROR, "return type must be a row type");
471
472 fctx->attinmeta = TupleDescGetAttInMetadata(tupleDesc);
473
474 fctx->user_fctx = uargs;
475
476 MemoryContextSwitchTo(mctx);
477 }
478
479 fctx = SRF_PERCALL_SETUP();
480 uargs = fctx->user_fctx;
481
482 if (fctx->call_cntr < fctx->max_calls)
483 {
484 result = bt_page_print_tuples(fctx, uargs->page, uargs->offset);
485 uargs->offset++;
486 SRF_RETURN_NEXT(fctx, result);
487 }
488 else
489 {
490 pfree(uargs);
491 SRF_RETURN_DONE(fctx);
492 }
493 }
494
495
496 /* ------------------------------------------------
497 * bt_metap()
498 *
499 * Get a btree's meta-page information
500 *
501 * Usage: SELECT * FROM bt_metap('t1_pkey')
502 * ------------------------------------------------
503 */
504 Datum
bt_metap(PG_FUNCTION_ARGS)505 bt_metap(PG_FUNCTION_ARGS)
506 {
507 text *relname = PG_GETARG_TEXT_PP(0);
508 Datum result;
509 Relation rel;
510 RangeVar *relrv;
511 BTMetaPageData *metad;
512 TupleDesc tupleDesc;
513 int j;
514 char *values[8];
515 Buffer buffer;
516 Page page;
517 HeapTuple tuple;
518
519 if (!superuser())
520 ereport(ERROR,
521 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
522 (errmsg("must be superuser to use pageinspect functions"))));
523
524 relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
525 rel = relation_openrv(relrv, AccessShareLock);
526
527 if (!IS_INDEX(rel) || !IS_BTREE(rel))
528 elog(ERROR, "relation \"%s\" is not a btree index",
529 RelationGetRelationName(rel));
530
531 /*
532 * Reject attempts to read non-local temporary relations; we would be
533 * likely to get wrong data since we have no visibility into the owning
534 * session's local buffers.
535 */
536 if (RELATION_IS_OTHER_TEMP(rel))
537 ereport(ERROR,
538 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
539 errmsg("cannot access temporary tables of other sessions")));
540
541 buffer = ReadBuffer(rel, 0);
542 LockBuffer(buffer, BUFFER_LOCK_SHARE);
543
544 page = BufferGetPage(buffer);
545 metad = BTPageGetMeta(page);
546
547 /* Build a tuple descriptor for our result type */
548 if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
549 elog(ERROR, "return type must be a row type");
550
551 j = 0;
552 values[j++] = psprintf("%d", metad->btm_magic);
553 values[j++] = psprintf("%d", metad->btm_version);
554 values[j++] = psprintf("%d", metad->btm_root);
555 values[j++] = psprintf("%d", metad->btm_level);
556 values[j++] = psprintf("%d", metad->btm_fastroot);
557 values[j++] = psprintf("%d", metad->btm_fastlevel);
558
559 /*
560 * Get values of extended metadata if available, use default values
561 * otherwise.
562 */
563 if (metad->btm_version == BTREE_VERSION)
564 {
565 /*
566 * kludge: btm_oldest_btpo_xact is declared as int4, which is wrong.
567 * We should at least avoid raising an error when its value happens to
568 * exceed PG_INT32_MAX, though.
569 */
570 values[j++] = psprintf("%d", (int) metad->btm_oldest_btpo_xact);
571 values[j++] = psprintf("%f", metad->btm_last_cleanup_num_heap_tuples);
572 }
573 else
574 {
575 values[j++] = "0";
576 values[j++] = "-1";
577 }
578
579 tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc),
580 values);
581
582 result = HeapTupleGetDatum(tuple);
583
584 UnlockReleaseBuffer(buffer);
585 relation_close(rel, AccessShareLock);
586
587 PG_RETURN_DATUM(result);
588 }
589