1 /*
2  * contrib/pageinspect/btreefuncs.c
3  *
4  *
5  * btreefuncs.c
6  *
7  * Copyright (c) 2006 Satoshi Nagayasu <nagayasus@nttdata.co.jp>
8  *
9  * Permission to use, copy, modify, and distribute this software and
10  * its documentation for any purpose, without fee, and without a
11  * written agreement is hereby granted, provided that the above
12  * copyright notice and this paragraph and the following two
13  * paragraphs appear in all copies.
14  *
15  * IN NO EVENT SHALL THE AUTHOR BE LIABLE TO ANY PARTY FOR DIRECT,
16  * INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
17  * LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
18  * DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED
19  * OF THE POSSIBILITY OF SUCH DAMAGE.
20  *
21  * THE AUTHOR SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS
24  * IS" BASIS, AND THE AUTHOR HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE,
25  * SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
26  */
27 
28 #include "postgres.h"
29 
30 #include "pageinspect.h"
31 
32 #include "access/nbtree.h"
33 #include "access/relation.h"
34 #include "catalog/namespace.h"
35 #include "catalog/pg_am.h"
36 #include "funcapi.h"
37 #include "miscadmin.h"
38 #include "utils/builtins.h"
39 #include "utils/rel.h"
40 #include "utils/varlena.h"
41 
42 
43 PG_FUNCTION_INFO_V1(bt_metap);
44 PG_FUNCTION_INFO_V1(bt_page_items);
45 PG_FUNCTION_INFO_V1(bt_page_items_bytea);
46 PG_FUNCTION_INFO_V1(bt_page_stats);
47 
48 #define IS_INDEX(r) ((r)->rd_rel->relkind == RELKIND_INDEX)
49 #define IS_BTREE(r) ((r)->rd_rel->relam == BTREE_AM_OID)
50 
51 /* note: BlockNumber is unsigned, hence can't be negative */
52 #define CHECK_RELATION_BLOCK_RANGE(rel, blkno) { \
53 		if ( RelationGetNumberOfBlocks(rel) <= (BlockNumber) (blkno) ) \
54 			 elog(ERROR, "block number out of range"); }
55 
56 /* ------------------------------------------------
57  * structure for single btree page statistics
58  * ------------------------------------------------
59  */
60 typedef struct BTPageStat
61 {
62 	uint32		blkno;
63 	uint32		live_items;
64 	uint32		dead_items;
65 	uint32		page_size;
66 	uint32		max_avail;
67 	uint32		free_size;
68 	uint32		avg_item_size;
69 	char		type;
70 
71 	/* opaque data */
72 	BlockNumber btpo_prev;
73 	BlockNumber btpo_next;
74 	union
75 	{
76 		uint32		level;
77 		TransactionId xact;
78 	}			btpo;
79 	uint16		btpo_flags;
80 	BTCycleId	btpo_cycleid;
81 } BTPageStat;
82 
83 
84 /* -------------------------------------------------
85  * GetBTPageStatistics()
86  *
87  * Collect statistics of single b-tree page
88  * -------------------------------------------------
89  */
90 static void
GetBTPageStatistics(BlockNumber blkno,Buffer buffer,BTPageStat * stat)91 GetBTPageStatistics(BlockNumber blkno, Buffer buffer, BTPageStat *stat)
92 {
93 	Page		page = BufferGetPage(buffer);
94 	PageHeader	phdr = (PageHeader) page;
95 	OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
96 	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
97 	int			item_size = 0;
98 	int			off;
99 
100 	stat->blkno = blkno;
101 
102 	stat->max_avail = BLCKSZ - (BLCKSZ - phdr->pd_special + SizeOfPageHeaderData);
103 
104 	stat->dead_items = stat->live_items = 0;
105 
106 	stat->page_size = PageGetPageSize(page);
107 
108 	/* page type (flags) */
109 	if (P_ISDELETED(opaque))
110 	{
111 		stat->type = 'd';
112 		stat->btpo.xact = opaque->btpo.xact;
113 		return;
114 	}
115 	else if (P_IGNORE(opaque))
116 		stat->type = 'e';
117 	else if (P_ISLEAF(opaque))
118 		stat->type = 'l';
119 	else if (P_ISROOT(opaque))
120 		stat->type = 'r';
121 	else
122 		stat->type = 'i';
123 
124 	/* btpage opaque data */
125 	stat->btpo_prev = opaque->btpo_prev;
126 	stat->btpo_next = opaque->btpo_next;
127 	stat->btpo.level = opaque->btpo.level;
128 	stat->btpo_flags = opaque->btpo_flags;
129 	stat->btpo_cycleid = opaque->btpo_cycleid;
130 
131 	/* count live and dead tuples, and free space */
132 	for (off = FirstOffsetNumber; off <= maxoff; off++)
133 	{
134 		IndexTuple	itup;
135 
136 		ItemId		id = PageGetItemId(page, off);
137 
138 		itup = (IndexTuple) PageGetItem(page, id);
139 
140 		item_size += IndexTupleSize(itup);
141 
142 		if (!ItemIdIsDead(id))
143 			stat->live_items++;
144 		else
145 			stat->dead_items++;
146 	}
147 	stat->free_size = PageGetFreeSpace(page);
148 
149 	if ((stat->live_items + stat->dead_items) > 0)
150 		stat->avg_item_size = item_size / (stat->live_items + stat->dead_items);
151 	else
152 		stat->avg_item_size = 0;
153 }
154 
155 /* -----------------------------------------------
156  * bt_page_stats()
157  *
158  * Usage: SELECT * FROM bt_page_stats('t1_pkey', 1);
159  * -----------------------------------------------
160  */
161 Datum
bt_page_stats(PG_FUNCTION_ARGS)162 bt_page_stats(PG_FUNCTION_ARGS)
163 {
164 	text	   *relname = PG_GETARG_TEXT_PP(0);
165 	uint32		blkno = PG_GETARG_UINT32(1);
166 	Buffer		buffer;
167 	Relation	rel;
168 	RangeVar   *relrv;
169 	Datum		result;
170 	HeapTuple	tuple;
171 	TupleDesc	tupleDesc;
172 	int			j;
173 	char	   *values[11];
174 	BTPageStat	stat;
175 
176 	if (!superuser())
177 		ereport(ERROR,
178 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
179 				 (errmsg("must be superuser to use pageinspect functions"))));
180 
181 	relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
182 	rel = relation_openrv(relrv, AccessShareLock);
183 
184 	if (!IS_INDEX(rel) || !IS_BTREE(rel))
185 		elog(ERROR, "relation \"%s\" is not a btree index",
186 			 RelationGetRelationName(rel));
187 
188 	/*
189 	 * Reject attempts to read non-local temporary relations; we would be
190 	 * likely to get wrong data since we have no visibility into the owning
191 	 * session's local buffers.
192 	 */
193 	if (RELATION_IS_OTHER_TEMP(rel))
194 		ereport(ERROR,
195 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
196 				 errmsg("cannot access temporary tables of other sessions")));
197 
198 	if (blkno == 0)
199 		elog(ERROR, "block 0 is a meta page");
200 
201 	CHECK_RELATION_BLOCK_RANGE(rel, blkno);
202 
203 	buffer = ReadBuffer(rel, blkno);
204 	LockBuffer(buffer, BUFFER_LOCK_SHARE);
205 
206 	/* keep compiler quiet */
207 	stat.btpo_prev = stat.btpo_next = InvalidBlockNumber;
208 	stat.btpo_flags = stat.free_size = stat.avg_item_size = 0;
209 
210 	GetBTPageStatistics(blkno, buffer, &stat);
211 
212 	UnlockReleaseBuffer(buffer);
213 	relation_close(rel, AccessShareLock);
214 
215 	/* Build a tuple descriptor for our result type */
216 	if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
217 		elog(ERROR, "return type must be a row type");
218 
219 	j = 0;
220 	values[j++] = psprintf("%d", stat.blkno);
221 	values[j++] = psprintf("%c", stat.type);
222 	values[j++] = psprintf("%d", stat.live_items);
223 	values[j++] = psprintf("%d", stat.dead_items);
224 	values[j++] = psprintf("%d", stat.avg_item_size);
225 	values[j++] = psprintf("%d", stat.page_size);
226 	values[j++] = psprintf("%d", stat.free_size);
227 	values[j++] = psprintf("%d", stat.btpo_prev);
228 	values[j++] = psprintf("%d", stat.btpo_next);
229 	values[j++] = psprintf("%d", (stat.type == 'd') ? stat.btpo.xact : stat.btpo.level);
230 	values[j++] = psprintf("%d", stat.btpo_flags);
231 
232 	tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc),
233 								   values);
234 
235 	result = HeapTupleGetDatum(tuple);
236 
237 	PG_RETURN_DATUM(result);
238 }
239 
240 
241 /*
242  * cross-call data structure for SRF
243  */
244 struct user_args
245 {
246 	Page		page;
247 	OffsetNumber offset;
248 };
249 
250 /*-------------------------------------------------------
251  * bt_page_print_tuples()
252  *
253  * Form a tuple describing index tuple at a given offset
254  * ------------------------------------------------------
255  */
256 static Datum
bt_page_print_tuples(FuncCallContext * fctx,Page page,OffsetNumber offset)257 bt_page_print_tuples(FuncCallContext *fctx, Page page, OffsetNumber offset)
258 {
259 	char	   *values[6];
260 	HeapTuple	tuple;
261 	ItemId		id;
262 	IndexTuple	itup;
263 	int			j;
264 	int			off;
265 	int			dlen;
266 	char	   *dump;
267 	char	   *ptr;
268 
269 	id = PageGetItemId(page, offset);
270 
271 	if (!ItemIdIsValid(id))
272 		elog(ERROR, "invalid ItemId");
273 
274 	itup = (IndexTuple) PageGetItem(page, id);
275 
276 	j = 0;
277 	values[j++] = psprintf("%d", offset);
278 	values[j++] = psprintf("(%u,%u)",
279 						   ItemPointerGetBlockNumberNoCheck(&itup->t_tid),
280 						   ItemPointerGetOffsetNumberNoCheck(&itup->t_tid));
281 	values[j++] = psprintf("%d", (int) IndexTupleSize(itup));
282 	values[j++] = psprintf("%c", IndexTupleHasNulls(itup) ? 't' : 'f');
283 	values[j++] = psprintf("%c", IndexTupleHasVarwidths(itup) ? 't' : 'f');
284 
285 	ptr = (char *) itup + IndexInfoFindDataOffset(itup->t_info);
286 	dlen = IndexTupleSize(itup) - IndexInfoFindDataOffset(itup->t_info);
287 	dump = palloc0(dlen * 3 + 1);
288 	values[j] = dump;
289 	for (off = 0; off < dlen; off++)
290 	{
291 		if (off > 0)
292 			*dump++ = ' ';
293 		sprintf(dump, "%02x", *(ptr + off) & 0xff);
294 		dump += 2;
295 	}
296 
297 	tuple = BuildTupleFromCStrings(fctx->attinmeta, values);
298 
299 	return HeapTupleGetDatum(tuple);
300 }
301 
302 /*-------------------------------------------------------
303  * bt_page_items()
304  *
305  * Get IndexTupleData set in a btree page
306  *
307  * Usage: SELECT * FROM bt_page_items('t1_pkey', 1);
308  *-------------------------------------------------------
309  */
310 Datum
bt_page_items(PG_FUNCTION_ARGS)311 bt_page_items(PG_FUNCTION_ARGS)
312 {
313 	text	   *relname = PG_GETARG_TEXT_PP(0);
314 	uint32		blkno = PG_GETARG_UINT32(1);
315 	Datum		result;
316 	FuncCallContext *fctx;
317 	MemoryContext mctx;
318 	struct user_args *uargs;
319 
320 	if (!superuser())
321 		ereport(ERROR,
322 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
323 				 (errmsg("must be superuser to use pageinspect functions"))));
324 
325 	if (SRF_IS_FIRSTCALL())
326 	{
327 		RangeVar   *relrv;
328 		Relation	rel;
329 		Buffer		buffer;
330 		BTPageOpaque opaque;
331 		TupleDesc	tupleDesc;
332 
333 		fctx = SRF_FIRSTCALL_INIT();
334 
335 		relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
336 		rel = relation_openrv(relrv, AccessShareLock);
337 
338 		if (!IS_INDEX(rel) || !IS_BTREE(rel))
339 			elog(ERROR, "relation \"%s\" is not a btree index",
340 				 RelationGetRelationName(rel));
341 
342 		/*
343 		 * Reject attempts to read non-local temporary relations; we would be
344 		 * likely to get wrong data since we have no visibility into the
345 		 * owning session's local buffers.
346 		 */
347 		if (RELATION_IS_OTHER_TEMP(rel))
348 			ereport(ERROR,
349 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
350 					 errmsg("cannot access temporary tables of other sessions")));
351 
352 		if (blkno == 0)
353 			elog(ERROR, "block 0 is a meta page");
354 
355 		CHECK_RELATION_BLOCK_RANGE(rel, blkno);
356 
357 		buffer = ReadBuffer(rel, blkno);
358 		LockBuffer(buffer, BUFFER_LOCK_SHARE);
359 
360 		/*
361 		 * We copy the page into local storage to avoid holding pin on the
362 		 * buffer longer than we must, and possibly failing to release it at
363 		 * all if the calling query doesn't fetch all rows.
364 		 */
365 		mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx);
366 
367 		uargs = palloc(sizeof(struct user_args));
368 
369 		uargs->page = palloc(BLCKSZ);
370 		memcpy(uargs->page, BufferGetPage(buffer), BLCKSZ);
371 
372 		UnlockReleaseBuffer(buffer);
373 		relation_close(rel, AccessShareLock);
374 
375 		uargs->offset = FirstOffsetNumber;
376 
377 		opaque = (BTPageOpaque) PageGetSpecialPointer(uargs->page);
378 
379 		if (P_ISDELETED(opaque))
380 			elog(NOTICE, "page is deleted");
381 
382 		fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
383 
384 		/* Build a tuple descriptor for our result type */
385 		if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
386 			elog(ERROR, "return type must be a row type");
387 
388 		fctx->attinmeta = TupleDescGetAttInMetadata(tupleDesc);
389 
390 		fctx->user_fctx = uargs;
391 
392 		MemoryContextSwitchTo(mctx);
393 	}
394 
395 	fctx = SRF_PERCALL_SETUP();
396 	uargs = fctx->user_fctx;
397 
398 	if (fctx->call_cntr < fctx->max_calls)
399 	{
400 		result = bt_page_print_tuples(fctx, uargs->page, uargs->offset);
401 		uargs->offset++;
402 		SRF_RETURN_NEXT(fctx, result);
403 	}
404 	else
405 	{
406 		pfree(uargs->page);
407 		pfree(uargs);
408 		SRF_RETURN_DONE(fctx);
409 	}
410 }
411 
412 /*-------------------------------------------------------
413  * bt_page_items_bytea()
414  *
415  * Get IndexTupleData set in a btree page
416  *
417  * Usage: SELECT * FROM bt_page_items(get_raw_page('t1_pkey', 1));
418  *-------------------------------------------------------
419  */
420 
421 Datum
bt_page_items_bytea(PG_FUNCTION_ARGS)422 bt_page_items_bytea(PG_FUNCTION_ARGS)
423 {
424 	bytea	   *raw_page = PG_GETARG_BYTEA_P(0);
425 	Datum		result;
426 	FuncCallContext *fctx;
427 	struct user_args *uargs;
428 	int			raw_page_size;
429 
430 	if (!superuser())
431 		ereport(ERROR,
432 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
433 				 (errmsg("must be superuser to use raw page functions"))));
434 
435 	if (SRF_IS_FIRSTCALL())
436 	{
437 		BTPageOpaque opaque;
438 		MemoryContext mctx;
439 		TupleDesc	tupleDesc;
440 
441 		raw_page_size = VARSIZE(raw_page) - VARHDRSZ;
442 
443 		if (raw_page_size < SizeOfPageHeaderData)
444 			ereport(ERROR,
445 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
446 					 errmsg("input page too small (%d bytes)", raw_page_size)));
447 
448 		fctx = SRF_FIRSTCALL_INIT();
449 		mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx);
450 
451 		uargs = palloc(sizeof(struct user_args));
452 
453 		uargs->page = VARDATA(raw_page);
454 
455 		uargs->offset = FirstOffsetNumber;
456 
457 		opaque = (BTPageOpaque) PageGetSpecialPointer(uargs->page);
458 
459 		if (P_ISMETA(opaque))
460 			ereport(ERROR,
461 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
462 					 errmsg("block is a meta page")));
463 
464 		if (P_ISDELETED(opaque))
465 			elog(NOTICE, "page is deleted");
466 
467 		fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
468 
469 		/* Build a tuple descriptor for our result type */
470 		if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
471 			elog(ERROR, "return type must be a row type");
472 
473 		fctx->attinmeta = TupleDescGetAttInMetadata(tupleDesc);
474 
475 		fctx->user_fctx = uargs;
476 
477 		MemoryContextSwitchTo(mctx);
478 	}
479 
480 	fctx = SRF_PERCALL_SETUP();
481 	uargs = fctx->user_fctx;
482 
483 	if (fctx->call_cntr < fctx->max_calls)
484 	{
485 		result = bt_page_print_tuples(fctx, uargs->page, uargs->offset);
486 		uargs->offset++;
487 		SRF_RETURN_NEXT(fctx, result);
488 	}
489 	else
490 	{
491 		pfree(uargs);
492 		SRF_RETURN_DONE(fctx);
493 	}
494 }
495 
496 
497 /* ------------------------------------------------
498  * bt_metap()
499  *
500  * Get a btree's meta-page information
501  *
502  * Usage: SELECT * FROM bt_metap('t1_pkey')
503  * ------------------------------------------------
504  */
505 Datum
bt_metap(PG_FUNCTION_ARGS)506 bt_metap(PG_FUNCTION_ARGS)
507 {
508 	text	   *relname = PG_GETARG_TEXT_PP(0);
509 	Datum		result;
510 	Relation	rel;
511 	RangeVar   *relrv;
512 	BTMetaPageData *metad;
513 	TupleDesc	tupleDesc;
514 	int			j;
515 	char	   *values[8];
516 	Buffer		buffer;
517 	Page		page;
518 	HeapTuple	tuple;
519 
520 	if (!superuser())
521 		ereport(ERROR,
522 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
523 				 (errmsg("must be superuser to use pageinspect functions"))));
524 
525 	relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
526 	rel = relation_openrv(relrv, AccessShareLock);
527 
528 	if (!IS_INDEX(rel) || !IS_BTREE(rel))
529 		elog(ERROR, "relation \"%s\" is not a btree index",
530 			 RelationGetRelationName(rel));
531 
532 	/*
533 	 * Reject attempts to read non-local temporary relations; we would be
534 	 * likely to get wrong data since we have no visibility into the owning
535 	 * session's local buffers.
536 	 */
537 	if (RELATION_IS_OTHER_TEMP(rel))
538 		ereport(ERROR,
539 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
540 				 errmsg("cannot access temporary tables of other sessions")));
541 
542 	buffer = ReadBuffer(rel, 0);
543 	LockBuffer(buffer, BUFFER_LOCK_SHARE);
544 
545 	page = BufferGetPage(buffer);
546 	metad = BTPageGetMeta(page);
547 
548 	/* Build a tuple descriptor for our result type */
549 	if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
550 		elog(ERROR, "return type must be a row type");
551 
552 	j = 0;
553 	values[j++] = psprintf("%d", metad->btm_magic);
554 	values[j++] = psprintf("%d", metad->btm_version);
555 	values[j++] = psprintf("%d", metad->btm_root);
556 	values[j++] = psprintf("%d", metad->btm_level);
557 	values[j++] = psprintf("%d", metad->btm_fastroot);
558 	values[j++] = psprintf("%d", metad->btm_fastlevel);
559 
560 	/*
561 	 * Get values of extended metadata if available, use default values
562 	 * otherwise.
563 	 */
564 	if (metad->btm_version >= BTREE_NOVAC_VERSION)
565 	{
566 		/*
567 		 * kludge: btm_oldest_btpo_xact is declared as int4, which is wrong.
568 		 * We should at least avoid raising an error when its value happens to
569 		 * exceed PG_INT32_MAX, though.
570 		 */
571 		values[j++] = psprintf("%d", (int) metad->btm_oldest_btpo_xact);
572 		values[j++] = psprintf("%f", metad->btm_last_cleanup_num_heap_tuples);
573 	}
574 	else
575 	{
576 		values[j++] = "0";
577 		values[j++] = "-1";
578 	}
579 
580 	tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc),
581 								   values);
582 
583 	result = HeapTupleGetDatum(tuple);
584 
585 	UnlockReleaseBuffer(buffer);
586 	relation_close(rel, AccessShareLock);
587 
588 	PG_RETURN_DATUM(result);
589 }
590