1 /*
2  * contrib/pageinspect/btreefuncs.c
3  *
4  *
5  * btreefuncs.c
6  *
7  * Copyright (c) 2006 Satoshi Nagayasu <nagayasus@nttdata.co.jp>
8  *
9  * Permission to use, copy, modify, and distribute this software and
10  * its documentation for any purpose, without fee, and without a
11  * written agreement is hereby granted, provided that the above
12  * copyright notice and this paragraph and the following two
13  * paragraphs appear in all copies.
14  *
15  * IN NO EVENT SHALL THE AUTHOR BE LIABLE TO ANY PARTY FOR DIRECT,
16  * INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
17  * LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
18  * DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED
19  * OF THE POSSIBILITY OF SUCH DAMAGE.
20  *
21  * THE AUTHOR SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS
24  * IS" BASIS, AND THE AUTHOR HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE,
25  * SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
26  */
27 
28 #include "postgres.h"
29 
30 #include "pageinspect.h"
31 
32 #include "access/nbtree.h"
33 #include "catalog/namespace.h"
34 #include "catalog/pg_am.h"
35 #include "funcapi.h"
36 #include "miscadmin.h"
37 #include "utils/builtins.h"
38 #include "utils/rel.h"
39 #include "utils/varlena.h"
40 
41 
42 PG_FUNCTION_INFO_V1(bt_metap);
43 PG_FUNCTION_INFO_V1(bt_page_items);
44 PG_FUNCTION_INFO_V1(bt_page_items_bytea);
45 PG_FUNCTION_INFO_V1(bt_page_stats);
46 
47 #define IS_INDEX(r) ((r)->rd_rel->relkind == RELKIND_INDEX)
48 #define IS_BTREE(r) ((r)->rd_rel->relam == BTREE_AM_OID)
49 
50 /* note: BlockNumber is unsigned, hence can't be negative */
51 #define CHECK_RELATION_BLOCK_RANGE(rel, blkno) { \
52 		if ( RelationGetNumberOfBlocks(rel) <= (BlockNumber) (blkno) ) \
53 			 elog(ERROR, "block number out of range"); }
54 
55 /* ------------------------------------------------
56  * structure for single btree page statistics
57  * ------------------------------------------------
58  */
59 typedef struct BTPageStat
60 {
61 	uint32		blkno;
62 	uint32		live_items;
63 	uint32		dead_items;
64 	uint32		page_size;
65 	uint32		max_avail;
66 	uint32		free_size;
67 	uint32		avg_item_size;
68 	char		type;
69 
70 	/* opaque data */
71 	BlockNumber btpo_prev;
72 	BlockNumber btpo_next;
73 	union
74 	{
75 		uint32		level;
76 		TransactionId xact;
77 	}			btpo;
78 	uint16		btpo_flags;
79 	BTCycleId	btpo_cycleid;
80 } BTPageStat;
81 
82 
83 /* -------------------------------------------------
84  * GetBTPageStatistics()
85  *
86  * Collect statistics of single b-tree page
87  * -------------------------------------------------
88  */
89 static void
GetBTPageStatistics(BlockNumber blkno,Buffer buffer,BTPageStat * stat)90 GetBTPageStatistics(BlockNumber blkno, Buffer buffer, BTPageStat *stat)
91 {
92 	Page		page = BufferGetPage(buffer);
93 	PageHeader	phdr = (PageHeader) page;
94 	OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
95 	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
96 	int			item_size = 0;
97 	int			off;
98 
99 	stat->blkno = blkno;
100 
101 	stat->max_avail = BLCKSZ - (BLCKSZ - phdr->pd_special + SizeOfPageHeaderData);
102 
103 	stat->dead_items = stat->live_items = 0;
104 
105 	stat->page_size = PageGetPageSize(page);
106 
107 	/* page type (flags) */
108 	if (P_ISDELETED(opaque))
109 	{
110 		stat->type = 'd';
111 		stat->btpo.xact = opaque->btpo.xact;
112 		return;
113 	}
114 	else if (P_IGNORE(opaque))
115 		stat->type = 'e';
116 	else if (P_ISLEAF(opaque))
117 		stat->type = 'l';
118 	else if (P_ISROOT(opaque))
119 		stat->type = 'r';
120 	else
121 		stat->type = 'i';
122 
123 	/* btpage opaque data */
124 	stat->btpo_prev = opaque->btpo_prev;
125 	stat->btpo_next = opaque->btpo_next;
126 	stat->btpo.level = opaque->btpo.level;
127 	stat->btpo_flags = opaque->btpo_flags;
128 	stat->btpo_cycleid = opaque->btpo_cycleid;
129 
130 	/* count live and dead tuples, and free space */
131 	for (off = FirstOffsetNumber; off <= maxoff; off++)
132 	{
133 		IndexTuple	itup;
134 
135 		ItemId		id = PageGetItemId(page, off);
136 
137 		itup = (IndexTuple) PageGetItem(page, id);
138 
139 		item_size += IndexTupleSize(itup);
140 
141 		if (!ItemIdIsDead(id))
142 			stat->live_items++;
143 		else
144 			stat->dead_items++;
145 	}
146 	stat->free_size = PageGetFreeSpace(page);
147 
148 	if ((stat->live_items + stat->dead_items) > 0)
149 		stat->avg_item_size = item_size / (stat->live_items + stat->dead_items);
150 	else
151 		stat->avg_item_size = 0;
152 }
153 
154 /* -----------------------------------------------
155  * bt_page_stats()
156  *
157  * Usage: SELECT * FROM bt_page_stats('t1_pkey', 1);
158  * -----------------------------------------------
159  */
160 Datum
bt_page_stats(PG_FUNCTION_ARGS)161 bt_page_stats(PG_FUNCTION_ARGS)
162 {
163 	text	   *relname = PG_GETARG_TEXT_PP(0);
164 	uint32		blkno = PG_GETARG_UINT32(1);
165 	Buffer		buffer;
166 	Relation	rel;
167 	RangeVar   *relrv;
168 	Datum		result;
169 	HeapTuple	tuple;
170 	TupleDesc	tupleDesc;
171 	int			j;
172 	char	   *values[11];
173 	BTPageStat	stat;
174 
175 	if (!superuser())
176 		ereport(ERROR,
177 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
178 				 (errmsg("must be superuser to use pageinspect functions"))));
179 
180 	relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
181 	rel = relation_openrv(relrv, AccessShareLock);
182 
183 	if (!IS_INDEX(rel) || !IS_BTREE(rel))
184 		elog(ERROR, "relation \"%s\" is not a btree index",
185 			 RelationGetRelationName(rel));
186 
187 	/*
188 	 * Reject attempts to read non-local temporary relations; we would be
189 	 * likely to get wrong data since we have no visibility into the owning
190 	 * session's local buffers.
191 	 */
192 	if (RELATION_IS_OTHER_TEMP(rel))
193 		ereport(ERROR,
194 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
195 				 errmsg("cannot access temporary tables of other sessions")));
196 
197 	if (blkno == 0)
198 		elog(ERROR, "block 0 is a meta page");
199 
200 	CHECK_RELATION_BLOCK_RANGE(rel, blkno);
201 
202 	buffer = ReadBuffer(rel, blkno);
203 	LockBuffer(buffer, BUFFER_LOCK_SHARE);
204 
205 	/* keep compiler quiet */
206 	stat.btpo_prev = stat.btpo_next = InvalidBlockNumber;
207 	stat.btpo_flags = stat.free_size = stat.avg_item_size = 0;
208 
209 	GetBTPageStatistics(blkno, buffer, &stat);
210 
211 	UnlockReleaseBuffer(buffer);
212 	relation_close(rel, AccessShareLock);
213 
214 	/* Build a tuple descriptor for our result type */
215 	if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
216 		elog(ERROR, "return type must be a row type");
217 
218 	j = 0;
219 	values[j++] = psprintf("%d", stat.blkno);
220 	values[j++] = psprintf("%c", stat.type);
221 	values[j++] = psprintf("%d", stat.live_items);
222 	values[j++] = psprintf("%d", stat.dead_items);
223 	values[j++] = psprintf("%d", stat.avg_item_size);
224 	values[j++] = psprintf("%d", stat.page_size);
225 	values[j++] = psprintf("%d", stat.free_size);
226 	values[j++] = psprintf("%d", stat.btpo_prev);
227 	values[j++] = psprintf("%d", stat.btpo_next);
228 	values[j++] = psprintf("%d", (stat.type == 'd') ? stat.btpo.xact : stat.btpo.level);
229 	values[j++] = psprintf("%d", stat.btpo_flags);
230 
231 	tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc),
232 								   values);
233 
234 	result = HeapTupleGetDatum(tuple);
235 
236 	PG_RETURN_DATUM(result);
237 }
238 
239 
240 /*
241  * cross-call data structure for SRF
242  */
243 struct user_args
244 {
245 	Page		page;
246 	OffsetNumber offset;
247 };
248 
249 /*-------------------------------------------------------
250  * bt_page_print_tuples()
251  *
252  * Form a tuple describing index tuple at a given offset
253  * ------------------------------------------------------
254  */
255 static Datum
bt_page_print_tuples(FuncCallContext * fctx,Page page,OffsetNumber offset)256 bt_page_print_tuples(FuncCallContext *fctx, Page page, OffsetNumber offset)
257 {
258 	char	   *values[6];
259 	HeapTuple	tuple;
260 	ItemId		id;
261 	IndexTuple	itup;
262 	int			j;
263 	int			off;
264 	int			dlen;
265 	char	   *dump;
266 	char	   *ptr;
267 
268 	id = PageGetItemId(page, offset);
269 
270 	if (!ItemIdIsValid(id))
271 		elog(ERROR, "invalid ItemId");
272 
273 	itup = (IndexTuple) PageGetItem(page, id);
274 
275 	j = 0;
276 	values[j++] = psprintf("%d", offset);
277 	values[j++] = psprintf("(%u,%u)",
278 						   ItemPointerGetBlockNumberNoCheck(&itup->t_tid),
279 						   ItemPointerGetOffsetNumberNoCheck(&itup->t_tid));
280 	values[j++] = psprintf("%d", (int) IndexTupleSize(itup));
281 	values[j++] = psprintf("%c", IndexTupleHasNulls(itup) ? 't' : 'f');
282 	values[j++] = psprintf("%c", IndexTupleHasVarwidths(itup) ? 't' : 'f');
283 
284 	ptr = (char *) itup + IndexInfoFindDataOffset(itup->t_info);
285 	dlen = IndexTupleSize(itup) - IndexInfoFindDataOffset(itup->t_info);
286 	dump = palloc0(dlen * 3 + 1);
287 	values[j] = dump;
288 	for (off = 0; off < dlen; off++)
289 	{
290 		if (off > 0)
291 			*dump++ = ' ';
292 		sprintf(dump, "%02x", *(ptr + off) & 0xff);
293 		dump += 2;
294 	}
295 
296 	tuple = BuildTupleFromCStrings(fctx->attinmeta, values);
297 
298 	return HeapTupleGetDatum(tuple);
299 }
300 
301 /*-------------------------------------------------------
302  * bt_page_items()
303  *
304  * Get IndexTupleData set in a btree page
305  *
306  * Usage: SELECT * FROM bt_page_items('t1_pkey', 1);
307  *-------------------------------------------------------
308  */
309 Datum
bt_page_items(PG_FUNCTION_ARGS)310 bt_page_items(PG_FUNCTION_ARGS)
311 {
312 	text	   *relname = PG_GETARG_TEXT_PP(0);
313 	uint32		blkno = PG_GETARG_UINT32(1);
314 	Datum		result;
315 	FuncCallContext *fctx;
316 	MemoryContext mctx;
317 	struct user_args *uargs;
318 
319 	if (!superuser())
320 		ereport(ERROR,
321 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
322 				 (errmsg("must be superuser to use pageinspect functions"))));
323 
324 	if (SRF_IS_FIRSTCALL())
325 	{
326 		RangeVar   *relrv;
327 		Relation	rel;
328 		Buffer		buffer;
329 		BTPageOpaque opaque;
330 		TupleDesc	tupleDesc;
331 
332 		fctx = SRF_FIRSTCALL_INIT();
333 
334 		relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
335 		rel = relation_openrv(relrv, AccessShareLock);
336 
337 		if (!IS_INDEX(rel) || !IS_BTREE(rel))
338 			elog(ERROR, "relation \"%s\" is not a btree index",
339 				 RelationGetRelationName(rel));
340 
341 		/*
342 		 * Reject attempts to read non-local temporary relations; we would be
343 		 * likely to get wrong data since we have no visibility into the
344 		 * owning session's local buffers.
345 		 */
346 		if (RELATION_IS_OTHER_TEMP(rel))
347 			ereport(ERROR,
348 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
349 					 errmsg("cannot access temporary tables of other sessions")));
350 
351 		if (blkno == 0)
352 			elog(ERROR, "block 0 is a meta page");
353 
354 		CHECK_RELATION_BLOCK_RANGE(rel, blkno);
355 
356 		buffer = ReadBuffer(rel, blkno);
357 		LockBuffer(buffer, BUFFER_LOCK_SHARE);
358 
359 		/*
360 		 * We copy the page into local storage to avoid holding pin on the
361 		 * buffer longer than we must, and possibly failing to release it at
362 		 * all if the calling query doesn't fetch all rows.
363 		 */
364 		mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx);
365 
366 		uargs = palloc(sizeof(struct user_args));
367 
368 		uargs->page = palloc(BLCKSZ);
369 		memcpy(uargs->page, BufferGetPage(buffer), BLCKSZ);
370 
371 		UnlockReleaseBuffer(buffer);
372 		relation_close(rel, AccessShareLock);
373 
374 		uargs->offset = FirstOffsetNumber;
375 
376 		opaque = (BTPageOpaque) PageGetSpecialPointer(uargs->page);
377 
378 		if (P_ISDELETED(opaque))
379 			elog(NOTICE, "page is deleted");
380 
381 		fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
382 
383 		/* Build a tuple descriptor for our result type */
384 		if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
385 			elog(ERROR, "return type must be a row type");
386 
387 		fctx->attinmeta = TupleDescGetAttInMetadata(tupleDesc);
388 
389 		fctx->user_fctx = uargs;
390 
391 		MemoryContextSwitchTo(mctx);
392 	}
393 
394 	fctx = SRF_PERCALL_SETUP();
395 	uargs = fctx->user_fctx;
396 
397 	if (fctx->call_cntr < fctx->max_calls)
398 	{
399 		result = bt_page_print_tuples(fctx, uargs->page, uargs->offset);
400 		uargs->offset++;
401 		SRF_RETURN_NEXT(fctx, result);
402 	}
403 	else
404 	{
405 		pfree(uargs->page);
406 		pfree(uargs);
407 		SRF_RETURN_DONE(fctx);
408 	}
409 }
410 
411 /*-------------------------------------------------------
412  * bt_page_items_bytea()
413  *
414  * Get IndexTupleData set in a btree page
415  *
416  * Usage: SELECT * FROM bt_page_items(get_raw_page('t1_pkey', 1));
417  *-------------------------------------------------------
418  */
419 
420 Datum
bt_page_items_bytea(PG_FUNCTION_ARGS)421 bt_page_items_bytea(PG_FUNCTION_ARGS)
422 {
423 	bytea	   *raw_page = PG_GETARG_BYTEA_P(0);
424 	Datum		result;
425 	FuncCallContext *fctx;
426 	struct user_args *uargs;
427 	int			raw_page_size;
428 
429 	if (!superuser())
430 		ereport(ERROR,
431 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
432 				 (errmsg("must be superuser to use pageinspect functions"))));
433 
434 	if (SRF_IS_FIRSTCALL())
435 	{
436 		BTPageOpaque opaque;
437 		MemoryContext mctx;
438 		TupleDesc	tupleDesc;
439 
440 		raw_page_size = VARSIZE(raw_page) - VARHDRSZ;
441 
442 		if (raw_page_size < SizeOfPageHeaderData)
443 			ereport(ERROR,
444 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
445 					 errmsg("input page too small (%d bytes)", raw_page_size)));
446 
447 		fctx = SRF_FIRSTCALL_INIT();
448 		mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx);
449 
450 		uargs = palloc(sizeof(struct user_args));
451 
452 		uargs->page = VARDATA(raw_page);
453 
454 		uargs->offset = FirstOffsetNumber;
455 
456 		opaque = (BTPageOpaque) PageGetSpecialPointer(uargs->page);
457 
458 		if (P_ISMETA(opaque))
459 			ereport(ERROR,
460 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
461 					 errmsg("block is a meta page")));
462 
463 		if (P_ISDELETED(opaque))
464 			elog(NOTICE, "page is deleted");
465 
466 		fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
467 
468 		/* Build a tuple descriptor for our result type */
469 		if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
470 			elog(ERROR, "return type must be a row type");
471 
472 		fctx->attinmeta = TupleDescGetAttInMetadata(tupleDesc);
473 
474 		fctx->user_fctx = uargs;
475 
476 		MemoryContextSwitchTo(mctx);
477 	}
478 
479 	fctx = SRF_PERCALL_SETUP();
480 	uargs = fctx->user_fctx;
481 
482 	if (fctx->call_cntr < fctx->max_calls)
483 	{
484 		result = bt_page_print_tuples(fctx, uargs->page, uargs->offset);
485 		uargs->offset++;
486 		SRF_RETURN_NEXT(fctx, result);
487 	}
488 	else
489 	{
490 		pfree(uargs);
491 		SRF_RETURN_DONE(fctx);
492 	}
493 }
494 
495 
496 /* ------------------------------------------------
497  * bt_metap()
498  *
499  * Get a btree's meta-page information
500  *
501  * Usage: SELECT * FROM bt_metap('t1_pkey')
502  * ------------------------------------------------
503  */
504 Datum
bt_metap(PG_FUNCTION_ARGS)505 bt_metap(PG_FUNCTION_ARGS)
506 {
507 	text	   *relname = PG_GETARG_TEXT_PP(0);
508 	Datum		result;
509 	Relation	rel;
510 	RangeVar   *relrv;
511 	BTMetaPageData *metad;
512 	TupleDesc	tupleDesc;
513 	int			j;
514 	char	   *values[6];
515 	Buffer		buffer;
516 	Page		page;
517 	HeapTuple	tuple;
518 
519 	if (!superuser())
520 		ereport(ERROR,
521 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
522 				 (errmsg("must be superuser to use pageinspect functions"))));
523 
524 	relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
525 	rel = relation_openrv(relrv, AccessShareLock);
526 
527 	if (!IS_INDEX(rel) || !IS_BTREE(rel))
528 		elog(ERROR, "relation \"%s\" is not a btree index",
529 			 RelationGetRelationName(rel));
530 
531 	/*
532 	 * Reject attempts to read non-local temporary relations; we would be
533 	 * likely to get wrong data since we have no visibility into the owning
534 	 * session's local buffers.
535 	 */
536 	if (RELATION_IS_OTHER_TEMP(rel))
537 		ereport(ERROR,
538 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
539 				 errmsg("cannot access temporary tables of other sessions")));
540 
541 	buffer = ReadBuffer(rel, 0);
542 	LockBuffer(buffer, BUFFER_LOCK_SHARE);
543 
544 	page = BufferGetPage(buffer);
545 	metad = BTPageGetMeta(page);
546 
547 	/* Build a tuple descriptor for our result type */
548 	if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
549 		elog(ERROR, "return type must be a row type");
550 
551 	j = 0;
552 	values[j++] = psprintf("%d", metad->btm_magic);
553 	values[j++] = psprintf("%d", metad->btm_version);
554 	values[j++] = psprintf("%d", metad->btm_root);
555 	values[j++] = psprintf("%d", metad->btm_level);
556 	values[j++] = psprintf("%d", metad->btm_fastroot);
557 	values[j++] = psprintf("%d", metad->btm_fastlevel);
558 
559 	tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc),
560 								   values);
561 
562 	result = HeapTupleGetDatum(tuple);
563 
564 	UnlockReleaseBuffer(buffer);
565 	relation_close(rel, AccessShareLock);
566 
567 	PG_RETURN_DATUM(result);
568 }
569