1 /*-------------------------------------------------------------------------
2  *
3  * shm_toc.c
4  *	  shared memory segment table of contents
5  *
6  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * src/backend/storage/ipc/shm_toc.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 
14 #include "postgres.h"
15 
16 #include "port/atomics.h"
17 #include "storage/shm_toc.h"
18 #include "storage/spin.h"
19 
20 typedef struct shm_toc_entry
21 {
22 	uint64		key;			/* Arbitrary identifier */
23 	Size		offset;			/* Offset, in bytes, from TOC start */
24 } shm_toc_entry;
25 
26 struct shm_toc
27 {
28 	uint64		toc_magic;		/* Magic number identifying this TOC */
29 	slock_t		toc_mutex;		/* Spinlock for mutual exclusion */
30 	Size		toc_total_bytes;	/* Bytes managed by this TOC */
31 	Size		toc_allocated_bytes;	/* Bytes allocated of those managed */
32 	uint32		toc_nentry;		/* Number of entries in TOC */
33 	shm_toc_entry toc_entry[FLEXIBLE_ARRAY_MEMBER];
34 };
35 
36 /*
37  * Initialize a region of shared memory with a table of contents.
38  */
39 shm_toc *
shm_toc_create(uint64 magic,void * address,Size nbytes)40 shm_toc_create(uint64 magic, void *address, Size nbytes)
41 {
42 	shm_toc    *toc = (shm_toc *) address;
43 
44 	Assert(nbytes > offsetof(shm_toc, toc_entry));
45 	toc->toc_magic = magic;
46 	SpinLockInit(&toc->toc_mutex);
47 
48 	/*
49 	 * The alignment code in shm_toc_allocate() assumes that the starting
50 	 * value is buffer-aligned.
51 	 */
52 	toc->toc_total_bytes = BUFFERALIGN_DOWN(nbytes);
53 	toc->toc_allocated_bytes = 0;
54 	toc->toc_nentry = 0;
55 
56 	return toc;
57 }
58 
59 /*
60  * Attach to an existing table of contents.  If the magic number found at
61  * the target address doesn't match our expectations, return NULL.
62  */
63 shm_toc *
shm_toc_attach(uint64 magic,void * address)64 shm_toc_attach(uint64 magic, void *address)
65 {
66 	shm_toc    *toc = (shm_toc *) address;
67 
68 	if (toc->toc_magic != magic)
69 		return NULL;
70 
71 	Assert(toc->toc_total_bytes >= toc->toc_allocated_bytes);
72 	Assert(toc->toc_total_bytes > offsetof(shm_toc, toc_entry));
73 
74 	return toc;
75 }
76 
77 /*
78  * Allocate shared memory from a segment managed by a table of contents.
79  *
80  * This is not a full-blown allocator; there's no way to free memory.  It's
81  * just a way of dividing a single physical shared memory segment into logical
82  * chunks that may be used for different purposes.
83  *
84  * We allocate backwards from the end of the segment, so that the TOC entries
85  * can grow forward from the start of the segment.
86  */
87 void *
shm_toc_allocate(shm_toc * toc,Size nbytes)88 shm_toc_allocate(shm_toc *toc, Size nbytes)
89 {
90 	volatile shm_toc *vtoc = toc;
91 	Size		total_bytes;
92 	Size		allocated_bytes;
93 	Size		nentry;
94 	Size		toc_bytes;
95 
96 	/*
97 	 * Make sure request is well-aligned.  XXX: MAXALIGN is not enough,
98 	 * because atomic ops might need a wider alignment.  We don't have a
99 	 * proper definition for the minimum to make atomic ops safe, but
100 	 * BUFFERALIGN ought to be enough.
101 	 */
102 	nbytes = BUFFERALIGN(nbytes);
103 
104 	SpinLockAcquire(&toc->toc_mutex);
105 
106 	total_bytes = vtoc->toc_total_bytes;
107 	allocated_bytes = vtoc->toc_allocated_bytes;
108 	nentry = vtoc->toc_nentry;
109 	toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry)
110 		+ allocated_bytes;
111 
112 	/* Check for memory exhaustion and overflow. */
113 	if (toc_bytes + nbytes > total_bytes || toc_bytes + nbytes < toc_bytes)
114 	{
115 		SpinLockRelease(&toc->toc_mutex);
116 		ereport(ERROR,
117 				(errcode(ERRCODE_OUT_OF_MEMORY),
118 				 errmsg("out of shared memory")));
119 	}
120 	vtoc->toc_allocated_bytes += nbytes;
121 
122 	SpinLockRelease(&toc->toc_mutex);
123 
124 	return ((char *) toc) + (total_bytes - allocated_bytes - nbytes);
125 }
126 
127 /*
128  * Return the number of bytes that can still be allocated.
129  */
130 Size
shm_toc_freespace(shm_toc * toc)131 shm_toc_freespace(shm_toc *toc)
132 {
133 	volatile shm_toc *vtoc = toc;
134 	Size		total_bytes;
135 	Size		allocated_bytes;
136 	Size		nentry;
137 	Size		toc_bytes;
138 
139 	SpinLockAcquire(&toc->toc_mutex);
140 	total_bytes = vtoc->toc_total_bytes;
141 	allocated_bytes = vtoc->toc_allocated_bytes;
142 	nentry = vtoc->toc_nentry;
143 	SpinLockRelease(&toc->toc_mutex);
144 
145 	toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry);
146 	Assert(allocated_bytes + BUFFERALIGN(toc_bytes) <= total_bytes);
147 	return total_bytes - (allocated_bytes + BUFFERALIGN(toc_bytes));
148 }
149 
150 /*
151  * Insert a TOC entry.
152  *
153  * The idea here is that the process setting up the shared memory segment will
154  * register the addresses of data structures within the segment using this
155  * function.  Each data structure will be identified using a 64-bit key, which
156  * is assumed to be a well-known or discoverable integer.  Other processes
157  * accessing the shared memory segment can pass the same key to
158  * shm_toc_lookup() to discover the addresses of those data structures.
159  *
160  * Since the shared memory segment may be mapped at different addresses within
161  * different backends, we store relative rather than absolute pointers.
162  *
163  * This won't scale well to a large number of keys.  Hopefully, that isn't
164  * necessary; if it proves to be, we might need to provide a more sophisticated
165  * data structure here.  But the real idea here is just to give someone mapping
166  * a dynamic shared memory the ability to find the bare minimum number of
167  * pointers that they need to bootstrap.  If you're storing a lot of stuff in
168  * the TOC, you're doing it wrong.
169  */
170 void
shm_toc_insert(shm_toc * toc,uint64 key,void * address)171 shm_toc_insert(shm_toc *toc, uint64 key, void *address)
172 {
173 	volatile shm_toc *vtoc = toc;
174 	Size		total_bytes;
175 	Size		allocated_bytes;
176 	Size		nentry;
177 	Size		toc_bytes;
178 	Size		offset;
179 
180 	/* Relativize pointer. */
181 	Assert(address > (void *) toc);
182 	offset = ((char *) address) - (char *) toc;
183 
184 	SpinLockAcquire(&toc->toc_mutex);
185 
186 	total_bytes = vtoc->toc_total_bytes;
187 	allocated_bytes = vtoc->toc_allocated_bytes;
188 	nentry = vtoc->toc_nentry;
189 	toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry)
190 		+ allocated_bytes;
191 
192 	/* Check for memory exhaustion and overflow. */
193 	if (toc_bytes + sizeof(shm_toc_entry) > total_bytes ||
194 		toc_bytes + sizeof(shm_toc_entry) < toc_bytes ||
195 		nentry >= PG_UINT32_MAX)
196 	{
197 		SpinLockRelease(&toc->toc_mutex);
198 		ereport(ERROR,
199 				(errcode(ERRCODE_OUT_OF_MEMORY),
200 				 errmsg("out of shared memory")));
201 	}
202 
203 	Assert(offset < total_bytes);
204 	vtoc->toc_entry[nentry].key = key;
205 	vtoc->toc_entry[nentry].offset = offset;
206 
207 	/*
208 	 * By placing a write barrier after filling in the entry and before
209 	 * updating the number of entries, we make it safe to read the TOC
210 	 * unlocked.
211 	 */
212 	pg_write_barrier();
213 
214 	vtoc->toc_nentry++;
215 
216 	SpinLockRelease(&toc->toc_mutex);
217 }
218 
219 /*
220  * Look up a TOC entry.
221  *
222  * If the key is not found, returns NULL if noError is true, otherwise
223  * throws elog(ERROR).
224  *
225  * Unlike the other functions in this file, this operation acquires no lock;
226  * it uses only barriers.  It probably wouldn't hurt concurrency very much even
227  * if it did get a lock, but since it's reasonably likely that a group of
228  * worker processes could each read a series of entries from the same TOC
229  * right around the same time, there seems to be some value in avoiding it.
230  */
231 void *
shm_toc_lookup(shm_toc * toc,uint64 key,bool noError)232 shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
233 {
234 	uint32		nentry;
235 	uint32		i;
236 
237 	/*
238 	 * Read the number of entries before we examine any entry.  We assume that
239 	 * reading a uint32 is atomic.
240 	 */
241 	nentry = toc->toc_nentry;
242 	pg_read_barrier();
243 
244 	/* Now search for a matching entry. */
245 	for (i = 0; i < nentry; ++i)
246 	{
247 		if (toc->toc_entry[i].key == key)
248 			return ((char *) toc) + toc->toc_entry[i].offset;
249 	}
250 
251 	/* No matching entry was found. */
252 	if (!noError)
253 		elog(ERROR, "could not find key " UINT64_FORMAT " in shm TOC at %p",
254 			 key, toc);
255 	return NULL;
256 }
257 
258 /*
259  * Estimate how much shared memory will be required to store a TOC and its
260  * dependent data structures.
261  */
262 Size
shm_toc_estimate(shm_toc_estimator * e)263 shm_toc_estimate(shm_toc_estimator *e)
264 {
265 	Size		sz;
266 
267 	sz = offsetof(shm_toc, toc_entry);
268 	sz = add_size(sz, mul_size(e->number_of_keys, sizeof(shm_toc_entry)));
269 	sz = add_size(sz, e->space_for_chunks);
270 
271 	return BUFFERALIGN(sz);
272 }
273