1 /*****************************************************************************
2 
3 Copyright (c) 1995, 2021, Oracle and/or its affiliates.
4 
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License, version 2.0,
7 as published by the Free Software Foundation.
8 
9 This program is also distributed with certain software (including
10 but not limited to OpenSSL) that is licensed under separate terms,
11 as designated in a particular file or component or in included license
12 documentation.  The authors of MySQL hereby grant you an additional
13 permission to link the program and your derivative works with the
14 separately licensed software that they have included with MySQL.
15 
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 GNU General Public License, version 2.0, for more details.
20 
21 You should have received a copy of the GNU General Public License along with
22 this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
24 
25 *****************************************************************************/
26 
27 /**************************************************//**
28 @file fil/fil0fil.cc
29 The tablespace memory cache
30 
31 Created 10/25/1995 Heikki Tuuri
32 *******************************************************/
33 
34 #include "ha_prototypes.h"
35 
36 #ifndef UNIV_HOTBACKUP
37 #include "btr0btr.h"
38 #include "buf0buf.h"
39 #include "dict0boot.h"
40 #include "dict0dict.h"
41 #include "fsp0file.h"
42 #include "fsp0fsp.h"
43 #include "fsp0space.h"
44 #include "fsp0sysspace.h"
45 #include "hash0hash.h"
46 #include "log0recv.h"
47 #include "mach0data.h"
48 #include "mem0mem.h"
49 #include "mtr0log.h"
50 #include "os0file.h"
51 #include "page0zip.h"
52 #include "row0mysql.h"
53 #include "row0trunc.h"
54 # include "buf0lru.h"
55 # include "ibuf0ibuf.h"
56 # include "os0event.h"
57 # include "sync0sync.h"
58 #endif /* !UNIV_HOTBACKUP */
59 #include "buf0flu.h"
60 #include "srv0start.h"
61 #include "trx0purge.h"
62 #include "ut0new.h"
63 #include "btr0sea.h"
64 #include "log0log.h"
65 
66 /** Tries to close a file in the LRU list. The caller must hold the fil_sys
67 mutex.
68 @return true if success, false if should retry later; since i/o's
69 generally complete in < 100 ms, and as InnoDB writes at most 128 pages
70 from the buffer pool in a batch, and then immediately flushes the
71 files, there is a good chance that the next time we find a suitable
72 node from the LRU list.
73 @param[in] print_info	if true, prints information why it
74                         cannot close a file */
75 static
76 bool
77 fil_try_to_close_file_in_LRU(bool print_info);
78 
79 /*
80 		IMPLEMENTATION OF THE TABLESPACE MEMORY CACHE
81 		=============================================
82 
83 The tablespace cache is responsible for providing fast read/write access to
84 tablespaces and logs of the database. File creation and deletion is done
85 in other modules which know more of the logic of the operation, however.
86 
87 A tablespace consists of a chain of files. The size of the files does not
88 have to be divisible by the database block size, because we may just leave
89 the last incomplete block unused. When a new file is appended to the
90 tablespace, the maximum size of the file is also specified. At the moment,
91 we think that it is best to extend the file to its maximum size already at
92 the creation of the file, because then we can avoid dynamically extending
93 the file when more space is needed for the tablespace.
94 
95 A block's position in the tablespace is specified with a 32-bit unsigned
96 integer. The files in the chain are thought to be catenated, and the block
97 corresponding to an address n is the nth block in the catenated file (where
98 the first block is named the 0th block, and the incomplete block fragments
99 at the end of files are not taken into account). A tablespace can be extended
100 by appending a new file at the end of the chain.
101 
102 Our tablespace concept is similar to the one of Oracle.
103 
104 To acquire more speed in disk transfers, a technique called disk striping is
105 sometimes used. This means that logical block addresses are divided in a
106 round-robin fashion across several disks. Windows NT supports disk striping,
107 so there we do not need to support it in the database. Disk striping is
108 implemented in hardware in RAID disks. We conclude that it is not necessary
109 to implement it in the database. Oracle 7 does not support disk striping,
110 either.
111 
112 Another trick used at some database sites is replacing tablespace files by
113 raw disks, that is, the whole physical disk drive, or a partition of it, is
114 opened as a single file, and it is accessed through byte offsets calculated
115 from the start of the disk or the partition. This is recommended in some
116 books on database tuning to achieve more speed in i/o. Using raw disk
117 certainly prevents the OS from fragmenting disk space, but it is not clear
118 if it really adds speed. We measured on the Pentium 100 MHz + NT + NTFS file
119 system + EIDE Conner disk only a negligible difference in speed when reading
120 from a file, versus reading from a raw disk.
121 
122 To have fast access to a tablespace or a log file, we put the data structures
123 to a hash table. Each tablespace and log file is given an unique 32-bit
124 identifier.
125 
126 Some operating systems do not support many open files at the same time,
127 though NT seems to tolerate at least 900 open files. Therefore, we put the
128 open files in an LRU-list. If we need to open another file, we may close the
129 file at the end of the LRU-list. When an i/o-operation is pending on a file,
130 the file cannot be closed. We take the file nodes with pending i/o-operations
131 out of the LRU-list and keep a count of pending operations. When an operation
132 completes, we decrement the count and return the file node to the LRU-list if
133 the count drops to zero. */
134 
135 /** This tablespace name is used internally during recovery to open a
136 general tablespace before the data dictionary are recovered and available. */
137 const char general_space_name[] = "innodb_general";
138 
139 /** Reference to the server data directory. Usually it is the
140 current working directory ".", but in the MySQL Embedded Server Library
141 it is an absolute path. */
142 const char*	fil_path_to_mysql_datadir;
143 Folder		folder_mysql_datadir;
144 
145 /** Common InnoDB file extentions */
146 const char* dot_ext[] = { "", ".ibd", ".isl", ".cfg", ".cfp" };
147 
148 /** The number of fsyncs done to the log */
149 ulint	fil_n_log_flushes			= 0;
150 
151 /** Number of pending redo log flushes */
152 ulint	fil_n_pending_log_flushes		= 0;
153 /** Number of pending tablespace flushes */
154 ulint	fil_n_pending_tablespace_flushes	= 0;
155 
156 /** Number of files currently open */
157 ulint	fil_n_file_opened			= 0;
158 
159 /** The null file address */
160 fil_addr_t	fil_addr_null = {FIL_NULL, 0};
161 
162 /** The tablespace memory cache; also the totality of logs (the log
163 data space) is stored here; below we talk about tablespaces, but also
164 the ib_logfiles form a 'space' and it is handled here */
165 struct fil_system_t {
166 #ifndef UNIV_HOTBACKUP
167 	ib_mutex_t	mutex;		/*!< The mutex protecting the cache */
168 #endif /* !UNIV_HOTBACKUP */
169 	hash_table_t*	spaces;		/*!< The hash table of spaces in the
170 					system; they are hashed on the space
171 					id */
172 	hash_table_t*	name_hash;	/*!< hash table based on the space
173 					name */
174 	UT_LIST_BASE_NODE_T(fil_node_t) LRU;
175 					/*!< base node for the LRU list of the
176 					most recently used open files with no
177 					pending i/o's; if we start an i/o on
178 					the file, we first remove it from this
179 					list, and return it to the start of
180 					the list when the i/o ends;
181 					log files and the system tablespace are
182 					not put to this list: they are opened
183 					after the startup, and kept open until
184 					shutdown */
185 	UT_LIST_BASE_NODE_T(fil_space_t) unflushed_spaces;
186 					/*!< base node for the list of those
187 					tablespaces whose files contain
188 					unflushed writes; those spaces have
189 					at least one file node where
190 					modification_counter > flush_counter */
191 	ulint		n_open;		/*!< number of files currently open */
192 	ulint		max_n_open;	/*!< n_open is not allowed to exceed
193 					this */
194 	int64_t		modification_counter;/*!< when we write to a file we
195 					increment this by one */
196 	ulint		max_assigned_id;/*!< maximum space id in the existing
197 					tables, or assigned during the time
198 					mysqld has been up; at an InnoDB
199 					startup we scan the data dictionary
200 					and set here the maximum of the
201 					space id's of the tables there */
202 	UT_LIST_BASE_NODE_T(fil_space_t) space_list;
203 					/*!< list of all file spaces */
204 	UT_LIST_BASE_NODE_T(fil_space_t) named_spaces;
205 					/*!< list of all file spaces
206 					for which a MLOG_FILE_NAME
207 					record has been written since
208 					the latest redo log checkpoint.
209 					Protected only by log_sys->mutex. */
210 	bool		space_id_reuse_warned;
211 					/* !< true if fil_space_create()
212 					has issued a warning about
213 					potential space_id reuse */
214 };
215 
216 /** The tablespace memory cache. This variable is NULL before the module is
217 initialized. */
218 static fil_system_t*	fil_system	= NULL;
219 
220 #ifdef UNIV_HOTBACKUP
221 static ulint	srv_data_read;
222 static ulint	srv_data_written;
223 #endif /* UNIV_HOTBACKUP */
224 
225 /** Determine if user has explicitly disabled fsync(). */
226 #ifndef _WIN32
227 # define fil_buffering_disabled(s)	\
228 	((s)->purpose == FIL_TYPE_TABLESPACE	\
229 	 && srv_unix_file_flush_method	\
230 	 == SRV_UNIX_O_DIRECT_NO_FSYNC)
231 #else /* _WIN32 */
232 # define fil_buffering_disabled(s)	(0)
233 #endif /* __WIN32 */
234 
235 /** Determine if the space id is a user tablespace id or not.
236 @param[in]	space_id	Space ID to check
237 @return true if it is a user tablespace ID */
238 UNIV_INLINE
239 bool
fil_is_user_tablespace_id(ulint space_id)240 fil_is_user_tablespace_id(
241 	ulint	space_id)
242 {
243 	return(!srv_is_undo_tablespace(space_id)
244 	       && space_id != srv_tmp_space.space_id());
245 }
246 
247 #ifdef UNIV_DEBUG
248 /** Try fil_validate() every this many times */
249 # define FIL_VALIDATE_SKIP	17
250 
251 /******************************************************************//**
252 Checks the consistency of the tablespace cache some of the time.
253 @return true if ok or the check was skipped */
254 static
255 bool
fil_validate_skip(void)256 fil_validate_skip(void)
257 /*===================*/
258 {
259 	/** The fil_validate() call skip counter. Use a signed type
260 	because of the race condition below. */
261 	static int fil_validate_count = FIL_VALIDATE_SKIP;
262 
263 	/* There is a race condition below, but it does not matter,
264 	because this call is only for heuristic purposes. We want to
265 	reduce the call frequency of the costly fil_validate() check
266 	in debug builds. */
267 	if (--fil_validate_count > 0) {
268 		return(true);
269 	}
270 
271 	fil_validate_count = FIL_VALIDATE_SKIP;
272 	return(fil_validate());
273 }
274 #endif /* UNIV_DEBUG */
275 
276 /********************************************************************//**
277 Determines if a file node belongs to the least-recently-used list.
278 @return true if the file belongs to fil_system->LRU mutex. */
279 UNIV_INLINE
280 bool
fil_space_belongs_in_lru(const fil_space_t * space)281 fil_space_belongs_in_lru(
282 /*=====================*/
283 	const fil_space_t*	space)	/*!< in: file space */
284 {
285 	switch (space->purpose) {
286 	case FIL_TYPE_LOG:
287 		return(false);
288 	case FIL_TYPE_TABLESPACE:
289 	case FIL_TYPE_TEMPORARY:
290 	case FIL_TYPE_IMPORT:
291 		return(fil_is_user_tablespace_id(space->id));
292 	}
293 
294 	ut_ad(0);
295 	return(false);
296 }
297 
298 /********************************************************************//**
299 NOTE: you must call fil_mutex_enter_and_prepare_for_io() first!
300 
301 Prepares a file node for i/o. Opens the file if it is closed. Updates the
302 pending i/o's field in the node and the system appropriately. Takes the node
303 off the LRU list if it is in the LRU list. The caller must hold the fil_sys
304 mutex.
305 @return false if the file can't be opened, otherwise true */
306 static
307 bool
308 fil_node_prepare_for_io(
309 /*====================*/
310 	fil_node_t*	node,	/*!< in: file node */
311 	fil_system_t*	system,	/*!< in: tablespace memory cache */
312 	fil_space_t*	space);	/*!< in: space */
313 
314 /**
315 Updates the data structures when an i/o operation finishes. Updates the
316 pending i/o's field in the node appropriately.
317 @param[in,out] node		file node
318 @param[in,out] system		tablespace instance
319 @param[in] type			IO context */
320 static
321 void
322 fil_node_complete_io(
323 	fil_node_t*		node,
324 	fil_system_t*		system,
325 	const IORequest&	type);
326 
327 /** Reads data from a space to a buffer. Remember that the possible incomplete
328 blocks at the end of file are ignored: they are not taken into account when
329 calculating the byte offset within a space.
330 @param[in]	page_id		page id
331 @param[in]	page_size	page size
332 @param[in]	byte_offset	remainder of offset in bytes; in aio this
333 must be divisible by the OS block size
334 @param[in]	len		how many bytes to read; this must not cross a
335 file boundary; in aio this must be a block size multiple
336 @param[in,out]	buf		buffer where to store data read; in aio this
337 must be appropriately aligned
338 @return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do
339 i/o on a tablespace which does not exist */
340 UNIV_INLINE
341 dberr_t
fil_read(const page_id_t & page_id,const page_size_t & page_size,ulint byte_offset,ulint len,void * buf)342 fil_read(
343 	const page_id_t&	page_id,
344 	const page_size_t&	page_size,
345 	ulint			byte_offset,
346 	ulint			len,
347 	void*			buf)
348 {
349 	return(fil_io(IORequestRead, true, page_id, page_size,
350 		      byte_offset, len, buf, NULL));
351 }
352 
353 /** Writes data to a space from a buffer. Remember that the possible incomplete
354 blocks at the end of file are ignored: they are not taken into account when
355 calculating the byte offset within a space.
356 @param[in]	page_id		page id
357 @param[in]	page_size	page size
358 @param[in]	byte_offset	remainder of offset in bytes; in aio this
359 must be divisible by the OS block size
360 @param[in]	len		how many bytes to write; this must not cross
361 a file boundary; in aio this must be a block size multiple
362 @param[in]	buf		buffer from which to write; in aio this must
363 be appropriately aligned
364 @return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do
365 i/o on a tablespace which does not exist */
366 UNIV_INLINE
367 dberr_t
fil_write(const page_id_t & page_id,const page_size_t & page_size,ulint byte_offset,ulint len,void * buf)368 fil_write(
369 	const page_id_t&	page_id,
370 	const page_size_t&	page_size,
371 	ulint			byte_offset,
372 	ulint			len,
373 	void*			buf)
374 {
375 	ut_ad(!srv_read_only_mode);
376 
377 	return(fil_io(IORequestWrite, true, page_id, page_size,
378 		      byte_offset, len, buf, NULL));
379 }
380 
381 /*******************************************************************//**
382 Returns the table space by a given id, NULL if not found. */
383 UNIV_INLINE
384 fil_space_t*
fil_space_get_by_id(ulint id)385 fil_space_get_by_id(
386 /*================*/
387 	ulint	id)	/*!< in: space id */
388 {
389 	fil_space_t*	space;
390 
391 	ut_ad(mutex_own(&fil_system->mutex));
392 
393 	HASH_SEARCH(hash, fil_system->spaces, id,
394 		    fil_space_t*, space,
395 		    ut_ad(space->magic_n == FIL_SPACE_MAGIC_N),
396 		    space->id == id);
397 
398 	return(space);
399 }
400 
401 /*******************************************************************//**
402 Returns the table space by a given name, NULL if not found. */
403 UNIV_INLINE
404 fil_space_t*
fil_space_get_by_name(const char * name)405 fil_space_get_by_name(
406 /*==================*/
407 	const char*	name)	/*!< in: space name */
408 {
409 	fil_space_t*	space;
410 	ulint		fold;
411 
412 	ut_ad(mutex_own(&fil_system->mutex));
413 
414 	fold = ut_fold_string(name);
415 
416 	HASH_SEARCH(name_hash, fil_system->name_hash, fold,
417 		    fil_space_t*, space,
418 		    ut_ad(space->magic_n == FIL_SPACE_MAGIC_N),
419 		    !strcmp(name, space->name));
420 
421 	return(space);
422 }
423 
424 /** Look up a tablespace.
425 The caller should hold an InnoDB table lock or a MDL that prevents
426 the tablespace from being dropped during the operation,
427 or the caller should be in single-threaded crash recovery mode
428 (no user connections that could drop tablespaces).
429 If this is not the case, fil_space_acquire() and fil_space_release()
430 should be used instead.
431 @param[in]	id	tablespace ID
432 @return tablespace, or NULL if not found */
433 fil_space_t*
fil_space_get(ulint id)434 fil_space_get(
435 	ulint	id)
436 {
437 	mutex_enter(&fil_system->mutex);
438 	fil_space_t*	space = fil_space_get_by_id(id);
439 	mutex_exit(&fil_system->mutex);
440 	ut_ad(space == NULL || space->purpose != FIL_TYPE_LOG);
441 	return(space);
442 }
443 #ifndef UNIV_HOTBACKUP
444 /** Returns the latch of a file space.
445 @param[in]	id	space id
446 @param[out]	flags	tablespace flags
447 @return latch protecting storage allocation */
448 rw_lock_t*
fil_space_get_latch(ulint id,ulint * flags)449 fil_space_get_latch(
450 	ulint	id,
451 	ulint*	flags)
452 {
453 	fil_space_t*	space;
454 
455 	ut_ad(fil_system);
456 
457 	mutex_enter(&fil_system->mutex);
458 
459 	space = fil_space_get_by_id(id);
460 
461 	ut_a(space);
462 
463 	if (flags) {
464 		*flags = space->flags;
465 	}
466 
467 	mutex_exit(&fil_system->mutex);
468 
469 	return(&(space->latch));
470 }
471 
472 #ifdef UNIV_DEBUG
473 /** Gets the type of a file space.
474 @param[in]	id	tablespace identifier
475 @return file type */
476 fil_type_t
fil_space_get_type(ulint id)477 fil_space_get_type(
478 	ulint	id)
479 {
480 	fil_space_t*	space;
481 
482 	ut_ad(fil_system);
483 
484 	mutex_enter(&fil_system->mutex);
485 
486 	space = fil_space_get_by_id(id);
487 
488 	ut_a(space);
489 
490 	mutex_exit(&fil_system->mutex);
491 
492 	return(space->purpose);
493 }
494 #endif /* UNIV_DEBUG */
495 
496 /** Note that a tablespace has been imported.
497 It is initially marked as FIL_TYPE_IMPORT so that no logging is
498 done during the import process when the space ID is stamped to each page.
499 Now we change it to FIL_SPACE_TABLESPACE to start redo and undo logging.
500 NOTE: temporary tablespaces are never imported.
501 @param[in]	id	tablespace identifier */
502 void
fil_space_set_imported(ulint id)503 fil_space_set_imported(
504 	ulint	id)
505 {
506 	ut_ad(fil_system != NULL);
507 
508 	mutex_enter(&fil_system->mutex);
509 
510 	fil_space_t*	space = fil_space_get_by_id(id);
511 
512 	ut_ad(space->purpose == FIL_TYPE_IMPORT);
513 	space->purpose = FIL_TYPE_TABLESPACE;
514 
515 	mutex_exit(&fil_system->mutex);
516 }
517 #endif /* !UNIV_HOTBACKUP */
518 
519 /**********************************************************************//**
520 Checks if all the file nodes in a space are flushed. The caller must hold
521 the fil_system mutex.
522 @return true if all are flushed */
523 static
524 bool
fil_space_is_flushed(fil_space_t * space)525 fil_space_is_flushed(
526 /*=================*/
527 	fil_space_t*	space)	/*!< in: space */
528 {
529 	ut_ad(mutex_own(&fil_system->mutex));
530 
531 	for (const fil_node_t* node = UT_LIST_GET_FIRST(space->chain);
532 	     node != NULL;
533 	     node = UT_LIST_GET_NEXT(chain, node)) {
534 
535 		if (node->modification_counter > node->flush_counter) {
536 
537 			ut_ad(!fil_buffering_disabled(space));
538 			return(false);
539 		}
540 	}
541 
542 	return(true);
543 }
544 
545 #if !defined(NO_FALLOCATE) && defined(UNIV_LINUX)
546 
547 #include <sys/ioctl.h>
548 /** FusionIO atomic write control info */
549 #define DFS_IOCTL_ATOMIC_WRITE_SET	_IOW(0x95, 2, uint)
550 
551 /**
552 Try and enable FusionIO atomic writes.
553 @param[in] file		OS file handle
554 @return true if successful */
555 bool
fil_fusionio_enable_atomic_write(pfs_os_file_t file)556 fil_fusionio_enable_atomic_write(pfs_os_file_t file)
557 {
558 	if (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT) {
559 
560 		uint	atomic = 1;
561 		ut_a(file.m_file != -1);
562 		if (ioctl(file.m_file, DFS_IOCTL_ATOMIC_WRITE_SET, &atomic) != -1) {
563 
564 			return(true);
565 		}
566 	}
567 
568 	return(false);
569 }
570 #endif /* !NO_FALLOCATE && UNIV_LINUX */
571 
572 /** Append a file to the chain of files of a space.
573 @param[in]	name		file name of a file that is not open
574 @param[in]	size		file size in entire database blocks
575 @param[in,out]	space		tablespace from fil_space_create()
576 @param[in]	is_raw		whether this is a raw device or partition
577 @param[in]	punch_hole	true if supported for this node
578 @param[in]	atomic_write	true if the file has atomic write enabled
579 @param[in]	max_pages	maximum number of pages in file,
580 ULINT_MAX means the file size is unlimited.
581 @return pointer to the file name
582 @retval NULL if error */
583 static
584 fil_node_t*
fil_node_create_low(const char * name,ulint size,fil_space_t * space,bool is_raw,bool punch_hole,bool atomic_write,ulint max_pages=ULINT_MAX)585 fil_node_create_low(
586 	const char*	name,
587 	ulint		size,
588 	fil_space_t*	space,
589 	bool		is_raw,
590 	bool		punch_hole,
591 	bool		atomic_write,
592 	ulint		max_pages = ULINT_MAX)
593 {
594 	fil_node_t*	node;
595 
596 	ut_ad(name != NULL);
597 	ut_ad(fil_system != NULL);
598 
599 	if (space == NULL) {
600 		return(NULL);
601 	}
602 
603 	node = reinterpret_cast<fil_node_t*>(ut_zalloc_nokey(sizeof(*node)));
604 
605 	node->name = mem_strdup(name);
606 
607 	ut_a(!is_raw || srv_start_raw_disk_in_use);
608 
609 	node->sync_event = os_event_create("fsync_event");
610 
611 	node->is_raw_disk = is_raw;
612 
613 	node->size = size;
614 
615 	node->flush_size = size;
616 
617 	node->magic_n = FIL_NODE_MAGIC_N;
618 
619 	node->init_size = size;
620 	node->max_size = max_pages;
621 
622 	mutex_enter(&fil_system->mutex);
623 
624 	space->size += size;
625 
626 	node->space = space;
627 
628 	os_file_stat_t	stat_info;
629 
630 #ifdef UNIV_DEBUG
631 	dberr_t err =
632 #endif /* UNIV_DEBUG */
633 
634 	os_file_get_status(
635 		node->name, &stat_info, false,
636 		fsp_is_system_temporary(space->id) ? true : srv_read_only_mode);
637 
638 	ut_ad(err == DB_SUCCESS);
639 
640 	node->block_size = stat_info.block_size;
641 
642 	/* In this debugging mode, we can overcome the limitation of some
643 	OSes like Windows that support Punch Hole but have a hole size
644 	effectively too large.  By setting the block size to be half the
645 	page size, we can bypass one of the checks that would normally
646 	turn Page Compression off.  This execution mode allows compression
647 	to be tested even when full punch hole support is not available. */
648 	DBUG_EXECUTE_IF("ignore_punch_hole",
649 		node->block_size = ut_min(stat_info.block_size,
650 					  static_cast<size_t>(UNIV_PAGE_SIZE / 2));
651 	);
652 
653 	if (!IORequest::is_punch_hole_supported()
654 	    || !punch_hole
655 	    || node->block_size >= srv_page_size) {
656 
657 		fil_no_punch_hole(node);
658 	} else {
659 		node->punch_hole = punch_hole;
660 	}
661 
662 	node->atomic_write = atomic_write;
663 
664 	UT_LIST_ADD_LAST(space->chain, node);
665 	mutex_exit(&fil_system->mutex);
666 
667 	return(node);
668 }
669 
670 /** Appends a new file to the chain of files of a space. File must be closed.
671 @param[in]	name		file name (file must be closed)
672 @param[in]	size		file size in database blocks, rounded downwards to
673 				an integer
674 @param[in,out]	space		space where to append
675 @param[in]	is_raw		true if a raw device or a raw disk partition
676 @param[in]	atomic_write	true if the file has atomic write enabled
677 @param[in]	max_pages	maximum number of pages in file,
678 ULINT_MAX means the file size is unlimited.
679 @return pointer to the file name
680 @retval NULL if error */
681 char*
fil_node_create(const char * name,ulint size,fil_space_t * space,bool is_raw,bool atomic_write,ulint max_pages)682 fil_node_create(
683 	const char*	name,
684 	ulint		size,
685 	fil_space_t*	space,
686 	bool		is_raw,
687 	bool		atomic_write,
688 	ulint		max_pages)
689 {
690 	fil_node_t*	node;
691 
692 	node = fil_node_create_low(
693 		name, size, space, is_raw, IORequest::is_punch_hole_supported(),
694 		atomic_write, max_pages);
695 
696 	return(node == NULL ? NULL : node->name);
697 }
698 
699 /** Open a file node of a tablespace.
700 The caller must own the fil_system mutex.
701 @param[in,out]	node	File node
702 @return false if the file can't be opened, otherwise true */
703 static
704 bool
fil_node_open_file(fil_node_t * node)705 fil_node_open_file(
706 	fil_node_t*	node)
707 {
708 	os_offset_t	size_bytes;
709 	bool		success;
710 	byte*		buf2;
711 	byte*		page;
712 	ulint		space_id;
713 	ulint		flags;
714 	ulint		min_size;
715 	bool		read_only_mode;
716 	fil_space_t*	space = node->space;
717 
718 	ut_ad(mutex_own(&fil_system->mutex));
719 	ut_a(node->n_pending == 0);
720 	ut_a(!node->is_open);
721 
722 	read_only_mode = !fsp_is_system_temporary(space->id)
723 		&& srv_read_only_mode;
724 
725 	if (node->size == 0
726 	    || (space->purpose == FIL_TYPE_TABLESPACE
727 		&& node == UT_LIST_GET_FIRST(space->chain)
728 		&& !undo::Truncate::was_tablespace_truncated(space->id)
729 		&& srv_startup_is_before_trx_rollback_phase)) {
730 		/* We do not know the size of the file yet. First we
731 		open the file in the normal mode, no async I/O here,
732 		for simplicity. Then do some checks, and close the
733 		file again.  NOTE that we could not use the simple
734 		file read function os_file_read() in Windows to read
735 		from a file opened for async I/O! */
736 
737 retry:
738 		node->handle = os_file_create_simple_no_error_handling(
739 			innodb_data_file_key, node->name, OS_FILE_OPEN,
740 			OS_FILE_READ_ONLY, read_only_mode, &success);
741 
742 		if (!success) {
743 			/* The following call prints an error message */
744 			ulint err = os_file_get_last_error(true);
745 			if (err == EMFILE + 100) {
746 				if (fil_try_to_close_file_in_LRU(true))
747 					goto retry;
748                        }
749 
750 			ib::warn() << "Cannot open '" << node->name << "'."
751 				" Have you deleted .ibd files under a"
752 				" running mysqld server?";
753 
754 			return(false);
755 		}
756 
757 		size_bytes = os_file_get_size(node->handle);
758 		ut_a(size_bytes != (os_offset_t) -1);
759 
760 #ifdef UNIV_HOTBACKUP
761 		if (space->id == 0) {
762 			node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE);
763 			os_file_close(node->handle);
764 			goto add_size;
765 		}
766 #endif /* UNIV_HOTBACKUP */
767 		ut_a(space->purpose != FIL_TYPE_LOG);
768 
769 		/* Read the first page of the tablespace */
770 
771 		const ulint buf2_size = recv_recovery_is_on()
772 			? (2 * UNIV_PAGE_SIZE) : UNIV_PAGE_SIZE;
773 		buf2 = static_cast<byte*>(
774 			ut_malloc_nokey(buf2_size + UNIV_PAGE_SIZE));
775 
776 		/* Align the memory for file i/o if we might have O_DIRECT
777 		set */
778 		page = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
779 		ut_ad(page == page_align(page));
780 
781 		IORequest	request(IORequest::READ);
782 
783 		success = os_file_read(
784 			request, node->handle, page, 0, buf2_size);
785 
786 		space_id = fsp_header_get_space_id(page);
787 		flags = fsp_header_get_flags(page);
788 
789 		/* Close the file now that we have read the space id from it */
790 
791 		os_file_close(node->handle);
792 
793 		const page_size_t	page_size(flags);
794 
795 		min_size = FIL_IBD_FILE_INITIAL_SIZE * page_size.physical();
796 
797 		if (size_bytes < min_size) {
798 
799 			ib::error() << "The size of tablespace file "
800 				<< node->name << " is only " << size_bytes
801 				<< ", should be at least " << min_size << "!";
802 
803 			ut_error;
804 		}
805 
806 		if (space_id != space->id) {
807 			ib::fatal() << "Tablespace id is " << space->id
808 				<< " in the data dictionary but in file "
809 				<< node->name << " it is " << space_id << "!";
810 		}
811 
812 		const page_size_t	space_page_size(space->flags);
813 
814 		if (!page_size.equals_to(space_page_size)) {
815 			ib::fatal() << "Tablespace file " << node->name
816 				<< " has page size " << page_size
817 				<< " (flags=" << ib::hex(flags) << ") but the"
818 				" data dictionary expects page size "
819 				<< space_page_size << " (flags="
820 				<< ib::hex(space->flags) << ")!";
821 		}
822 
823 		if (space->flags != flags) {
824 
825 			ib::fatal()
826 				<< "Table flags are "
827 				<< ib::hex(space->flags) << " in the data"
828 				" dictionary but the flags in file "
829 				<< node->name << " are " << ib::hex(flags)
830 				<< "!";
831 		}
832 
833 		{
834 			ulint	size		= fsp_header_get_field(
835 				page, FSP_SIZE);
836 			ulint	free_limit	= fsp_header_get_field(
837 				page, FSP_FREE_LIMIT);
838 			ulint	free_len	= flst_get_len(
839 				FSP_HEADER_OFFSET + FSP_FREE + page);
840 			ut_ad(space->free_limit == 0
841 			      || space->free_limit == free_limit);
842 			ut_ad(space->free_len == 0
843 			      || space->free_len == free_len);
844 			space->size_in_header = size;
845 			space->free_limit = free_limit;
846 			space->free_len = free_len;
847 
848 			/* Set estimated value for space->compression_type
849 			during recovery process. */
850 			if (recv_recovery_is_on()
851 			    && (Compression::is_compressed_page(
852 					page + page_size.physical())
853 			        || Compression::is_compressed_encrypted_page(
854 					page + page_size.physical()))) {
855 				ut_ad(buf2_size >= (2 * UNIV_PAGE_SIZE));
856 				Compression::meta_t header;
857 				Compression::deserialize_header(
858 					page + page_size.physical(), &header);
859 				space->compression_type = header.m_algorithm;
860 			}
861 		}
862 
863 		ut_free(buf2);
864 
865 		/* For encrypted tablespace, we need to check the
866 		encrytion key and iv(initial vector) is readed. */
867 		if (FSP_FLAGS_GET_ENCRYPTION(flags)
868 		    && !recv_recovery_is_on()) {
869 			if (space->encryption_type != Encryption::AES) {
870 				ib::error()
871 					<< "Can't read encryption"
872 					<< " key from file "
873 					<< node->name << "!";
874 				return(false);
875 			}
876 		}
877 
878 		if (node->size == 0) {
879 			ulint	extent_size;
880 
881 			extent_size = page_size.physical() * FSP_EXTENT_SIZE;
882 
883 			/* After apply-incremental, tablespaces are not extended
884 			to a whole megabyte. Do not cut off valid data. */
885 #ifndef UNIV_HOTBACKUP
886 			/* Truncate the size to a multiple of extent size. */
887 			if (size_bytes >= extent_size) {
888 				size_bytes = ut_2pow_round(size_bytes,
889 							   extent_size);
890 			}
891 #endif /* !UNIV_HOTBACKUP */
892 			node->size = (ulint)
893 				(size_bytes / page_size.physical());
894 
895 #ifdef UNIV_HOTBACKUP
896 add_size:
897 #endif /* UNIV_HOTBACKUP */
898 			space->size += node->size;
899 		}
900 	}
901 
902 	/* printf("Opening file %s\n", node->name); */
903 
904 	/* Open the file for reading and writing, in Windows normally in the
905 	unbuffered async I/O mode, though global variables may make
906 	os_file_create() to fall back to the normal file I/O mode. */
907 
908 	if (space->purpose == FIL_TYPE_LOG) {
909 		node->handle = os_file_create(
910 			innodb_log_file_key, node->name, OS_FILE_OPEN,
911 			OS_FILE_AIO, OS_LOG_FILE, read_only_mode, &success);
912 	} else if (node->is_raw_disk) {
913 		node->handle = os_file_create(
914 			innodb_data_file_key, node->name, OS_FILE_OPEN_RAW,
915 			OS_FILE_AIO, OS_DATA_FILE, read_only_mode, &success);
916 	} else {
917 		node->handle = os_file_create(
918 			innodb_data_file_key, node->name, OS_FILE_OPEN,
919 			OS_FILE_AIO, OS_DATA_FILE, read_only_mode, &success);
920 	}
921 
922 	ut_a(success);
923 
924 	node->is_open = true;
925 
926 	fil_system->n_open++;
927 	fil_n_file_opened++;
928 
929 	if (fil_space_belongs_in_lru(space)) {
930 
931 		/* Put the node to the LRU list */
932 		UT_LIST_ADD_FIRST(fil_system->LRU, node);
933 	}
934 
935 	return(true);
936 }
937 
938 /** Close a file node.
939 @param[in,out]	node	File node */
940 static
941 void
fil_node_close_file(fil_node_t * node)942 fil_node_close_file(
943 	fil_node_t*	node)
944 {
945 	bool	ret;
946 
947 	ut_ad(mutex_own(&(fil_system->mutex)));
948 	ut_a(node->is_open);
949 	ut_a(node->n_pending == 0);
950 	ut_a(node->n_pending_flushes == 0);
951 	ut_a(!node->being_extended);
952 #ifndef UNIV_HOTBACKUP
953 	ut_a(node->modification_counter == node->flush_counter
954 	     || node->space->purpose == FIL_TYPE_TEMPORARY
955 	     || srv_fast_shutdown == 2);
956 #endif /* !UNIV_HOTBACKUP */
957 
958 	ret = os_file_close(node->handle);
959 	ut_a(ret);
960 
961 	/* printf("Closing file %s\n", node->name); */
962 
963 	node->is_open = false;
964 	ut_a(fil_system->n_open > 0);
965 	fil_system->n_open--;
966 	fil_n_file_opened--;
967 
968 	if (fil_space_belongs_in_lru(node->space)) {
969 
970 		ut_a(UT_LIST_GET_LEN(fil_system->LRU) > 0);
971 
972 		/* The node is in the LRU list, remove it */
973 		UT_LIST_REMOVE(fil_system->LRU, node);
974 	}
975 }
976 
977 /** Tries to close a file in the LRU list. The caller must hold the fil_sys
978 mutex.
979 @return true if success, false if should retry later; since i/o's
980 generally complete in < 100 ms, and as InnoDB writes at most 128 pages
981 from the buffer pool in a batch, and then immediately flushes the
982 files, there is a good chance that the next time we find a suitable
983 node from the LRU list.
984 @param[in] print_info	if true, prints information why it
985 			cannot close a file*/
986 static
987 bool
fil_try_to_close_file_in_LRU(bool print_info)988 fil_try_to_close_file_in_LRU(
989 
990 	bool	print_info)
991 {
992 	fil_node_t*	node;
993 
994 	ut_ad(mutex_own(&fil_system->mutex));
995 
996 	if (print_info) {
997 		ib::info() << "fil_sys open file LRU len "
998 			<< UT_LIST_GET_LEN(fil_system->LRU);
999 	}
1000 
1001 	for (node = UT_LIST_GET_LAST(fil_system->LRU);
1002 	     node != NULL;
1003 	     node = UT_LIST_GET_PREV(LRU, node)) {
1004 
1005 		if (node->modification_counter == node->flush_counter
1006 		    && node->n_pending_flushes == 0
1007 		    && !node->being_extended) {
1008 
1009 			fil_node_close_file(node);
1010 
1011 			return(true);
1012 		}
1013 
1014 		if (!print_info) {
1015 			continue;
1016 		}
1017 
1018 		if (node->n_pending_flushes > 0) {
1019 
1020 			ib::info() << "Cannot close file " << node->name
1021 				<< ", because n_pending_flushes "
1022 				<< node->n_pending_flushes;
1023 		}
1024 
1025 		if (node->modification_counter != node->flush_counter) {
1026 			ib::warn() << "Cannot close file " << node->name
1027 				<< ", because modification count "
1028 				<< node->modification_counter <<
1029 				" != flush count " << node->flush_counter;
1030 		}
1031 
1032 		if (node->being_extended) {
1033 			ib::info() << "Cannot close file " << node->name
1034 				<< ", because it is being extended";
1035 		}
1036 	}
1037 
1038 	return(false);
1039 }
1040 
1041 /*******************************************************************//**
1042 Reserves the fil_system mutex and tries to make sure we can open at least one
1043 file while holding it. This should be called before calling
1044 fil_node_prepare_for_io(), because that function may need to open a file. */
1045 static
1046 void
fil_mutex_enter_and_prepare_for_io(ulint space_id)1047 fil_mutex_enter_and_prepare_for_io(
1048 /*===============================*/
1049 	ulint	space_id)	/*!< in: space id */
1050 {
1051 	fil_space_t*	space;
1052 	bool		success;
1053 	bool		print_info	= false;
1054 	ulint		count		= 0;
1055 	ulint		count2		= 0;
1056 
1057 	for (;;) {
1058 		mutex_enter(&fil_system->mutex);
1059 
1060 		if (space_id == 0 || space_id >= SRV_LOG_SPACE_FIRST_ID) {
1061 			/* We keep log files and system tablespace files always
1062 			open; this is important in preventing deadlocks in this
1063 			module, as a page read completion often performs
1064 			another read from the insert buffer. The insert buffer
1065 			is in tablespace 0, and we cannot end up waiting in
1066 			this function. */
1067 			return;
1068 		}
1069 
1070 		space = fil_space_get_by_id(space_id);
1071 
1072 		if (space != NULL && space->stop_ios) {
1073 			/* We are going to do a rename file and want to stop
1074 			new i/o's for a while. */
1075 
1076 			if (count2 > 20000) {
1077 				ib::warn() << "Tablespace " << space->name
1078 					<< " has i/o ops stopped for a long"
1079 					" time " << count2;
1080 			}
1081 
1082 			mutex_exit(&fil_system->mutex);
1083 
1084 #ifndef UNIV_HOTBACKUP
1085 
1086 			/* Wake the i/o-handler threads to make sure pending
1087 			i/o's are performed */
1088 			os_aio_simulated_wake_handler_threads();
1089 
1090 			/* The sleep here is just to give IO helper threads a
1091 			bit of time to do some work. It is not required that
1092 			all IO related to the tablespace being renamed must
1093 			be flushed here as we do fil_flush() in
1094 			fil_rename_tablespace() as well. */
1095 			os_thread_sleep(20000);
1096 
1097 #endif /* UNIV_HOTBACKUP */
1098 
1099 			/* Flush tablespaces so that we can close modified
1100 			files in the LRU list */
1101 			fil_flush_file_spaces(FIL_TYPE_TABLESPACE);
1102 
1103 			os_thread_sleep(20000);
1104 
1105 			count2++;
1106 
1107 			continue;
1108 		}
1109 
1110 		if (fil_system->n_open < fil_system->max_n_open) {
1111 
1112 			return;
1113 		}
1114 
1115 		/* If the file is already open, no need to do anything; if the
1116 		space does not exist, we handle the situation in the function
1117 		which called this function. */
1118 
1119 		if (space == NULL || UT_LIST_GET_FIRST(space->chain)->is_open) {
1120 
1121 			return;
1122 		}
1123 
1124 		if (count > 1) {
1125 			print_info = true;
1126 		}
1127 
1128 		/* Too many files are open, try to close some */
1129 		do {
1130 			success = fil_try_to_close_file_in_LRU(print_info);
1131 
1132 		} while (success
1133 			 && fil_system->n_open >= fil_system->max_n_open);
1134 
1135 		if (fil_system->n_open < fil_system->max_n_open) {
1136 			/* Ok */
1137 			return;
1138 		}
1139 
1140 		if (count >= 2) {
1141 			ib::warn() << "Too many (" << fil_system->n_open
1142 				<< ") files stay open while the maximum"
1143 				" allowed value would be "
1144 				<< fil_system->max_n_open << ". You may need"
1145 				" to raise the value of innodb_open_files in"
1146 				" my.cnf.";
1147 
1148 			return;
1149 		}
1150 
1151 		mutex_exit(&fil_system->mutex);
1152 
1153 #ifndef UNIV_HOTBACKUP
1154 		/* Wake the i/o-handler threads to make sure pending i/o's are
1155 		performed */
1156 		os_aio_simulated_wake_handler_threads();
1157 
1158 		os_thread_sleep(20000);
1159 #endif /* !UNIV_HOTBACKUP */
1160 		/* Flush tablespaces so that we can close modified files in
1161 		the LRU list. */
1162 
1163 		fil_flush_file_spaces(FIL_TYPE_TABLESPACE);
1164 
1165 		count++;
1166 	}
1167 }
1168 
1169 /** Prepare to free a file node object from a tablespace memory cache.
1170 @param[in,out]	node	file node
1171 @param[in]	space	tablespace */
1172 static
1173 void
fil_node_close_to_free(fil_node_t * node,fil_space_t * space)1174 fil_node_close_to_free(
1175 	fil_node_t*	node,
1176 	fil_space_t*	space)
1177 {
1178 	ut_ad(mutex_own(&fil_system->mutex));
1179 	ut_a(node->magic_n == FIL_NODE_MAGIC_N);
1180 	ut_a(node->n_pending == 0);
1181 	ut_a(!node->being_extended);
1182 
1183 	if (node->is_open) {
1184 		/* We fool the assertion in fil_node_close_file() to think
1185 		there are no unflushed modifications in the file */
1186 
1187 		node->modification_counter = node->flush_counter;
1188 		os_event_set(node->sync_event);
1189 
1190 		if (fil_buffering_disabled(space)) {
1191 
1192 			ut_ad(!space->is_in_unflushed_spaces);
1193 			ut_ad(fil_space_is_flushed(space));
1194 
1195 		} else if (space->is_in_unflushed_spaces
1196 			   && fil_space_is_flushed(space)) {
1197 
1198 			space->is_in_unflushed_spaces = false;
1199 
1200 			UT_LIST_REMOVE(fil_system->unflushed_spaces, space);
1201 		}
1202 
1203 		fil_node_close_file(node);
1204 	}
1205 }
1206 
1207 /** Detach a space object from the tablespace memory cache.
1208 Closes the files in the chain but does not delete them.
1209 There must not be any pending i/o's or flushes on the files.
1210 @param[in,out]	space		tablespace */
1211 static
1212 void
fil_space_detach(fil_space_t * space)1213 fil_space_detach(
1214 	fil_space_t*	space)
1215 {
1216 	ut_ad(mutex_own(&fil_system->mutex));
1217 
1218 	HASH_DELETE(fil_space_t, hash, fil_system->spaces, space->id, space);
1219 
1220 	fil_space_t*	fnamespace = fil_space_get_by_name(space->name);
1221 
1222 	ut_a(space == fnamespace);
1223 
1224 	HASH_DELETE(fil_space_t, name_hash, fil_system->name_hash,
1225 		    ut_fold_string(space->name), space);
1226 
1227 	if (space->is_in_unflushed_spaces) {
1228 
1229 		ut_ad(!fil_buffering_disabled(space));
1230 		space->is_in_unflushed_spaces = false;
1231 
1232 		UT_LIST_REMOVE(fil_system->unflushed_spaces, space);
1233 	}
1234 
1235 	UT_LIST_REMOVE(fil_system->space_list, space);
1236 
1237 	ut_a(space->magic_n == FIL_SPACE_MAGIC_N);
1238 	ut_a(space->n_pending_flushes == 0);
1239 
1240 	for (fil_node_t* fil_node = UT_LIST_GET_FIRST(space->chain);
1241 	     fil_node != NULL;
1242 	     fil_node = UT_LIST_GET_NEXT(chain, fil_node)) {
1243 
1244 		fil_node_close_to_free(fil_node, space);
1245 	}
1246 }
1247 
1248 /** Free a tablespace object on which fil_space_detach() was invoked.
1249 There must not be any pending i/o's or flushes on the files.
1250 @param[in,out]	space		tablespace */
1251 static
1252 void
fil_space_free_low(fil_space_t * space)1253 fil_space_free_low(
1254 	fil_space_t*	space)
1255 {
1256 	/* The tablespace must not be in fil_system->named_spaces. */
1257 	ut_ad(srv_fast_shutdown == 2 || space->max_lsn == 0);
1258 
1259 	for (fil_node_t* node = UT_LIST_GET_FIRST(space->chain);
1260 	     node != NULL; ) {
1261 		ut_d(space->size -= node->size);
1262 		os_event_destroy(node->sync_event);
1263 		ut_free(node->name);
1264 		fil_node_t* old_node = node;
1265 		node = UT_LIST_GET_NEXT(chain, node);
1266 		ut_free(old_node);
1267 	}
1268 
1269 	ut_ad(space->size == 0);
1270 
1271 	rw_lock_free(&space->latch);
1272 
1273 	ut_free(space->name);
1274 	ut_free(space);
1275 }
1276 
1277 /** Frees a space object from the tablespace memory cache.
1278 Closes the files in the chain but does not delete them.
1279 There must not be any pending i/o's or flushes on the files.
1280 @param[in]	id		tablespace identifier
1281 @param[in]	x_latched	whether the caller holds X-mode space->latch
1282 @return true if success */
1283 bool
fil_space_free(ulint id,bool x_latched)1284 fil_space_free(
1285 	ulint		id,
1286 	bool		x_latched)
1287 {
1288 	ut_ad(id != TRX_SYS_SPACE);
1289 
1290 	mutex_enter(&fil_system->mutex);
1291 	fil_space_t*	space = fil_space_get_by_id(id);
1292 
1293 	if (space != NULL) {
1294 		fil_space_detach(space);
1295 	}
1296 
1297 	mutex_exit(&fil_system->mutex);
1298 
1299 	if (space != NULL) {
1300 		if (x_latched) {
1301 			rw_lock_x_unlock(&space->latch);
1302 		}
1303 
1304 		bool	need_mutex = !recv_recovery_on;
1305 
1306 		if (need_mutex) {
1307 			log_mutex_enter();
1308 		}
1309 
1310 		ut_ad(log_mutex_own());
1311 
1312 		if (space->max_lsn != 0) {
1313 			ut_d(space->max_lsn = 0);
1314 			UT_LIST_REMOVE(fil_system->named_spaces, space);
1315 		}
1316 
1317 		if (need_mutex) {
1318 			log_mutex_exit();
1319 		}
1320 
1321 		fil_space_free_low(space);
1322 	}
1323 
1324 	return(space != NULL);
1325 }
1326 
1327 /** Create a space memory object and put it to the fil_system hash table.
1328 The tablespace name is independent from the tablespace file-name.
1329 Error messages are issued to the server log.
1330 @param[in]	name	Tablespace name
1331 @param[in]	id	Tablespace identifier
1332 @param[in]	flags	Tablespace flags
1333 @param[in]	purpose	Tablespace purpose
1334 @return pointer to created tablespace, to be filled in with fil_node_create()
1335 @retval NULL on failure (such as when the same tablespace exists) */
1336 fil_space_t*
fil_space_create(const char * name,ulint id,ulint flags,fil_type_t purpose)1337 fil_space_create(
1338 	const char*	name,
1339 	ulint		id,
1340 	ulint		flags,
1341 	fil_type_t	purpose)
1342 {
1343 	fil_space_t*	space;
1344 
1345 	ut_ad(fil_system);
1346 	ut_ad(fsp_flags_is_valid(flags));
1347 	ut_ad(srv_page_size == UNIV_PAGE_SIZE_ORIG || flags != 0);
1348 
1349 	DBUG_EXECUTE_IF("fil_space_create_failure", return(NULL););
1350 
1351 	mutex_enter(&fil_system->mutex);
1352 
1353 	/* Look for a matching tablespace. */
1354 	space = fil_space_get_by_name(name);
1355 
1356 	if (space != NULL) {
1357 		mutex_exit(&fil_system->mutex);
1358 
1359 		ib::warn() << "Tablespace '" << name << "' exists in the"
1360 			" cache with id " << space->id << " != " << id;
1361 
1362 		return(NULL);
1363 	}
1364 
1365 	space = fil_space_get_by_id(id);
1366 
1367 	if (space != NULL) {
1368 		ib::error() << "Trying to add tablespace '" << name
1369 			<< "' with id " << id
1370 			<< " to the tablespace memory cache, but tablespace '"
1371 			<< space->name << "' already exists in the cache!";
1372 		mutex_exit(&fil_system->mutex);
1373 		return(NULL);
1374 	}
1375 
1376 	space = static_cast<fil_space_t*>(ut_zalloc_nokey(sizeof(*space)));
1377 
1378 	space->id = id;
1379 	space->name = mem_strdup(name);
1380 
1381 	UT_LIST_INIT(space->chain, &fil_node_t::chain);
1382 
1383 	/* This warning is not applicable while MEB scanning the redo logs */
1384 #ifndef UNIV_HOTBACKUP
1385 	if (fil_type_is_data(purpose)
1386 	    && !recv_recovery_on
1387 	    && id > fil_system->max_assigned_id) {
1388 
1389 		if (!fil_system->space_id_reuse_warned) {
1390 			fil_system->space_id_reuse_warned = true;
1391 
1392 			ib::warn() << "Allocated tablespace ID " << id
1393 				<< " for " << name << ", old maximum was "
1394 				<< fil_system->max_assigned_id;
1395 		}
1396 
1397 		fil_system->max_assigned_id = id;
1398 	}
1399 #endif /* !UNIV_HOTBACKUP */
1400 	space->purpose = purpose;
1401 	space->flags = flags;
1402 
1403 	space->magic_n = FIL_SPACE_MAGIC_N;
1404 
1405 	space->encryption_type = Encryption::NONE;
1406 
1407 	rw_lock_create(fil_space_latch_key, &space->latch, SYNC_FSP);
1408 
1409 	if (space->purpose == FIL_TYPE_TEMPORARY) {
1410 #ifndef UNIV_HOTBACKUP
1411 		ut_d(space->latch.set_temp_fsp());
1412 #endif /* !UNIV_HOTBACKUP */
1413 	}
1414 
1415 	HASH_INSERT(fil_space_t, hash, fil_system->spaces, id, space);
1416 
1417 	HASH_INSERT(fil_space_t, name_hash, fil_system->name_hash,
1418 		    ut_fold_string(name), space);
1419 
1420 	UT_LIST_ADD_LAST(fil_system->space_list, space);
1421 
1422 	if (id < SRV_LOG_SPACE_FIRST_ID && id > fil_system->max_assigned_id) {
1423 
1424 		fil_system->max_assigned_id = id;
1425 	}
1426 
1427 	mutex_exit(&fil_system->mutex);
1428 
1429 	return(space);
1430 }
1431 
1432 /*******************************************************************//**
1433 Assigns a new space id for a new single-table tablespace. This works simply by
1434 incrementing the global counter. If 4 billion id's is not enough, we may need
1435 to recycle id's.
1436 @return true if assigned, false if not */
1437 bool
fil_assign_new_space_id(ulint * space_id)1438 fil_assign_new_space_id(
1439 /*====================*/
1440 	ulint*	space_id)	/*!< in/out: space id */
1441 {
1442 	ulint	id;
1443 	bool	success;
1444 
1445 	mutex_enter(&fil_system->mutex);
1446 
1447 	id = *space_id;
1448 
1449 	if (id < fil_system->max_assigned_id) {
1450 		id = fil_system->max_assigned_id;
1451 	}
1452 
1453 	id++;
1454 
1455 	if (id > (SRV_LOG_SPACE_FIRST_ID / 2) && (id % 1000000UL == 0)) {
1456 		ib::warn() << "You are running out of new single-table"
1457 			" tablespace id's. Current counter is " << id
1458 			<< " and it must not exceed" << SRV_LOG_SPACE_FIRST_ID
1459 			<< "! To reset the counter to zero you have to dump"
1460 			" all your tables and recreate the whole InnoDB"
1461 			" installation.";
1462 	}
1463 
1464 	success = (id < SRV_LOG_SPACE_FIRST_ID);
1465 
1466 	if (success) {
1467 		*space_id = fil_system->max_assigned_id = id;
1468 	} else {
1469 		ib::warn() << "You have run out of single-table tablespace"
1470 			" id's! Current counter is " << id
1471 			<< ". To reset the counter to zero"
1472 			" you have to dump all your tables and"
1473 			" recreate the whole InnoDB installation.";
1474 		*space_id = ULINT_UNDEFINED;
1475 	}
1476 
1477 	mutex_exit(&fil_system->mutex);
1478 
1479 	return(success);
1480 }
1481 
1482 /*******************************************************************//**
1483 Returns a pointer to the fil_space_t that is in the memory cache
1484 associated with a space id. The caller must lock fil_system->mutex.
1485 @return file_space_t pointer, NULL if space not found */
1486 UNIV_INLINE
1487 fil_space_t*
fil_space_get_space(ulint id)1488 fil_space_get_space(
1489 /*================*/
1490 	ulint	id)	/*!< in: space id */
1491 {
1492 	fil_space_t*	space;
1493 	fil_node_t*	node;
1494 
1495 	ut_ad(fil_system);
1496 
1497 	space = fil_space_get_by_id(id);
1498 	if (space == NULL || space->size != 0) {
1499 		return(space);
1500 	}
1501 
1502 	switch (space->purpose) {
1503 	case FIL_TYPE_LOG:
1504 		break;
1505 	case FIL_TYPE_TEMPORARY:
1506 	case FIL_TYPE_TABLESPACE:
1507 	case FIL_TYPE_IMPORT:
1508 		ut_a(id != 0);
1509 
1510 		mutex_exit(&fil_system->mutex);
1511 
1512 		/* It is possible that the space gets evicted at this point
1513 		before the fil_mutex_enter_and_prepare_for_io() acquires
1514 		the fil_system->mutex. Check for this after completing the
1515 		call to fil_mutex_enter_and_prepare_for_io(). */
1516 		fil_mutex_enter_and_prepare_for_io(id);
1517 
1518 		/* We are still holding the fil_system->mutex. Check if
1519 		the space is still in memory cache. */
1520 		space = fil_space_get_by_id(id);
1521 		if (space == NULL) {
1522 			return(NULL);
1523 		}
1524 
1525 		/* The following code must change when InnoDB supports
1526 		multiple datafiles per tablespace. */
1527 		ut_a(1 == UT_LIST_GET_LEN(space->chain));
1528 
1529 		node = UT_LIST_GET_FIRST(space->chain);
1530 
1531 		/* It must be a single-table tablespace and we have not opened
1532 		the file yet; the following calls will open it and update the
1533 		size fields */
1534 
1535 		if (!fil_node_prepare_for_io(node, fil_system, space)) {
1536 			/* The single-table tablespace can't be opened,
1537 			because the ibd file is missing. */
1538 			return(NULL);
1539 		}
1540 
1541 		fil_node_complete_io(node, fil_system, IORequestRead);
1542 	}
1543 
1544 	return(space);
1545 }
1546 
1547 /** Returns the path from the first fil_node_t found with this space ID.
1548 The caller is responsible for freeing the memory allocated here for the
1549 value returned.
1550 @param[in]	id	Tablespace ID
1551 @return own: A copy of fil_node_t::path, NULL if space ID is zero
1552 or not found. */
1553 char*
fil_space_get_first_path(ulint id)1554 fil_space_get_first_path(
1555 	ulint		id)
1556 {
1557 	fil_space_t*	space;
1558 	fil_node_t*	node;
1559 	char*		path;
1560 
1561 	ut_ad(fil_system);
1562 	ut_a(id);
1563 
1564 	fil_mutex_enter_and_prepare_for_io(id);
1565 
1566 	space = fil_space_get_space(id);
1567 
1568 	if (space == NULL) {
1569 		mutex_exit(&fil_system->mutex);
1570 
1571 		return(NULL);
1572 	}
1573 
1574 	ut_ad(mutex_own(&fil_system->mutex));
1575 
1576 	node = UT_LIST_GET_FIRST(space->chain);
1577 
1578 	path = mem_strdup(node->name);
1579 
1580 	mutex_exit(&fil_system->mutex);
1581 
1582 	return(path);
1583 }
1584 
1585 /*******************************************************************//**
1586 Returns the size of the space in pages. The tablespace must be cached in the
1587 memory cache.
1588 @return space size, 0 if space not found */
1589 ulint
fil_space_get_size(ulint id)1590 fil_space_get_size(
1591 /*===============*/
1592 	ulint	id)	/*!< in: space id */
1593 {
1594 	fil_space_t*	space;
1595 	ulint		size;
1596 
1597 	ut_ad(fil_system);
1598 	mutex_enter(&fil_system->mutex);
1599 
1600 	space = fil_space_get_space(id);
1601 
1602 	size = space ? space->size : 0;
1603 
1604 	mutex_exit(&fil_system->mutex);
1605 
1606 	return(size);
1607 }
1608 
1609 /*******************************************************************//**
1610 Returns the flags of the space. The tablespace must be cached
1611 in the memory cache.
1612 @return flags, ULINT_UNDEFINED if space not found */
1613 ulint
fil_space_get_flags(ulint id)1614 fil_space_get_flags(
1615 /*================*/
1616 	ulint	id)	/*!< in: space id */
1617 {
1618 	fil_space_t*	space;
1619 	ulint		flags;
1620 
1621 	ut_ad(fil_system);
1622 
1623 	mutex_enter(&fil_system->mutex);
1624 
1625 	space = fil_space_get_space(id);
1626 
1627 	if (space == NULL) {
1628 		mutex_exit(&fil_system->mutex);
1629 
1630 		return(ULINT_UNDEFINED);
1631 	}
1632 
1633 	flags = space->flags;
1634 
1635 	mutex_exit(&fil_system->mutex);
1636 
1637 	return(flags);
1638 }
1639 
1640 /** Check if table is mark for truncate.
1641 @param[in]	id	space id
1642 @return true if tablespace is marked for truncate. */
1643 bool
fil_space_is_being_truncated(ulint id)1644 fil_space_is_being_truncated(
1645 	ulint id)
1646 {
1647 	bool	mark_for_truncate;
1648 	mutex_enter(&fil_system->mutex);
1649 	mark_for_truncate = fil_space_get_by_id(id)->is_being_truncated;
1650 	mutex_exit(&fil_system->mutex);
1651 	return(mark_for_truncate);
1652 }
1653 
1654 /** Open each fil_node_t of a named fil_space_t if not already open.
1655 @param[in]	name	Tablespace name
1656 @return true if all nodes are open  */
1657 bool
fil_space_open(const char * name)1658 fil_space_open(
1659 	const char*	name)
1660 {
1661 	ut_ad(fil_system != NULL);
1662 
1663 	mutex_enter(&fil_system->mutex);
1664 
1665 	fil_space_t*	space = fil_space_get_by_name(name);
1666 	fil_node_t*	node;
1667 
1668 	for (node = UT_LIST_GET_FIRST(space->chain);
1669 	     node != NULL;
1670 	     node = UT_LIST_GET_NEXT(chain, node)) {
1671 
1672 		if (!node->is_open
1673 		    && !fil_node_open_file(node)) {
1674 			mutex_exit(&fil_system->mutex);
1675 			return(false);
1676 		}
1677 	}
1678 
1679 	mutex_exit(&fil_system->mutex);
1680 
1681 	return(true);
1682 }
1683 
1684 /** Close each fil_node_t of a named fil_space_t if open.
1685 @param[in]	name	Tablespace name */
1686 void
fil_space_close(const char * name)1687 fil_space_close(
1688 	const char*	name)
1689 {
1690 	if (fil_system == NULL) {
1691 		return;
1692 	}
1693 
1694 	mutex_enter(&fil_system->mutex);
1695 
1696 	fil_space_t*	space = fil_space_get_by_name(name);
1697 	if (space == NULL) {
1698 		mutex_exit(&fil_system->mutex);
1699 		return;
1700 	}
1701 
1702 	for (fil_node_t* node = UT_LIST_GET_FIRST(space->chain);
1703 	     node != NULL;
1704 	     node = UT_LIST_GET_NEXT(chain, node)) {
1705 
1706 		if (node->is_open) {
1707 			fil_node_close_file(node);
1708 		}
1709 	}
1710 
1711 	mutex_exit(&fil_system->mutex);
1712 }
1713 
1714 /** Returns the page size of the space and whether it is compressed or not.
1715 The tablespace must be cached in the memory cache.
1716 @param[in]	id	space id
1717 @param[out]	found	true if tablespace was found
1718 @return page size */
1719 const page_size_t
fil_space_get_page_size(ulint id,bool * found)1720 fil_space_get_page_size(
1721 	ulint	id,
1722 	bool*	found)
1723 {
1724 	const ulint	flags = fil_space_get_flags(id);
1725 
1726 	if (flags == ULINT_UNDEFINED) {
1727 		*found = false;
1728 		return(univ_page_size);
1729 	}
1730 
1731 	*found = true;
1732 
1733 	return(page_size_t(flags));
1734 }
1735 
1736 /****************************************************************//**
1737 Initializes the tablespace memory cache. */
1738 void
fil_init(ulint hash_size,ulint max_n_open)1739 fil_init(
1740 /*=====*/
1741 	ulint	hash_size,	/*!< in: hash table size */
1742 	ulint	max_n_open)	/*!< in: max number of open files */
1743 {
1744 	ut_a(fil_system == NULL);
1745 
1746 	ut_a(hash_size > 0);
1747 	ut_a(max_n_open > 0);
1748 
1749 	fil_system = static_cast<fil_system_t*>(
1750 		ut_zalloc_nokey(sizeof(*fil_system)));
1751 
1752 	mutex_create(LATCH_ID_FIL_SYSTEM, &fil_system->mutex);
1753 
1754 	fil_system->spaces = hash_create(hash_size);
1755 	fil_system->name_hash = hash_create(hash_size);
1756 
1757 	UT_LIST_INIT(fil_system->LRU, &fil_node_t::LRU);
1758 	UT_LIST_INIT(fil_system->space_list, &fil_space_t::space_list);
1759 	UT_LIST_INIT(fil_system->unflushed_spaces,
1760 		     &fil_space_t::unflushed_spaces);
1761 	UT_LIST_INIT(fil_system->named_spaces, &fil_space_t::named_spaces);
1762 
1763 	fil_system->max_n_open = max_n_open;
1764 }
1765 
1766 /*******************************************************************//**
1767 Opens all log files and system tablespace data files. They stay open until the
1768 database server shutdown. This should be called at a server startup after the
1769 space objects for the log and the system tablespace have been created. The
1770 purpose of this operation is to make sure we never run out of file descriptors
1771 if we need to read from the insert buffer or to write to the log. */
1772 void
fil_open_log_and_system_tablespace_files(void)1773 fil_open_log_and_system_tablespace_files(void)
1774 /*==========================================*/
1775 {
1776 	fil_space_t*	space;
1777 
1778 	mutex_enter(&fil_system->mutex);
1779 
1780 	for (space = UT_LIST_GET_FIRST(fil_system->space_list);
1781 	     space != NULL;
1782 	     space = UT_LIST_GET_NEXT(space_list, space)) {
1783 
1784 		fil_node_t*	node;
1785 
1786 		if (fil_space_belongs_in_lru(space)) {
1787 
1788 			continue;
1789 		}
1790 
1791 		for (node = UT_LIST_GET_FIRST(space->chain);
1792 		     node != NULL;
1793 		     node = UT_LIST_GET_NEXT(chain, node)) {
1794 
1795 			if (!node->is_open) {
1796 				if (!fil_node_open_file(node)) {
1797 					/* This func is called during server's
1798 					startup. If some file of log or system
1799 					tablespace is missing, the server
1800 					can't start successfully. So we should
1801 					assert for it. */
1802 					ut_a(0);
1803 				}
1804 			}
1805 
1806 			if (fil_system->max_n_open < 10 + fil_system->n_open) {
1807 
1808 				ib::warn() << "You must raise the value of"
1809 					" innodb_open_files in my.cnf!"
1810 					" Remember that InnoDB keeps all"
1811 					" log files and all system"
1812 					" tablespace files open"
1813 					" for the whole time mysqld is"
1814 					" running, and needs to open also"
1815 					" some .ibd files if the"
1816 					" file-per-table storage model is used."
1817 					" Current open files "
1818 					<< fil_system->n_open
1819 					<< ", max allowed open files "
1820 					<< fil_system->max_n_open
1821 					<< ".";
1822 			}
1823 		}
1824 	}
1825 
1826 	mutex_exit(&fil_system->mutex);
1827 }
1828 
1829 /*******************************************************************//**
1830 Closes all open files. There must not be any pending i/o's or not flushed
1831 modifications in the files. */
1832 void
fil_close_all_files(void)1833 fil_close_all_files(void)
1834 /*=====================*/
1835 {
1836 	fil_space_t*	space;
1837 
1838 	/* At shutdown, we should not have any files in this list. */
1839 	ut_ad(srv_fast_shutdown == 2
1840 	      || UT_LIST_GET_LEN(fil_system->named_spaces) == 0);
1841 
1842 	mutex_enter(&fil_system->mutex);
1843 
1844 	for (space = UT_LIST_GET_FIRST(fil_system->space_list);
1845 	     space != NULL; ) {
1846 		fil_node_t*	node;
1847 		fil_space_t*	prev_space = space;
1848 
1849 		for (node = UT_LIST_GET_FIRST(space->chain);
1850 		     node != NULL;
1851 		     node = UT_LIST_GET_NEXT(chain, node)) {
1852 
1853 			if (node->is_open) {
1854 				fil_node_close_file(node);
1855 			}
1856 		}
1857 
1858 		space = UT_LIST_GET_NEXT(space_list, space);
1859 		fil_space_detach(prev_space);
1860 		fil_space_free_low(prev_space);
1861 	}
1862 
1863 	mutex_exit(&fil_system->mutex);
1864 
1865 	ut_ad(srv_fast_shutdown == 2
1866 	      || UT_LIST_GET_LEN(fil_system->named_spaces) == 0);
1867 }
1868 
1869 /*******************************************************************//**
1870 Closes the redo log files. There must not be any pending i/o's or not
1871 flushed modifications in the files. */
1872 void
fil_close_log_files(bool free)1873 fil_close_log_files(
1874 /*================*/
1875 	bool	free)	/*!< in: whether to free the memory object */
1876 {
1877 	fil_space_t*	space;
1878 
1879 	mutex_enter(&fil_system->mutex);
1880 
1881 	space = UT_LIST_GET_FIRST(fil_system->space_list);
1882 
1883 	while (space != NULL) {
1884 		fil_node_t*	node;
1885 		fil_space_t*	prev_space = space;
1886 
1887 		if (space->purpose != FIL_TYPE_LOG) {
1888 			space = UT_LIST_GET_NEXT(space_list, space);
1889 			continue;
1890 		}
1891 
1892 		/* Log files are not in the fil_system->named_spaces list. */
1893 		ut_ad(space->max_lsn == 0);
1894 
1895 		for (node = UT_LIST_GET_FIRST(space->chain);
1896 		     node != NULL;
1897 		     node = UT_LIST_GET_NEXT(chain, node)) {
1898 
1899 			if (node->is_open) {
1900 				fil_node_close_file(node);
1901 			}
1902 		}
1903 
1904 		space = UT_LIST_GET_NEXT(space_list, space);
1905 
1906 		if (free) {
1907 			fil_space_detach(prev_space);
1908 			fil_space_free_low(prev_space);
1909 		}
1910 	}
1911 
1912 	mutex_exit(&fil_system->mutex);
1913 }
1914 
1915 /*******************************************************************//**
1916 Sets the max tablespace id counter if the given number is bigger than the
1917 previous value. */
1918 void
fil_set_max_space_id_if_bigger(ulint max_id)1919 fil_set_max_space_id_if_bigger(
1920 /*===========================*/
1921 	ulint	max_id)	/*!< in: maximum known id */
1922 {
1923 	if (max_id >= SRV_LOG_SPACE_FIRST_ID) {
1924 		ib::fatal() << "Max tablespace id is too high, " << max_id;
1925 	}
1926 
1927 	mutex_enter(&fil_system->mutex);
1928 
1929 	if (fil_system->max_assigned_id < max_id) {
1930 
1931 		fil_system->max_assigned_id = max_id;
1932 	}
1933 
1934 	mutex_exit(&fil_system->mutex);
1935 }
1936 
1937 /** Write the flushed LSN to the page header of the first page in the
1938 system tablespace.
1939 @param[in]	lsn	flushed LSN
1940 @return DB_SUCCESS or error number */
1941 dberr_t
fil_write_flushed_lsn(lsn_t lsn)1942 fil_write_flushed_lsn(
1943 	lsn_t	lsn)
1944 {
1945 	byte*	buf1;
1946 	byte*	buf;
1947 	dberr_t	err;
1948 
1949 	buf1 = static_cast<byte*>(ut_malloc_nokey(2 * UNIV_PAGE_SIZE));
1950 	buf = static_cast<byte*>(ut_align(buf1, UNIV_PAGE_SIZE));
1951 
1952 	const page_id_t	page_id(TRX_SYS_SPACE, 0);
1953 
1954 	err = fil_read(page_id, univ_page_size, 0, univ_page_size.physical(),
1955 		       buf);
1956 
1957 	if (err == DB_SUCCESS) {
1958 		mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN, lsn);
1959 
1960 		err = fil_write(page_id, univ_page_size, 0,
1961 				univ_page_size.physical(), buf);
1962 
1963 		fil_flush_file_spaces(FIL_TYPE_TABLESPACE);
1964 	}
1965 
1966 	ut_free(buf1);
1967 
1968 	return(err);
1969 }
1970 #ifndef UNIV_HOTBACKUP
1971 /** Acquire a tablespace when it could be dropped concurrently.
1972 Used by background threads that do not necessarily hold proper locks
1973 for concurrency control.
1974 @param[in]	id	tablespace ID
1975 @param[in]	silent	whether to silently ignore missing tablespaces
1976 @return the tablespace, or NULL if missing or being deleted */
1977 inline
1978 fil_space_t*
fil_space_acquire_low(ulint id,bool silent)1979 fil_space_acquire_low(
1980 	ulint	id,
1981 	bool	silent)
1982 {
1983 	fil_space_t*	space;
1984 
1985 	mutex_enter(&fil_system->mutex);
1986 
1987 	space = fil_space_get_by_id(id);
1988 
1989 	if (space == NULL) {
1990 		if (!silent) {
1991 			ib::warn() << "Trying to access missing"
1992 				" tablespace " << id;
1993 		}
1994 	} else if (space->stop_new_ops || space->is_being_truncated) {
1995 		space = NULL;
1996 	} else {
1997 		space->n_pending_ops++;
1998 	}
1999 
2000 	mutex_exit(&fil_system->mutex);
2001 
2002 	return(space);
2003 }
2004 
2005 /** Acquire a tablespace when it could be dropped concurrently.
2006 Used by background threads that do not necessarily hold proper locks
2007 for concurrency control.
2008 @param[in]	id	tablespace ID
2009 @return the tablespace, or NULL if missing or being deleted */
2010 fil_space_t*
fil_space_acquire(ulint id)2011 fil_space_acquire(
2012 	ulint	id)
2013 {
2014 	return(fil_space_acquire_low(id, false));
2015 }
2016 
2017 /** Acquire a tablespace that may not exist.
2018 Used by background threads that do not necessarily hold proper locks
2019 for concurrency control.
2020 @param[in]	id	tablespace ID
2021 @return the tablespace, or NULL if missing or being deleted */
2022 fil_space_t*
fil_space_acquire_silent(ulint id)2023 fil_space_acquire_silent(
2024 	ulint	id)
2025 {
2026 	return(fil_space_acquire_low(id, true));
2027 }
2028 
2029 /** Release a tablespace acquired with fil_space_acquire().
2030 @param[in,out]	space	tablespace to release  */
2031 void
fil_space_release(fil_space_t * space)2032 fil_space_release(
2033 	fil_space_t*	space)
2034 {
2035 	mutex_enter(&fil_system->mutex);
2036 	ut_ad(space->magic_n == FIL_SPACE_MAGIC_N);
2037 	ut_ad(space->n_pending_ops > 0);
2038 	space->n_pending_ops--;
2039 	mutex_exit(&fil_system->mutex);
2040 }
2041 #endif /* !UNIV_HOTBACKUP */
2042 
2043 /********************************************************//**
2044 Creates the database directory for a table if it does not exist yet. */
2045 void
fil_create_directory_for_tablename(const char * name)2046 fil_create_directory_for_tablename(
2047 /*===============================*/
2048 	const char*	name)	/*!< in: name in the standard
2049 				'databasename/tablename' format */
2050 {
2051 	const char*	namend;
2052 	char*		path;
2053 	ulint		len;
2054 
2055 	len = strlen(fil_path_to_mysql_datadir);
2056 	namend = strchr(name, '/');
2057 	ut_a(namend);
2058 	path = static_cast<char*>(ut_malloc_nokey(len + (namend - name) + 2));
2059 
2060 	memcpy(path, fil_path_to_mysql_datadir, len);
2061 	path[len] = '/';
2062 	memcpy(path + len + 1, name, namend - name);
2063 	path[len + (namend - name) + 1] = 0;
2064 
2065 	os_normalize_path(path);
2066 
2067 	bool	success = os_file_create_directory(path, false);
2068 	ut_a(success);
2069 
2070 	ut_free(path);
2071 }
2072 
2073 /** Write a log record about an operation on a tablespace file.
2074 @param[in]	type		MLOG_FILE_NAME or MLOG_FILE_DELETE
2075 or MLOG_FILE_CREATE2 or MLOG_FILE_RENAME2
2076 @param[in]	space_id	tablespace identifier
2077 @param[in]	first_page_no	first page number in the file
2078 @param[in]	path		file path
2079 @param[in]	new_path	if type is MLOG_FILE_RENAME2, the new name
2080 @param[in]	flags		if type is MLOG_FILE_CREATE2, the space flags
2081 @param[in,out]	mtr		mini-transaction */
2082 static
2083 void
fil_op_write_log(mlog_id_t type,ulint space_id,ulint first_page_no,const char * path,const char * new_path,ulint flags,mtr_t * mtr)2084 fil_op_write_log(
2085 	mlog_id_t	type,
2086 	ulint		space_id,
2087 	ulint		first_page_no,
2088 	const char*	path,
2089 	const char*	new_path,
2090 	ulint		flags,
2091 	mtr_t*		mtr)
2092 {
2093 	byte*		log_ptr;
2094 	ulint		len;
2095 
2096 	ut_ad(first_page_no == 0);
2097 
2098 	/* fil_name_parse() requires that there be at least one path
2099 	separator and that the file path end with ".ibd". */
2100 	ut_ad(strchr(path, OS_PATH_SEPARATOR) != NULL);
2101 	ut_ad(strcmp(&path[strlen(path) - strlen(DOT_IBD)], DOT_IBD) == 0);
2102 
2103 	log_ptr = mlog_open(mtr, 11 + 4 + 2 + 1);
2104 
2105 	if (log_ptr == NULL) {
2106 		/* Logging in mtr is switched off during crash recovery:
2107 		in that case mlog_open returns NULL */
2108 		return;
2109 	}
2110 
2111 	log_ptr = mlog_write_initial_log_record_low(
2112 		type, space_id, first_page_no, log_ptr, mtr);
2113 
2114 	if (type == MLOG_FILE_CREATE2) {
2115 		mach_write_to_4(log_ptr, flags);
2116 		log_ptr += 4;
2117 	}
2118 
2119 	/* Let us store the strings as null-terminated for easier readability
2120 	and handling */
2121 
2122 	len = strlen(path) + 1;
2123 
2124 	mach_write_to_2(log_ptr, len);
2125 	log_ptr += 2;
2126 	mlog_close(mtr, log_ptr);
2127 
2128 	mlog_catenate_string(
2129 		mtr, reinterpret_cast<const byte*>(path), len);
2130 
2131 	switch (type) {
2132 	case MLOG_FILE_RENAME2:
2133 		ut_ad(strchr(new_path, OS_PATH_SEPARATOR) != NULL);
2134 		len = strlen(new_path) + 1;
2135 		log_ptr = mlog_open(mtr, 2 + len);
2136 		ut_a(log_ptr);
2137 		mach_write_to_2(log_ptr, len);
2138 		log_ptr += 2;
2139 		mlog_close(mtr, log_ptr);
2140 
2141 		mlog_catenate_string(
2142 			mtr, reinterpret_cast<const byte*>(new_path), len);
2143 		break;
2144 	case MLOG_FILE_NAME:
2145 	case MLOG_FILE_DELETE:
2146 	case MLOG_FILE_CREATE2:
2147 		break;
2148 	default:
2149 		ut_ad(0);
2150 	}
2151 }
2152 #ifndef UNIV_HOTBACKUP
2153 /** Write redo log for renaming a file.
2154 @param[in]	space_id	tablespace id
2155 @param[in]	first_page_no	first page number in the file
2156 @param[in]	old_name	tablespace file name
2157 @param[in]	new_name	tablespace file name after renaming
2158 @param[in,out]	mtr		mini-transaction */
2159 static
2160 void
fil_name_write_rename(ulint space_id,ulint first_page_no,const char * old_name,const char * new_name,mtr_t * mtr)2161 fil_name_write_rename(
2162 	ulint		space_id,
2163 	ulint		first_page_no,
2164 	const char*	old_name,
2165 	const char*	new_name,
2166 	mtr_t*		mtr)
2167 {
2168 	ut_ad(!is_predefined_tablespace(space_id));
2169 
2170 	fil_op_write_log(
2171 		MLOG_FILE_RENAME2,
2172 		space_id, first_page_no, old_name, new_name, 0, mtr);
2173 }
2174 #endif /* !UNIV_HOTBACKUP */
2175 /** Write MLOG_FILE_NAME for a file.
2176 @param[in]	space_id	tablespace id
2177 @param[in]	first_page_no	first page number in the file
2178 @param[in]	name		tablespace file name
2179 @param[in,out]	mtr		mini-transaction */
2180 static
2181 void
fil_name_write(ulint space_id,ulint first_page_no,const char * name,mtr_t * mtr)2182 fil_name_write(
2183 	ulint		space_id,
2184 	ulint		first_page_no,
2185 	const char*	name,
2186 	mtr_t*		mtr)
2187 {
2188 	fil_op_write_log(
2189 		MLOG_FILE_NAME, space_id, first_page_no, name, NULL, 0, mtr);
2190 }
2191 
2192 /** Write MLOG_FILE_NAME for a file.
2193 @param[in]	space		tablespace
2194 @param[in]	first_page_no	first page number in the file
2195 @param[in]	file		tablespace file
2196 @param[in,out]	mtr		mini-transaction */
2197 static
2198 void
fil_name_write(const fil_space_t * space,ulint first_page_no,const fil_node_t * file,mtr_t * mtr)2199 fil_name_write(
2200 	const fil_space_t*	space,
2201 	ulint			first_page_no,
2202 	const fil_node_t*	file,
2203 	mtr_t*			mtr)
2204 {
2205 	fil_name_write(space->id, first_page_no, file->name, mtr);
2206 }
2207 
2208 #ifndef UNIV_HOTBACKUP
2209 /********************************************************//**
2210 Recreates table indexes by applying
2211 TRUNCATE log record during recovery.
2212 @return DB_SUCCESS or error code */
2213 dberr_t
fil_recreate_table(ulint space_id,ulint format_flags,ulint flags,const char * name,truncate_t & truncate)2214 fil_recreate_table(
2215 /*===============*/
2216 	ulint		space_id,	/*!< in: space id */
2217 	ulint		format_flags,	/*!< in: page format */
2218 	ulint		flags,		/*!< in: tablespace flags */
2219 	const char*	name,		/*!< in: table name */
2220 	truncate_t&	truncate)	/*!< in: The information of
2221 					TRUNCATE log record */
2222 {
2223 	dberr_t			err = DB_SUCCESS;
2224 	bool			found;
2225 	const page_size_t	page_size(fil_space_get_page_size(space_id,
2226 								  &found));
2227 
2228 	if (!found) {
2229 		ib::info() << "Missing .ibd file for table '" << name
2230 			<< "' with tablespace " << space_id;
2231 		return(DB_ERROR);
2232 	}
2233 
2234 	ut_ad(!truncate_t::s_fix_up_active);
2235 	truncate_t::s_fix_up_active = true;
2236 
2237 	/* Step-1: Scan for active indexes from REDO logs and drop
2238 	all the indexes using low level function that take root_page_no
2239 	and space-id. */
2240 	truncate.drop_indexes(space_id);
2241 
2242 	/* Step-2: Scan for active indexes and re-create them. */
2243 	err = truncate.create_indexes(
2244 		name, space_id, page_size, flags, format_flags);
2245 	if (err != DB_SUCCESS) {
2246 		ib::info() << "Failed to create indexes for the table '"
2247 			<< name << "' with tablespace " << space_id
2248 			<< " while fixing up truncate action";
2249 		return(err);
2250 	}
2251 
2252 	truncate_t::s_fix_up_active = false;
2253 
2254 	return(err);
2255 }
2256 
2257 /********************************************************//**
2258 Recreates the tablespace and table indexes by applying
2259 TRUNCATE log record during recovery.
2260 @return DB_SUCCESS or error code */
2261 dberr_t
fil_recreate_tablespace(ulint space_id,ulint format_flags,ulint flags,const char * name,truncate_t & truncate,lsn_t recv_lsn)2262 fil_recreate_tablespace(
2263 /*====================*/
2264 	ulint		space_id,	/*!< in: space id */
2265 	ulint		format_flags,	/*!< in: page format */
2266 	ulint		flags,		/*!< in: tablespace flags */
2267 	const char*	name,		/*!< in: table name */
2268 	truncate_t&	truncate,	/*!< in: The information of
2269 					TRUNCATE log record */
2270 	lsn_t		recv_lsn)	/*!< in: the end LSN of
2271 						the log record */
2272 {
2273 	dberr_t		err = DB_SUCCESS;
2274 	mtr_t		mtr;
2275 
2276 	ut_ad(!truncate_t::s_fix_up_active);
2277 	truncate_t::s_fix_up_active = true;
2278 
2279 	/* Step-1: Invalidate buffer pool pages belonging to the tablespace
2280 	to re-create. */
2281 	buf_LRU_flush_or_remove_pages(space_id, BUF_REMOVE_ALL_NO_WRITE, 0);
2282 
2283 	/* Remove all insert buffer entries for the tablespace */
2284 	ibuf_delete_for_discarded_space(space_id);
2285 
2286 	/* Step-2: truncate tablespace (reset the size back to original or
2287 	default size) of tablespace. */
2288 	err = truncate.truncate(
2289 		space_id, truncate.get_dir_path(), name, flags, true);
2290 
2291 	if (err != DB_SUCCESS) {
2292 
2293 		ib::info() << "Cannot access .ibd file for table '"
2294 			<< name << "' with tablespace " << space_id
2295 			<< " while truncating";
2296 		return(DB_ERROR);
2297 	}
2298 
2299 	bool			found;
2300 	const page_size_t&	page_size =
2301 		fil_space_get_page_size(space_id, &found);
2302 
2303 	if (!found) {
2304 		ib::info() << "Missing .ibd file for table '" << name
2305 			<< "' with tablespace " << space_id;
2306 		return(DB_ERROR);
2307 	}
2308 
2309 	/* Step-3: Initialize Header. */
2310 	if (page_size.is_compressed()) {
2311 		byte*	buf;
2312 		page_t*	page;
2313 
2314 		buf = static_cast<byte*>(ut_zalloc_nokey(3 * UNIV_PAGE_SIZE));
2315 
2316 		/* Align the memory for file i/o */
2317 		page = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE));
2318 
2319 		flags = fsp_flags_set_page_size(flags, univ_page_size);
2320 
2321 		fsp_header_init_fields(page, space_id, flags);
2322 
2323 		mach_write_to_4(
2324 			page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space_id);
2325 
2326 		page_zip_des_t  page_zip;
2327 		page_zip_set_size(&page_zip, page_size.physical());
2328 		page_zip.data = page + UNIV_PAGE_SIZE;
2329 
2330 #ifdef UNIV_DEBUG
2331 		page_zip.m_start =
2332 #endif /* UNIV_DEBUG */
2333 		page_zip.m_end = page_zip.m_nonempty = page_zip.n_blobs = 0;
2334 		buf_flush_init_for_writing(
2335 			NULL, page, &page_zip, 0,
2336 			fsp_is_checksum_disabled(space_id));
2337 
2338 		err = fil_write(page_id_t(space_id, 0), page_size, 0,
2339 				page_size.physical(), page_zip.data);
2340 
2341 		ut_free(buf);
2342 
2343 		if (err != DB_SUCCESS) {
2344 			ib::info() << "Failed to clean header of the"
2345 				" table '" << name << "' with tablespace "
2346 				<< space_id;
2347 			return(err);
2348 		}
2349 	}
2350 
2351 	mtr_start(&mtr);
2352 	/* Don't log the operation while fixing up table truncate operation
2353 	as crash at this level can still be sustained with recovery restarting
2354 	from last checkpoint. */
2355 	mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO);
2356 
2357 	/* Initialize the first extent descriptor page and
2358 	the second bitmap page for the new tablespace. */
2359 	fsp_header_init(space_id, FIL_IBD_FILE_INITIAL_SIZE, &mtr);
2360 	mtr_commit(&mtr);
2361 
2362 	/* Step-4: Re-Create Indexes to newly re-created tablespace.
2363 	This operation will restore tablespace back to what it was
2364 	when it was created during CREATE TABLE. */
2365 	err = truncate.create_indexes(
2366 		name, space_id, page_size, flags, format_flags);
2367 	if (err != DB_SUCCESS) {
2368 		return(err);
2369 	}
2370 
2371 	/* Step-5: Write new created pages into ibd file handle and
2372 	flush it to disk for the tablespace, in case i/o-handler thread
2373 	deletes the bitmap page from buffer. */
2374 	mtr_start(&mtr);
2375 
2376 	mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO);
2377 
2378 	mutex_enter(&fil_system->mutex);
2379 
2380 	fil_space_t*	space = fil_space_get_by_id(space_id);
2381 
2382 	mutex_exit(&fil_system->mutex);
2383 
2384 	fil_node_t*	node = UT_LIST_GET_FIRST(space->chain);
2385 
2386 	for (ulint page_no = 0; page_no < node->size; ++page_no) {
2387 
2388 		const page_id_t	cur_page_id(space_id, page_no);
2389 
2390 		buf_block_t*	block = buf_page_get(cur_page_id, page_size,
2391 						     RW_X_LATCH, &mtr);
2392 
2393 		byte*	page = buf_block_get_frame(block);
2394 
2395 		if (!fsp_flags_is_compressed(flags)) {
2396 
2397 			ut_ad(!page_size.is_compressed());
2398 
2399 			buf_flush_init_for_writing(
2400 				block, page, NULL, recv_lsn,
2401 				fsp_is_checksum_disabled(space_id));
2402 
2403 			err = fil_write(cur_page_id, page_size, 0,
2404 					page_size.physical(), page);
2405 		} else {
2406 			ut_ad(page_size.is_compressed());
2407 
2408 			/* We don't want to rewrite empty pages. */
2409 
2410 			if (fil_page_get_type(page) != 0) {
2411 				page_zip_des_t*  page_zip =
2412 					buf_block_get_page_zip(block);
2413 
2414 				buf_flush_init_for_writing(
2415 					block, page, page_zip, recv_lsn,
2416 					fsp_is_checksum_disabled(space_id));
2417 
2418 				err = fil_write(cur_page_id, page_size, 0,
2419 						page_size.physical(),
2420 						page_zip->data);
2421 			} else {
2422 #ifdef UNIV_DEBUG
2423 				const byte*	data = block->page.zip.data;
2424 
2425 				/* Make sure that the page is really empty */
2426 				for (ulint i = 0;
2427 				     i < page_size.physical();
2428 				     ++i) {
2429 
2430 					ut_a(data[i] == 0);
2431 				}
2432 #endif /* UNIV_DEBUG */
2433 			}
2434 		}
2435 
2436 		if (err != DB_SUCCESS) {
2437 			ib::info() << "Cannot write page " << page_no
2438 				<< " into a .ibd file for table '"
2439 				<< name << "' with tablespace " << space_id;
2440 		}
2441 	}
2442 
2443 	mtr_commit(&mtr);
2444 
2445 	truncate_t::s_fix_up_active = false;
2446 
2447 	return(err);
2448 }
2449 #endif /* UNIV_HOTBACKUP */
2450 /** Replay a file rename operation if possible.
2451 @param[in]	space_id	tablespace identifier
2452 @param[in]	first_page_no	first page number in the file
2453 @param[in]	name		old file name
2454 @param[in]	new_name	new file name
2455 @return	whether the operation was successfully applied
2456 (the name did not exist, or new_name did not exist and
2457 name was successfully renamed to new_name)  */
2458 bool
fil_op_replay_rename(ulint space_id,ulint first_page_no,const char * name,const char * new_name)2459 fil_op_replay_rename(
2460 	ulint		space_id,
2461 	ulint		first_page_no,
2462 	const char*	name,
2463 	const char*	new_name)
2464 {
2465 #ifdef UNIV_HOTBACKUP
2466 	ut_ad(recv_replay_file_ops);
2467 #endif /* UNIV_HOTBACKUP */
2468 	ut_ad(first_page_no == 0);
2469 
2470 	/* In order to replay the rename, the following must hold:
2471 	* The new name is not already used.
2472 	* A tablespace exists with the old name.
2473 	* The space ID for that tablepace matches this log entry.
2474 	This will prevent unintended renames during recovery. */
2475 	fil_space_t*	space = fil_space_get(space_id);
2476 
2477 	if (space == NULL) {
2478 		return(true);
2479 	}
2480 
2481 	const bool name_match
2482 		= strcmp(name, UT_LIST_GET_FIRST(space->chain)->name) == 0;
2483 
2484 	if (!name_match) {
2485 		return(true);
2486 	}
2487 
2488 	/* Create the database directory for the new name, if
2489 	it does not exist yet */
2490 
2491 	const char*	namend = strrchr(new_name, OS_PATH_SEPARATOR);
2492 	ut_a(namend != NULL);
2493 
2494 	char*		dir = static_cast<char*>(
2495 		ut_malloc_nokey(namend - new_name + 1));
2496 
2497 	memcpy(dir, new_name, namend - new_name);
2498 	dir[namend - new_name] = '\0';
2499 
2500 	bool		success = os_file_create_directory(dir, false);
2501 	ut_a(success);
2502 
2503 	ulint		dirlen = 0;
2504 
2505 	if (const char* dirend = strrchr(dir, OS_PATH_SEPARATOR)) {
2506 		dirlen = dirend - dir + 1;
2507 	}
2508 
2509 	ut_free(dir);
2510 
2511 	/* New path must not exist. */
2512 	dberr_t		err = fil_rename_tablespace_check(
2513 		space_id, name, new_name, false);
2514 	if (err != DB_SUCCESS) {
2515 		ib::error() << " Cannot replay file rename."
2516 			" Remove either file and try again.";
2517 		return(false);
2518 	}
2519 
2520 	char*		new_table = mem_strdupl(
2521 		new_name + dirlen,
2522 		strlen(new_name + dirlen)
2523 		- 4 /* remove ".ibd" */);
2524 
2525 	ut_ad(new_table[namend - new_name - dirlen]
2526 	      == OS_PATH_SEPARATOR);
2527 #if OS_PATH_SEPARATOR != '/'
2528 	new_table[namend - new_name - dirlen] = '/';
2529 #endif
2530 
2531 	if (!fil_rename_tablespace(
2532 		    space_id, name, new_table, new_name)) {
2533 		ut_error;
2534 	}
2535 
2536 	ut_free(new_table);
2537 	return(true);
2538 }
2539 
2540 /** File operations for tablespace */
2541 enum fil_operation_t {
2542 	FIL_OPERATION_DELETE,	/*!< delete a single-table tablespace */
2543 	FIL_OPERATION_CLOSE,	/*!< close a single-table tablespace */
2544 	FIL_OPERATION_TRUNCATE	/*!< truncate a single-table tablespace */
2545 };
2546 
2547 /** Check for pending operations.
2548 @param[in]	space	tablespace
2549 @param[in]	count	number of attempts so far
2550 @return 0 if no operations else count + 1. */
2551 static
2552 ulint
fil_check_pending_ops(fil_space_t * space,ulint count)2553 fil_check_pending_ops(
2554 	fil_space_t*	space,
2555 	ulint		count)
2556 {
2557 	ut_ad(mutex_own(&fil_system->mutex));
2558 
2559 	const ulint	n_pending_ops = space ? space->n_pending_ops : 0;
2560 
2561 	if (n_pending_ops) {
2562 
2563 		if (count > 5000) {
2564 			ib::warn() << "Trying to close/delete/truncate"
2565 				" tablespace '" << space->name
2566 				<< "' but there are " << n_pending_ops
2567 				<< " pending operations on it.";
2568 		}
2569 
2570 		return(count + 1);
2571 	}
2572 
2573 	return(0);
2574 }
2575 
2576 /*******************************************************************//**
2577 Check for pending IO.
2578 @return 0 if no pending else count + 1. */
2579 static
2580 ulint
fil_check_pending_io(fil_operation_t operation,fil_space_t * space,fil_node_t ** node,ulint count)2581 fil_check_pending_io(
2582 /*=================*/
2583 	fil_operation_t	operation,	/*!< in: File operation */
2584 	fil_space_t*	space,		/*!< in/out: Tablespace to check */
2585 	fil_node_t**	node,		/*!< out: Node in space list */
2586 	ulint		count)		/*!< in: number of attempts so far */
2587 {
2588 	ut_ad(mutex_own(&fil_system->mutex));
2589 	ut_a(space->n_pending_ops == 0);
2590 
2591 	switch (operation) {
2592 	case FIL_OPERATION_DELETE:
2593 	case FIL_OPERATION_CLOSE:
2594 		break;
2595 	case FIL_OPERATION_TRUNCATE:
2596 		space->is_being_truncated = true;
2597 		break;
2598 	}
2599 
2600 	/* The following code must change when InnoDB supports
2601 	multiple datafiles per tablespace. */
2602 	ut_a(UT_LIST_GET_LEN(space->chain) == 1);
2603 
2604 	*node = UT_LIST_GET_FIRST(space->chain);
2605 
2606 	if (space->n_pending_flushes > 0 || (*node)->n_pending > 0) {
2607 
2608 		ut_a(!(*node)->being_extended);
2609 
2610 		if (count > 1000) {
2611 			ib::warn() << "Trying to delete/close/truncate"
2612 				" tablespace '" << space->name
2613 				<< "' but there are "
2614 				<< space->n_pending_flushes
2615 				<< " flushes and " << (*node)->n_pending
2616 				<< " pending i/o's on it.";
2617 		}
2618 
2619 		return(count + 1);
2620 	}
2621 
2622 	return(0);
2623 }
2624 
2625 /*******************************************************************//**
2626 Check pending operations on a tablespace.
2627 @return DB_SUCCESS or error failure. */
2628 static
2629 dberr_t
fil_check_pending_operations(ulint id,fil_operation_t operation,fil_space_t ** space,char ** path)2630 fil_check_pending_operations(
2631 /*=========================*/
2632 	ulint		id,		/*!< in: space id */
2633 	fil_operation_t	operation,	/*!< in: File operation */
2634 	fil_space_t**	space,		/*!< out: tablespace instance
2635 					in memory */
2636 	char**		path)		/*!< out/own: tablespace path */
2637 {
2638 	ulint		count = 0;
2639 
2640 	ut_a(!is_system_tablespace(id));
2641 	ut_ad(space);
2642 
2643 	*space = 0;
2644 
2645 	mutex_enter(&fil_system->mutex);
2646 	fil_space_t* sp = fil_space_get_by_id(id);
2647 	if (sp) {
2648 		sp->stop_new_ops = true;
2649 	}
2650 	mutex_exit(&fil_system->mutex);
2651 
2652 	/* Check for pending operations. */
2653 
2654 	do {
2655 		mutex_enter(&fil_system->mutex);
2656 
2657 		sp = fil_space_get_by_id(id);
2658 
2659 		count = fil_check_pending_ops(sp, count);
2660 
2661 		mutex_exit(&fil_system->mutex);
2662 
2663 		if (count > 0) {
2664 			os_thread_sleep(20000);
2665 		}
2666 
2667 	} while (count > 0);
2668 
2669 	/* Check for pending IO. */
2670 
2671 	*path = 0;
2672 
2673 	do {
2674 		mutex_enter(&fil_system->mutex);
2675 
2676 		sp = fil_space_get_by_id(id);
2677 
2678 		if (sp == NULL) {
2679 			mutex_exit(&fil_system->mutex);
2680 			return(DB_TABLESPACE_NOT_FOUND);
2681 		}
2682 
2683 		fil_node_t*	node;
2684 
2685 		count = fil_check_pending_io(operation, sp, &node, count);
2686 
2687 		if (count == 0) {
2688 			*path = mem_strdup(node->name);
2689 		}
2690 
2691 		mutex_exit(&fil_system->mutex);
2692 
2693 		if (count > 0) {
2694 			os_thread_sleep(20000);
2695 		}
2696 
2697 	} while (count > 0);
2698 
2699 	ut_ad(sp);
2700 
2701 	*space = sp;
2702 	return(DB_SUCCESS);
2703 }
2704 
2705 /*******************************************************************//**
2706 Closes a single-table tablespace. The tablespace must be cached in the
2707 memory cache. Free all pages used by the tablespace.
2708 @return DB_SUCCESS or error */
2709 dberr_t
fil_close_tablespace(trx_t * trx,ulint id)2710 fil_close_tablespace(
2711 /*=================*/
2712 	trx_t*		trx,	/*!< in/out: Transaction covering the close */
2713 	ulint		id)	/*!< in: space id */
2714 {
2715 	char*		path = 0;
2716 	fil_space_t*	space = 0;
2717 	dberr_t		err;
2718 
2719 	ut_a(!is_system_tablespace(id));
2720 
2721 	err = fil_check_pending_operations(id, FIL_OPERATION_CLOSE,
2722 					   &space, &path);
2723 
2724 	if (err != DB_SUCCESS) {
2725 		return(err);
2726 	}
2727 
2728 	ut_a(space);
2729 	ut_a(path != 0);
2730 
2731 	rw_lock_x_lock(&space->latch);
2732 
2733 	/* Invalidate in the buffer pool all pages belonging to the
2734 	tablespace. Since we have set space->stop_new_ops = true, readahead
2735 	or ibuf merge can no longer read more pages of this tablespace to the
2736 	buffer pool. Thus we can clean the tablespace out of the buffer pool
2737 	completely and permanently. The flag stop_new_ops also prevents
2738 	fil_flush() from being applied to this tablespace. */
2739 
2740 	buf_LRU_flush_or_remove_pages(id, BUF_REMOVE_FLUSH_WRITE, trx);
2741 
2742 	/* If the free is successful, the X lock will be released before
2743 	the space memory data structure is freed. */
2744 
2745 	if (!fil_space_free(id, true)) {
2746 		rw_lock_x_unlock(&space->latch);
2747 		err = DB_TABLESPACE_NOT_FOUND;
2748 	} else {
2749 		err = DB_SUCCESS;
2750 	}
2751 
2752 	/* If it is a delete then also delete any generated files, otherwise
2753 	when we drop the database the remove directory will fail. */
2754 
2755 	char*	cfg_name = fil_make_filepath(path, NULL, CFG, false);
2756 	if (cfg_name != NULL) {
2757 		os_file_delete_if_exists(innodb_data_file_key, cfg_name, NULL);
2758 		ut_free(cfg_name);
2759 	}
2760 
2761 	char*	cfp_name = fil_make_filepath(path, NULL, CFP, false);
2762 	if (cfp_name != NULL) {
2763 		os_file_delete_if_exists(innodb_data_file_key, cfp_name, NULL);
2764 		ut_free(cfp_name);
2765 	}
2766 
2767 	ut_free(path);
2768 
2769 	return(err);
2770 }
2771 
2772 /** Deletes an IBD tablespace, either general or single-table.
2773 The tablespace must be cached in the memory cache. This will delete the
2774 datafile, fil_space_t & fil_node_t entries from the file_system_t cache.
2775 @param[in]	space_id	Tablespace id
2776 @param[in]	buf_remove	Specify the action to take on the pages
2777 for this table in the buffer pool.
2778 @return DB_SUCCESS or error */
2779 dberr_t
fil_delete_tablespace(ulint id,buf_remove_t buf_remove)2780 fil_delete_tablespace(
2781 	ulint		id,
2782 	buf_remove_t	buf_remove)
2783 {
2784 	char*		path = 0;
2785 	fil_space_t*	space = 0;
2786 
2787 	ut_a(!is_system_tablespace(id));
2788 
2789 	dberr_t err = fil_check_pending_operations(
2790 		id, FIL_OPERATION_DELETE, &space, &path);
2791 
2792 	if (err != DB_SUCCESS) {
2793 
2794 		ib::error() << "Cannot delete tablespace " << id
2795 			<< " because it is not found in the tablespace"
2796 			" memory cache.";
2797 
2798 		return(err);
2799 	}
2800 
2801 	ut_a(space);
2802 	ut_a(path != 0);
2803 
2804 #ifndef UNIV_HOTBACKUP
2805 	/* IMPORTANT: Because we have set space::stop_new_ops there
2806 	can't be any new ibuf merges, reads or flushes. We are here
2807 	because node::n_pending was zero above. However, it is still
2808 	possible to have pending read and write requests:
2809 
2810 	A read request can happen because the reader thread has
2811 	gone through the ::stop_new_ops check in buf_page_init_for_read()
2812 	before the flag was set and has not yet incremented ::n_pending
2813 	when we checked it above.
2814 
2815 	A write request can be issued any time because we don't check
2816 	the ::stop_new_ops flag when queueing a block for write.
2817 
2818 	We deal with pending write requests in the following function
2819 	where we'd minimally evict all dirty pages belonging to this
2820 	space from the flush_list. Note that if a block is IO-fixed
2821 	we'll wait for IO to complete.
2822 
2823 	To deal with potential read requests, we will check the
2824 	::stop_new_ops flag in fil_io(). */
2825 
2826 	buf_LRU_flush_or_remove_pages(id, buf_remove, 0);
2827 
2828 #endif /* !UNIV_HOTBACKUP */
2829 
2830 	/* If it is a delete then also delete any generated files, otherwise
2831 	when we drop the database the remove directory will fail. */
2832 	{
2833 #ifdef UNIV_HOTBACKUP
2834 		/* When replaying the operation in MySQL Enterprise
2835 		Backup, we do not try to write any log record. */
2836 #else /* UNIV_HOTBACKUP */
2837 		/* Before deleting the file, write a log record about
2838 		it, so that InnoDB crash recovery will expect the file
2839 		to be gone. */
2840 		mtr_t		mtr;
2841 
2842 		mtr_start(&mtr);
2843 		fil_op_write_log(MLOG_FILE_DELETE, id, 0, path, NULL, 0, &mtr);
2844 		mtr_commit(&mtr);
2845 		/* Even if we got killed shortly after deleting the
2846 		tablespace file, the record must have already been
2847 		written to the redo log. */
2848 		log_write_up_to(mtr.commit_lsn(), true);
2849 #endif /* UNIV_HOTBACKUP */
2850 
2851 		char*	cfg_name = fil_make_filepath(path, NULL, CFG, false);
2852 		if (cfg_name != NULL) {
2853 			os_file_delete_if_exists(innodb_data_file_key, cfg_name, NULL);
2854 			ut_free(cfg_name);
2855 		}
2856 
2857 		char*	cfp_name = fil_make_filepath(path, NULL, CFP, false);
2858 		if (cfp_name != NULL) {
2859 			os_file_delete_if_exists(innodb_data_file_key, cfp_name, NULL);
2860 			ut_free(cfp_name);
2861 		}
2862 	}
2863 
2864 	/* Delete the link file pointing to the ibd file we are deleting. */
2865 	if (FSP_FLAGS_HAS_DATA_DIR(space->flags)) {
2866 
2867 		RemoteDatafile::delete_link_file(space->name);
2868 
2869 	} else if (FSP_FLAGS_GET_SHARED(space->flags)) {
2870 
2871 		RemoteDatafile::delete_link_file(base_name(path));
2872 
2873 	}
2874 
2875 	mutex_enter(&fil_system->mutex);
2876 
2877 	/* Double check the sanity of pending ops after reacquiring
2878 	the fil_system::mutex. */
2879 	if (const fil_space_t* s = fil_space_get_by_id(id)) {
2880 		ut_a(s == space);
2881 		ut_a(space->n_pending_ops == 0);
2882 		ut_a(UT_LIST_GET_LEN(space->chain) == 1);
2883 		fil_node_t* node = UT_LIST_GET_FIRST(space->chain);
2884 		ut_a(node->n_pending == 0);
2885 
2886 		fil_space_detach(space);
2887 		mutex_exit(&fil_system->mutex);
2888 
2889 		log_mutex_enter();
2890 
2891 		if (space->max_lsn != 0) {
2892 			ut_d(space->max_lsn = 0);
2893 			UT_LIST_REMOVE(fil_system->named_spaces, space);
2894 		}
2895 
2896 		log_mutex_exit();
2897 		fil_space_free_low(space);
2898 
2899 		if (!os_file_delete(innodb_data_file_key, path)
2900 		    && !os_file_delete_if_exists(
2901 			    innodb_data_file_key, path, NULL)) {
2902 
2903 			/* Note: This is because we have removed the
2904 			tablespace instance from the cache. */
2905 
2906 			err = DB_IO_ERROR;
2907 		}
2908 	} else {
2909 		mutex_exit(&fil_system->mutex);
2910 		err = DB_TABLESPACE_NOT_FOUND;
2911 	}
2912 
2913 	ut_free(path);
2914 
2915 	return(err);
2916 }
2917 #ifndef UNIV_HOTBACKUP
2918 /** Truncate the tablespace to needed size.
2919 @param[in]	space_id	id of tablespace to truncate
2920 @param[in]	size_in_pages	truncate size.
2921 @return true if truncate was successful. */
2922 bool
fil_truncate_tablespace(ulint space_id,ulint size_in_pages)2923 fil_truncate_tablespace(
2924 	ulint		space_id,
2925 	ulint		size_in_pages)
2926 {
2927 	/* Step-1: Prepare tablespace for truncate. This involves
2928 	stopping all the new operations + IO on that tablespace
2929 	and ensuring that related pages are flushed to disk. */
2930 	if (fil_prepare_for_truncate(space_id) != DB_SUCCESS) {
2931 		return(false);
2932 	}
2933 
2934 	/* Step-2: Invalidate buffer pool pages belonging to the tablespace
2935 	to re-create. Remove all insert buffer entries for the tablespace */
2936 	buf_LRU_flush_or_remove_pages(space_id, BUF_REMOVE_ALL_NO_WRITE, 0);
2937 
2938 	/* Step-3: Truncate the tablespace and accordingly update
2939 	the fil_space_t handler that is used to access this tablespace. */
2940 	mutex_enter(&fil_system->mutex);
2941 	fil_space_t*	space = fil_space_get_by_id(space_id);
2942 
2943 	/* The following code must change when InnoDB supports
2944 	multiple datafiles per tablespace. */
2945 	ut_a(UT_LIST_GET_LEN(space->chain) == 1);
2946 
2947 	fil_node_t*	node = UT_LIST_GET_FIRST(space->chain);
2948 
2949 	ut_ad(node->is_open);
2950 
2951 	space->size = node->size = size_in_pages;
2952 
2953 	bool success = os_file_truncate(node->name, node->handle, 0);
2954 	if (success) {
2955 
2956 		os_offset_t	size = size_in_pages * UNIV_PAGE_SIZE;
2957 
2958 		success = os_file_set_size(
2959 			node->name, node->handle, size, srv_read_only_mode);
2960 
2961 		if (success) {
2962 			space->stop_new_ops = false;
2963 			space->is_being_truncated = false;
2964 		}
2965 	}
2966 
2967 	mutex_exit(&fil_system->mutex);
2968 
2969 	return(success);
2970 }
2971 
2972 /*******************************************************************//**
2973 Prepare for truncating a single-table tablespace.
2974 1) Check pending operations on a tablespace;
2975 2) Remove all insert buffer entries for the tablespace;
2976 @return DB_SUCCESS or error */
2977 dberr_t
fil_prepare_for_truncate(ulint id)2978 fil_prepare_for_truncate(
2979 /*=====================*/
2980 	ulint	id)		/*!< in: space id */
2981 {
2982 	char*		path = 0;
2983 	fil_space_t*	space = 0;
2984 
2985 	ut_a(!is_system_tablespace(id));
2986 
2987 	dberr_t	err = fil_check_pending_operations(
2988 		id, FIL_OPERATION_TRUNCATE, &space, &path);
2989 
2990 	ut_free(path);
2991 
2992 	if (err == DB_TABLESPACE_NOT_FOUND) {
2993 		ib::error() << "Cannot truncate tablespace " << id
2994 			<< " because it is not found in the tablespace"
2995 			" memory cache.";
2996 	}
2997 
2998 	return(err);
2999 }
3000 
3001 /** Reinitialize the original tablespace header with the same space id
3002 for single tablespace
3003 @param[in]      table		table belongs to tablespace
3004 @param[in]      size            size in blocks
3005 @param[in]      trx             Transaction covering truncate */
3006 void
fil_reinit_space_header_for_table(dict_table_t * table,ulint size,trx_t * trx)3007 fil_reinit_space_header_for_table(
3008 	dict_table_t*	table,
3009 	ulint		size,
3010 	trx_t*		trx)
3011 {
3012 	ulint	id = table->space;
3013 
3014 	ut_a(!is_system_tablespace(id));
3015 
3016 	/* Invalidate in the buffer pool all pages belonging
3017 	to the tablespace. The buffer pool scan may take long
3018 	time to complete, therefore we release dict_sys->mutex
3019 	and the dict operation lock during the scan and aquire
3020 	it again after the buffer pool scan.*/
3021 
3022 	/* Release the lock on the indexes too. So that
3023 	they won't violate the latch ordering. */
3024 	dict_table_x_unlock_indexes(table);
3025 	row_mysql_unlock_data_dictionary(trx);
3026 	DEBUG_SYNC_C("trunc_table_index_dropped_release_dict_lock");
3027 
3028 	DEBUG_SYNC_C("simulate_buffer_pool_scan");
3029 	buf_LRU_flush_or_remove_pages(id, BUF_REMOVE_ALL_NO_WRITE, 0);
3030 
3031 	row_mysql_lock_data_dictionary(trx);
3032 
3033 	dict_table_x_lock_indexes(table);
3034 
3035 	/* Remove all insert buffer entries for the tablespace */
3036 	ibuf_delete_for_discarded_space(id);
3037 
3038 	mutex_enter(&fil_system->mutex);
3039 
3040 	fil_space_t*	space = fil_space_get_by_id(id);
3041 
3042 	/* The following code must change when InnoDB supports
3043 	multiple datafiles per tablespace. */
3044 	ut_a(UT_LIST_GET_LEN(space->chain) == 1);
3045 
3046 	fil_node_t*	node = UT_LIST_GET_FIRST(space->chain);
3047 
3048 	space->size = node->size = size;
3049 
3050 	mutex_exit(&fil_system->mutex);
3051 
3052 	mtr_t	mtr;
3053 
3054 	mtr_start(&mtr);
3055 	mtr.set_named_space(id);
3056 
3057 	fsp_header_init(id, size, &mtr);
3058 
3059 	mtr_commit(&mtr);
3060 }
3061 
3062 #ifdef UNIV_DEBUG
3063 /** Increase redo skipped count for a tablespace.
3064 @param[in]	id	space id */
3065 void
fil_space_inc_redo_skipped_count(ulint id)3066 fil_space_inc_redo_skipped_count(
3067 	ulint		id)
3068 {
3069 	fil_space_t*	space;
3070 
3071 	mutex_enter(&fil_system->mutex);
3072 
3073 	space = fil_space_get_by_id(id);
3074 
3075 	ut_a(space != NULL);
3076 
3077 	space->redo_skipped_count++;
3078 
3079 	mutex_exit(&fil_system->mutex);
3080 }
3081 
3082 /** Decrease redo skipped count for a tablespace.
3083 @param[in]	id	space id */
3084 void
fil_space_dec_redo_skipped_count(ulint id)3085 fil_space_dec_redo_skipped_count(
3086 	ulint		id)
3087 {
3088 	fil_space_t*	space;
3089 
3090 	mutex_enter(&fil_system->mutex);
3091 
3092 	space = fil_space_get_by_id(id);
3093 
3094 	ut_a(space != NULL);
3095 	ut_a(space->redo_skipped_count > 0);
3096 
3097 	space->redo_skipped_count--;
3098 
3099 	mutex_exit(&fil_system->mutex);
3100 }
3101 
3102 /**
3103 Check whether a single-table tablespace is redo skipped.
3104 @param[in]	id	space id
3105 @return true if redo skipped */
3106 bool
fil_space_is_redo_skipped(ulint id)3107 fil_space_is_redo_skipped(
3108 	ulint		id)
3109 {
3110 	fil_space_t*	space;
3111 	bool		is_redo_skipped;
3112 
3113 	mutex_enter(&fil_system->mutex);
3114 
3115 	space = fil_space_get_by_id(id);
3116 
3117 	ut_a(space != NULL);
3118 
3119 	is_redo_skipped = space->redo_skipped_count > 0;
3120 
3121 	mutex_exit(&fil_system->mutex);
3122 
3123 	return(is_redo_skipped);
3124 }
3125 #endif
3126 
3127 /*******************************************************************//**
3128 Discards a single-table tablespace. The tablespace must be cached in the
3129 memory cache. Discarding is like deleting a tablespace, but
3130 
3131  1. We do not drop the table from the data dictionary;
3132 
3133  2. We remove all insert buffer entries for the tablespace immediately;
3134     in DROP TABLE they are only removed gradually in the background;
3135 
3136  3. Free all the pages in use by the tablespace.
3137 @return DB_SUCCESS or error */
3138 dberr_t
fil_discard_tablespace(ulint id)3139 fil_discard_tablespace(
3140 /*===================*/
3141 	ulint	id)	/*!< in: space id */
3142 {
3143 	dberr_t	err;
3144 
3145 	switch (err = fil_delete_tablespace(id, BUF_REMOVE_ALL_NO_WRITE)) {
3146 	case DB_SUCCESS:
3147 		break;
3148 
3149 	case DB_IO_ERROR:
3150 		ib::warn() << "While deleting tablespace " << id
3151 			<< " in DISCARD TABLESPACE. File rename/delete"
3152 			" failed: " << ut_strerr(err);
3153 		break;
3154 
3155 	case DB_TABLESPACE_NOT_FOUND:
3156 		ib::warn() << "Cannot delete tablespace " << id
3157 			<< " in DISCARD TABLESPACE: " << ut_strerr(err);
3158 		break;
3159 
3160 	default:
3161 		ut_error;
3162 	}
3163 
3164 	/* Remove all insert buffer entries for the tablespace */
3165 
3166 	ibuf_delete_for_discarded_space(id);
3167 
3168 	return(err);
3169 }
3170 #endif /* !UNIV_HOTBACKUP */
3171 
3172 /*******************************************************************//**
3173 Allocates and builds a file name from a path, a table or tablespace name
3174 and a suffix. The string must be freed by caller with ut_free().
3175 @param[in] path NULL or the direcory path or the full path and filename.
3176 @param[in] name NULL if path is full, or Table/Tablespace name
3177 @param[in] suffix NULL or the file extention to use.
3178 @param[in] trim_name true if the last name on the path should be trimmed.
3179 @return own: file name */
3180 char*
fil_make_filepath(const char * path,const char * name,ib_extention ext,bool trim_name)3181 fil_make_filepath(
3182 	const char*	path,
3183 	const char*	name,
3184 	ib_extention	ext,
3185 	bool		trim_name)
3186 {
3187 	/* The path may contain the basename of the file, if so we do not
3188 	need the name.  If the path is NULL, we can use the default path,
3189 	but there needs to be a name. */
3190 	ut_ad(path != NULL || name != NULL);
3191 
3192 	/* If we are going to strip a name off the path, there better be a
3193 	path and a new name to put back on. */
3194 	ut_ad(!trim_name || (path != NULL && name != NULL));
3195 
3196 	if (path == NULL) {
3197 		path = fil_path_to_mysql_datadir;
3198 	}
3199 
3200 	ulint	len		= 0;	/* current length */
3201 	ulint	path_len	= strlen(path);
3202 	ulint	name_len	= (name ? strlen(name) : 0);
3203 	const char* suffix	= dot_ext[ext];
3204 	ulint	suffix_len	= strlen(suffix);
3205 	ulint	full_len	= path_len + 1 + name_len + suffix_len + 1;
3206 
3207 	char*	full_name = static_cast<char*>(ut_malloc_nokey(full_len));
3208 	if (full_name == NULL) {
3209 		return NULL;
3210 	}
3211 
3212 	/* If the name is a relative path, do not prepend "./". */
3213 	if (path[0] == '.'
3214 	    && (path[1] == '\0' || path[1] == OS_PATH_SEPARATOR)
3215 	    && name != NULL && name[0] == '.') {
3216 		path = NULL;
3217 		path_len = 0;
3218 	}
3219 
3220 	if (path != NULL) {
3221 		memcpy(full_name, path, path_len);
3222 		len = path_len;
3223 		full_name[len] = '\0';
3224 		os_normalize_path(full_name);
3225 	}
3226 
3227 	if (trim_name) {
3228 		/* Find the offset of the last DIR separator and set it to
3229 		null in order to strip off the old basename from this path. */
3230 		char* last_dir_sep = strrchr(full_name, OS_PATH_SEPARATOR);
3231 		if (last_dir_sep) {
3232 			last_dir_sep[0] = '\0';
3233 			len = strlen(full_name);
3234 		}
3235 	}
3236 
3237 	if (name != NULL) {
3238 		if (len && full_name[len - 1] != OS_PATH_SEPARATOR) {
3239 			/* Add a DIR separator */
3240 			full_name[len] = OS_PATH_SEPARATOR;
3241 			full_name[++len] = '\0';
3242 		}
3243 
3244 		char*	ptr = &full_name[len];
3245 		memcpy(ptr, name, name_len);
3246 		len += name_len;
3247 		full_name[len] = '\0';
3248 		os_normalize_path(ptr);
3249 	}
3250 
3251 	/* Make sure that the specified suffix is at the end of the filepath
3252 	string provided. This assumes that the suffix starts with '.'.
3253 	If the first char of the suffix is found in the filepath at the same
3254 	length as the suffix from the end, then we will assume that there is
3255 	a previous suffix that needs to be replaced. */
3256 	if (suffix != NULL) {
3257 		/* Need room for the trailing null byte. */
3258 		ut_ad(len < full_len);
3259 
3260 		if ((len > suffix_len)
3261 		   && (full_name[len - suffix_len] == suffix[0])) {
3262 			/* Another suffix exists, make it the one requested. */
3263 			memcpy(&full_name[len - suffix_len], suffix, suffix_len);
3264 
3265 		} else {
3266 			/* No previous suffix, add it. */
3267 			ut_ad(len + suffix_len < full_len);
3268 			memcpy(&full_name[len], suffix, suffix_len);
3269 			full_name[len + suffix_len] = '\0';
3270 		}
3271 	}
3272 
3273 	return(full_name);
3274 }
3275 
3276 /** Test if a tablespace file can be renamed to a new filepath by checking
3277 if that the old filepath exists and the new filepath does not exist.
3278 @param[in]	space_id	tablespace id
3279 @param[in]	old_path	old filepath
3280 @param[in]	new_path	new filepath
3281 @param[in]	is_discarded	whether the tablespace is discarded
3282 @return innodb error code */
3283 dberr_t
fil_rename_tablespace_check(ulint space_id,const char * old_path,const char * new_path,bool is_discarded)3284 fil_rename_tablespace_check(
3285 	ulint		space_id,
3286 	const char*	old_path,
3287 	const char*	new_path,
3288 	bool		is_discarded)
3289 {
3290 	bool	exists = false;
3291 	os_file_type_t	ftype;
3292 
3293 	if (!is_discarded
3294 	    && os_file_status(old_path, &exists, &ftype)
3295 	    && !exists) {
3296 		ib::error() << "Cannot rename '" << old_path
3297 			<< "' to '" << new_path
3298 			<< "' for space ID " << space_id
3299 			<< " because the source file"
3300 			<< " does not exist.";
3301 		return(DB_TABLESPACE_NOT_FOUND);
3302 	}
3303 
3304 	exists = false;
3305 	if (!os_file_status(new_path, &exists, &ftype) || exists) {
3306 		ib::error() << "Cannot rename '" << old_path
3307 			<< "' to '" << new_path
3308 			<< "' for space ID " << space_id
3309 			<< " because the target file exists."
3310 			" Remove the target file and try again.";
3311 		return(DB_TABLESPACE_EXISTS);
3312 	}
3313 
3314 	return(DB_SUCCESS);
3315 }
3316 
3317 /** Rename a single-table tablespace.
3318 The tablespace must exist in the memory cache.
3319 @param[in]	id		tablespace identifier
3320 @param[in]	old_path	old file name
3321 @param[in]	new_name	new table name in the
3322 databasename/tablename format
3323 @param[in]	new_path_in	new file name,
3324 or NULL if it is located in the normal data directory
3325 @return true if success */
3326 bool
fil_rename_tablespace(ulint id,const char * old_path,const char * new_name,const char * new_path_in)3327 fil_rename_tablespace(
3328 	ulint		id,
3329 	const char*	old_path,
3330 	const char*	new_name,
3331 	const char*	new_path_in)
3332 {
3333 	bool		sleep		= false;
3334 	bool		flush		= false;
3335 	fil_space_t*	space;
3336 	fil_node_t*	node;
3337 	ulint		count		= 0;
3338 	ut_a(id != 0);
3339 
3340 	ut_ad(strchr(new_name, '/') != NULL);
3341 retry:
3342 	count++;
3343 
3344 	if (!(count % 1000)) {
3345 		ib::warn() << "Cannot rename file " << old_path
3346 			<< " (space id " << id << "), retried " << count
3347 			<< " times."
3348 			" There are either pending IOs or flushes or"
3349 			" the file is being extended.";
3350 	}
3351 
3352 	mutex_enter(&fil_system->mutex);
3353 
3354 	space = fil_space_get_by_id(id);
3355 
3356 	DBUG_EXECUTE_IF("fil_rename_tablespace_failure_1", space = NULL; );
3357 
3358 	if (space == NULL) {
3359 		ib::error() << "Cannot find space id " << id
3360 			<< " in the tablespace memory cache, though the file '"
3361 			<< old_path
3362 			<< "' in a rename operation should have that id.";
3363 func_exit:
3364 		mutex_exit(&fil_system->mutex);
3365 		return(false);
3366 	}
3367 
3368 	if (count > 25000) {
3369 		space->stop_ios = false;
3370 		goto func_exit;
3371 	}
3372 
3373 	if (space != fil_space_get_by_name(space->name)) {
3374 		ib::error() << "Cannot find " << space->name
3375 			<< " in tablespace memory cache";
3376 		space->stop_ios = false;
3377 		goto func_exit;
3378 	}
3379 
3380 	if (fil_space_get_by_name(new_name)) {
3381 		ib::error() << new_name
3382 			<< " is already in tablespace memory cache";
3383 		space->stop_ios = false;
3384 		goto func_exit;
3385 	}
3386 
3387 	/* We temporarily close the .ibd file because we do not trust that
3388 	operating systems can rename an open file. For the closing we have to
3389 	wait until there are no pending i/o's or flushes on the file. */
3390 
3391 	space->stop_ios = true;
3392 
3393 	/* The following code must change when InnoDB supports
3394 	multiple datafiles per tablespace. */
3395 	ut_a(UT_LIST_GET_LEN(space->chain) == 1);
3396 	node = UT_LIST_GET_FIRST(space->chain);
3397 
3398 	if (node->n_pending > 0
3399 	    || node->n_pending_flushes > 0
3400 	    || node->being_extended) {
3401 		/* There are pending i/o's or flushes or the file is
3402 		currently being extended, sleep for a while and
3403 		retry */
3404 		sleep = true;
3405 
3406 	} else if (node->modification_counter > node->flush_counter) {
3407 		/* Flush the space */
3408 		sleep = flush = true;
3409 
3410 	} else if (node->is_open) {
3411 		/* Close the file */
3412 
3413 		fil_node_close_file(node);
3414 	}
3415 
3416 	mutex_exit(&fil_system->mutex);
3417 
3418 	if (sleep) {
3419 		os_thread_sleep(20000);
3420 
3421 		if (flush) {
3422 			fil_flush(id);
3423 		}
3424 
3425 		sleep = flush = false;
3426 		goto retry;
3427 	}
3428 
3429 	ut_ad(space->stop_ios);
3430 
3431 	char*	new_file_name = new_path_in == NULL
3432 		? fil_make_filepath(NULL, new_name, IBD, false)
3433 		: mem_strdup(new_path_in);
3434 	char*	old_file_name = node->name;
3435 	char*	new_space_name = mem_strdup(new_name);
3436 	char*	old_space_name = space->name;
3437 	ulint	old_fold = ut_fold_string(old_space_name);
3438 	ulint	new_fold = ut_fold_string(new_space_name);
3439 
3440 	ut_ad(strchr(old_file_name, OS_PATH_SEPARATOR) != NULL);
3441 	ut_ad(strchr(new_file_name, OS_PATH_SEPARATOR) != NULL);
3442 
3443 #ifndef UNIV_HOTBACKUP
3444 	if (!recv_recovery_on) {
3445 		mtr_t		mtr;
3446 
3447 		mtr.start();
3448 		fil_name_write_rename(
3449 			id, 0, old_file_name, new_file_name, &mtr);
3450 		mtr.commit();
3451 		log_mutex_enter();
3452 	}
3453 #endif /* !UNIV_HOTBACKUP */
3454 
3455 	/* log_sys->mutex is above fil_system->mutex in the latching order */
3456 	ut_ad(log_mutex_own());
3457 	mutex_enter(&fil_system->mutex);
3458 
3459 	ut_ad(space->name == old_space_name);
3460 	/* We already checked these. */
3461 	ut_ad(space == fil_space_get_by_name(old_space_name));
3462 	ut_ad(!fil_space_get_by_name(new_space_name));
3463 	ut_ad(node->name == old_file_name);
3464 
3465 	bool	success;
3466 
3467 	DBUG_EXECUTE_IF("fil_rename_tablespace_failure_2",
3468 			goto skip_rename; );
3469 
3470 	success = os_file_rename(
3471 		innodb_data_file_key, old_file_name, new_file_name);
3472 
3473 	DBUG_EXECUTE_IF("fil_rename_tablespace_failure_2",
3474 			skip_rename: success = false; );
3475 
3476 	ut_ad(node->name == old_file_name);
3477 
3478 	if (success) {
3479 		node->name = new_file_name;
3480 	}
3481 
3482 #ifndef UNIV_HOTBACKUP
3483 	if (!recv_recovery_on) {
3484 		log_mutex_exit();
3485 	}
3486 #endif /* !UNIV_HOTBACKUP */
3487 
3488 	ut_ad(space->name == old_space_name);
3489 
3490 	if (success) {
3491 		HASH_DELETE(fil_space_t, name_hash, fil_system->name_hash,
3492 			    old_fold, space);
3493 		space->name = new_space_name;
3494 		HASH_INSERT(fil_space_t, name_hash, fil_system->name_hash,
3495 			    new_fold, space);
3496 	} else {
3497 		/* Because nothing was renamed, we must free the new
3498 		names, not the old ones. */
3499 		old_file_name = new_file_name;
3500 		old_space_name = new_space_name;
3501 	}
3502 
3503 	ut_ad(space->stop_ios);
3504 	space->stop_ios = false;
3505 	mutex_exit(&fil_system->mutex);
3506 
3507 	ut_free(old_file_name);
3508 	ut_free(old_space_name);
3509 
3510 	return(success);
3511 }
3512 
3513 /** Create a new General or Single-Table tablespace
3514 @param[in]	space_id	Tablespace ID
3515 @param[in]	name		Tablespace name in dbname/tablename format.
3516 For general tablespaces, the 'dbname/' part may be missing.
3517 @param[in]	path		Path and filename of the datafile to create.
3518 @param[in]	flags		Tablespace flags
3519 @param[in]	size		Initial size of the tablespace file in pages,
3520 must be >= FIL_IBD_FILE_INITIAL_SIZE
3521 @return DB_SUCCESS or error code */
3522 dberr_t
fil_ibd_create(ulint space_id,const char * name,const char * path,ulint flags,ulint size)3523 fil_ibd_create(
3524 	ulint		space_id,
3525 	const char*	name,
3526 	const char*	path,
3527 	ulint		flags,
3528 	ulint		size)
3529 {
3530 	pfs_os_file_t	file;
3531 	dberr_t		err;
3532 	byte*		buf2;
3533 	byte*		page;
3534 	bool		success;
3535 	bool		is_temp = FSP_FLAGS_GET_TEMPORARY(flags);
3536 	bool		has_data_dir = FSP_FLAGS_HAS_DATA_DIR(flags);
3537 	bool		has_shared_space = FSP_FLAGS_GET_SHARED(flags);
3538 	fil_space_t*	space = NULL;
3539 
3540 	ut_ad(!is_system_tablespace(space_id));
3541 	ut_ad(!srv_read_only_mode);
3542 	ut_a(space_id < SRV_LOG_SPACE_FIRST_ID);
3543 	ut_a(size >= FIL_IBD_FILE_INITIAL_SIZE);
3544 	ut_a(fsp_flags_is_valid(flags));
3545 
3546 	/* Create the subdirectories in the path, if they are
3547 	not there already. */
3548 	if (!has_shared_space) {
3549 		err = os_file_create_subdirs_if_needed(path);
3550 		if (err != DB_SUCCESS) {
3551 			return(err);
3552 		}
3553 	}
3554 
3555 	file = os_file_create(
3556 		innodb_data_file_key, path,
3557 		OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT,
3558 		OS_FILE_NORMAL,
3559 		OS_DATA_FILE,
3560 		srv_read_only_mode,
3561 		&success);
3562 
3563 	if (!success) {
3564 		/* The following call will print an error message */
3565 		ulint	error = os_file_get_last_error(true);
3566 
3567 		ib::error() << "Cannot create file '" << path << "'";
3568 
3569 		if (error == OS_FILE_ALREADY_EXISTS) {
3570 			ib::error() << "The file '" << path << "'"
3571 				" already exists though the"
3572 				" corresponding table did not exist"
3573 				" in the InnoDB data dictionary."
3574 				" Have you moved InnoDB .ibd files"
3575 				" around without using the SQL commands"
3576 				" DISCARD TABLESPACE and IMPORT TABLESPACE,"
3577 				" or did mysqld crash in the middle of"
3578 				" CREATE TABLE?"
3579 				" You can resolve the problem by removing"
3580 				" the file '" << path
3581 				<< "' under the 'datadir' of MySQL.";
3582 
3583 			return(DB_TABLESPACE_EXISTS);
3584 		}
3585 
3586 		if (error == OS_FILE_DISK_FULL) {
3587 			return(DB_OUT_OF_FILE_SPACE);
3588 		}
3589 
3590 		return(DB_ERROR);
3591 	}
3592 
3593 	bool	atomic_write;
3594 
3595 #if !defined(NO_FALLOCATE) && defined(UNIV_LINUX)
3596 	if (fil_fusionio_enable_atomic_write(file)) {
3597 
3598 		/* This is required by FusionIO HW/Firmware */
3599 		int     ret = posix_fallocate(file.m_file, 0, size * UNIV_PAGE_SIZE);
3600 
3601 		if (ret != 0) {
3602 
3603 			ib::error() <<
3604 				"posix_fallocate(): Failed to preallocate"
3605 				" data for file " << path
3606 				<< ", desired size "
3607 				<< size * UNIV_PAGE_SIZE
3608 				<< " Operating system error number " << ret
3609 				<< ". Check"
3610 				" that the disk is not full or a disk quota"
3611 				" exceeded. Make sure the file system supports"
3612 				" this function. Some operating system error"
3613 				" numbers are described at " REFMAN
3614 				" operating-system-error-codes.html";
3615 
3616 			success = false;
3617 		} else {
3618 			success = true;
3619 		}
3620 
3621 		atomic_write = true;
3622 	} else {
3623 		atomic_write = false;
3624 
3625 		success = os_file_set_size(
3626 			path, file, size * UNIV_PAGE_SIZE, srv_read_only_mode);
3627 	}
3628 #else
3629 	atomic_write = false;
3630 
3631 	success = os_file_set_size(
3632 		path, file, size * UNIV_PAGE_SIZE, srv_read_only_mode);
3633 
3634 #endif /* !NO_FALLOCATE && UNIV_LINUX */
3635 
3636 	if (!success) {
3637 		os_file_close(file);
3638 		os_file_delete(innodb_data_file_key, path);
3639 		return(DB_OUT_OF_FILE_SPACE);
3640 	}
3641 
3642 	/* Note: We are actually punching a hole, previous contents will
3643 	be lost after this call, if it succeeds. In this case the file
3644 	should be full of NULs. */
3645 
3646 	bool	punch_hole = os_is_sparse_file_supported(path, file);
3647 
3648 	if (punch_hole) {
3649 
3650 		dberr_t	punch_err;
3651 		punch_err = os_file_punch_hole(file.m_file, 0, size * UNIV_PAGE_SIZE);
3652 		if (punch_err != DB_SUCCESS) {
3653 			punch_hole = false;
3654 		}
3655 	}
3656 
3657 	/* printf("Creating tablespace %s id %lu\n", path, space_id); */
3658 
3659 	/* We have to write the space id to the file immediately and flush the
3660 	file to disk. This is because in crash recovery we must be aware what
3661 	tablespaces exist and what are their space id's, so that we can apply
3662 	the log records to the right file. It may take quite a while until
3663 	buffer pool flush algorithms write anything to the file and flush it to
3664 	disk. If we would not write here anything, the file would be filled
3665 	with zeros from the call of os_file_set_size(), until a buffer pool
3666 	flush would write to it. */
3667 
3668 	buf2 = static_cast<byte*>(ut_malloc_nokey(3 * UNIV_PAGE_SIZE));
3669 	/* Align the memory for file i/o if we might have O_DIRECT set */
3670 	page = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
3671 
3672 	memset(page, '\0', UNIV_PAGE_SIZE);
3673 #ifndef UNIV_HOTBACKUP
3674 	/* Add the UNIV_PAGE_SIZE to the table flags and write them to the
3675 	tablespace header. */
3676 	flags = fsp_flags_set_page_size(flags, univ_page_size);
3677 #endif /* !UNIV_HOTBACKUP */
3678 	fsp_header_init_fields(page, space_id, flags);
3679 	mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space_id);
3680 
3681 	const page_size_t	page_size(flags);
3682 	IORequest		request(IORequest::WRITE);
3683 
3684 	if (!page_size.is_compressed()) {
3685 
3686 		buf_flush_init_for_writing(
3687 			NULL, page, NULL, 0,
3688 			fsp_is_checksum_disabled(space_id));
3689 
3690 		err = os_file_write(
3691 			request, path, file, page, 0, page_size.physical());
3692 
3693 		ut_ad(err != DB_IO_NO_PUNCH_HOLE);
3694 
3695 	} else {
3696 		page_zip_des_t	page_zip;
3697 
3698 		page_zip_set_size(&page_zip, page_size.physical());
3699 		page_zip.data = page + UNIV_PAGE_SIZE;
3700 #ifdef UNIV_DEBUG
3701 		page_zip.m_start =
3702 #endif /* UNIV_DEBUG */
3703 			page_zip.m_end = page_zip.m_nonempty =
3704 			page_zip.n_blobs = 0;
3705 
3706 		buf_flush_init_for_writing(
3707 			NULL, page, &page_zip, 0,
3708 			fsp_is_checksum_disabled(space_id));
3709 
3710 		err = os_file_write(
3711 			request, path, file, page_zip.data, 0,
3712 			page_size.physical());
3713 
3714 		ut_a(err != DB_IO_NO_PUNCH_HOLE);
3715 
3716 		punch_hole = false;
3717 	}
3718 
3719 	ut_free(buf2);
3720 
3721 	if (err != DB_SUCCESS) {
3722 
3723 		ib::error()
3724 			<< "Could not write the first page to"
3725 			<< " tablespace '" << path << "'";
3726 
3727 		os_file_close(file);
3728 		os_file_delete(innodb_data_file_key, path);
3729 
3730 		return(DB_ERROR);
3731 	}
3732 
3733 	success = os_file_flush(file);
3734 
3735 	if (!success) {
3736 		ib::error() << "File flush of tablespace '"
3737 			<< path << "' failed";
3738 		os_file_close(file);
3739 		os_file_delete(innodb_data_file_key, path);
3740 		return(DB_ERROR);
3741 	}
3742 
3743 	/* MEB creates isl files during copy-back, hence they
3744 	should not be created during apply log operation. */
3745 #ifndef UNIV_HOTBACKUP
3746 	if (has_data_dir || has_shared_space) {
3747 		/* Make the ISL file if the IBD file is not
3748 		in the default location. */
3749 		err = RemoteDatafile::create_link_file(name, path,
3750 						       has_shared_space);
3751 		if (err != DB_SUCCESS) {
3752 			os_file_close(file);
3753 			os_file_delete(innodb_data_file_key, path);
3754 			return(err);
3755 		}
3756 	}
3757 #endif /* !UNIV_HOTBACKUP */
3758 	space = fil_space_create(name, space_id, flags, is_temp
3759 				 ? FIL_TYPE_TEMPORARY : FIL_TYPE_TABLESPACE);
3760 
3761 	DEBUG_SYNC_C("fil_ibd_created_space");
3762 
3763 	if (!fil_node_create_low(
3764 			path, size, space, false, punch_hole, atomic_write)) {
3765 
3766 		err = DB_ERROR;
3767 		goto error_exit_1;
3768 	}
3769 
3770 	/* For encryption tablespace, initial encryption information. */
3771 	if (FSP_FLAGS_GET_ENCRYPTION(space->flags)) {
3772 		err = fil_set_encryption(space->id,
3773 					 Encryption::AES,
3774 					 NULL,
3775 					 NULL);
3776 		ut_ad(err == DB_SUCCESS);
3777 	}
3778 
3779 #ifndef UNIV_HOTBACKUP
3780 	if (!is_temp) {
3781 		mtr_t			mtr;
3782 		const fil_node_t*	file = UT_LIST_GET_FIRST(space->chain);
3783 
3784 		mtr_start(&mtr);
3785 		fil_op_write_log(
3786 			MLOG_FILE_CREATE2, space_id, 0, file->name,
3787 			NULL, space->flags, &mtr);
3788 		fil_name_write(space, 0, file, &mtr);
3789 		mtr_commit(&mtr);
3790 	}
3791 #endif /* !UNIV_HOTBACKUP */
3792 	err = DB_SUCCESS;
3793 
3794 	/* Error code is set.  Cleanup the various variables used.
3795 	These labels reflect the order in which variables are assigned or
3796 	actions are done. */
3797 error_exit_1:
3798 	if (err != DB_SUCCESS && (has_data_dir || has_shared_space)) {
3799 		RemoteDatafile::delete_link_file(name);
3800 	}
3801 
3802 	os_file_close(file);
3803 	if (err != DB_SUCCESS) {
3804 		os_file_delete(innodb_data_file_key, path);
3805 	}
3806 
3807 	return(err);
3808 }
3809 
3810 #ifndef UNIV_HOTBACKUP
3811 /** Try to open a single-table tablespace and optionally check that the
3812 space id in it is correct. If this does not succeed, print an error message
3813 to the .err log. This function is used to open a tablespace when we start
3814 mysqld after the dictionary has been booted, and also in IMPORT TABLESPACE.
3815 
3816 NOTE that we assume this operation is used either at the database startup
3817 or under the protection of the dictionary mutex, so that two users cannot
3818 race here. This operation does not leave the file associated with the
3819 tablespace open, but closes it after we have looked at the space id in it.
3820 
3821 If the validate boolean is set, we read the first page of the file and
3822 check that the space id in the file is what we expect. We assume that
3823 this function runs much faster if no check is made, since accessing the
3824 file inode probably is much faster (the OS caches them) than accessing
3825 the first page of the file.  This boolean may be initially false, but if
3826 a remote tablespace is found it will be changed to true.
3827 
3828 If the fix_dict boolean is set, then it is safe to use an internal SQL
3829 statement to update the dictionary tables if they are incorrect.
3830 
3831 @param[in]	validate	true if we should validate the tablespace
3832 @param[in]	fix_dict	true if the dictionary is available to be fixed
3833 @param[in]	purpose		FIL_TYPE_TABLESPACE or FIL_TYPE_TEMPORARY
3834 @param[in]	id		tablespace ID
3835 @param[in]	flags		tablespace flags
3836 @param[in]	space_name	tablespace name of the datafile
3837 If file-per-table, it is the table name in the databasename/tablename format
3838 @param[in]	path_in		expected filepath, usually read from dictionary
3839 @return DB_SUCCESS or error code */
3840 dberr_t
fil_ibd_open(bool validate,bool fix_dict,fil_type_t purpose,ulint id,ulint flags,const char * space_name,const char * path_in)3841 fil_ibd_open(
3842 	bool		validate,
3843 	bool		fix_dict,
3844 	fil_type_t	purpose,
3845 	ulint		id,
3846 	ulint		flags,
3847 	const char*	space_name,
3848 	const char*	path_in)
3849 {
3850 	dberr_t		err = DB_SUCCESS;
3851 	bool		dict_filepath_same_as_default = false;
3852 	bool		link_file_found = false;
3853 	bool		link_file_is_bad = false;
3854 	bool		is_shared = FSP_FLAGS_GET_SHARED(flags);
3855 	bool		is_encrypted = FSP_FLAGS_GET_ENCRYPTION(flags);
3856 	Datafile	df_default;	/* default location */
3857 	Datafile	df_dict;	/* dictionary location */
3858 	RemoteDatafile	df_remote;	/* remote location */
3859 	ulint		tablespaces_found = 0;
3860 	ulint		valid_tablespaces_found = 0;
3861 	bool		for_import = (purpose == FIL_TYPE_IMPORT);
3862 
3863 	ut_ad(!fix_dict || rw_lock_own(dict_operation_lock, RW_LOCK_X));
3864 
3865 	ut_ad(!fix_dict || mutex_own(&dict_sys->mutex));
3866 	ut_ad(!fix_dict || !srv_read_only_mode);
3867 	ut_ad(!fix_dict || srv_log_file_size != 0);
3868 	ut_ad(fil_type_is_data(purpose));
3869 
3870 	if (!fsp_flags_is_valid(flags)) {
3871 		return(DB_CORRUPTION);
3872 	}
3873 
3874 	df_default.init(space_name, flags);
3875 	df_dict.init(space_name, flags);
3876 	df_remote.init(space_name, flags);
3877 
3878 	/* Discover the correct file by looking in three possible locations
3879 	while avoiding unecessary effort. */
3880 
3881 	if (is_shared) {
3882 		/* Shared tablespaces will have a path_in since the filename
3883 		is not generated from the tablespace name. Use the basename
3884 		from this path_in with the default datadir as a filepath to
3885 		the default location */
3886 		ut_a(path_in);
3887 		const char*	sep = strrchr(path_in, OS_PATH_SEPARATOR);
3888 		const char*	basename = (sep == NULL) ? path_in : &sep[1];
3889 		df_default.make_filepath(NULL, basename, IBD);
3890 
3891 		/* Always validate shared tablespaces. */
3892 		validate = true;
3893 
3894 		/* Set the ISL filepath in the default location. */
3895 		df_remote.set_link_filepath(path_in);
3896 	} else {
3897 		/* We will always look for an ibd in the default location. */
3898 		df_default.make_filepath(NULL, space_name, IBD);
3899 	}
3900 
3901 	/* Look for a filepath embedded in an ISL where the default file
3902 	would be. */
3903 	if (df_remote.open_read_only(true) == DB_SUCCESS) {
3904 		ut_ad(df_remote.is_open());
3905 
3906 		/* Always validate a file opened from an ISL pointer */
3907 		validate = true;
3908 		++tablespaces_found;
3909 		link_file_found = true;
3910 	} else if (df_remote.filepath() != NULL) {
3911 		/* An ISL file was found but contained a bad filepath in it.
3912 		Better validate anything we do find. */
3913 		validate = true;
3914 	}
3915 
3916 	/* Attempt to open the tablespace at the dictionary filepath. */
3917 	if (path_in) {
3918 		if (df_default.same_filepath_as(path_in)) {
3919 			dict_filepath_same_as_default = true;
3920 		} else {
3921 			/* Dict path is not the default path. Always validate
3922 			remote files. If default is opened, it was moved. */
3923 			validate = true;
3924 
3925 			df_dict.set_filepath(path_in);
3926 			if (df_dict.open_read_only(true) == DB_SUCCESS) {
3927 				ut_ad(df_dict.is_open());
3928 				++tablespaces_found;
3929 			}
3930 		}
3931 	}
3932 
3933 	/* Always look for a file at the default location. But don't log
3934 	an error if the tablespace is already open in remote or dict. */
3935 	ut_a(df_default.filepath());
3936 	const bool	strict = (tablespaces_found == 0);
3937 	if (df_default.open_read_only(strict) == DB_SUCCESS) {
3938 		ut_ad(df_default.is_open());
3939 		++tablespaces_found;
3940 	}
3941 
3942 	/* Check if multiple locations point to the same file. */
3943 	if (tablespaces_found > 1 && df_default.same_as(df_remote)) {
3944 		/* A link file was found with the default path in it.
3945 		Use the default path and delete the link file. */
3946 		--tablespaces_found;
3947 		df_remote.delete_link_file();
3948 		df_remote.close();
3949 	}
3950 	if (tablespaces_found > 1 && df_default.same_as(df_dict)) {
3951 		--tablespaces_found;
3952 		df_dict.close();
3953 	}
3954 	if (tablespaces_found > 1 && df_remote.same_as(df_dict)) {
3955 		--tablespaces_found;
3956 		df_dict.close();
3957 	}
3958 
3959 	bool	atomic_write;
3960 
3961 #if !defined(NO_FALLOCATE) && defined(UNIV_LINUX)
3962 	if (!srv_use_doublewrite_buf && df_default.is_open()) {
3963 
3964 		atomic_write = fil_fusionio_enable_atomic_write(
3965 			df_default.handle());
3966 
3967 	} else {
3968 		atomic_write = false;
3969 	}
3970 #else
3971 	atomic_write = false;
3972 #endif /* !NO_FALLOCATE && UNIV_LINUX */
3973 
3974 	/*  We have now checked all possible tablespace locations and
3975 	have a count of how many unique files we found.  If things are
3976 	normal, we only found 1. */
3977 	/* For encrypted tablespace, we need to check the
3978 	encryption in header of first page. */
3979 	if (!validate && tablespaces_found == 1 && !is_encrypted) {
3980 
3981 		goto skip_validate;
3982 	}
3983 
3984 	/* Read and validate the first page of these three tablespace
3985 	locations, if found. */
3986 	valid_tablespaces_found +=
3987 		(df_remote.validate_to_dd(id, flags, for_import)
3988 			== DB_SUCCESS) ? 1 : 0;
3989 
3990 	valid_tablespaces_found +=
3991 		(df_default.validate_to_dd(id, flags, for_import)
3992 			== DB_SUCCESS) ? 1 : 0;
3993 
3994 	valid_tablespaces_found +=
3995 		(df_dict.validate_to_dd(id, flags, for_import)
3996 			== DB_SUCCESS) ? 1 : 0;
3997 
3998 	/* Make sense of these three possible locations.
3999 	First, bail out if no tablespace files were found. */
4000 	if (valid_tablespaces_found == 0) {
4001 		if (!is_encrypted) {
4002 			/* The following call prints an error message.
4003 			For encrypted tablespace we skip print, since it should
4004 			be keyring plugin issues. */
4005 			os_file_get_last_error(true);
4006 			ib::error() << "Could not find a valid tablespace file for `"
4007 				<< space_name << "`. " << TROUBLESHOOT_DATADICT_MSG;
4008 		}
4009 
4010 		return(DB_CORRUPTION);
4011 	}
4012 
4013 	if (!validate && !is_encrypted) {
4014 		return(DB_SUCCESS);
4015 	}
4016 
4017 	if (validate && is_encrypted && fil_space_get(id)) {
4018 		return(DB_SUCCESS);
4019 	}
4020 
4021 	/* Do not open any tablespaces if more than one tablespace with
4022 	the correct space ID and flags were found. */
4023 	if (tablespaces_found > 1) {
4024 		ib::error() << "A tablespace for `" << space_name
4025 			<< "` has been found in multiple places;";
4026 
4027 		if (df_default.is_open()) {
4028 			ib::error() << "Default location: "
4029 				<< df_default.filepath()
4030 				<< ", Space ID=" << df_default.space_id()
4031 				<< ", Flags=" << df_default.flags();
4032 		}
4033 		if (df_remote.is_open()) {
4034 			ib::error() << "Remote location: "
4035 				<< df_remote.filepath()
4036 				<< ", Space ID=" << df_remote.space_id()
4037 				<< ", Flags=" << df_remote.flags();
4038 		}
4039 		if (df_dict.is_open()) {
4040 			ib::error() << "Dictionary location: "
4041 				<< df_dict.filepath()
4042 				<< ", Space ID=" << df_dict.space_id()
4043 				<< ", Flags=" << df_dict.flags();
4044 		}
4045 
4046 		/* Force-recovery will allow some tablespaces to be
4047 		skipped by REDO if there was more than one file found.
4048 		Unlike during the REDO phase of recovery, we now know
4049 		if the tablespace is valid according to the dictionary,
4050 		which was not available then. So if we did not force
4051 		recovery and there is only one good tablespace, ignore
4052 		any bad tablespaces. */
4053 		if (valid_tablespaces_found > 1 || srv_force_recovery > 0) {
4054 			ib::error() << "Will not open tablespace `"
4055 				<< space_name << "`";
4056 
4057 			/* If the file is not open it cannot be valid. */
4058 			ut_ad(df_default.is_open() || !df_default.is_valid());
4059 			ut_ad(df_dict.is_open()    || !df_dict.is_valid());
4060 			ut_ad(df_remote.is_open()  || !df_remote.is_valid());
4061 
4062 			/* Having established that, this is an easy way to
4063 			look for corrupted data files. */
4064 			if (df_default.is_open() != df_default.is_valid()
4065 			    || df_dict.is_open() != df_dict.is_valid()
4066 			    || df_remote.is_open() != df_remote.is_valid()) {
4067 				return(DB_CORRUPTION);
4068 			}
4069 			return(DB_ERROR);
4070 		}
4071 
4072 		/* There is only one valid tablespace found and we did
4073 		not use srv_force_recovery during REDO.  Use this one
4074 		tablespace and clean up invalid tablespace pointers */
4075 		if (df_default.is_open() && !df_default.is_valid()) {
4076 			df_default.close();
4077 			tablespaces_found--;
4078 		}
4079 		if (df_dict.is_open() && !df_dict.is_valid()) {
4080 			df_dict.close();
4081 			/* Leave dict.filepath so that SYS_DATAFILES
4082 			can be corrected below. */
4083 			tablespaces_found--;
4084 		}
4085 		if (df_remote.is_open() && !df_remote.is_valid()) {
4086 			df_remote.close();
4087 			tablespaces_found--;
4088 			link_file_is_bad = true;
4089 		}
4090 	}
4091 
4092 	/* At this point, there should be only one filepath. */
4093 	ut_a(tablespaces_found == 1);
4094 	ut_a(valid_tablespaces_found == 1);
4095 
4096 	/* Only fix the dictionary at startup when there is only one thread.
4097 	Calls to dict_load_table() can be done while holding other latches. */
4098 	if (!fix_dict) {
4099 		goto skip_validate;
4100 	}
4101 
4102 	/* We may need to update what is stored in SYS_DATAFILES or
4103 	SYS_TABLESPACES or adjust the link file.  Since a failure to
4104 	update SYS_TABLESPACES or SYS_DATAFILES does not prevent opening
4105 	and using the tablespace either this time or the next, we do not
4106 	check the return code or fail to open the tablespace. But if it
4107 	fails, dict_update_filepath() will issue a warning to the log. */
4108 	if (df_dict.filepath()) {
4109 		ut_ad(path_in != NULL);
4110 		ut_ad(df_dict.same_filepath_as(path_in));
4111 
4112 		if (df_remote.is_open()) {
4113 			if (!df_remote.same_filepath_as(path_in)) {
4114 				dict_update_filepath(id, df_remote.filepath());
4115 			}
4116 
4117 		} else if (df_default.is_open()) {
4118 			ut_ad(!dict_filepath_same_as_default);
4119 			dict_update_filepath(id, df_default.filepath());
4120 			if (link_file_is_bad) {
4121 				RemoteDatafile::delete_link_file(space_name);
4122 			}
4123 
4124 		} else if (!is_shared
4125 			   && (!link_file_found || link_file_is_bad)) {
4126 			ut_ad(df_dict.is_open());
4127 			/* Fix the link file if we got our filepath
4128 			from the dictionary but a link file did not
4129 			exist or it did not point to a valid file. */
4130 			RemoteDatafile::delete_link_file(space_name);
4131 			RemoteDatafile::create_link_file(
4132 				space_name, df_dict.filepath());
4133 		}
4134 
4135 	} else if (df_remote.is_open()) {
4136 		if (dict_filepath_same_as_default) {
4137 			dict_update_filepath(id, df_remote.filepath());
4138 
4139 		} else if (path_in == NULL) {
4140 			/* SYS_DATAFILES record for this space ID
4141 			was not found. */
4142 			dict_replace_tablespace_and_filepath(
4143 				id, space_name, df_remote.filepath(), flags);
4144 		}
4145 
4146 	} else if (df_default.is_open()) {
4147 		/* We opened the tablespace in the default location.
4148 		SYS_DATAFILES.PATH needs to be updated if it is different
4149 		from this default path or if the SYS_DATAFILES.PATH was not
4150 		supplied and it should have been. Also update the dictionary
4151 		if we found an ISL file (since !df_remote.is_open).  Since
4152 		path_in is not suppled for file-per-table, we must assume
4153 		that it matched the ISL. */
4154 		if ((path_in != NULL && !dict_filepath_same_as_default)
4155 		    || (path_in == NULL
4156 		        && (DICT_TF_HAS_DATA_DIR(flags)
4157 		            || DICT_TF_HAS_SHARED_SPACE(flags)))
4158 		    || df_remote.filepath() != NULL) {
4159 			dict_replace_tablespace_and_filepath(
4160 				id, space_name, df_default.filepath(), flags);
4161 		}
4162 	}
4163 
4164 skip_validate:
4165 	if (err == DB_SUCCESS) {
4166 		fil_space_t*	space = fil_space_create(
4167 			space_name, id, flags, purpose);
4168 
4169 		/* We do not measure the size of the file, that is why
4170 		we pass the 0 below */
4171 
4172 		if (fil_node_create_low(
4173 			    df_remote.is_open() ? df_remote.filepath() :
4174 			    df_dict.is_open() ? df_dict.filepath() :
4175 			    df_default.filepath(), 0, space, false,
4176 			    true, atomic_write) == NULL) {
4177 
4178 			err = DB_ERROR;
4179 		}
4180 
4181 		/* For encryption tablespace, initialize encryption
4182 		information.*/
4183 		if (err == DB_SUCCESS && is_encrypted && !for_import) {
4184 			Datafile& df_current = df_remote.is_open() ?
4185 				df_remote: df_dict.is_open() ?
4186 				df_dict : df_default;
4187 
4188 			byte*	key = df_current.m_encryption_key;
4189 			byte*	iv = df_current.m_encryption_iv;
4190 			ut_ad(key && iv);
4191 
4192 			err = fil_set_encryption(space->id, Encryption::AES,
4193 						 key, iv);
4194 			ut_ad(err == DB_SUCCESS);
4195 		}
4196 	}
4197 
4198 	return(err);
4199 }
4200 #endif /* !UNIV_HOTBACKUP */
4201 
4202 #ifdef UNIV_HOTBACKUP
4203 /*******************************************************************//**
4204 Allocates a file name for an old version of a single-table tablespace.
4205 The string must be freed by caller with ut_free()!
4206 @return own: file name */
4207 static
4208 char*
fil_make_ibbackup_old_name(const char * name)4209 fil_make_ibbackup_old_name(
4210 /*=======================*/
4211 	const char*	name)		/*!< in: original file name */
4212 {
4213 	static const char	suffix[] = "_ibbackup_old_vers_";
4214 	char*			path;
4215 	ulint			len = strlen(name);
4216 
4217 	path = static_cast<char*>(ut_malloc_nokey(len + 15 + sizeof(suffix)));
4218 
4219 	memcpy(path, name, len);
4220 	memcpy(path + len, suffix, sizeof(suffix) - 1);
4221 	ut_sprintf_timestamp_without_extra_chars(
4222 		path + len + sizeof(suffix) - 1);
4223 	return(path);
4224 }
4225 #endif /* UNIV_HOTBACKUP */
4226 
4227 /** Looks for a pre-existing fil_space_t with the given tablespace ID
4228 and, if found, returns the name and filepath in newly allocated buffers
4229 that the caller must free.
4230 @param[in]	space_id	The tablespace ID to search for.
4231 @param[out]	name		Name of the tablespace found.
4232 @param[out]	filepath	The filepath of the first datafile for the
4233 tablespace.
4234 @return true if tablespace is found, false if not. */
4235 bool
fil_space_read_name_and_filepath(ulint space_id,char ** name,char ** filepath)4236 fil_space_read_name_and_filepath(
4237 	ulint	space_id,
4238 	char**	name,
4239 	char**	filepath)
4240 {
4241 	bool	success = false;
4242 	*name = NULL;
4243 	*filepath = NULL;
4244 
4245 	mutex_enter(&fil_system->mutex);
4246 
4247 	fil_space_t*	space = fil_space_get_by_id(space_id);
4248 
4249 	if (space != NULL) {
4250 		*name = mem_strdup(space->name);
4251 
4252 		fil_node_t* node = UT_LIST_GET_FIRST(space->chain);
4253 		*filepath = mem_strdup(node->name);
4254 
4255 		success = true;
4256 	}
4257 
4258 	mutex_exit(&fil_system->mutex);
4259 
4260 	return(success);
4261 }
4262 
4263 /** Convert a file name to a tablespace name.
4264 @param[in]	filename	directory/databasename/tablename.ibd
4265 @return database/tablename string, to be freed with ut_free() */
4266 char*
fil_path_to_space_name(const char * filename)4267 fil_path_to_space_name(
4268 	const char*	filename)
4269 {
4270 	/* Strip the file name prefix and suffix, leaving
4271 	only databasename/tablename. */
4272 	ulint		filename_len	= strlen(filename);
4273 	const char*	end		= filename + filename_len;
4274 #ifdef HAVE_MEMRCHR
4275 	const char*	tablename	= 1 + static_cast<const char*>(
4276 		memrchr(filename, OS_PATH_SEPARATOR,
4277 			filename_len));
4278 	const char*	dbname		= 1 + static_cast<const char*>(
4279 		memrchr(filename, OS_PATH_SEPARATOR,
4280 			tablename - filename - 1));
4281 #else /* HAVE_MEMRCHR */
4282 	const char*	tablename	= filename;
4283 	const char*	dbname		= NULL;
4284 
4285 	while (const char* t = static_cast<const char*>(
4286 		       memchr(tablename, OS_PATH_SEPARATOR,
4287 			      end - tablename))) {
4288 		dbname = tablename;
4289 		tablename = t + 1;
4290 	}
4291 #endif /* HAVE_MEMRCHR */
4292 
4293 	ut_ad(dbname != NULL);
4294 	ut_ad(tablename > dbname);
4295 	ut_ad(tablename < end);
4296 	ut_ad(end - tablename > 4);
4297 	ut_ad(memcmp(end - 4, DOT_IBD, 4) == 0);
4298 
4299 	char*	name = mem_strdupl(dbname, end - dbname - 4);
4300 
4301 	ut_ad(name[tablename - dbname - 1] == OS_PATH_SEPARATOR);
4302 #if OS_PATH_SEPARATOR != '/'
4303 	/* space->name uses '/', not OS_PATH_SEPARATOR. */
4304 	name[tablename - dbname - 1] = '/';
4305 #endif
4306 
4307 	return(name);
4308 }
4309 
4310 /** Discover the correct IBD file to open given a remote or missing
4311 filepath from the REDO log.  MEB and administrators can move a crashed
4312 database to another location on the same machine and try to recover it.
4313 Remote IBD files might be moved as well to the new location.
4314     The problem with this is that the REDO log contains the old location
4315 which may be still accessible.  During recovery, if files are found in
4316 both locations, we can chose on based on these priorities;
4317 1. Default location
4318 2. ISL location
4319 3. REDO location
4320 @param[in]	space_id	tablespace ID
4321 @param[in]	df		Datafile object with path from redo
4322 @return true if a valid datafile was found, false if not */
4323 bool
fil_ibd_discover(ulint space_id,Datafile & df)4324 fil_ibd_discover(
4325 	ulint		space_id,
4326 	Datafile&	df)
4327 {
4328 	Datafile	df_def_gen;	/* default general datafile */
4329 	Datafile	df_def_per;	/* default file-per-table datafile */
4330 	RemoteDatafile	df_rem_gen;	/* remote general datafile*/
4331 	RemoteDatafile	df_rem_per;	/* remote file-per-table datafile */
4332 
4333 	/* Look for the datafile in the default location. If it is
4334 	a general tablespace, it will be in the datadir. */
4335 	const char*	filename = df.filepath();
4336 	const char*	basename = base_name(filename);
4337 	df_def_gen.init(basename, 0);
4338 	df_def_gen.make_filepath(NULL, basename, IBD);
4339 	if (df_def_gen.open_read_only(false) == DB_SUCCESS
4340 	    && df_def_gen.validate_for_recovery() == DB_SUCCESS
4341 	    && df_def_gen.space_id() == space_id) {
4342 		df.set_filepath(df_def_gen.filepath());
4343 		df.open_read_only(false);
4344 		return(true);
4345 	}
4346 
4347 	/* If this datafile is file-per-table it will have a schema dir. */
4348 	ulint		sep_found = 0;
4349 	const char*	db = basename;
4350 	for (; db > filename && sep_found < 2; db--) {
4351 		if (db[0] == OS_PATH_SEPARATOR) {
4352 			sep_found++;
4353 		}
4354 	}
4355 	if (sep_found == 2) {
4356 		db += 2;
4357 		df_def_per.init(db, 0);
4358 		df_def_per.make_filepath(NULL, db, IBD);
4359 		if (df_def_per.open_read_only(false) == DB_SUCCESS
4360 		    && df_def_per.validate_for_recovery() == DB_SUCCESS
4361 		    && df_def_per.space_id() == space_id) {
4362 			df.set_filepath(df_def_per.filepath());
4363 			df.open_read_only(false);
4364 			return(true);
4365 		}
4366 	}
4367 
4368 	/* Did not find a general or file-per-table datafile in the
4369 	default location.  Look for a remote general tablespace. */
4370 	df_rem_gen.set_name(basename);
4371 	if (df_rem_gen.open_link_file() == DB_SUCCESS) {
4372 
4373 		/* An ISL file was found with contents. */
4374 		if (df_rem_gen.open_read_only(false) != DB_SUCCESS
4375 		    || df_rem_gen.validate_for_recovery() != DB_SUCCESS) {
4376 
4377 			/* Assume that this ISL file is intended to be used.
4378 			Do not continue looking for another if this file
4379 			cannot be opened or is not a valid IBD file. */
4380 			ib::error() << "ISL file '"
4381 				<< df_rem_gen.link_filepath()
4382 				<< "' was found but the linked file '"
4383 				<< df_rem_gen.filepath()
4384 				<< "' could not be opened or is not correct.";
4385 			return(false);
4386 		}
4387 
4388 		/* Use this file if it has the space_id from the MLOG
4389 		record. */
4390 		if (df_rem_gen.space_id() == space_id) {
4391 			df.set_filepath(df_rem_gen.filepath());
4392 			df.open_read_only(false);
4393 			return(true);
4394 		}
4395 
4396 		/* Since old MLOG records can use the same basename in
4397 		multiple CREATE/DROP sequences, this ISL file could be
4398 		pointing to a later version of this basename.ibd file
4399 		which has a different space_id. Keep looking. */
4400 	}
4401 
4402 	/* Look for a remote file-per-table tablespace. */
4403 	if (sep_found == 2) {
4404 		df_rem_per.set_name(db);
4405 		if (df_rem_per.open_link_file() == DB_SUCCESS) {
4406 
4407 			/* An ISL file was found with contents. */
4408 			if (df_rem_per.open_read_only(false) != DB_SUCCESS
4409 				|| df_rem_per.validate_for_recovery()
4410 				   != DB_SUCCESS) {
4411 
4412 				/* Assume that this ISL file is intended to
4413 				be used. Do not continue looking for another
4414 				if this file cannot be opened or is not
4415 				a valid IBD file. */
4416 				ib::error() << "ISL file '"
4417 					<< df_rem_per.link_filepath()
4418 					<< "' was found but the linked file '"
4419 					<< df_rem_per.filepath()
4420 					<< "' could not be opened or is"
4421 					" not correct.";
4422 				return(false);
4423 			}
4424 
4425 			/* Use this file if it has the space_id from the
4426 			MLOG record. */
4427 			if (df_rem_per.space_id() == space_id) {
4428 				df.set_filepath(df_rem_per.filepath());
4429 				df.open_read_only(false);
4430 				return(true);
4431 			}
4432 
4433 			/* Since old MLOG records can use the same basename
4434 			in multiple CREATE/DROP TABLE sequences, this ISL
4435 			file could be pointing to a later version of this
4436 			basename.ibd file which has a different space_id.
4437 			Keep looking. */
4438 		}
4439 	}
4440 
4441 	/* No ISL files were found in the default location. Use the location
4442 	given in the redo log. */
4443 	if (df.open_read_only(false) == DB_SUCCESS
4444 	    && df.validate_for_recovery() == DB_SUCCESS
4445 	    && df.space_id() == space_id) {
4446 		return(true);
4447 	}
4448 
4449 	/* A datafile was not discovered for the filename given. */
4450 	return(false);
4451 }
4452 
4453 /** Open an ibd tablespace and add it to the InnoDB data structures.
4454 This is similar to fil_ibd_open() except that it is used while processing
4455 the REDO log, so the data dictionary is not available and very little
4456 validation is done. The tablespace name is extracred from the
4457 dbname/tablename.ibd portion of the filename, which assumes that the file
4458 is a file-per-table tablespace.  Any name will do for now.  General
4459 tablespace names will be read from the dictionary after it has been
4460 recovered.  The tablespace flags are read at this time from the first page
4461 of the file in validate_for_recovery().
4462 @param[in]	space_id	tablespace ID
4463 @param[in]	filename	path/to/databasename/tablename.ibd
4464 @param[out]	space		the tablespace, or NULL on error
4465 @return status of the operation */
4466 enum fil_load_status
fil_ibd_load(ulint space_id,const char * filename,fil_space_t * & space)4467 fil_ibd_load(
4468 	ulint		space_id,
4469 	const char*	filename,
4470 	fil_space_t*&	space)
4471 {
4472 	/* If the a space is already in the file system cache with this
4473 	space ID, then there is nothing to do. */
4474 	mutex_enter(&fil_system->mutex);
4475 	space = fil_space_get_by_id(space_id);
4476 	mutex_exit(&fil_system->mutex);
4477 
4478 	if (space != NULL) {
4479 		/* Compare the filename we are trying to open with the
4480 		filename from the first node of the tablespace we opened
4481 		previously. Fail if it is different. */
4482 		fil_node_t* node = UT_LIST_GET_FIRST(space->chain);
4483 
4484 		if (0 != strcmp(innobase_basename(filename),
4485 				innobase_basename(node->name))) {
4486 #ifdef  UNIV_HOTBACKUP
4487 			ib::trace()
4488 #else
4489 			ib::info()
4490 #endif /* UNIV_HOTBACKUP */
4491 				<< "Ignoring data file '" << filename
4492 				<< "' with space ID " << space->id
4493 				<< ". Another data file called " << node->name
4494 				<< " exists with the same space ID.";
4495 
4496 				space = NULL;
4497 				return(FIL_LOAD_ID_CHANGED);
4498 		}
4499 		return(FIL_LOAD_OK);
4500 	}
4501 
4502 	/* If the filepath in the redo log is a default location in or
4503 	under the datadir, then just try to open it there. */
4504 	Datafile	file;
4505 	file.set_filepath(filename);
4506 
4507 	Folder		folder(filename, dirname_length(filename));
4508 	if (folder_mysql_datadir >= folder) {
4509 		file.open_read_only(false);
4510 	}
4511 
4512 	if (!file.is_open()) {
4513 		/* The file has been moved or it is a remote datafile. */
4514 		if (!fil_ibd_discover(space_id, file)
4515 		    || !file.is_open()) {
4516 			return(FIL_LOAD_NOT_FOUND);
4517 		}
4518 	}
4519 
4520 	os_offset_t	size;
4521 
4522 	/* Read and validate the first page of the tablespace.
4523 	Assign a tablespace name based on the tablespace type. */
4524 	switch (file.validate_for_recovery()) {
4525 		os_offset_t	minimum_size;
4526 	case DB_SUCCESS:
4527 		if (file.space_id() != space_id) {
4528 #ifdef UNIV_HOTBACKUP
4529 			ib::trace()
4530 #else /* !UNIV_HOTBACKUP */
4531 			ib::info()
4532 #endif /* UNIV_HOTBACKUP */
4533 				<< "Ignoring data file '"
4534 				<< file.filepath()
4535 				<< "' with space ID " << file.space_id()
4536 				<< ", since the redo log references "
4537 				<< file.filepath() << " with space ID "
4538 				<< space_id << ".";
4539 			return(FIL_LOAD_ID_CHANGED);
4540 		}
4541 
4542 		/* Get and test the file size. */
4543 		size = os_file_get_size(file.handle());
4544 
4545 		/* Every .ibd file is created >= 4 pages in size.
4546 		Smaller files cannot be OK. */
4547 		minimum_size = FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE;
4548 
4549 		if (size == static_cast<os_offset_t>(-1)) {
4550 			/* The following call prints an error message */
4551 			os_file_get_last_error(true);
4552 
4553 			ib::error() << "Could not measure the size of"
4554 				" single-table tablespace file '"
4555 				<< file.filepath() << "'";
4556 
4557 		} else if (size < minimum_size) {
4558 #ifndef UNIV_HOTBACKUP
4559 			ib::error() << "The size of tablespace file '"
4560 				<< file.filepath() << "' is only " << size
4561 				<< ", should be at least " << minimum_size
4562 				<< "!";
4563 #else
4564 			/* In MEB, we work around this error. */
4565 			file.set_space_id(ULINT_UNDEFINED);
4566 			file.set_flags(0);
4567 			break;
4568 #endif /* !UNIV_HOTBACKUP */
4569 		} else {
4570 			/* Everything is fine so far. */
4571 			break;
4572 		}
4573 
4574 		/* Fall through to error handling */
4575 
4576 	case DB_TABLESPACE_EXISTS:
4577 #ifdef UNIV_HOTBACKUP
4578 		if (file.flags() == ~(ulint)0) {
4579 			return FIL_LOAD_OK;
4580 		}
4581 #endif /* UNIV_HOTBACKUP */
4582 
4583 		return(FIL_LOAD_INVALID);
4584 
4585 	default:
4586 		return(FIL_LOAD_NOT_FOUND);
4587 	}
4588 
4589 	ut_ad(space == NULL);
4590 
4591 #ifdef UNIV_HOTBACKUP
4592 	if (file.space_id() == ULINT_UNDEFINED || file.space_id() == 0) {
4593 		char*	new_path;
4594 
4595 		ib::info() << "Renaming tablespace file '" << file.filepath()
4596 			<< "' with space ID " << file.space_id() << " to "
4597 			<< file.name() << "_ibbackup_old_vers_<timestamp>"
4598 			" because its size " << size() << " is too small"
4599 			" (< 4 pages 16 kB each), or the space id in the"
4600 			" file header is not sensible. This can happen in"
4601 			" an mysqlbackup run, and is not dangerous.";
4602 		file.close();
4603 
4604 		new_path = fil_make_ibbackup_old_name(file.filepath());
4605 
4606 		bool	success = os_file_rename(
4607 			innodb_data_file_key, file.filepath(), new_path);
4608 
4609 		ut_a(success);
4610 
4611 		ut_free(new_path);
4612 
4613 		return(FIL_LOAD_ID_CHANGED);
4614 	}
4615 
4616 	/* A backup may contain the same space several times, if the space got
4617 	renamed at a sensitive time. Since it is enough to have one version of
4618 	the space, we rename the file if a space with the same space id
4619 	already exists in the tablespace memory cache. We rather rename the
4620 	file than delete it, because if there is a bug, we do not want to
4621 	destroy valuable data. */
4622 
4623 	mutex_enter(&fil_system->mutex);
4624 	space = fil_space_get_by_id(space_id);
4625 	mutex_exit(&fil_system->mutex);
4626 
4627 	if (space != NULL) {
4628 		ib::info() << "Renaming data file '" << file.filepath()
4629 			<< "' with space ID " << space_id << " to "
4630 			<< file.name()
4631 			<< "_ibbackup_old_vers_<timestamp> because space "
4632 			<< space->name << " with the same id was scanned"
4633 			" earlier. This can happen if you have renamed tables"
4634 			" during an mysqlbackup run.";
4635 		file.close();
4636 
4637 		char*	new_path = fil_make_ibbackup_old_name(file.filepath());
4638 
4639 		bool	success = os_file_rename(
4640 			innodb_data_file_key, file.filepath(), new_path);
4641 
4642 		ut_a(success);
4643 
4644 		ut_free(new_path);
4645 		return(FIL_LOAD_OK);
4646 	}
4647 #endif /* UNIV_HOTBACKUP */
4648 
4649 	bool is_temp = FSP_FLAGS_GET_TEMPORARY(file.flags());
4650 	space = fil_space_create(
4651 		file.name(), space_id, file.flags(),
4652 		is_temp ? FIL_TYPE_TEMPORARY : FIL_TYPE_TABLESPACE);
4653 
4654 	if (space == NULL) {
4655 		return(FIL_LOAD_INVALID);
4656 	}
4657 
4658 	ut_ad(space->id == file.space_id());
4659 	ut_ad(space->id == space_id);
4660 
4661 	/* We do not use the size information we have about the file, because
4662 	the rounding formula for extents and pages is somewhat complex; we
4663 	let fil_node_open() do that task. */
4664 
4665 	if (!fil_node_create_low(file.filepath(), 0, space,
4666 				 false, true, false)) {
4667 		ut_error;
4668 	}
4669 
4670 	/* For encryption tablespace, initial encryption information. */
4671 	if (FSP_FLAGS_GET_ENCRYPTION(space->flags)
4672 	    && file.m_encryption_key != NULL) {
4673 		dberr_t err = fil_set_encryption(space->id,
4674 						 Encryption::AES,
4675 						 file.m_encryption_key,
4676 						 file.m_encryption_iv);
4677 		if (err != DB_SUCCESS) {
4678 			ib::error() << "Can't set encryption information for"
4679 				" tablespace " << space->name << "!";
4680 		}
4681 	}
4682 
4683 
4684 	return(FIL_LOAD_OK);
4685 }
4686 
4687 /***********************************************************************//**
4688 A fault-tolerant function that tries to read the next file name in the
4689 directory. We retry 100 times if os_file_readdir_next_file() returns -1. The
4690 idea is to read as much good data as we can and jump over bad data.
4691 @return 0 if ok, -1 if error even after the retries, 1 if at the end
4692 of the directory */
4693 int
fil_file_readdir_next_file(dberr_t * err,const char * dirname,os_file_dir_t dir,os_file_stat_t * info)4694 fil_file_readdir_next_file(
4695 /*=======================*/
4696 	dberr_t*	err,	/*!< out: this is set to DB_ERROR if an error
4697 				was encountered, otherwise not changed */
4698 	const char*	dirname,/*!< in: directory name or path */
4699 	os_file_dir_t	dir,	/*!< in: directory stream */
4700 	os_file_stat_t*	info)	/*!< in/out: buffer where the
4701 				info is returned */
4702 {
4703 	for (ulint i = 0; i < 100; i++) {
4704 		int	ret = os_file_readdir_next_file(dirname, dir, info);
4705 
4706 		if (ret != -1) {
4707 
4708 			return(ret);
4709 		}
4710 
4711 		ib::error() << "os_file_readdir_next_file() returned -1 in"
4712 			" directory " << dirname
4713 			<< ", crash recovery may have failed"
4714 			" for some .ibd files!";
4715 
4716 		*err = DB_ERROR;
4717 	}
4718 
4719 	return(-1);
4720 }
4721 
4722 /*******************************************************************//**
4723 Report that a tablespace for a table was not found. */
4724 static
4725 void
fil_report_missing_tablespace(const char * name,ulint space_id)4726 fil_report_missing_tablespace(
4727 /*===========================*/
4728 	const char*	name,			/*!< in: table name */
4729 	ulint		space_id)		/*!< in: table's space id */
4730 {
4731 	ib::error() << "Table " << name
4732 		<< " in the InnoDB data dictionary has tablespace id "
4733 		<< space_id << ","
4734 		" but tablespace with that id or name does not exist. Have"
4735 		" you deleted or moved .ibd files? This may also be a table"
4736 		" created with CREATE TEMPORARY TABLE whose .ibd and .frm"
4737 		" files MySQL automatically removed, but the table still"
4738 		" exists in the InnoDB internal data dictionary.";
4739 }
4740 
4741 #ifndef UNIV_HOTBACKUP
4742 /** Returns true if a matching tablespace exists in the InnoDB tablespace
4743 memory cache. Note that if we have not done a crash recovery at the database
4744 startup, there may be many tablespaces which are not yet in the memory cache.
4745 @param[in]	id		Tablespace ID
4746 @param[in]	name		Tablespace name used in fil_space_create().
4747 @param[in]	print_error_if_does_not_exist
4748 				Print detailed error information to the
4749 error log if a matching tablespace is not found from memory.
4750 @param[in]	adjust_space	Whether to adjust space id on mismatch
4751 @param[in]	heap		Heap memory
4752 @param[in]	table_id	table id
4753 @return true if a matching tablespace exists in the memory cache */
4754 bool
fil_space_for_table_exists_in_mem(ulint id,const char * name,bool print_error_if_does_not_exist,bool adjust_space,mem_heap_t * heap,table_id_t table_id)4755 fil_space_for_table_exists_in_mem(
4756 	ulint		id,
4757 	const char*	name,
4758 	bool		print_error_if_does_not_exist,
4759 	bool		adjust_space,
4760 	mem_heap_t*	heap,
4761 	table_id_t	table_id)
4762 {
4763 	fil_space_t*	fnamespace = NULL;
4764 	fil_space_t*	space;
4765 
4766 	ut_ad(fil_system);
4767 
4768 	mutex_enter(&fil_system->mutex);
4769 
4770 	/* Look if there is a space with the same id */
4771 
4772 	space = fil_space_get_by_id(id);
4773 
4774 	if (space != NULL
4775 	    && FSP_FLAGS_GET_SHARED(space->flags)
4776 	    && adjust_space
4777 	    && srv_sys_tablespaces_open
4778 	    && 0 == strncmp(space->name, general_space_name,
4779 			    strlen(general_space_name))) {
4780 		/* This name was assigned during recovery in fil_ibd_load().
4781 		This general tablespace was opened from an MLOG_FILE_NAME log
4782 		entry where the tablespace name does not exist.  Replace the
4783 		temporary name with this name and return this space. */
4784 		HASH_DELETE(fil_space_t, name_hash, fil_system->name_hash,
4785 			    ut_fold_string(space->name), space);
4786 		ut_free(space->name);
4787 		space->name = mem_strdup(name);
4788 		HASH_INSERT(fil_space_t, name_hash, fil_system->name_hash,
4789 			    ut_fold_string(space->name), space);
4790 
4791 		mutex_exit(&fil_system->mutex);
4792 
4793 		return(true);
4794 	}
4795 
4796 	if (space != NULL) {
4797 		if (FSP_FLAGS_GET_SHARED(space->flags)
4798 		    && !srv_sys_tablespaces_open) {
4799 
4800 			/* No need to check the name */
4801 			mutex_exit(&fil_system->mutex);
4802 			return(true);
4803 		}
4804 
4805 		/* If this space has the expected name, use it. */
4806 		fnamespace = fil_space_get_by_name(name);
4807 		if (space == fnamespace) {
4808 			/* Found */
4809 
4810 			mutex_exit(&fil_system->mutex);
4811 
4812 			return(true);
4813 		}
4814 	}
4815 
4816 	/* Info from "fnamespace" comes from the ibd file itself, it can
4817 	be different from data obtained from System tables since file
4818 	operations are not transactional. If adjust_space is set, and the
4819 	mismatching space are between a user table and its temp table, we
4820 	shall adjust the ibd file name according to system table info */
4821 	if (adjust_space
4822 	    && space != NULL
4823 	    && row_is_mysql_tmp_table_name(space->name)
4824 	    && !row_is_mysql_tmp_table_name(name)) {
4825 
4826 		mutex_exit(&fil_system->mutex);
4827 
4828 		DBUG_EXECUTE_IF("ib_crash_before_adjust_fil_space",
4829 				DBUG_SUICIDE(););
4830 
4831 		if (fnamespace) {
4832 			const char*	tmp_name;
4833 
4834 			tmp_name = dict_mem_create_temporary_tablename(
4835 				heap, name, table_id);
4836 
4837 			fil_rename_tablespace(
4838 				fnamespace->id,
4839 				UT_LIST_GET_FIRST(fnamespace->chain)->name,
4840 				tmp_name, NULL);
4841 		}
4842 
4843 		DBUG_EXECUTE_IF("ib_crash_after_adjust_one_fil_space",
4844 				DBUG_SUICIDE(););
4845 
4846 		fil_rename_tablespace(
4847 			id, UT_LIST_GET_FIRST(space->chain)->name,
4848 			name, NULL);
4849 
4850 		DBUG_EXECUTE_IF("ib_crash_after_adjust_fil_space",
4851 				DBUG_SUICIDE(););
4852 
4853 		mutex_enter(&fil_system->mutex);
4854 		fnamespace = fil_space_get_by_name(name);
4855 		ut_ad(space == fnamespace);
4856 		mutex_exit(&fil_system->mutex);
4857 
4858 		return(true);
4859 	}
4860 
4861 	if (!print_error_if_does_not_exist) {
4862 
4863 		mutex_exit(&fil_system->mutex);
4864 
4865 		return(false);
4866 	}
4867 
4868 	if (space == NULL) {
4869 		if (fnamespace == NULL) {
4870 			if (print_error_if_does_not_exist) {
4871 				fil_report_missing_tablespace(name, id);
4872 			}
4873 		} else {
4874 			ib::error() << "Table " << name << " in InnoDB data"
4875 				" dictionary has tablespace id " << id
4876 				<< ", but a tablespace with that id does not"
4877 				" exist. There is a tablespace of name "
4878 				<< fnamespace->name << " and id "
4879 				<< fnamespace->id << ", though. Have you"
4880 				" deleted or moved .ibd files?";
4881 		}
4882 error_exit:
4883 		ib::warn() << TROUBLESHOOT_DATADICT_MSG;
4884 
4885 		mutex_exit(&fil_system->mutex);
4886 
4887 		return(false);
4888 	}
4889 
4890 	if (0 != strcmp(space->name, name)) {
4891 
4892 		ib::error() << "Table " << name << " in InnoDB data dictionary"
4893 			" has tablespace id " << id << ", but the tablespace"
4894 			" with that id has name " << space->name << "."
4895 			" Have you deleted or moved .ibd files?";
4896 
4897 		if (fnamespace != NULL) {
4898 			ib::error() << "There is a tablespace with the right"
4899 				" name: " << fnamespace->name << ", but its id"
4900 				" is " << fnamespace->id << ".";
4901 		}
4902 
4903 		goto error_exit;
4904 	}
4905 
4906 	mutex_exit(&fil_system->mutex);
4907 
4908 	return(false);
4909 }
4910 #endif /* !UNIV_HOTBACKUP */
4911 /** Return the space ID based on the tablespace name.
4912 The tablespace must be found in the tablespace memory cache.
4913 This call is made from external to this module, so the mutex is not owned.
4914 @param[in]	tablespace	Tablespace name
4915 @return space ID if tablespace found, ULINT_UNDEFINED if space not. */
4916 ulint
fil_space_get_id_by_name(const char * tablespace)4917 fil_space_get_id_by_name(
4918 	const char*	tablespace)
4919 {
4920 	mutex_enter(&fil_system->mutex);
4921 
4922 	/* Search for a space with the same name. */
4923 	fil_space_t*	space = fil_space_get_by_name(tablespace);
4924 	ulint		id = (space == NULL) ? ULINT_UNDEFINED : space->id;
4925 
4926 	mutex_exit(&fil_system->mutex);
4927 
4928 	return(id);
4929 }
4930 
4931 /**
4932 Fill the pages with NULs
4933 @param[in] node		File node
4934 @param[in] page_size	physical page size
4935 @param[in] start	Offset from the start of the file in bytes
4936 @param[in] len		Length in bytes
4937 @param[in] read_only_mode
4938 			if true, then read only mode checks are enforced.
4939 @return DB_SUCCESS or error code */
4940 static
4941 dberr_t
fil_write_zeros(const fil_node_t * node,ulint page_size,os_offset_t start,ulint len,bool read_only_mode)4942 fil_write_zeros(
4943 	const fil_node_t*	node,
4944 	ulint			page_size,
4945 	os_offset_t		start,
4946 	ulint			len,
4947 	bool			read_only_mode)
4948 {
4949 	ut_a(len > 0);
4950 
4951 	/* Extend at most 1M at a time */
4952 	ulint	n_bytes = ut_min(static_cast<ulint>(1024 * 1024), len);
4953 	byte*	ptr = reinterpret_cast<byte*>(ut_zalloc_nokey(n_bytes
4954 							      + page_size));
4955 	byte*	buf = reinterpret_cast<byte*>(ut_align(ptr, page_size));
4956 
4957 	os_offset_t		offset = start;
4958 	dberr_t			err = DB_SUCCESS;
4959 	const os_offset_t	end = start + len;
4960 	IORequest		request(IORequest::WRITE);
4961 
4962 	while (offset < end) {
4963 
4964 #ifdef UNIV_HOTBACKUP
4965 		err = os_file_write(
4966 			request, node->name, node->handle, buf, offset,
4967 			n_bytes);
4968 #else
4969 		err = os_aio_func(
4970 			request, OS_AIO_SYNC, node->name,
4971 			node->handle, buf, offset, n_bytes, read_only_mode,
4972 			NULL, NULL);
4973 #endif /* UNIV_HOTBACKUP */
4974 
4975 		if (err != DB_SUCCESS) {
4976 			break;
4977 		}
4978 
4979 		offset += n_bytes;
4980 
4981 		n_bytes = ut_min(n_bytes, static_cast<ulint>(end - offset));
4982 
4983 		DBUG_EXECUTE_IF("ib_crash_during_tablespace_extension",
4984 				DBUG_SUICIDE(););
4985 	}
4986 
4987 	ut_free(ptr);
4988 
4989 	return(err);
4990 }
4991 
4992 /** Try to extend a tablespace if it is smaller than the specified size.
4993 @param[in,out]	space	tablespace
4994 @param[in]	size	desired size in pages
4995 @return whether the tablespace is at least as big as requested */
4996 bool
fil_space_extend(fil_space_t * space,ulint size)4997 fil_space_extend(
4998 	fil_space_t*	space,
4999 	ulint		size)
5000 {
5001 	/* In read-only mode we allow write to shared temporary tablespace
5002 	as intrinsic table created by Optimizer reside in this tablespace. */
5003 	ut_ad(!srv_read_only_mode || fsp_is_system_temporary(space->id));
5004 
5005 retry:
5006 
5007 #ifdef UNIV_HOTBACKUP
5008 	page_size_t	page_length(space->flags);
5009 	ulint   actual_size = space->size;
5010 	ib::trace() << "space id : " << space->id << ", space name : "
5011 		<< space->name << ", space size : " << actual_size << " pages,"
5012 		<< " desired space size : " << size << " pages,"
5013 		<< " page size : " << page_length.physical();
5014 #endif /* UNIV_HOTBACKUP */
5015 
5016 	bool		success = true;
5017 
5018 	fil_mutex_enter_and_prepare_for_io(space->id);
5019 
5020 	if (space->size >= size) {
5021 		/* Space already big enough */
5022 		mutex_exit(&fil_system->mutex);
5023 		return(true);
5024 	}
5025 
5026 	page_size_t	pageSize(space->flags);
5027 	const ulint	page_size = pageSize.physical();
5028 	fil_node_t*	node = UT_LIST_GET_LAST(space->chain);
5029 
5030 	if (!node->being_extended) {
5031 		/* Mark this node as undergoing extension. This flag
5032 		is used by other threads to wait for the extension
5033 		opereation to finish. */
5034 		node->being_extended = true;
5035 	} else {
5036 		/* Another thread is currently extending the file. Wait
5037 		for it to finish.  It'd have been better to use an event
5038 		driven mechanism but the entire module is peppered with
5039 		polling code. */
5040 
5041 		mutex_exit(&fil_system->mutex);
5042 		os_thread_sleep(100000);
5043 		goto retry;
5044 	}
5045 
5046 	if (!fil_node_prepare_for_io(node, fil_system, space)) {
5047 		/* The tablespace data file, such as .ibd file, is missing */
5048 		node->being_extended = false;
5049 		mutex_exit(&fil_system->mutex);
5050 
5051 		return(false);
5052 	}
5053 
5054 	/* At this point it is safe to release fil_system mutex. No
5055 	other thread can rename, delete or close the file because
5056 	we have set the node->being_extended flag. */
5057 	mutex_exit(&fil_system->mutex);
5058 
5059 	ulint		pages_added;
5060 
5061 	/* Note: This code is going to be executed independent of FusionIO HW
5062 	if the OS supports posix_fallocate() */
5063 
5064 	ut_ad(size > space->size);
5065 
5066 	os_offset_t	node_start = os_file_get_size(node->handle);
5067 	ut_a(node_start != (os_offset_t) -1);
5068 
5069 	/* Node first page number */
5070 	ulint		node_first_page = space->size - node->size;
5071 
5072 	/* Number of physical pages in the node/file */
5073 	ulint		n_node_physical_pages
5074 		= static_cast<ulint>(node_start) / page_size;
5075 
5076 	/* Number of pages to extend in the node/file */
5077 	lint		n_node_extend;
5078 
5079 	n_node_extend = size - (node_first_page + node->size);
5080 
5081 	/* If we already have enough physical pages to satisfy the
5082 	extend request on the node then ignore it */
5083 	if (node->size + n_node_extend > n_node_physical_pages) {
5084 
5085 		DBUG_EXECUTE_IF("ib_crash_during_tablespace_extension",
5086 				DBUG_SUICIDE(););
5087 
5088 		os_offset_t     len;
5089 		dberr_t		err = DB_SUCCESS;
5090 
5091 		len = ((node->size + n_node_extend) * page_size) - node_start;
5092 		ut_ad(len > 0);
5093 
5094 #if !defined(NO_FALLOCATE) && defined(UNIV_LINUX)
5095 		int     ret = posix_fallocate(node->handle.m_file, node_start, len);
5096 
5097 		DBUG_EXECUTE_IF("ib_posix_fallocate_fail_eintr",
5098 				ret = EINTR;);
5099 
5100 		DBUG_EXECUTE_IF("ib_posix_fallocate_fail_einval",
5101 				ret = EINVAL;);
5102 
5103 		if (ret != 0) {
5104 			/* We already pass the valid offset and len in,
5105 			if EINVAL is returned, it could only mean that the
5106 			file system doesn't support fallocate(), currently
5107 			one known case is ext3 with O_DIRECT.
5108 
5109 			Also because above call could be interrupted,
5110 			in this case, simply go to plan B by writing zeroes.
5111 
5112 			Both error messages for above two scenarios are
5113 			skipped in case of flooding error messages, because
5114 			they can be ignored by users. */
5115 			if (ret != EINTR && ret != EINVAL) {
5116 				ib::error()
5117 					<< "posix_fallocate(): Failed to"
5118 					" preallocate data for file "
5119 					<< node->name << ", desired size "
5120 					<< len << " bytes."
5121 					" Operating system error number "
5122 					<< ret << ". Check"
5123 					" that the disk is not full or a disk"
5124 					" quota exceeded. Make sure the file"
5125 					" system supports this function."
5126 					" Some operating system error"
5127 					" numbers are described at " REFMAN
5128 					"operating-system-error-codes.html";
5129 			}
5130 
5131 			err = DB_IO_ERROR;
5132 		}
5133 #endif /* NO_FALLOCATE || !UNIV_LINUX */
5134 
5135 		if (!node->atomic_write || err == DB_IO_ERROR) {
5136 
5137 			bool	read_only_mode;
5138 
5139 			read_only_mode = (space->purpose != FIL_TYPE_TEMPORARY
5140 					  ? false : srv_read_only_mode);
5141 
5142 			err = fil_write_zeros(
5143 				node, page_size, node_start,
5144 				static_cast<ulint>(len), read_only_mode);
5145 
5146 			if (err != DB_SUCCESS) {
5147 
5148 				ib::warn()
5149 					<< "Error while writing " << len
5150 					<< " zeroes to " << node->name
5151 					<< " starting at offset " << node_start;
5152 			}
5153 		}
5154 
5155 		/* Check how many pages actually added */
5156 		os_offset_t	end = os_file_get_size(node->handle);
5157 		ut_a(end != static_cast<os_offset_t>(-1) && end >= node_start);
5158 
5159 		os_has_said_disk_full = !(success = (end == node_start + len));
5160 
5161 		pages_added = static_cast<ulint>(end - node_start) / page_size;
5162 
5163 	} else {
5164 		success = true;
5165 		pages_added = n_node_extend;
5166 		os_has_said_disk_full = FALSE;
5167 	}
5168 
5169 	mutex_enter(&fil_system->mutex);
5170 
5171 	ut_a(node->being_extended);
5172 
5173 	node->size += pages_added;
5174 	space->size += pages_added;
5175 	node->being_extended = false;
5176 
5177 	fil_node_complete_io(node, fil_system, IORequestWrite);
5178 
5179 #ifndef UNIV_HOTBACKUP
5180 	/* Keep the last data file size info up to date, rounded to
5181 	full megabytes */
5182 	ulint	pages_per_mb = (1024 * 1024) / page_size;
5183 	ulint	size_in_pages = ((node->size / pages_per_mb) * pages_per_mb);
5184 
5185 	if (space->id == srv_sys_space.space_id()) {
5186 		srv_sys_space.set_last_file_size(size_in_pages);
5187 	} else if (space->id == srv_tmp_space.space_id()) {
5188 		srv_tmp_space.set_last_file_size(size_in_pages);
5189 	}
5190 #else
5191 	ib::trace() << "extended space : " << space->name << " from "
5192 		<< actual_size << " pages to " << space->size << " pages "
5193 		<< ", desired space size : " << size << " pages.";
5194 #endif /* !UNIV_HOTBACKUP */
5195 
5196 	mutex_exit(&fil_system->mutex);
5197 
5198 	fil_flush(space->id);
5199 
5200 	return(success);
5201 }
5202 
5203 #ifdef UNIV_HOTBACKUP
5204 /********************************************************************//**
5205 Extends all tablespaces to the size stored in the space header. During the
5206 mysqlbackup --apply-log phase we extended the spaces on-demand so that log
5207 records could be applied, but that may have left spaces still too small
5208 compared to the size stored in the space header. */
5209 void
fil_extend_tablespaces_to_stored_len(void)5210 fil_extend_tablespaces_to_stored_len(void)
5211 /*======================================*/
5212 {
5213 	byte*		buf;
5214 	ulint		actual_size;
5215 	ulint		size_in_header;
5216 	dberr_t		error;
5217 	bool		success;
5218 
5219 	buf = (byte*)ut_malloc_nokey(UNIV_PAGE_SIZE);
5220 
5221 	mutex_enter(&fil_system->mutex);
5222 
5223 	for (fil_space_t* space = UT_LIST_GET_FIRST(fil_system->space_list);
5224 	     space != NULL;
5225 	     space = UT_LIST_GET_NEXT(space_list, space)) {
5226 
5227 		ut_a(space->purpose == FIL_TYPE_TABLESPACE);
5228 
5229 		mutex_exit(&fil_system->mutex); /* no need to protect with a
5230 					      mutex, because this is a
5231 					      single-threaded operation */
5232 		error = fil_read(
5233 			page_id_t(space->id, 0),
5234 			page_size_t(space->flags),
5235 			0, univ_page_size.physical(), buf);
5236 
5237 		ut_a(error == DB_SUCCESS);
5238 
5239 		size_in_header = fsp_header_get_field(buf, FSP_SIZE);
5240 
5241 		success = fil_space_extend(space, size_in_header);
5242 		if (!success) {
5243 			ib::error() << "Could not extend the tablespace of "
5244 				<< space->name  << " to the size stored in"
5245 				" header, " << size_in_header << " pages;"
5246 				" size after extension " << actual_size
5247 				<< " pages. Check that you have free disk"
5248 				" space and retry!";
5249 			ut_a(success);
5250 		}
5251 
5252 		mutex_enter(&fil_system->mutex);
5253 	}
5254 
5255 	mutex_exit(&fil_system->mutex);
5256 
5257 	ut_free(buf);
5258 }
5259 #endif
5260 
5261 /*========== RESERVE FREE EXTENTS (for a B-tree split, for example) ===*/
5262 
5263 /*******************************************************************//**
5264 Tries to reserve free extents in a file space.
5265 @return true if succeed */
5266 bool
fil_space_reserve_free_extents(ulint id,ulint n_free_now,ulint n_to_reserve)5267 fil_space_reserve_free_extents(
5268 /*===========================*/
5269 	ulint	id,		/*!< in: space id */
5270 	ulint	n_free_now,	/*!< in: number of free extents now */
5271 	ulint	n_to_reserve)	/*!< in: how many one wants to reserve */
5272 {
5273 	fil_space_t*	space;
5274 	bool		success;
5275 
5276 	ut_ad(fil_system);
5277 
5278 	mutex_enter(&fil_system->mutex);
5279 
5280 	space = fil_space_get_by_id(id);
5281 
5282 	ut_a(space);
5283 
5284 	if (space->n_reserved_extents + n_to_reserve > n_free_now) {
5285 		success = false;
5286 	} else {
5287 		space->n_reserved_extents += n_to_reserve;
5288 		success = true;
5289 	}
5290 
5291 	mutex_exit(&fil_system->mutex);
5292 
5293 	return(success);
5294 }
5295 
5296 /*******************************************************************//**
5297 Releases free extents in a file space. */
5298 void
fil_space_release_free_extents(ulint id,ulint n_reserved)5299 fil_space_release_free_extents(
5300 /*===========================*/
5301 	ulint	id,		/*!< in: space id */
5302 	ulint	n_reserved)	/*!< in: how many one reserved */
5303 {
5304 	fil_space_t*	space;
5305 
5306 	ut_ad(fil_system);
5307 
5308 	mutex_enter(&fil_system->mutex);
5309 
5310 	space = fil_space_get_by_id(id);
5311 
5312 	ut_a(space);
5313 	ut_a(space->n_reserved_extents >= n_reserved);
5314 
5315 	space->n_reserved_extents -= n_reserved;
5316 
5317 	mutex_exit(&fil_system->mutex);
5318 }
5319 
5320 /*******************************************************************//**
5321 Gets the number of reserved extents. If the database is silent, this number
5322 should be zero. */
5323 ulint
fil_space_get_n_reserved_extents(ulint id)5324 fil_space_get_n_reserved_extents(
5325 /*=============================*/
5326 	ulint	id)		/*!< in: space id */
5327 {
5328 	fil_space_t*	space;
5329 	ulint		n;
5330 
5331 	ut_ad(fil_system);
5332 
5333 	mutex_enter(&fil_system->mutex);
5334 
5335 	space = fil_space_get_by_id(id);
5336 
5337 	ut_a(space);
5338 
5339 	n = space->n_reserved_extents;
5340 
5341 	mutex_exit(&fil_system->mutex);
5342 
5343 	return(n);
5344 }
5345 
5346 /*============================ FILE I/O ================================*/
5347 
5348 /********************************************************************//**
5349 NOTE: you must call fil_mutex_enter_and_prepare_for_io() first!
5350 
5351 Prepares a file node for i/o. Opens the file if it is closed. Updates the
5352 pending i/o's field in the node and the system appropriately. Takes the node
5353 off the LRU list if it is in the LRU list. The caller must hold the fil_sys
5354 mutex.
5355 @return false if the file can't be opened, otherwise true */
5356 static
5357 bool
fil_node_prepare_for_io(fil_node_t * node,fil_system_t * system,fil_space_t * space)5358 fil_node_prepare_for_io(
5359 /*====================*/
5360 	fil_node_t*	node,	/*!< in: file node */
5361 	fil_system_t*	system,	/*!< in: tablespace memory cache */
5362 	fil_space_t*	space)	/*!< in: space */
5363 {
5364 	ut_ad(node && system && space);
5365 	ut_ad(mutex_own(&(system->mutex)));
5366 
5367 	if (system->n_open > system->max_n_open + 5) {
5368 		ib::warn() << "Open files " << system->n_open
5369 			<< " exceeds the limit " << system->max_n_open;
5370 	}
5371 
5372 	if (!node->is_open) {
5373 		/* File is closed: open it */
5374 		ut_a(node->n_pending == 0);
5375 
5376 		if (!fil_node_open_file(node)) {
5377 			return(false);
5378 		}
5379 	}
5380 
5381 	if (node->n_pending == 0 && fil_space_belongs_in_lru(space)) {
5382 		/* The node is in the LRU list, remove it */
5383 
5384 		ut_a(UT_LIST_GET_LEN(system->LRU) > 0);
5385 
5386 		UT_LIST_REMOVE(system->LRU, node);
5387 	}
5388 
5389 	node->n_pending++;
5390 
5391 	return(true);
5392 }
5393 
5394 /********************************************************************//**
5395 Updates the data structures when an i/o operation finishes. Updates the
5396 pending i/o's field in the node appropriately. */
5397 static
5398 void
fil_node_complete_io(fil_node_t * node,fil_system_t * system,const IORequest & type)5399 fil_node_complete_io(
5400 /*=================*/
5401 	fil_node_t*	node,	/*!< in: file node */
5402 	fil_system_t*	system,	/*!< in: tablespace memory cache */
5403 	const IORequest&type)	/*!< in: IO_TYPE_*, marks the node as
5404 				modified if TYPE_IS_WRITE() */
5405 {
5406 	ut_ad(mutex_own(&system->mutex));
5407 	ut_a(node->n_pending > 0);
5408 
5409 	--node->n_pending;
5410 
5411 	ut_ad(type.validate());
5412 
5413 	if (type.is_write()) {
5414 
5415 		ut_ad(!srv_read_only_mode
5416 		      || fsp_is_system_temporary(node->space->id));
5417 
5418 		++system->modification_counter;
5419 
5420 		node->modification_counter = system->modification_counter;
5421 
5422 		if (fil_buffering_disabled(node->space)) {
5423 
5424 			/* We don't need to keep track of unflushed
5425 			changes as user has explicitly disabled
5426 			buffering. */
5427 			ut_ad(!node->space->is_in_unflushed_spaces);
5428 			node->flush_counter = node->modification_counter;
5429 
5430 		} else if (!node->space->is_in_unflushed_spaces) {
5431 
5432 			node->space->is_in_unflushed_spaces = true;
5433 
5434 			UT_LIST_ADD_FIRST(
5435 				system->unflushed_spaces, node->space);
5436 		}
5437 	}
5438 
5439 	if (node->n_pending == 0 && fil_space_belongs_in_lru(node->space)) {
5440 
5441 		/* The node must be put back to the LRU list */
5442 		UT_LIST_ADD_FIRST(system->LRU, node);
5443 	}
5444 }
5445 
5446 /** Report information about an invalid page access. */
5447 static
5448 void
fil_report_invalid_page_access(ulint block_offset,ulint space_id,const char * space_name,ulint byte_offset,ulint len,bool is_read)5449 fil_report_invalid_page_access(
5450 	ulint		block_offset,	/*!< in: block offset */
5451 	ulint		space_id,	/*!< in: space id */
5452 	const char*	space_name,	/*!< in: space name */
5453 	ulint		byte_offset,	/*!< in: byte offset */
5454 	ulint		len,		/*!< in: I/O length */
5455 	bool		is_read)	/*!< in: I/O type */
5456 {
5457 	ib::error()
5458 		<< "Trying to access page number " << block_offset << " in"
5459 		" space " << space_id << ", space name " << space_name << ","
5460 		" which is outside the tablespace bounds. Byte offset "
5461 		<< byte_offset << ", len " << len << ", i/o type " <<
5462 		(is_read ? "read" : "write")
5463 		<< ". If you get this error at mysqld startup, please check"
5464 		" that your my.cnf matches the ibdata files that you have in"
5465 		" the MySQL server.";
5466 
5467 	ib::error() << "Server exits"
5468 #ifdef UNIV_DEBUG
5469 		<< " at " << __FILE__ << "[" << __LINE__ << "]"
5470 #endif
5471 		<< ".";
5472 
5473 	_exit(1);
5474 }
5475 
5476 /** Set encryption information for IORequest.
5477 @param[in,out]	req_type	IO request
5478 @param[in]	page_id		page id
5479 @param[in]	space		table space */
5480 inline
5481 void
fil_io_set_encryption(IORequest & req_type,const page_id_t & page_id,fil_space_t * space)5482 fil_io_set_encryption(
5483 	IORequest&		req_type,
5484 	const page_id_t&	page_id,
5485 	fil_space_t*		space)
5486 {
5487 	/* Don't encrypt the log, page 0 of all tablespaces, all pages
5488 	from the system tablespace. */
5489 	if (!req_type.is_log() && page_id.page_no() > 0
5490 	    && space->encryption_type != Encryption::NONE)
5491 	{
5492 		req_type.encryption_key(space->encryption_key,
5493 					space->encryption_klen,
5494 					space->encryption_iv);
5495 		req_type.encryption_algorithm(Encryption::AES);
5496 	} else {
5497 		req_type.clear_encrypted();
5498 	}
5499 }
5500 
5501 /** Reads or writes data. This operation could be asynchronous (aio).
5502 
5503 @param[in,out] type	IO context
5504 @param[in] sync		true if synchronous aio is desired
5505 @param[in] page_id	page id
5506 @param[in] page_size	page size
5507 @param[in] byte_offset	remainder of offset in bytes; in aio this
5508 			must be divisible by the OS block size
5509 @param[in] len		how many bytes to read or write; this must
5510 			not cross a file boundary; in aio this must
5511 			be a block size multiple
5512 @param[in,out] buf	buffer where to store read data or from where
5513 			to write; in aio this must be appropriately
5514 			aligned
5515 @param[in] message	message for aio handler if non-sync aio
5516 			used, else ignored
5517 
5518 @return DB_SUCCESS, DB_TABLESPACE_DELETED or DB_TABLESPACE_TRUNCATED
5519 	if we are trying to do i/o on a tablespace which does not exist */
5520 dberr_t
fil_io(const IORequest & type,bool sync,const page_id_t & page_id,const page_size_t & page_size,ulint byte_offset,ulint len,void * buf,void * message)5521 fil_io(
5522 	const IORequest&	type,
5523 	bool			sync,
5524 	const page_id_t&	page_id,
5525 	const page_size_t&	page_size,
5526 	ulint			byte_offset,
5527 	ulint			len,
5528 	void*			buf,
5529 	void*			message)
5530 {
5531 	os_offset_t		offset;
5532 	IORequest		req_type(type);
5533 
5534 	ut_ad(req_type.validate());
5535 
5536 	ut_ad(len > 0);
5537 	ut_ad(byte_offset < UNIV_PAGE_SIZE);
5538 	ut_ad(!page_size.is_compressed() || byte_offset == 0);
5539 	ut_ad(UNIV_PAGE_SIZE == (ulong)(1 << UNIV_PAGE_SIZE_SHIFT));
5540 #if (1 << UNIV_PAGE_SIZE_SHIFT_MAX) != UNIV_PAGE_SIZE_MAX
5541 # error "(1 << UNIV_PAGE_SIZE_SHIFT_MAX) != UNIV_PAGE_SIZE_MAX"
5542 #endif
5543 #if (1 << UNIV_PAGE_SIZE_SHIFT_MIN) != UNIV_PAGE_SIZE_MIN
5544 # error "(1 << UNIV_PAGE_SIZE_SHIFT_MIN) != UNIV_PAGE_SIZE_MIN"
5545 #endif
5546 	ut_ad(fil_validate_skip());
5547 
5548 #ifndef UNIV_HOTBACKUP
5549 
5550 	/* ibuf bitmap pages must be read in the sync AIO mode: */
5551 	ut_ad(recv_no_ibuf_operations
5552 	      || req_type.is_write()
5553 	      || !ibuf_bitmap_page(page_id, page_size)
5554 	      || sync
5555 	      || req_type.is_log());
5556 
5557 	ulint	mode;
5558 
5559 	if (sync) {
5560 
5561 		mode = OS_AIO_SYNC;
5562 
5563 	} else if (req_type.is_log()) {
5564 
5565 		mode = OS_AIO_LOG;
5566 
5567 	} else if (req_type.is_read()
5568 		   && !recv_no_ibuf_operations
5569 		   && ibuf_page(page_id, page_size, NULL)) {
5570 
5571 		mode = OS_AIO_IBUF;
5572 
5573 		/* Reduce probability of deadlock bugs in connection with ibuf:
5574 		do not let the ibuf i/o handler sleep */
5575 
5576 		req_type.clear_do_not_wake();
5577 	} else {
5578 		mode = OS_AIO_NORMAL;
5579 	}
5580 #else /* !UNIV_HOTBACKUP */
5581 	ut_a(sync);
5582 	ulint mode = OS_AIO_SYNC;
5583 #endif /* !UNIV_HOTBACKUP */
5584 
5585 #ifndef UNIV_HOTBACKUP
5586 	if (req_type.is_read()) {
5587 
5588 		srv_stats.data_read.add(len);
5589 
5590 	} else if (req_type.is_write()) {
5591 
5592 		ut_ad(!srv_read_only_mode
5593 		      || fsp_is_system_temporary(page_id.space()));
5594 
5595 		srv_stats.data_written.add(len);
5596 	}
5597 #endif /* !UNIV_HOTBACKUP */
5598 
5599 	/* Reserve the fil_system mutex and make sure that we can open at
5600 	least one file while holding it, if the file is not already open */
5601 
5602 	fil_mutex_enter_and_prepare_for_io(page_id.space());
5603 
5604 	fil_space_t*	space = fil_space_get_by_id(page_id.space());
5605 
5606 	/* If we are deleting a tablespace we don't allow async read operations
5607 	on that. However, we do allow write operations and sync read operations. */
5608 	if (space == NULL
5609 	    || (req_type.is_read()
5610 		&& !sync
5611 		&& space->stop_new_ops
5612 		&& !space->is_being_truncated)) {
5613 
5614 		mutex_exit(&fil_system->mutex);
5615 
5616 		if (!req_type.ignore_missing()) {
5617 			ib::error()
5618 				<< "Trying to do I/O to a tablespace which"
5619 				" does not exist. I/O type: "
5620 				<< (req_type.is_read() ? "read" : "write")
5621 				<< ", page: " << page_id
5622 				<< ", I/O length: " << len << " bytes";
5623 		}
5624 
5625 		return(DB_TABLESPACE_DELETED);
5626 	}
5627 
5628 	ut_ad(mode != OS_AIO_IBUF || fil_type_is_data(space->purpose));
5629 
5630 	ulint		cur_page_no = page_id.page_no();
5631 	fil_node_t*	node = UT_LIST_GET_FIRST(space->chain);
5632 
5633 	for (;;) {
5634 
5635 		if (node == NULL) {
5636 
5637 			if (req_type.ignore_missing()) {
5638 				mutex_exit(&fil_system->mutex);
5639 				return(DB_ERROR);
5640 			}
5641 
5642 			fil_report_invalid_page_access(
5643 				page_id.page_no(), page_id.space(),
5644 				space->name, byte_offset, len,
5645 				req_type.is_read());
5646 
5647 		} else if (fil_is_user_tablespace_id(space->id)
5648 			   && node->size == 0) {
5649 
5650 			/* We do not know the size of a single-table tablespace
5651 			before we open the file */
5652 			break;
5653 
5654 		} else if (node->size > cur_page_no) {
5655 			/* Found! */
5656 			break;
5657 
5658 		} else {
5659 			if (space->id != srv_sys_space.space_id()
5660 			    && UT_LIST_GET_LEN(space->chain) == 1
5661 			    && (srv_is_tablespace_truncated(space->id)
5662 				|| space->is_being_truncated
5663 				|| srv_was_tablespace_truncated(space))
5664 			    && req_type.is_read()) {
5665 
5666 				/* Handle page which is outside the truncated
5667 				tablespace bounds when recovering from a crash
5668 				happened during a truncation */
5669 				mutex_exit(&fil_system->mutex);
5670 				return(DB_TABLESPACE_TRUNCATED);
5671 			}
5672 
5673 			cur_page_no -= node->size;
5674 
5675 			node = UT_LIST_GET_NEXT(chain, node);
5676 		}
5677 	}
5678 
5679 	/* Open file if closed */
5680 	if (!fil_node_prepare_for_io(node, fil_system, space)) {
5681 		if (fil_type_is_data(space->purpose)
5682 		    && fil_is_user_tablespace_id(space->id)) {
5683 			mutex_exit(&fil_system->mutex);
5684 
5685 			if (!req_type.ignore_missing()) {
5686 				ib::error()
5687 					<< "Trying to do I/O to a tablespace"
5688 					" which exists without .ibd data file."
5689 					" I/O type: "
5690 					<< (req_type.is_read()
5691 					    ? "read" : "write")
5692 					<< ", page: "
5693 					<< page_id_t(page_id.space(),
5694 						     cur_page_no)
5695 					<< ", I/O length: " << len << " bytes";
5696 			}
5697 
5698 			return(DB_TABLESPACE_DELETED);
5699 		}
5700 
5701 		/* The tablespace is for log. Currently, we just assert here
5702 		to prevent handling errors along the way fil_io returns.
5703 		Also, if the log files are missing, it would be hard to
5704 		promise the server can continue running. */
5705 		ut_a(0);
5706 	}
5707 
5708 	/* Check that at least the start offset is within the bounds of a
5709 	single-table tablespace, including rollback tablespaces. */
5710 	if (node->size <= cur_page_no
5711 	    && space->id != srv_sys_space.space_id()
5712 	    && fil_type_is_data(space->purpose)) {
5713 
5714 		if (req_type.ignore_missing()) {
5715 			/* If we can tolerate the non-existent pages, we
5716 			should return with DB_ERROR and let caller decide
5717 			what to do. */
5718 			fil_node_complete_io(node, fil_system, req_type);
5719 			mutex_exit(&fil_system->mutex);
5720 			return(DB_ERROR);
5721 		}
5722 
5723 		fil_report_invalid_page_access(
5724 			page_id.page_no(), page_id.space(),
5725 			space->name, byte_offset, len, req_type.is_read());
5726 	}
5727 
5728 	/* Now we have made the changes in the data structures of fil_system */
5729 	mutex_exit(&fil_system->mutex);
5730 
5731 	/* Calculate the low 32 bits and the high 32 bits of the file offset */
5732 
5733 	if (!page_size.is_compressed()) {
5734 
5735 		offset = ((os_offset_t) cur_page_no
5736 			  << UNIV_PAGE_SIZE_SHIFT) + byte_offset;
5737 
5738 		ut_a(node->size - cur_page_no
5739 		     >= ((byte_offset + len + (UNIV_PAGE_SIZE - 1))
5740 			 / UNIV_PAGE_SIZE));
5741 	} else {
5742 		ulint	size_shift;
5743 
5744 		switch (page_size.physical()) {
5745 		case 1024: size_shift = 10; break;
5746 		case 2048: size_shift = 11; break;
5747 		case 4096: size_shift = 12; break;
5748 		case 8192: size_shift = 13; break;
5749 		case 16384: size_shift = 14; break;
5750 		case 32768: size_shift = 15; break;
5751 		case 65536: size_shift = 16; break;
5752 		default: ut_error;
5753 		}
5754 
5755 		offset = ((os_offset_t) cur_page_no << size_shift)
5756 			+ byte_offset;
5757 
5758 		ut_a(node->size - cur_page_no
5759 		     >= (len + (page_size.physical() - 1))
5760 		     / page_size.physical());
5761 	}
5762 
5763 	/* Do AIO */
5764 
5765 	ut_a(byte_offset % OS_FILE_LOG_BLOCK_SIZE == 0);
5766 	ut_a((len % OS_FILE_LOG_BLOCK_SIZE) == 0);
5767 
5768 	/* Don't compress the log, page 0 of all tablespaces, tables
5769 	compresssed with the old scheme and all pages from the system
5770 	tablespace. */
5771 
5772 	if (req_type.is_write()
5773 	    && !req_type.is_log()
5774 	    && !page_size.is_compressed()
5775 	    && page_id.page_no() > 0
5776 	    && IORequest::is_punch_hole_supported()
5777 	    && node->punch_hole) {
5778 
5779 		ut_ad(!req_type.is_log());
5780 
5781 		req_type.set_punch_hole();
5782 
5783 		req_type.compression_algorithm(space->compression_type);
5784 
5785 	} else {
5786 		req_type.clear_compressed();
5787 	}
5788 
5789 	/* Set encryption information. */
5790 	fil_io_set_encryption(req_type, page_id, space);
5791 
5792 	req_type.block_size(node->block_size);
5793 
5794 	dberr_t	err;
5795 
5796 #ifdef UNIV_HOTBACKUP
5797 	/* In mysqlbackup do normal i/o, not aio */
5798 	if (req_type.is_read()) {
5799 
5800 		err = os_file_read(req_type, node->handle, buf, offset, len);
5801 
5802 	} else {
5803 
5804 		ut_ad(!srv_read_only_mode
5805 		      || fsp_is_system_temporary(page_id.space()));
5806 
5807 		err = os_file_write(
5808 			req_type, node->name, node->handle, buf, offset, len);
5809 	}
5810 #else /* UNIV_HOTBACKUP */
5811 	/* Queue the aio request */
5812 	err = os_aio(
5813 		req_type,
5814 		mode, node->name, node->handle, buf, offset, len,
5815 		fsp_is_system_temporary(page_id.space())
5816 		? false : srv_read_only_mode,
5817 		node, message);
5818 
5819 #endif /* UNIV_HOTBACKUP */
5820 
5821 	if (err == DB_IO_NO_PUNCH_HOLE) {
5822 
5823 		err = DB_SUCCESS;
5824 
5825 		if (node->punch_hole) {
5826 
5827 			ib::warn()
5828 				<< "Punch hole failed for '"
5829 				<< node->name << "'";
5830 		}
5831 
5832 		fil_no_punch_hole(node);
5833 	}
5834 
5835 	/* We an try to recover the page from the double write buffer if
5836 	the decompression fails or the page is corrupt. */
5837 
5838 	ut_a(req_type.is_dblwr_recover() || err == DB_SUCCESS);
5839 
5840 	if (sync) {
5841 		/* The i/o operation is already completed when we return from
5842 		os_aio: */
5843 
5844 		mutex_enter(&fil_system->mutex);
5845 
5846 		fil_node_complete_io(node, fil_system, req_type);
5847 
5848 		mutex_exit(&fil_system->mutex);
5849 
5850 		ut_ad(fil_validate_skip());
5851 	}
5852 
5853 	return(err);
5854 }
5855 
5856 #ifndef UNIV_HOTBACKUP
5857 /**********************************************************************//**
5858 Waits for an aio operation to complete. This function is used to write the
5859 handler for completed requests. The aio array of pending requests is divided
5860 into segments (see os0file.cc for more info). The thread specifies which
5861 segment it wants to wait for. */
5862 void
fil_aio_wait(ulint segment)5863 fil_aio_wait(
5864 /*=========*/
5865 	ulint	segment)	/*!< in: the number of the segment in the aio
5866 				array to wait for */
5867 {
5868 	fil_node_t*	node;
5869 	IORequest	type;
5870 	void*		message;
5871 
5872 	ut_ad(fil_validate_skip());
5873 
5874 	dberr_t	err = os_aio_handler(segment, &node, &message, &type);
5875 
5876 	ut_a(err == DB_SUCCESS);
5877 
5878 	if (node == NULL) {
5879 		ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS);
5880 		return;
5881 	}
5882 
5883 	srv_set_io_thread_op_info(segment, "complete io for fil node");
5884 
5885 	mutex_enter(&fil_system->mutex);
5886 
5887 	fil_node_complete_io(node, fil_system, type);
5888 
5889 	mutex_exit(&fil_system->mutex);
5890 
5891 	ut_ad(fil_validate_skip());
5892 
5893 	/* Do the i/o handling */
5894 	/* IMPORTANT: since i/o handling for reads will read also the insert
5895 	buffer in tablespace 0, you have to be very careful not to introduce
5896 	deadlocks in the i/o system. We keep tablespace 0 data files always
5897 	open, and use a special i/o thread to serve insert buffer requests. */
5898 
5899 	switch (node->space->purpose) {
5900 	case FIL_TYPE_TABLESPACE:
5901 	case FIL_TYPE_TEMPORARY:
5902 	case FIL_TYPE_IMPORT:
5903 		srv_set_io_thread_op_info(segment, "complete io for buf page");
5904 
5905 		/* async single page writes from the dblwr buffer don't have
5906 		access to the page */
5907 		if (message != NULL) {
5908 			buf_page_io_complete(static_cast<buf_page_t*>(message));
5909 		}
5910 		return;
5911 	case FIL_TYPE_LOG:
5912 		srv_set_io_thread_op_info(segment, "complete io for log");
5913 		log_io_complete(static_cast<log_group_t*>(message));
5914 		return;
5915 	}
5916 
5917 	ut_ad(0);
5918 }
5919 #endif /* !UNIV_HOTBACKUP */
5920 
5921 /**********************************************************************//**
5922 Flushes to disk possible writes cached by the OS. If the space does not exist
5923 or is being dropped, does not do anything. */
5924 void
fil_flush(ulint space_id)5925 fil_flush(
5926 /*======*/
5927 	ulint	space_id)	/*!< in: file space id (this can be a group of
5928 				log files or a tablespace of the database) */
5929 {
5930 	fil_node_t*	node;
5931 	pfs_os_file_t	file;
5932 
5933 	mutex_enter(&fil_system->mutex);
5934 
5935 	fil_space_t*	space = fil_space_get_by_id(space_id);
5936 
5937 	if (space == NULL
5938 	    || space->purpose == FIL_TYPE_TEMPORARY
5939 	    || space->stop_new_ops
5940 	    || space->is_being_truncated) {
5941 		mutex_exit(&fil_system->mutex);
5942 
5943 		return;
5944 	}
5945 
5946 	bool fbd = fil_buffering_disabled(space);
5947 	if (fbd) {
5948 
5949 		/* No need to flush. User has explicitly disabled
5950 		buffering. However, flush should be called if the file
5951                 size changes to keep OS metadata in sync. */
5952 		ut_ad(!space->is_in_unflushed_spaces);
5953 		ut_ad(fil_space_is_flushed(space));
5954 
5955 		/* Flush only if the file size changes */
5956 		bool no_flush = true;
5957 		for (node = UT_LIST_GET_FIRST(space->chain);
5958 		     node != NULL;
5959 		     node = UT_LIST_GET_NEXT(chain, node)) {
5960 #ifdef UNIV_DEBUG
5961 			ut_ad(node->modification_counter
5962 			      == node->flush_counter);
5963 #endif /* UNIV_DEBUG */
5964 			if (node->flush_size != node->size) {
5965 				/* Found at least one file whose size has changed */
5966 				no_flush = false;
5967 				break;
5968 			}
5969 		}
5970 
5971 		if (no_flush) {
5972 			mutex_exit(&fil_system->mutex);
5973 			return;
5974 		}
5975 	}
5976 
5977 	space->n_pending_flushes++;	/*!< prevent dropping of the space while
5978 					we are flushing */
5979 	for (node = UT_LIST_GET_FIRST(space->chain);
5980 	     node != NULL;
5981 	     node = UT_LIST_GET_NEXT(chain, node)) {
5982 
5983 		int64_t	old_mod_counter = node->modification_counter;
5984 
5985 		if (!node->is_open) {
5986 			continue;
5987 		}
5988 
5989 		/* Skip flushing if the file size has not changed since
5990 		last flush was done and the flush mode is O_DIRECT_NO_FSYNC */
5991 		if (fbd && (node->flush_size == node->size)) {
5992 			continue;
5993 		}
5994 
5995 		/* If we are here and the flush mode is O_DIRECT_NO_FSYNC, then
5996 		it means that the file size has changed and hence, it shold be
5997 		flushed, irrespective of the mod_counter and flush counter values,
5998 		which are always same in case of O_DIRECT_NO_FSYNC to avoid flush
5999 		on every write operation.
6000 		For other flush modes, if the flush_counter is same or ahead of
6001 		the mode_counter, skip the flush. */
6002 		if (!fbd && (old_mod_counter <= node->flush_counter)) {
6003 			continue;
6004 		}
6005 
6006 		switch (space->purpose) {
6007 		case FIL_TYPE_TEMPORARY:
6008 			ut_ad(0); // we already checked for this
6009 		case FIL_TYPE_TABLESPACE:
6010 		case FIL_TYPE_IMPORT:
6011 			fil_n_pending_tablespace_flushes++;
6012 			break;
6013 		case FIL_TYPE_LOG:
6014 			fil_n_pending_log_flushes++;
6015 			fil_n_log_flushes++;
6016 			break;
6017 		}
6018 #ifdef _WIN32
6019 		if (node->is_raw_disk) {
6020 
6021 			goto skip_flush;
6022 		}
6023 #endif /* _WIN32 */
6024 retry:
6025 		if (node->n_pending_flushes > 0) {
6026 			/* We want to avoid calling os_file_flush() on
6027 			the file twice at the same time, because we do
6028 			not know what bugs OS's may contain in file
6029 			i/o */
6030 
6031 #ifndef UNIV_HOTBACKUP
6032 			int64_t	sig_count = os_event_reset(node->sync_event);
6033 #endif /* !UNIV_HOTBACKUP */
6034 
6035 			mutex_exit(&fil_system->mutex);
6036 
6037 			os_event_wait_low(node->sync_event, sig_count);
6038 
6039 			mutex_enter(&fil_system->mutex);
6040 
6041 			if (node->flush_counter >= old_mod_counter) {
6042 
6043 				goto skip_flush;
6044 			}
6045 
6046 			goto retry;
6047 		}
6048 
6049 		ut_a(node->is_open);
6050 		file = node->handle;
6051 		node->n_pending_flushes++;
6052 
6053 		mutex_exit(&fil_system->mutex);
6054 
6055 		os_file_flush(file);
6056 
6057 		node->flush_size = node->size;
6058 
6059 		mutex_enter(&fil_system->mutex);
6060 
6061 		os_event_set(node->sync_event);
6062 
6063 		node->n_pending_flushes--;
6064 skip_flush:
6065 		if (node->flush_counter < old_mod_counter) {
6066 			node->flush_counter = old_mod_counter;
6067 
6068 			if (space->is_in_unflushed_spaces
6069 			    && fil_space_is_flushed(space)) {
6070 
6071 				space->is_in_unflushed_spaces = false;
6072 
6073 				UT_LIST_REMOVE(
6074 					fil_system->unflushed_spaces,
6075 					space);
6076 			}
6077 		}
6078 
6079 		switch (space->purpose) {
6080 		case FIL_TYPE_TEMPORARY:
6081 			ut_ad(0); // we already checked for this
6082 		case FIL_TYPE_TABLESPACE:
6083 		case FIL_TYPE_IMPORT:
6084 			fil_n_pending_tablespace_flushes--;
6085 			continue;
6086 		case FIL_TYPE_LOG:
6087 			fil_n_pending_log_flushes--;
6088 			continue;
6089 		}
6090 
6091 		ut_ad(0);
6092 	}
6093 
6094 	space->n_pending_flushes--;
6095 
6096 	mutex_exit(&fil_system->mutex);
6097 }
6098 
6099 /** Flush to disk the writes in file spaces of the given type
6100 possibly cached by the OS.
6101 @param[in]	purpose	FIL_TYPE_TABLESPACE or FIL_TYPE_LOG */
6102 void
fil_flush_file_spaces(fil_type_t purpose)6103 fil_flush_file_spaces(
6104 	fil_type_t	purpose)
6105 {
6106 	fil_space_t*	space;
6107 	ulint*		space_ids;
6108 	ulint		n_space_ids;
6109 
6110 	ut_ad(purpose == FIL_TYPE_TABLESPACE || purpose == FIL_TYPE_LOG);
6111 
6112 	mutex_enter(&fil_system->mutex);
6113 
6114 	n_space_ids = UT_LIST_GET_LEN(fil_system->unflushed_spaces);
6115 	if (n_space_ids == 0) {
6116 
6117 		mutex_exit(&fil_system->mutex);
6118 		return;
6119 	}
6120 
6121 	/* Assemble a list of space ids to flush.  Previously, we
6122 	traversed fil_system->unflushed_spaces and called UT_LIST_GET_NEXT()
6123 	on a space that was just removed from the list by fil_flush().
6124 	Thus, the space could be dropped and the memory overwritten. */
6125 	space_ids = static_cast<ulint*>(
6126 		ut_malloc_nokey(n_space_ids * sizeof(*space_ids)));
6127 
6128 	n_space_ids = 0;
6129 
6130 	for (space = UT_LIST_GET_FIRST(fil_system->unflushed_spaces);
6131 	     space;
6132 	     space = UT_LIST_GET_NEXT(unflushed_spaces, space)) {
6133 
6134 		if (space->purpose == purpose
6135 		    && !space->stop_new_ops
6136 		    && !space->is_being_truncated) {
6137 
6138 			space_ids[n_space_ids++] = space->id;
6139 		}
6140 	}
6141 
6142 	mutex_exit(&fil_system->mutex);
6143 
6144 	/* Flush the spaces.  It will not hurt to call fil_flush() on
6145 	a non-existing space id. */
6146 	for (ulint i = 0; i < n_space_ids; i++) {
6147 
6148 		fil_flush(space_ids[i]);
6149 	}
6150 
6151 	ut_free(space_ids);
6152 }
6153 
6154 /** Functor to validate the file node list of a tablespace. */
6155 struct	Check {
6156 	/** Total size of file nodes visited so far */
6157 	ulint	size;
6158 	/** Total number of open files visited so far */
6159 	ulint	n_open;
6160 
6161 	/** Constructor */
CheckCheck6162 	Check() : size(0), n_open(0) {}
6163 
6164 	/** Visit a file node
6165 	@param[in]	elem	file node to visit */
operator ()Check6166 	void	operator()(const fil_node_t* elem)
6167 	{
6168 		ut_a(elem->is_open || !elem->n_pending);
6169 		n_open += elem->is_open;
6170 		size += elem->size;
6171 	}
6172 
6173 	/** Validate a tablespace.
6174 	@param[in]	space	tablespace to validate
6175 	@return		number of open file nodes */
validateCheck6176 	static ulint validate(const fil_space_t* space)
6177 	{
6178 		ut_ad(mutex_own(&fil_system->mutex));
6179 		Check	check;
6180 		ut_list_validate(space->chain, check);
6181 		ut_a(space->size == check.size);
6182 		return(check.n_open);
6183 	}
6184 };
6185 
6186 /******************************************************************//**
6187 Checks the consistency of the tablespace cache.
6188 @return true if ok */
6189 bool
fil_validate(void)6190 fil_validate(void)
6191 /*==============*/
6192 {
6193 	fil_space_t*	space;
6194 	fil_node_t*	fil_node;
6195 	ulint		n_open		= 0;
6196 
6197 	mutex_enter(&fil_system->mutex);
6198 
6199 	/* Look for spaces in the hash table */
6200 
6201 	for (ulint i = 0; i < hash_get_n_cells(fil_system->spaces); i++) {
6202 
6203 		for (space = static_cast<fil_space_t*>(
6204 				HASH_GET_FIRST(fil_system->spaces, i));
6205 		     space != 0;
6206 		     space = static_cast<fil_space_t*>(
6207 				HASH_GET_NEXT(hash, space))) {
6208 
6209 			n_open += Check::validate(space);
6210 		}
6211 	}
6212 
6213 	ut_a(fil_system->n_open == n_open);
6214 
6215 	UT_LIST_CHECK(fil_system->LRU);
6216 
6217 	for (fil_node = UT_LIST_GET_FIRST(fil_system->LRU);
6218 	     fil_node != 0;
6219 	     fil_node = UT_LIST_GET_NEXT(LRU, fil_node)) {
6220 
6221 		ut_a(fil_node->n_pending == 0);
6222 		ut_a(!fil_node->being_extended);
6223 		ut_a(fil_node->is_open);
6224 		ut_a(fil_space_belongs_in_lru(fil_node->space));
6225 	}
6226 
6227 	mutex_exit(&fil_system->mutex);
6228 
6229 	return(true);
6230 }
6231 
6232 /********************************************************************//**
6233 Returns true if file address is undefined.
6234 @return true if undefined */
6235 bool
fil_addr_is_null(fil_addr_t addr)6236 fil_addr_is_null(
6237 /*=============*/
6238 	fil_addr_t	addr)	/*!< in: address */
6239 {
6240 	return(addr.page == FIL_NULL);
6241 }
6242 
6243 /********************************************************************//**
6244 Get the predecessor of a file page.
6245 @return FIL_PAGE_PREV */
6246 ulint
fil_page_get_prev(const byte * page)6247 fil_page_get_prev(
6248 /*==============*/
6249 	const byte*	page)	/*!< in: file page */
6250 {
6251 	return(mach_read_from_4(page + FIL_PAGE_PREV));
6252 }
6253 
6254 /********************************************************************//**
6255 Get the successor of a file page.
6256 @return FIL_PAGE_NEXT */
6257 ulint
fil_page_get_next(const byte * page)6258 fil_page_get_next(
6259 /*==============*/
6260 	const byte*	page)	/*!< in: file page */
6261 {
6262 	return(mach_read_from_4(page + FIL_PAGE_NEXT));
6263 }
6264 
6265 /*********************************************************************//**
6266 Sets the file page type. */
6267 void
fil_page_set_type(byte * page,ulint type)6268 fil_page_set_type(
6269 /*==============*/
6270 	byte*	page,	/*!< in/out: file page */
6271 	ulint	type)	/*!< in: type */
6272 {
6273 	ut_ad(page);
6274 
6275 	mach_write_to_2(page + FIL_PAGE_TYPE, type);
6276 }
6277 
6278 #ifndef UNIV_HOTBACKUP
6279 /** Reset the page type.
6280 Data files created before MySQL 5.1 may contain garbage in FIL_PAGE_TYPE.
6281 In MySQL 3.23.53, only undo log pages and index pages were tagged.
6282 Any other pages were written with uninitialized bytes in FIL_PAGE_TYPE.
6283 @param[in]	page_id	page number
6284 @param[in,out]	page	page with invalid FIL_PAGE_TYPE
6285 @param[in]	type	expected page type
6286 @param[in,out]	mtr	mini-transaction */
6287 void
fil_page_reset_type(const page_id_t & page_id,byte * page,ulint type,mtr_t * mtr)6288 fil_page_reset_type(
6289 	const page_id_t&	page_id,
6290 	byte*			page,
6291 	ulint			type,
6292 	mtr_t*			mtr)
6293 {
6294 	ib::info()
6295 		<< "Resetting invalid page " << page_id << " type "
6296 		<< fil_page_get_type(page) << " to " << type << ".";
6297 	mlog_write_ulint(page + FIL_PAGE_TYPE, type, MLOG_2BYTES, mtr);
6298 }
6299 #endif /* !UNIV_HOTBACKUP */
6300 
6301 /****************************************************************//**
6302 Closes the tablespace memory cache. */
6303 void
fil_close(void)6304 fil_close(void)
6305 /*===========*/
6306 {
6307 	hash_table_free(fil_system->spaces);
6308 
6309 	hash_table_free(fil_system->name_hash);
6310 
6311 	ut_a(UT_LIST_GET_LEN(fil_system->LRU) == 0);
6312 	ut_a(UT_LIST_GET_LEN(fil_system->unflushed_spaces) == 0);
6313 	ut_a(UT_LIST_GET_LEN(fil_system->space_list) == 0);
6314 
6315 	mutex_free(&fil_system->mutex);
6316 
6317 	ut_free(fil_system);
6318 	fil_system = NULL;
6319 }
6320 
6321 #ifndef UNIV_HOTBACKUP
6322 /********************************************************************//**
6323 Initializes a buffer control block when the buf_pool is created. */
6324 static
6325 void
fil_buf_block_init(buf_block_t * block,byte * frame)6326 fil_buf_block_init(
6327 /*===============*/
6328 	buf_block_t*	block,		/*!< in: pointer to control block */
6329 	byte*		frame)		/*!< in: pointer to buffer frame */
6330 {
6331 	UNIV_MEM_DESC(frame, UNIV_PAGE_SIZE);
6332 
6333 	block->frame = frame;
6334 
6335 	block->page.io_fix = BUF_IO_NONE;
6336 	/* There are assertions that check for this. */
6337 	block->page.buf_fix_count = 1;
6338 	block->page.state = BUF_BLOCK_READY_FOR_USE;
6339 
6340 	page_zip_des_init(&block->page.zip);
6341 }
6342 
6343 struct fil_iterator_t {
6344 	pfs_os_file_t	file;			/*!< File handle */
6345 	const char*	filepath;		/*!< File path name */
6346 	os_offset_t	start;			/*!< From where to start */
6347 	os_offset_t	end;			/*!< Where to stop */
6348 	os_offset_t	file_size;		/*!< File size in bytes */
6349 	ulint		page_size;		/*!< Page size */
6350 	ulint		n_io_buffers;		/*!< Number of pages to use
6351 						for IO */
6352 	byte*		io_buffer;		/*!< Buffer to use for IO */
6353 	byte*		encryption_key;		/*!< Encryption key */
6354 	byte*		encryption_iv;		/*!< Encryption iv */
6355 	size_t		block_size;		/*!< FS Block Size */
6356 };
6357 
6358 /********************************************************************//**
6359 TODO: This can be made parallel trivially by chunking up the file and creating
6360 a callback per thread. Main benefit will be to use multiple CPUs for
6361 checksums and compressed tables. We have to do compressed tables block by
6362 block right now. Secondly we need to decompress/compress and copy too much
6363 of data. These are CPU intensive.
6364 
6365 Iterate over all the pages in the tablespace.
6366 @param iter Tablespace iterator
6367 @param block block to use for IO
6368 @param callback Callback to inspect and update page contents
6369 @retval DB_SUCCESS or error code */
6370 static
6371 dberr_t
fil_iterate(const fil_iterator_t & iter,buf_block_t * block,PageCallback & callback)6372 fil_iterate(
6373 /*========*/
6374 	const fil_iterator_t&	iter,
6375 	buf_block_t*		block,
6376 	PageCallback&		callback)
6377 {
6378 	os_offset_t		offset;
6379 	ulint			page_no = 0;
6380 	ulint			space_id = callback.get_space_id();
6381 	ulint			n_bytes = iter.n_io_buffers * iter.page_size;
6382 
6383 	ut_ad(!srv_read_only_mode);
6384 
6385 	/* For old style compressed tables we do a lot of useless copying
6386 	for non-index pages. Unfortunately, it is required by
6387 	buf_zip_decompress() */
6388 
6389 	ulint	read_type = IORequest::READ;
6390 	ulint	write_type = IORequest::WRITE;
6391 
6392 	for (offset = iter.start; offset < iter.end; offset += n_bytes) {
6393 
6394 		byte*	io_buffer = iter.io_buffer;
6395 
6396 		block->frame = io_buffer;
6397 
6398 		if (callback.get_page_size().is_compressed()) {
6399 			page_zip_des_init(&block->page.zip);
6400 			page_zip_set_size(&block->page.zip, iter.page_size);
6401 
6402 			block->page.size.copy_from(
6403 				page_size_t(iter.page_size,
6404 					    univ_page_size.logical(),
6405 					    true));
6406 
6407 			block->page.zip.data = block->frame + UNIV_PAGE_SIZE;
6408 			ut_d(block->page.zip.m_external = true);
6409 			ut_ad(iter.page_size
6410 			      == callback.get_page_size().physical());
6411 
6412 			/* Zip IO is done in the compressed page buffer. */
6413 			io_buffer = block->page.zip.data;
6414 		} else {
6415 			io_buffer = iter.io_buffer;
6416 		}
6417 
6418 		/* We have to read the exact number of bytes. Otherwise the
6419 		InnoDB IO functions croak on failed reads. */
6420 
6421 		n_bytes = static_cast<ulint>(
6422 			ut_min(static_cast<os_offset_t>(n_bytes),
6423 			       iter.end - offset));
6424 
6425 		ut_ad(n_bytes > 0);
6426 		ut_ad(!(n_bytes % iter.page_size));
6427 
6428 		dberr_t		err;
6429 		IORequest	read_request(read_type);
6430 		read_request.block_size(iter.block_size);
6431 
6432 		/* For encrypted table, set encryption information. */
6433 		if (iter.encryption_key != NULL && offset != 0) {
6434 			read_request.encryption_key(iter.encryption_key,
6435 						    ENCRYPTION_KEY_LEN,
6436 						    iter.encryption_iv);
6437 			read_request.encryption_algorithm(Encryption::AES);
6438 		}
6439 
6440 		err = os_file_read(
6441 			read_request, iter.file, io_buffer, offset,
6442 			(ulint) n_bytes);
6443 
6444 		if (err != DB_SUCCESS) {
6445 
6446 			ib::error() << "os_file_read() failed";
6447 
6448 			return(err);
6449 		}
6450 
6451 		bool		updated = false;
6452 		os_offset_t	page_off = offset;
6453 		ulint		n_pages_read = (ulint) n_bytes / iter.page_size;
6454 
6455 		for (ulint i = 0; i < n_pages_read; ++i) {
6456 
6457 			buf_block_set_file_page(
6458 				block, page_id_t(space_id, page_no++));
6459 
6460 			if ((err = callback(page_off, block)) != DB_SUCCESS) {
6461 
6462 				return(err);
6463 
6464 			} else if (!updated) {
6465 				updated = buf_block_get_state(block)
6466 					== BUF_BLOCK_FILE_PAGE;
6467 			}
6468 
6469 			buf_block_set_state(block, BUF_BLOCK_NOT_USED);
6470 			buf_block_set_state(block, BUF_BLOCK_READY_FOR_USE);
6471 
6472 			page_off += iter.page_size;
6473 			block->frame += iter.page_size;
6474 		}
6475 
6476 		IORequest	write_request(write_type);
6477 		write_request.block_size(iter.block_size);
6478 
6479 		/* For encrypted table, set encryption information. */
6480 		if (iter.encryption_key != NULL && offset != 0) {
6481 			write_request.encryption_key(iter.encryption_key,
6482 						     ENCRYPTION_KEY_LEN,
6483 						     iter.encryption_iv);
6484 			write_request.encryption_algorithm(Encryption::AES);
6485 		}
6486 
6487 		/* A page was updated in the set, write back to disk.
6488 		Note: We don't have the compression algorithm, we write
6489 		out the imported file as uncompressed. */
6490 
6491 		if (updated
6492 		    && (err = os_file_write(
6493 				write_request,
6494 				iter.filepath, iter.file, io_buffer,
6495 				offset, (ulint) n_bytes)) != DB_SUCCESS) {
6496 
6497 			/* This is not a hard error */
6498 			if (err == DB_IO_NO_PUNCH_HOLE) {
6499 
6500 				err = DB_SUCCESS;
6501 				write_type &= ~IORequest::PUNCH_HOLE;
6502 
6503 			} else {
6504 				ib::error() << "os_file_write() failed";
6505 
6506 				return(err);
6507 			}
6508 		}
6509 	}
6510 
6511 	return(DB_SUCCESS);
6512 }
6513 
6514 /********************************************************************//**
6515 Iterate over all the pages in the tablespace.
6516 @param table the table definiton in the server
6517 @param n_io_buffers number of blocks to read and write together
6518 @param callback functor that will do the page updates
6519 @return DB_SUCCESS or error code */
6520 dberr_t
fil_tablespace_iterate(dict_table_t * table,ulint n_io_buffers,PageCallback & callback)6521 fil_tablespace_iterate(
6522 /*===================*/
6523 	dict_table_t*	table,
6524 	ulint		n_io_buffers,
6525 	PageCallback&	callback)
6526 {
6527 	dberr_t		err;
6528 	pfs_os_file_t	file;
6529 	char*		filepath;
6530 	bool		success;
6531 
6532 	ut_a(n_io_buffers > 0);
6533 	ut_ad(!srv_read_only_mode);
6534 
6535 	DBUG_EXECUTE_IF("ib_import_trigger_corruption_1",
6536 			return(DB_CORRUPTION););
6537 
6538 	/* Make sure the data_dir_path is set. */
6539 	dict_get_and_save_data_dir_path(table, false);
6540 
6541 	if (DICT_TF_HAS_DATA_DIR(table->flags)) {
6542 		ut_a(table->data_dir_path);
6543 
6544 		filepath = fil_make_filepath(
6545 			table->data_dir_path, table->name.m_name, IBD, true);
6546 	} else {
6547 		filepath = fil_make_filepath(
6548 			NULL, table->name.m_name, IBD, false);
6549 	}
6550 
6551 	if (filepath == NULL) {
6552 		return(DB_OUT_OF_MEMORY);
6553 	}
6554 
6555 	file = os_file_create_simple_no_error_handling(
6556 		innodb_data_file_key, filepath,
6557 		OS_FILE_OPEN, OS_FILE_READ_WRITE, srv_read_only_mode, &success);
6558 
6559 	DBUG_EXECUTE_IF("fil_tablespace_iterate_failure",
6560 	{
6561 		static bool once;
6562 
6563 		if (!once || ut_rnd_interval(0, 10) == 5) {
6564 			once = true;
6565 			success = false;
6566 			os_file_close(file);
6567 		}
6568 	});
6569 
6570 	if (!success) {
6571 		/* The following call prints an error message */
6572 		os_file_get_last_error(true);
6573 
6574 		ib::error() << "Trying to import a tablespace, but could not"
6575 			" open the tablespace file " << filepath;
6576 
6577 		ut_free(filepath);
6578 
6579 		return(DB_TABLESPACE_NOT_FOUND);
6580 
6581 	} else {
6582 		err = DB_SUCCESS;
6583 	}
6584 
6585 	/* Set File System Block Size */
6586 	size_t block_size;
6587 	{
6588 		os_file_stat_t stat_info;
6589 
6590 		ut_d(dberr_t err =) os_file_get_status(filepath, &stat_info, false, false);
6591 		ut_ad(err == DB_SUCCESS);
6592 
6593 		block_size = stat_info.block_size;
6594 	}
6595 
6596 	callback.set_file(filepath, file);
6597 
6598 	os_offset_t	file_size = os_file_get_size(file);
6599 	ut_a(file_size != (os_offset_t) -1);
6600 
6601 	/* The block we will use for every physical page */
6602 	buf_block_t*	block;
6603 
6604 	block = reinterpret_cast<buf_block_t*>(ut_zalloc_nokey(sizeof(*block)));
6605 
6606 	mutex_create(LATCH_ID_BUF_BLOCK_MUTEX, &block->mutex);
6607 
6608 	/* Allocate a page to read in the tablespace header, so that we
6609 	can determine the page size and zip size (if it is compressed).
6610 	We allocate an extra page in case it is a compressed table. One
6611 	page is to ensure alignement. */
6612 
6613 	void*	page_ptr = ut_malloc_nokey(3 * UNIV_PAGE_SIZE);
6614 	byte*	page = static_cast<byte*>(ut_align(page_ptr, UNIV_PAGE_SIZE));
6615 
6616 	fil_buf_block_init(block, page);
6617 
6618 	/* Read the first page and determine the page and zip size. */
6619 
6620 	IORequest	request(IORequest::READ);
6621 
6622 	err = os_file_read(request, file, page, 0, UNIV_PAGE_SIZE);
6623 
6624 	if (err != DB_SUCCESS) {
6625 
6626 		err = DB_IO_ERROR;
6627 
6628 	} else if ((err = callback.init(file_size, block)) == DB_SUCCESS) {
6629 		fil_iterator_t	iter;
6630 
6631 		iter.file = file;
6632 		iter.start = 0;
6633 		iter.end = file_size;
6634 		iter.filepath = filepath;
6635 		iter.file_size = file_size;
6636 		iter.n_io_buffers = n_io_buffers;
6637 		iter.page_size = callback.get_page_size().physical();
6638 		iter.block_size = block_size;
6639 
6640 		/* Set encryption info. */
6641 		iter.encryption_key = table->encryption_key;
6642 		iter.encryption_iv = table->encryption_iv;
6643 
6644 		/* Check encryption is matched or not. */
6645 		ulint	space_flags = callback.get_space_flags();
6646 		if (FSP_FLAGS_GET_ENCRYPTION(space_flags)) {
6647 			if (!dict_table_is_encrypted(table)) {
6648 				ib::error() << "Table is not in an encrypted"
6649 					" tablespace, but the data file"
6650                                         " intended for import is an encrypted"
6651 					" tablespace";
6652 				err = DB_IO_NO_ENCRYPT_TABLESPACE;
6653 			} else {
6654 				/* encryption_key must have been populated
6655                                 while reading CFP file. */
6656 				ut_ad(table->encryption_key != NULL &&
6657 				table->encryption_iv != NULL);
6658 
6659 				if (table->encryption_key == NULL ||
6660 					table->encryption_iv == NULL) {
6661 					err = DB_ERROR;
6662 				}
6663 			}
6664 		}
6665 
6666 		if (err == DB_SUCCESS) {
6667 
6668 			/* Compressed pages can't be optimised for block IO
6669 			for now.  We do the IMPORT page by page. */
6670 
6671 			if (callback.get_page_size().is_compressed()) {
6672 				iter.n_io_buffers = 1;
6673 				ut_a(iter.page_size
6674 				     == callback.get_page_size().physical());
6675 			}
6676 
6677 			/** Add an extra page for compressed page scratch
6678 			area. */
6679 			void*	io_buffer = ut_malloc_nokey(
6680 				(2 + iter.n_io_buffers) * UNIV_PAGE_SIZE);
6681 
6682 			iter.io_buffer = static_cast<byte*>(
6683 				ut_align(io_buffer, UNIV_PAGE_SIZE));
6684 
6685 			err = fil_iterate(iter, block, callback);
6686 
6687 			ut_free(io_buffer);
6688 		}
6689 	}
6690 
6691 	if (err == DB_SUCCESS) {
6692 
6693 		ib::info() << "Sync to disk";
6694 
6695 		if (!os_file_flush(file)) {
6696 			ib::info() << "os_file_flush() failed!";
6697 			err = DB_IO_ERROR;
6698 		} else {
6699 			ib::info() << "Sync to disk - done!";
6700 		}
6701 	}
6702 
6703 	os_file_close(file);
6704 
6705 	ut_free(page_ptr);
6706 	ut_free(filepath);
6707 
6708 	mutex_free(&block->mutex);
6709 
6710 	ut_free(block);
6711 
6712 	return(err);
6713 }
6714 #endif /* !UNIV_HOTBACKUP */
6715 
6716 /** Set the tablespace table size.
6717 @param[in]	page	a page belonging to the tablespace */
6718 void
set_page_size(const buf_frame_t * page)6719 PageCallback::set_page_size(
6720 	const buf_frame_t*	page) UNIV_NOTHROW
6721 {
6722 	m_page_size.copy_from(fsp_header_get_page_size(page));
6723 }
6724 
6725 /********************************************************************//**
6726 Delete the tablespace file and any related files like .cfg.
6727 This should not be called for temporary tables.
6728 @param[in] ibd_filepath File path of the IBD tablespace */
6729 void
fil_delete_file(const char * ibd_filepath)6730 fil_delete_file(
6731 /*============*/
6732 	const char*	ibd_filepath)
6733 {
6734 	/* Force a delete of any stale .ibd files that are lying around. */
6735 
6736 	ib::info() << "Deleting " << ibd_filepath;
6737 
6738 	os_file_delete_if_exists(innodb_data_file_key, ibd_filepath, NULL);
6739 
6740 	char*	cfg_filepath = fil_make_filepath(
6741 		ibd_filepath, NULL, CFG, false);
6742 	if (cfg_filepath != NULL) {
6743 		os_file_delete_if_exists(
6744 			innodb_data_file_key, cfg_filepath, NULL);
6745 		ut_free(cfg_filepath);
6746 	}
6747 
6748 	char*	cfp_filepath = fil_make_filepath(
6749 		ibd_filepath, NULL, CFP, false);
6750 	if (cfp_filepath != NULL) {
6751 		os_file_delete_if_exists(
6752 			innodb_data_file_key, cfp_filepath, NULL);
6753 		ut_free(cfp_filepath);
6754 	}
6755 }
6756 
6757 /**
6758 Iterate over all the spaces in the space list and fetch the
6759 tablespace names. It will return a copy of the name that must be
6760 freed by the caller using: delete[].
6761 @return DB_SUCCESS if all OK. */
6762 dberr_t
fil_get_space_names(space_name_list_t & space_name_list)6763 fil_get_space_names(
6764 /*================*/
6765 	space_name_list_t&	space_name_list)
6766 				/*!< in/out: List to append to */
6767 {
6768 	fil_space_t*	space;
6769 	dberr_t		err = DB_SUCCESS;
6770 
6771 	mutex_enter(&fil_system->mutex);
6772 
6773 	for (space = UT_LIST_GET_FIRST(fil_system->space_list);
6774 	     space != NULL;
6775 	     space = UT_LIST_GET_NEXT(space_list, space)) {
6776 
6777 		if (space->purpose == FIL_TYPE_TABLESPACE) {
6778 			ulint	len;
6779 			char*	name;
6780 
6781 			len = ::strlen(space->name);
6782 			name = UT_NEW_ARRAY_NOKEY(char, len + 1);
6783 
6784 			if (name == 0) {
6785 				/* Caller to free elements allocated so far. */
6786 				err = DB_OUT_OF_MEMORY;
6787 				break;
6788 			}
6789 
6790 			memcpy(name, space->name, len);
6791 			name[len] = 0;
6792 
6793 			space_name_list.push_back(name);
6794 		}
6795 	}
6796 
6797 	mutex_exit(&fil_system->mutex);
6798 
6799 	return(err);
6800 }
6801 
6802 #ifndef UNIV_HOTBACKUP
6803 /** Return the next fil_node_t in the current or next fil_space_t.
6804 Once started, the caller must keep calling this until it returns NULL.
6805 fil_space_acquire() and fil_space_release() are invoked here which
6806 blocks a concurrent operation from dropping the tablespace.
6807 @param[in]	prev_node	Pointer to the previous fil_node_t.
6808 If NULL, use the first fil_space_t on fil_system->space_list.
6809 @return pointer to the next fil_node_t.
6810 @retval NULL if this was the last file node */
6811 const fil_node_t*
fil_node_next(const fil_node_t * prev_node)6812 fil_node_next(
6813 	const fil_node_t*	prev_node)
6814 {
6815 	fil_space_t*		space;
6816 	const fil_node_t*	node = prev_node;
6817 
6818 	mutex_enter(&fil_system->mutex);
6819 
6820 	if (node == NULL) {
6821 		space = UT_LIST_GET_FIRST(fil_system->space_list);
6822 
6823 		/* We can trust that space is not NULL because at least the
6824 		system tablespace is always present and loaded first. */
6825 		space->n_pending_ops++;
6826 
6827 		node = UT_LIST_GET_FIRST(space->chain);
6828 		ut_ad(node != NULL);
6829 	} else {
6830 		space = node->space;
6831 		ut_ad(space->n_pending_ops > 0);
6832 		node = UT_LIST_GET_NEXT(chain, node);
6833 
6834 		if (node == NULL) {
6835 			/* Move on to the next fil_space_t */
6836 			space->n_pending_ops--;
6837 			space = UT_LIST_GET_NEXT(space_list, space);
6838 
6839 			/* Skip spaces that are being
6840 			created by fil_ibd_create(),
6841 			or dropped or truncated. */
6842 			while (space != NULL
6843 			       && (UT_LIST_GET_LEN(space->chain) == 0
6844 				   || space->stop_new_ops
6845 				   || space->is_being_truncated)) {
6846 				space = UT_LIST_GET_NEXT(space_list, space);
6847 			}
6848 
6849 			if (space != NULL) {
6850 				space->n_pending_ops++;
6851 				node = UT_LIST_GET_FIRST(space->chain);
6852 				ut_ad(node != NULL);
6853 			}
6854 		}
6855 	}
6856 
6857 	mutex_exit(&fil_system->mutex);
6858 
6859 	return(node);
6860 }
6861 
6862 /** Generate redo log for swapping two .ibd files
6863 @param[in]	old_table	old table
6864 @param[in]	new_table	new table
6865 @param[in]	tmp_name	temporary table name
6866 @param[in,out]	mtr		mini-transaction
6867 @return innodb error code */
6868 dberr_t
fil_mtr_rename_log(const dict_table_t * old_table,const dict_table_t * new_table,const char * tmp_name,mtr_t * mtr)6869 fil_mtr_rename_log(
6870 	const dict_table_t*	old_table,
6871 	const dict_table_t*	new_table,
6872 	const char*		tmp_name,
6873 	mtr_t*			mtr)
6874 {
6875 	dberr_t	err;
6876 
6877 	bool	old_is_file_per_table =
6878 		!is_system_tablespace(old_table->space)
6879 		&& !DICT_TF_HAS_SHARED_SPACE(old_table->flags);
6880 
6881 	bool	new_is_file_per_table =
6882 		!is_system_tablespace(new_table->space)
6883 		&& !DICT_TF_HAS_SHARED_SPACE(new_table->flags);
6884 
6885 	/* If neither table is file-per-table,
6886 	there will be no renaming of files. */
6887 	if (!old_is_file_per_table && !new_is_file_per_table) {
6888 		return(DB_SUCCESS);
6889 	}
6890 
6891 	const char*	old_dir = DICT_TF_HAS_DATA_DIR(old_table->flags)
6892 		? old_table->data_dir_path
6893 		: NULL;
6894 
6895 	char*	old_path = fil_make_filepath(
6896 		old_dir, old_table->name.m_name, IBD, (old_dir != NULL));
6897 	if (old_path == NULL) {
6898 		return(DB_OUT_OF_MEMORY);
6899 	}
6900 
6901 	if (old_is_file_per_table) {
6902 		char*	tmp_path = fil_make_filepath(
6903 			old_dir, tmp_name, IBD, (old_dir != NULL));
6904 		if (tmp_path == NULL) {
6905 			ut_free(old_path);
6906 			return(DB_OUT_OF_MEMORY);
6907 		}
6908 
6909 		/* Temp filepath must not exist. */
6910 		err = fil_rename_tablespace_check(
6911 			old_table->space, old_path, tmp_path,
6912 			dict_table_is_discarded(old_table));
6913 		if (err != DB_SUCCESS) {
6914 			ut_free(old_path);
6915 			ut_free(tmp_path);
6916 			return(err);
6917 		}
6918 
6919 		fil_name_write_rename(
6920 			old_table->space, 0, old_path, tmp_path, mtr);
6921 
6922 		ut_free(tmp_path);
6923 	}
6924 
6925 	if (new_is_file_per_table) {
6926 		const char*	new_dir = DICT_TF_HAS_DATA_DIR(new_table->flags)
6927 			? new_table->data_dir_path
6928 			: NULL;
6929 		char*	new_path = fil_make_filepath(
6930 				new_dir, new_table->name.m_name,
6931 				IBD, (new_dir != NULL));
6932 		if (new_path == NULL) {
6933 			ut_free(old_path);
6934 			return(DB_OUT_OF_MEMORY);
6935 		}
6936 
6937 		/* Destination filepath must not exist unless this ALTER
6938 		TABLE starts and ends with a file_per-table tablespace. */
6939 		if (!old_is_file_per_table) {
6940 			err = fil_rename_tablespace_check(
6941 				new_table->space, new_path, old_path,
6942 				dict_table_is_discarded(new_table));
6943 			if (err != DB_SUCCESS) {
6944 				ut_free(old_path);
6945 				ut_free(new_path);
6946 				return(err);
6947 			}
6948 		}
6949 
6950 		fil_name_write_rename(
6951 			new_table->space, 0, new_path, old_path, mtr);
6952 
6953 		ut_free(new_path);
6954 	}
6955 
6956 	ut_free(old_path);
6957 
6958 	return(DB_SUCCESS);
6959 }
6960 #endif /* !UNIV_HOTBACKUP */
6961 #ifdef UNIV_DEBUG
6962 /** Check that a tablespace is valid for mtr_commit().
6963 @param[in]	space	persistent tablespace that has been changed */
6964 static
6965 void
fil_space_validate_for_mtr_commit(const fil_space_t * space)6966 fil_space_validate_for_mtr_commit(
6967 	const fil_space_t*	space)
6968 {
6969 	ut_ad(!mutex_own(&fil_system->mutex));
6970 	ut_ad(space != NULL);
6971 	ut_ad(space->purpose == FIL_TYPE_TABLESPACE);
6972 	ut_ad(!is_predefined_tablespace(space->id));
6973 
6974 	/* We are serving mtr_commit(). While there is an active
6975 	mini-transaction, we should have !space->stop_new_ops. This is
6976 	guaranteed by meta-data locks or transactional locks, or
6977 	dict_operation_lock (X-lock in DROP, S-lock in purge).
6978 
6979 	However, a file I/O thread can invoke change buffer merge
6980 	while fil_check_pending_operations() is waiting for operations
6981 	to quiesce. This is not a problem, because
6982 	ibuf_merge_or_delete_for_page() would call
6983 	fil_space_acquire() before mtr_start() and
6984 	fil_space_release() after mtr_commit(). This is why
6985 	n_pending_ops should not be zero if stop_new_ops is set. */
6986 	ut_ad(!space->stop_new_ops
6987 	      || space->is_being_truncated /* TRUNCATE sets stop_new_ops */
6988 	      || space->n_pending_ops > 0);
6989 }
6990 #endif /* UNIV_DEBUG */
6991 
6992 /** Write a MLOG_FILE_NAME record for a persistent tablespace.
6993 @param[in]	space	tablespace
6994 @param[in,out]	mtr	mini-transaction */
6995 static
6996 void
fil_names_write(const fil_space_t * space,mtr_t * mtr)6997 fil_names_write(
6998 	const fil_space_t*	space,
6999 	mtr_t*			mtr)
7000 {
7001 	ut_ad(UT_LIST_GET_LEN(space->chain) == 1);
7002 	fil_name_write(space, 0, UT_LIST_GET_FIRST(space->chain), mtr);
7003 }
7004 
7005 /** Note that a non-predefined persistent tablespace has been modified
7006 by redo log.
7007 @param[in,out]	space	tablespace */
7008 void
fil_names_dirty(fil_space_t * space)7009 fil_names_dirty(
7010 	fil_space_t*	space)
7011 {
7012 	ut_ad(log_mutex_own());
7013 	ut_ad(recv_recovery_is_on());
7014 	ut_ad(log_sys->lsn != 0);
7015 	ut_ad(space->max_lsn == 0);
7016 	ut_d(fil_space_validate_for_mtr_commit(space));
7017 
7018 	UT_LIST_ADD_LAST(fil_system->named_spaces, space);
7019 	space->max_lsn = log_sys->lsn;
7020 }
7021 
7022 /** Write MLOG_FILE_NAME records when a non-predefined persistent
7023 tablespace was modified for the first time since the latest
7024 fil_names_clear().
7025 @param[in,out]	space	tablespace
7026 @param[in,out]	mtr	mini-transaction */
7027 void
fil_names_dirty_and_write(fil_space_t * space,mtr_t * mtr)7028 fil_names_dirty_and_write(
7029 	fil_space_t*	space,
7030 	mtr_t*		mtr)
7031 {
7032 	ut_ad(log_mutex_own());
7033 	ut_d(fil_space_validate_for_mtr_commit(space));
7034 	ut_ad(space->max_lsn == log_sys->lsn);
7035 
7036 	UT_LIST_ADD_LAST(fil_system->named_spaces, space);
7037 	fil_names_write(space, mtr);
7038 
7039 	DBUG_EXECUTE_IF("fil_names_write_bogus",
7040 			{
7041 				char bogus_name[] = "./test/bogus file.ibd";
7042 				os_normalize_path(bogus_name);
7043 				fil_name_write(
7044 					SRV_LOG_SPACE_FIRST_ID, 0,
7045 					bogus_name, mtr);
7046 			});
7047 }
7048 #ifndef UNIV_HOTBACKUP
7049 /** On a log checkpoint, reset fil_names_dirty_and_write() flags
7050 and write out MLOG_FILE_NAME and MLOG_CHECKPOINT if needed.
7051 @param[in]	lsn		checkpoint LSN
7052 @param[in]	do_write	whether to always write MLOG_CHECKPOINT
7053 @return whether anything was written to the redo log
7054 @retval false	if no flags were set and nothing written
7055 @retval true	if anything was written to the redo log */
7056 bool
fil_names_clear(lsn_t lsn,bool do_write)7057 fil_names_clear(
7058 	lsn_t	lsn,
7059 	bool	do_write)
7060 {
7061 	mtr_t	mtr;
7062 	ulint	mtr_checkpoint_size = LOG_CHECKPOINT_FREE_PER_THREAD;
7063 
7064 	DBUG_EXECUTE_IF(
7065 		"increase_mtr_checkpoint_size",
7066 		mtr_checkpoint_size = 75 * 1024;
7067 		);
7068 
7069 	ut_ad(log_mutex_own());
7070 
7071 	if (log_sys->append_on_checkpoint) {
7072 		mtr_write_log(log_sys->append_on_checkpoint);
7073 		do_write = true;
7074 	}
7075 
7076 	mtr.start();
7077 
7078 	for (fil_space_t* space = UT_LIST_GET_FIRST(fil_system->named_spaces);
7079 	     space != NULL; ) {
7080 		fil_space_t*	next = UT_LIST_GET_NEXT(named_spaces, space);
7081 
7082 		ut_ad(space->max_lsn > 0);
7083 		if (space->max_lsn < lsn) {
7084 			/* The tablespace was last dirtied before the
7085 			checkpoint LSN. Remove it from the list, so
7086 			that if the tablespace is not going to be
7087 			modified any more, subsequent checkpoints will
7088 			avoid calling fil_names_write() on it. */
7089 			space->max_lsn = 0;
7090 			UT_LIST_REMOVE(fil_system->named_spaces, space);
7091 		}
7092 
7093 		/* max_lsn is the last LSN where fil_names_dirty_and_write()
7094 		was called. If we kept track of "min_lsn" (the first LSN
7095 		where max_lsn turned nonzero), we could avoid the
7096 		fil_names_write() call if min_lsn > lsn. */
7097 
7098 		fil_names_write(space, &mtr);
7099 		do_write = true;
7100 
7101 		const mtr_buf_t* mtr_log = mtr_get_log(&mtr);
7102 
7103 		/** If the mtr buffer size exceeds the size of
7104 		LOG_CHECKPOINT_FREE_PER_THREAD then commit the multi record
7105 		mini-transaction, start the new mini-transaction to
7106 		avoid the parsing buffer overflow error during recovery. */
7107 
7108 		if (mtr_log->size() > mtr_checkpoint_size) {
7109 			ut_ad(mtr_log->size() < (RECV_PARSING_BUF_SIZE / 2));
7110 			mtr.commit_checkpoint(lsn, false);
7111 			mtr.start();
7112 		}
7113 
7114 		space = next;
7115 	}
7116 
7117 	if (do_write) {
7118 		mtr.commit_checkpoint(lsn, true);
7119 	} else {
7120 		ut_ad(!mtr.has_modifications());
7121 	}
7122 
7123 	return(do_write);
7124 }
7125 
7126 /** Truncate a single-table tablespace. The tablespace must be cached
7127 in the memory cache.
7128 @param space_id			space id
7129 @param dir_path			directory path
7130 @param tablename		the table name in the usual
7131 				databasename/tablename format of InnoDB
7132 @param flags			tablespace flags
7133 @param trunc_to_default		truncate to default size if tablespace
7134 				is being newly re-initialized.
7135 @return DB_SUCCESS or error */
7136 dberr_t
truncate(ulint space_id,const char * dir_path,const char * tablename,ulint flags,bool trunc_to_default)7137 truncate_t::truncate(
7138 /*=================*/
7139 	ulint		space_id,
7140 	const char*	dir_path,
7141 	const char*	tablename,
7142 	ulint		flags,
7143 	bool		trunc_to_default)
7144 {
7145 	dberr_t		err = DB_SUCCESS;
7146 	char*		path;
7147 	bool		has_data_dir = FSP_FLAGS_HAS_DATA_DIR(flags);
7148 
7149 	ut_a(!is_system_tablespace(space_id));
7150 
7151 	if (has_data_dir) {
7152 		ut_ad(dir_path != NULL);
7153 
7154 		path = fil_make_filepath(dir_path, tablename, IBD, true);
7155 
7156 	} else {
7157 		path = fil_make_filepath(NULL, tablename, IBD, false);
7158 	}
7159 
7160 	if (path == NULL) {
7161 		return(DB_OUT_OF_MEMORY);
7162 	}
7163 
7164 	mutex_enter(&fil_system->mutex);
7165 
7166 	fil_space_t*	space = fil_space_get_by_id(space_id);
7167 
7168 	/* The following code must change when InnoDB supports
7169 	multiple datafiles per tablespace. */
7170 	ut_a(UT_LIST_GET_LEN(space->chain) == 1);
7171 
7172 	fil_node_t*	node = UT_LIST_GET_FIRST(space->chain);
7173 
7174 	if (trunc_to_default) {
7175 		space->size = node->size = FIL_IBD_FILE_INITIAL_SIZE;
7176 	}
7177 
7178 	const bool already_open = node->is_open;
7179 
7180 	if (!already_open) {
7181 
7182 		bool	ret;
7183 
7184 		node->handle = os_file_create_simple_no_error_handling(
7185 			innodb_data_file_key, path, OS_FILE_OPEN,
7186 			OS_FILE_READ_WRITE,
7187 			fsp_is_system_temporary(space_id)
7188 			? false : srv_read_only_mode, &ret);
7189 
7190 		if (!ret) {
7191 			ib::error() << "Failed to open tablespace file "
7192 				<< path << ".";
7193 
7194 			ut_free(path);
7195 
7196 			return(DB_ERROR);
7197 		}
7198 
7199 		node->is_open = true;
7200 	}
7201 
7202 	os_offset_t	trunc_size = trunc_to_default
7203 		? FIL_IBD_FILE_INITIAL_SIZE
7204 		: space->size;
7205 
7206 	const bool success = os_file_truncate(
7207 		path, node->handle, trunc_size * UNIV_PAGE_SIZE);
7208 
7209 	if (!success) {
7210 		ib::error() << "Cannot truncate file " << path
7211 			<< " in TRUNCATE TABLESPACE.";
7212 		err = DB_ERROR;
7213 	}
7214 
7215 	space->stop_new_ops = false;
7216 	space->is_being_truncated = false;
7217 
7218 	/* If we opened the file in this function, close it. */
7219 	if (!already_open) {
7220 		bool	closed = os_file_close(node->handle);
7221 
7222 		if (!closed) {
7223 
7224 			ib::error() << "Failed to close tablespace file "
7225 				<< path << ".";
7226 
7227 			err = DB_ERROR;
7228 		} else {
7229 			node->is_open = false;
7230 		}
7231 	}
7232 
7233 	mutex_exit(&fil_system->mutex);
7234 
7235 	ut_free(path);
7236 
7237 	return(err);
7238 }
7239 #endif /* !UNIV_HOTBACKUP */
7240 
7241 /**
7242 Note that the file system where the file resides doesn't support PUNCH HOLE.
7243 Called from AIO handlers when IO returns DB_IO_NO_PUNCH_HOLE
7244 @param[in,out]	node		Node to set */
7245 void
fil_no_punch_hole(fil_node_t * node)7246 fil_no_punch_hole(fil_node_t* node)
7247 {
7248 	node->punch_hole = false;
7249 }
7250 
7251 /** Set the compression type for the tablespace of a table
7252 @param[in]	table		The table that should be compressed
7253 @param[in]	algorithm	Text representation of the algorithm
7254 @return DB_SUCCESS or error code */
7255 dberr_t
fil_set_compression(dict_table_t * table,const char * algorithm)7256 fil_set_compression(
7257 	dict_table_t*	table,
7258 	const char*	algorithm)
7259 {
7260 	ut_ad(table != NULL);
7261 
7262 	/* We don't support Page Compression for the system tablespace,
7263 	the temporary tablespace, or any general tablespace because
7264 	COMPRESSION is set by TABLE DDL, not TABLESPACE DDL. There is
7265 	no other technical reason.  Also, do not use it for missing
7266 	tables or tables with compressed row_format. */
7267 	if (table->ibd_file_missing
7268 	    || !DICT_TF2_FLAG_IS_SET(table, DICT_TF2_USE_FILE_PER_TABLE)
7269 	    || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_TEMPORARY)
7270 	    || page_size_t(table->flags).is_compressed()) {
7271 
7272 		return(DB_IO_NO_PUNCH_HOLE_TABLESPACE);
7273 	}
7274 
7275 	dberr_t		err;
7276 	Compression	compression;
7277 
7278 	if (algorithm == NULL || strlen(algorithm) == 0) {
7279 
7280 #ifndef UNIV_DEBUG
7281 		compression.m_type = Compression::NONE;
7282 #else
7283 		/* This is a Debug tool for setting compression on all
7284 		compressible tables not otherwise specified. */
7285 		switch (srv_debug_compress) {
7286 		case Compression::LZ4:
7287 		case Compression::ZLIB:
7288 		case Compression::NONE:
7289 
7290 			compression.m_type =
7291 				static_cast<Compression::Type>(
7292 					srv_debug_compress);
7293 			break;
7294 
7295 		default:
7296 			compression.m_type = Compression::NONE;
7297 		}
7298 
7299 #endif /* UNIV_DEBUG */
7300 
7301 		err = DB_SUCCESS;
7302 
7303 	} else {
7304 
7305 		err = Compression::check(algorithm, &compression);
7306 	}
7307 
7308 	fil_space_t*	space = fil_space_get(table->space);
7309 
7310 	if (space == NULL) {
7311 		return(DB_NOT_FOUND);
7312 	}
7313 
7314 	space->compression_type = compression.m_type;
7315 
7316 	if (space->compression_type != Compression::NONE) {
7317 
7318 		const fil_node_t* node;
7319 
7320 		node = UT_LIST_GET_FIRST(space->chain);
7321 
7322 		if (!node->punch_hole) {
7323 
7324 			return(DB_IO_NO_PUNCH_HOLE_FS);
7325 		}
7326 	}
7327 
7328 	return(err);
7329 }
7330 
7331 /** Get the compression algorithm for a tablespace.
7332 @param[in]	space_id	Space ID to check
7333 @return the compression algorithm */
7334 Compression::Type
fil_get_compression(ulint space_id)7335 fil_get_compression(
7336 	ulint	space_id)
7337 {
7338 	fil_space_t*	space = fil_space_get(space_id);
7339 
7340 	return(space == NULL ? Compression::NONE : space->compression_type);
7341 }
7342 
7343 /** Set the encryption type for the tablespace
7344 @param[in] space_id		Space ID of tablespace for which to set
7345 @param[in] algorithm		Encryption algorithm
7346 @param[in] key			Encryption key
7347 @param[in] iv			Encryption iv
7348 @return DB_SUCCESS or error code */
7349 dberr_t
fil_set_encryption(ulint space_id,Encryption::Type algorithm,byte * key,byte * iv)7350 fil_set_encryption(
7351 	ulint			space_id,
7352 	Encryption::Type	algorithm,
7353 	byte*			key,
7354 	byte*			iv)
7355 {
7356 	ut_ad(!is_system_or_undo_tablespace(space_id));
7357 
7358 	if (is_system_tablespace(space_id)) {
7359 		return(DB_IO_NO_ENCRYPT_TABLESPACE);
7360 	}
7361 
7362 	mutex_enter(&fil_system->mutex);
7363 
7364 	fil_space_t*	space = fil_space_get_by_id(space_id);
7365 
7366 	if (space == NULL) {
7367 		mutex_exit(&fil_system->mutex);
7368 		return(DB_NOT_FOUND);
7369 	}
7370 
7371 	ut_ad(algorithm != Encryption::NONE);
7372 	space->encryption_type = algorithm;
7373 	if (key == NULL) {
7374 		Encryption::random_value(space->encryption_key);
7375 	} else {
7376 		memcpy(space->encryption_key,
7377 		       key, ENCRYPTION_KEY_LEN);
7378 	}
7379 
7380 	space->encryption_klen = ENCRYPTION_KEY_LEN;
7381 	if (iv == NULL) {
7382 		Encryption::random_value(space->encryption_iv);
7383 	} else {
7384 		memcpy(space->encryption_iv,
7385 		       iv, ENCRYPTION_KEY_LEN);
7386 	}
7387 
7388 	mutex_exit(&fil_system->mutex);
7389 
7390 	return(DB_SUCCESS);
7391 }
7392 
7393 /** Rotate the tablespace keys by new master key.
7394 @return true if the re-encrypt suceeds */
7395 bool
fil_encryption_rotate()7396 fil_encryption_rotate()
7397 {
7398 	fil_space_t*	space;
7399 	mtr_t		mtr;
7400 	byte		encrypt_info[ENCRYPTION_INFO_SIZE_V2];
7401 
7402 	for (space = UT_LIST_GET_FIRST(fil_system->space_list);
7403 	     space != NULL; ) {
7404 		/* Skip unencypted tablespaces. */
7405 		if (is_system_or_undo_tablespace(space->id)
7406 		    || fsp_is_system_temporary(space->id)
7407 		    || space->purpose == FIL_TYPE_LOG) {
7408 			space = UT_LIST_GET_NEXT(space_list, space);
7409 			continue;
7410 		}
7411 
7412 		if (space->encryption_type != Encryption::NONE) {
7413 			mtr_start(&mtr);
7414 			mtr.set_named_space(space->id);
7415 
7416 			space = mtr_x_lock_space(space->id, &mtr);
7417 
7418 			memset(encrypt_info, 0, ENCRYPTION_INFO_SIZE_V2);
7419 
7420 			if (!fsp_header_rotate_encryption(space,
7421 							  encrypt_info,
7422 							  &mtr)) {
7423 				mtr_commit(&mtr);
7424 				return(false);
7425 			}
7426 
7427 			mtr_commit(&mtr);
7428 		}
7429 
7430 		space = UT_LIST_GET_NEXT(space_list, space);
7431 		DBUG_EXECUTE_IF("ib_crash_during_rotation_for_encryption",
7432 				DBUG_SUICIDE(););
7433 	}
7434 
7435 	return(true);
7436 }
7437 
7438 /** Build the basic folder name from the path and length provided
7439 @param[in]	path	pathname (may also include the file basename)
7440 @param[in]	len	length of the path, in bytes */
7441 void
make_path(const char * path,size_t len)7442 Folder::make_path(const char* path, size_t len)
7443 {
7444 	if (is_absolute_path(path)) {
7445 		m_folder = mem_strdupl(path, len);
7446 		m_folder_len = len;
7447 	}
7448 	else {
7449 		size_t n = 2 + len + strlen(fil_path_to_mysql_datadir);
7450 		m_folder = static_cast<char*>(ut_malloc_nokey(n));
7451 		m_folder_len = 0;
7452 
7453 		if (path != fil_path_to_mysql_datadir) {
7454 			/* Put the mysqld datadir into m_folder first. */
7455 			ut_ad(fil_path_to_mysql_datadir[0] != '\0');
7456 			m_folder_len = strlen(fil_path_to_mysql_datadir);
7457 			memcpy(m_folder, fil_path_to_mysql_datadir,
7458 			       m_folder_len);
7459 			if (m_folder[m_folder_len - 1] != OS_PATH_SEPARATOR) {
7460 				m_folder[m_folder_len++] = OS_PATH_SEPARATOR;
7461 			}
7462 		}
7463 
7464 		/* Append the path. */
7465 		memcpy(m_folder + m_folder_len, path, len);
7466 		m_folder_len += len;
7467 		m_folder[m_folder_len] = '\0';
7468 	}
7469 
7470 	os_normalize_path(m_folder);
7471 }
7472 
7473 /** Resolve a relative path in m_folder to an absolute path
7474 in m_abs_path setting m_abs_len. */
7475 void
make_abs_path()7476 Folder::make_abs_path()
7477 {
7478 	my_realpath(m_abs_path, m_folder, MYF(0));
7479 	m_abs_len = strlen(m_abs_path);
7480 
7481 	ut_ad(m_abs_len + 1 < sizeof(m_abs_path));
7482 
7483 	/* Folder::related_to() needs a trailing separator. */
7484 	if (m_abs_path[m_abs_len - 1] != OS_PATH_SEPARATOR) {
7485 		m_abs_path[m_abs_len] = OS_PATH_SEPARATOR;
7486 		m_abs_path[++m_abs_len] = '\0';
7487 	}
7488 }
7489 
7490 /** Constructor
7491 @param[in]	path	pathname (may also include the file basename)
7492 @param[in]	len	length of the path, in bytes */
Folder(const char * path,size_t len)7493 Folder::Folder(const char* path, size_t len)
7494 {
7495 	make_path(path, len);
7496 	make_abs_path();
7497 }
7498 
7499 /** Assignment operator
7500 @param[in]	folder	folder string provided */
7501 class Folder&
operator =(const char * path)7502 Folder::operator=(const char* path)
7503 {
7504 	ut_free(m_folder);
7505 	make_path(path, strlen(path));
7506 	make_abs_path();
7507 
7508 	return(*this);
7509 }
7510 
7511 /** Determine if two folders are equal
7512 @param[in]	other	folder to compare to
7513 @return whether the folders are equal */
operator ==(const Folder & other) const7514 bool Folder::operator==(const Folder& other) const
7515 {
7516 	return(m_abs_len == other.m_abs_len
7517 	       && !memcmp(m_abs_path, other.m_abs_path, m_abs_len));
7518 }
7519 
7520 /** Determine if the left folder is the same or an ancestor of
7521 (contains) the right folder.
7522 @param[in]	other	folder to compare to
7523 @return whether this is the same or an ancestor of the other folder. */
operator >=(const Folder & other) const7524 bool Folder::operator>=(const Folder& other) const
7525 {
7526 	return(m_abs_len <= other.m_abs_len
7527 		&& (!memcmp(other.m_abs_path, m_abs_path, m_abs_len)));
7528 }
7529 
7530 /** Determine if the left folder is an ancestor of (contains)
7531 the right folder.
7532 @param[in]	other	folder to compare to
7533 @return whether this is an ancestor of the other folder */
operator >(const Folder & other) const7534 bool Folder::operator>(const Folder& other) const
7535 {
7536 	return(m_abs_len < other.m_abs_len
7537 	       && (!memcmp(other.m_abs_path, m_abs_path, m_abs_len)));
7538 }
7539 
7540 /** Determine if the directory referenced by m_folder exists.
7541 @return whether the directory exists */
7542 bool
exists()7543 Folder::exists()
7544 {
7545 	bool		exists;
7546 	os_file_type_t	type;
7547 
7548 #ifdef _WIN32
7549 	/* Temporarily strip the trailing_separator since it will cause
7550 	_stat64() to fail on Windows unless the path is the root of some
7551 	drive; like "c:\".  _stat64() will fail if it is "c:". */
7552 	size_t	len = strlen(m_abs_path);
7553 	if (m_abs_path[m_abs_len - 1] == OS_PATH_SEPARATOR
7554 	    && m_abs_path[m_abs_len - 2] != ':') {
7555 		m_abs_path[m_abs_len - 1] = '\0';
7556 	}
7557 #endif /* WIN32 */
7558 
7559 	bool ret = os_file_status(m_abs_path, &exists, &type);
7560 
7561 #ifdef _WIN32
7562 	/* Put the separator back on. */
7563 	if (m_abs_path[m_abs_len - 1] == '\0') {
7564 		m_abs_path[m_abs_len - 1] = OS_PATH_SEPARATOR;
7565 	}
7566 #endif /* WIN32 */
7567 
7568 	return(ret && exists && type == OS_FILE_TYPE_DIR);
7569 }
7570 
7571 /* Unit Tests */
7572 #ifdef UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH
7573 #define MF  fil_make_filepath
7574 #define DISPLAY ib::info() << path
7575 void
test_make_filepath()7576 test_make_filepath()
7577 {
7578 	char* path;
7579 	const char* long_path =
7580 		"this/is/a/very/long/path/including/a/very/"
7581 		"looooooooooooooooooooooooooooooooooooooooooooooooo"
7582 		"oooooooooooooooooooooooooooooooooooooooooooooooooo"
7583 		"oooooooooooooooooooooooooooooooooooooooooooooooooo"
7584 		"oooooooooooooooooooooooooooooooooooooooooooooooooo"
7585 		"oooooooooooooooooooooooooooooooooooooooooooooooooo"
7586 		"oooooooooooooooooooooooooooooooooooooooooooooooooo"
7587 		"oooooooooooooooooooooooooooooooooooooooooooooooooo"
7588 		"oooooooooooooooooooooooooooooooooooooooooooooooooo"
7589 		"oooooooooooooooooooooooooooooooooooooooooooooooooo"
7590 		"oooooooooooooooooooooooooooooooooooooooooooooooong"
7591 		"/folder/name";
7592 	path = MF("/this/is/a/path/with/a/filename", NULL, IBD, false); DISPLAY;
7593 	path = MF("/this/is/a/path/with/a/filename", NULL, ISL, false); DISPLAY;
7594 	path = MF("/this/is/a/path/with/a/filename", NULL, CFG, false); DISPLAY;
7595 	path = MF("/this/is/a/path/with/a/filename", NULL, CFP, false); DISPLAY;
7596 	path = MF("/this/is/a/path/with/a/filename.ibd", NULL, IBD, false); DISPLAY;
7597 	path = MF("/this/is/a/path/with/a/filename.ibd", NULL, IBD, false); DISPLAY;
7598 	path = MF("/this/is/a/path/with/a/filename.dat", NULL, IBD, false); DISPLAY;
7599 	path = MF(NULL, "tablespacename", NO_EXT, false); DISPLAY;
7600 	path = MF(NULL, "tablespacename", IBD, false); DISPLAY;
7601 	path = MF(NULL, "dbname/tablespacename", NO_EXT, false); DISPLAY;
7602 	path = MF(NULL, "dbname/tablespacename", IBD, false); DISPLAY;
7603 	path = MF(NULL, "dbname/tablespacename", ISL, false); DISPLAY;
7604 	path = MF(NULL, "dbname/tablespacename", CFG, false); DISPLAY;
7605 	path = MF(NULL, "dbname/tablespacename", CFP, false); DISPLAY;
7606 	path = MF(NULL, "dbname\\tablespacename", NO_EXT, false); DISPLAY;
7607 	path = MF(NULL, "dbname\\tablespacename", IBD, false); DISPLAY;
7608 	path = MF("/this/is/a/path", "dbname/tablespacename", IBD, false); DISPLAY;
7609 	path = MF("/this/is/a/path", "dbname/tablespacename", IBD, true); DISPLAY;
7610 	path = MF("./this/is/a/path", "dbname/tablespacename.ibd", IBD, true); DISPLAY;
7611 	path = MF("this\\is\\a\\path", "dbname/tablespacename", IBD, true); DISPLAY;
7612 	path = MF("/this/is/a/path", "dbname\\tablespacename", IBD, true); DISPLAY;
7613 	path = MF(long_path, NULL, IBD, false); DISPLAY;
7614 	path = MF(long_path, "tablespacename", IBD, false); DISPLAY;
7615 	path = MF(long_path, "tablespacename", IBD, true); DISPLAY;
7616 }
7617 #endif /* UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH */
7618 /* @} */
7619 
7620 /** Release the reserved free extents.
7621 @param[in]	n_reserved	number of reserved extents */
7622 void
release_free_extents(ulint n_reserved)7623 fil_space_t::release_free_extents(ulint	n_reserved)
7624 {
7625 	ut_ad(rw_lock_own(&latch, RW_LOCK_X));
7626 
7627 	ut_a(n_reserved_extents >= n_reserved);
7628 	n_reserved_extents -= n_reserved;
7629 }
7630