1 /*****************************************************************************
2 
3 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
4 
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License, version 2.0,
7 as published by the Free Software Foundation.
8 
9 This program is also distributed with certain software (including
10 but not limited to OpenSSL) that is licensed under separate terms,
11 as designated in a particular file or component or in included license
12 documentation.  The authors of MySQL hereby grant you an additional
13 permission to link the program and your derivative works with the
14 separately licensed software that they have included with MySQL.
15 
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 GNU General Public License, version 2.0, for more details.
20 
21 You should have received a copy of the GNU General Public License along with
22 this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
24 
25 *****************************************************************************/
26 
27 /**************************************************//**
28 @file fil/fil0fil.cc
29 The tablespace memory cache
30 
31 Created 10/25/1995 Heikki Tuuri
32 *******************************************************/
33 
34 #include "fil0fil.h"
35 
36 #include <debug_sync.h>
37 #include <my_dbug.h>
38 
39 #include "mem0mem.h"
40 #include "hash0hash.h"
41 #include "os0file.h"
42 #include "mach0data.h"
43 #include "buf0buf.h"
44 #include "buf0flu.h"
45 #include "log0recv.h"
46 #include "fsp0fsp.h"
47 #include "srv0srv.h"
48 #include "srv0start.h"
49 #include "mtr0mtr.h"
50 #include "mtr0log.h"
51 #include "dict0dict.h"
52 #include "page0page.h"
53 #include "page0zip.h"
54 #include "trx0sys.h"
55 #include "row0mysql.h"
56 #ifndef UNIV_HOTBACKUP
57 # include "buf0lru.h"
58 # include "ibuf0ibuf.h"
59 # include "sync0sync.h"
60 # include "os0sync.h"
61 #else /* !UNIV_HOTBACKUP */
62 # include "srv0srv.h"
63 static ulint srv_data_read, srv_data_written;
64 #endif /* !UNIV_HOTBACKUP */
65 
66 /*
67 		IMPLEMENTATION OF THE TABLESPACE MEMORY CACHE
68 		=============================================
69 
70 The tablespace cache is responsible for providing fast read/write access to
71 tablespaces and logs of the database. File creation and deletion is done
72 in other modules which know more of the logic of the operation, however.
73 
74 A tablespace consists of a chain of files. The size of the files does not
75 have to be divisible by the database block size, because we may just leave
76 the last incomplete block unused. When a new file is appended to the
77 tablespace, the maximum size of the file is also specified. At the moment,
78 we think that it is best to extend the file to its maximum size already at
79 the creation of the file, because then we can avoid dynamically extending
80 the file when more space is needed for the tablespace.
81 
82 A block's position in the tablespace is specified with a 32-bit unsigned
83 integer. The files in the chain are thought to be catenated, and the block
84 corresponding to an address n is the nth block in the catenated file (where
85 the first block is named the 0th block, and the incomplete block fragments
86 at the end of files are not taken into account). A tablespace can be extended
87 by appending a new file at the end of the chain.
88 
89 Our tablespace concept is similar to the one of Oracle.
90 
91 To acquire more speed in disk transfers, a technique called disk striping is
92 sometimes used. This means that logical block addresses are divided in a
93 round-robin fashion across several disks. Windows NT supports disk striping,
94 so there we do not need to support it in the database. Disk striping is
95 implemented in hardware in RAID disks. We conclude that it is not necessary
96 to implement it in the database. Oracle 7 does not support disk striping,
97 either.
98 
99 Another trick used at some database sites is replacing tablespace files by
100 raw disks, that is, the whole physical disk drive, or a partition of it, is
101 opened as a single file, and it is accessed through byte offsets calculated
102 from the start of the disk or the partition. This is recommended in some
103 books on database tuning to achieve more speed in i/o. Using raw disk
104 certainly prevents the OS from fragmenting disk space, but it is not clear
105 if it really adds speed. We measured on the Pentium 100 MHz + NT + NTFS file
106 system + EIDE Conner disk only a negligible difference in speed when reading
107 from a file, versus reading from a raw disk.
108 
109 To have fast access to a tablespace or a log file, we put the data structures
110 to a hash table. Each tablespace and log file is given an unique 32-bit
111 identifier.
112 
113 Some operating systems do not support many open files at the same time,
114 though NT seems to tolerate at least 900 open files. Therefore, we put the
115 open files in an LRU-list. If we need to open another file, we may close the
116 file at the end of the LRU-list. When an i/o-operation is pending on a file,
117 the file cannot be closed. We take the file nodes with pending i/o-operations
118 out of the LRU-list and keep a count of pending operations. When an operation
119 completes, we decrement the count and return the file node to the LRU-list if
120 the count drops to zero. */
121 
122 /** When mysqld is run, the default directory "." is the mysqld datadir,
123 but in the MySQL Embedded Server Library and mysqlbackup it is not the default
124 directory, and we must set the base file path explicitly */
125 UNIV_INTERN const char*	fil_path_to_mysql_datadir	= ".";
126 
127 /** The number of fsyncs done to the log */
128 UNIV_INTERN ulint	fil_n_log_flushes			= 0;
129 
130 /** Number of pending redo log flushes */
131 UNIV_INTERN ulint	fil_n_pending_log_flushes		= 0;
132 /** Number of pending tablespace flushes */
133 UNIV_INTERN ulint	fil_n_pending_tablespace_flushes	= 0;
134 
135 /** Number of files currently open */
136 UNIV_INTERN ulint	fil_n_file_opened			= 0;
137 
138 /** The null file address */
139 UNIV_INTERN fil_addr_t	fil_addr_null = {FIL_NULL, 0};
140 
141 #ifdef UNIV_PFS_MUTEX
142 /* Key to register fil_system_mutex with performance schema */
143 UNIV_INTERN mysql_pfs_key_t	fil_system_mutex_key;
144 #endif /* UNIV_PFS_MUTEX */
145 
146 #ifdef UNIV_PFS_RWLOCK
147 /* Key to register file space latch with performance schema */
148 UNIV_INTERN mysql_pfs_key_t	fil_space_latch_key;
149 #endif /* UNIV_PFS_RWLOCK */
150 
151 /** File node of a tablespace or the log data space */
152 struct fil_node_t {
153 	fil_space_t*	space;	/*!< backpointer to the space where this node
154 				belongs */
155 	char*		name;	/*!< path to the file */
156 	ibool		open;	/*!< TRUE if file open */
157 	pfs_os_file_t	handle;	/*!< OS handle to the file, if file open */
158 	os_event_t	sync_event;/*!< Condition event to group and
159 				serialize calls to fsync */
160 	ibool		is_raw_disk;/*!< TRUE if the 'file' is actually a raw
161 				device or a raw disk partition */
162 	ulint		size;	/*!< size of the file in database pages, 0 if
163 				not known yet; the possible last incomplete
164 				megabyte may be ignored if space == 0 */
165 	ulint		n_pending;
166 				/*!< count of pending i/o's on this file;
167 				closing of the file is not allowed if
168 				this is > 0 */
169 	ulint		n_pending_flushes;
170 				/*!< count of pending flushes on this file;
171 				closing of the file is not allowed if
172 				this is > 0 */
173 	ibool		being_extended;
174 				/*!< TRUE if the node is currently
175 				being extended. */
176 	ib_int64_t	modification_counter;/*!< when we write to the file we
177 				increment this by one */
178 	ib_int64_t	flush_counter;/*!< up to what
179 				modification_counter value we have
180 				flushed the modifications to disk */
181 	UT_LIST_NODE_T(fil_node_t) chain;
182 				/*!< link field for the file chain */
183 	UT_LIST_NODE_T(fil_node_t) LRU;
184 				/*!< link field for the LRU list */
185 	ulint		magic_n;/*!< FIL_NODE_MAGIC_N */
186 };
187 
188 /** Value of fil_node_t::magic_n */
189 #define	FIL_NODE_MAGIC_N	89389
190 
191 /** Tablespace or log data space: let us call them by a common name space */
192 struct fil_space_t {
193 	char*		name;	/*!< space name = the path to the first file in
194 				it */
195 	ulint		id;	/*!< space id */
196 	ib_int64_t	tablespace_version;
197 				/*!< in DISCARD/IMPORT this timestamp
198 				is used to check if we should ignore
199 				an insert buffer merge request for a
200 				page because it actually was for the
201 				previous incarnation of the space */
202 	ibool		mark;	/*!< this is set to TRUE at database startup if
203 				the space corresponds to a table in the InnoDB
204 				data dictionary; so we can print a warning of
205 				orphaned tablespaces */
206 	ibool		stop_ios;/*!< TRUE if we want to rename the
207 				.ibd file of tablespace and want to
208 				stop temporarily posting of new i/o
209 				requests on the file */
210 	ibool		stop_new_ops;
211 				/*!< we set this TRUE when we start
212 				deleting a single-table tablespace.
213 				When this is set following new ops
214 				are not allowed:
215 				* read IO request
216 				* ibuf merge
217 				* file flush
218 				Note that we can still possibly have
219 				new write operations because we don't
220 				check this flag when doing flush
221 				batches. */
222 	ulint		purpose;/*!< FIL_TABLESPACE, FIL_LOG, or
223 				FIL_ARCH_LOG */
224 	UT_LIST_BASE_NODE_T(fil_node_t) chain;
225 				/*!< base node for the file chain */
226 	ulint		size;	/*!< space size in pages; 0 if a single-table
227 				tablespace whose size we do not know yet;
228 				last incomplete megabytes in data files may be
229 				ignored if space == 0 */
230 	ulint		flags;	/*!< tablespace flags; see
231 				fsp_flags_is_valid(),
232 				fsp_flags_get_zip_size() */
233 	ulint		n_reserved_extents;
234 				/*!< number of reserved free extents for
235 				ongoing operations like B-tree page split */
236 	ulint		n_pending_flushes; /*!< this is positive when flushing
237 				the tablespace to disk; dropping of the
238 				tablespace is forbidden if this is positive */
239 	ulint		n_pending_ops;/*!< this is positive when we
240 				have pending operations against this
241 				tablespace. The pending operations can
242 				be ibuf merges or lock validation code
243 				trying to read a block.
244 				Dropping of the tablespace is forbidden
245 				if this is positive */
246 	hash_node_t	hash;	/*!< hash chain node */
247 	hash_node_t	name_hash;/*!< hash chain the name_hash table */
248 #ifndef UNIV_HOTBACKUP
249 	prio_rw_lock_t	latch;	/*!< latch protecting the file space storage
250 				allocation */
251 #endif /* !UNIV_HOTBACKUP */
252 	UT_LIST_NODE_T(fil_space_t) unflushed_spaces;
253 				/*!< list of spaces with at least one unflushed
254 				file we have written to */
255 	bool		is_in_unflushed_spaces;
256 				/*!< true if this space is currently in
257 				unflushed_spaces */
258 	ibool		is_corrupt;
259 	UT_LIST_NODE_T(fil_space_t) space_list;
260 				/*!< list of all spaces */
261 	ulint		magic_n;/*!< FIL_SPACE_MAGIC_N */
262 };
263 
264 /** Value of fil_space_t::magic_n */
265 #define	FIL_SPACE_MAGIC_N	89472
266 
267 /** The tablespace memory cache; also the totality of logs (the log
268 data space) is stored here; below we talk about tablespaces, but also
269 the ib_logfiles form a 'space' and it is handled here */
270 struct fil_system_t {
271 #ifndef UNIV_HOTBACKUP
272 	ib_mutex_t		mutex;		/*!< The mutex protecting the cache */
273 #endif /* !UNIV_HOTBACKUP */
274 	hash_table_t*	spaces;		/*!< The hash table of spaces in the
275 					system; they are hashed on the space
276 					id */
277 	hash_table_t*	name_hash;	/*!< hash table based on the space
278 					name */
279 	UT_LIST_BASE_NODE_T(fil_node_t) LRU;
280 					/*!< base node for the LRU list of the
281 					most recently used open files with no
282 					pending i/o's; if we start an i/o on
283 					the file, we first remove it from this
284 					list, and return it to the start of
285 					the list when the i/o ends;
286 					log files and the system tablespace are
287 					not put to this list: they are opened
288 					after the startup, and kept open until
289 					shutdown */
290 	UT_LIST_BASE_NODE_T(fil_space_t) unflushed_spaces;
291 					/*!< base node for the list of those
292 					tablespaces whose files contain
293 					unflushed writes; those spaces have
294 					at least one file node where
295 					modification_counter > flush_counter */
296 	ulint		n_open;		/*!< number of files currently open */
297 	ulint		max_n_open;	/*!< n_open is not allowed to exceed
298 					this */
299 	ib_int64_t	modification_counter;/*!< when we write to a file we
300 					increment this by one */
301 	ulint		max_assigned_id;/*!< maximum space id in the existing
302 					tables, or assigned during the time
303 					mysqld has been up; at an InnoDB
304 					startup we scan the data dictionary
305 					and set here the maximum of the
306 					space id's of the tables there */
307 	ib_int64_t	tablespace_version;
308 					/*!< a counter which is incremented for
309 					every space object memory creation;
310 					every space mem object gets a
311 					'timestamp' from this; in DISCARD/
312 					IMPORT this is used to check if we
313 					should ignore an insert buffer merge
314 					request */
315 	UT_LIST_BASE_NODE_T(fil_space_t) space_list;
316 					/*!< list of all file spaces */
317 	ibool		space_id_reuse_warned;
318 					/* !< TRUE if fil_space_create()
319 					has issued a warning about
320 					potential space_id reuse */
321 };
322 
323 /** The tablespace memory cache. This variable is NULL before the module is
324 initialized. */
325 static fil_system_t*	fil_system	= NULL;
326 
327 /** Determine if (i) is a user tablespace id or not. */
328 # define fil_is_user_tablespace_id(i) (i != 0 \
329 				       && !srv_is_undo_tablespace(i))
330 
331 /** Determine if user has explicitly disabled fsync(). */
332 #ifndef __WIN__
333 # define fil_buffering_disabled(s)					\
334 	(((s)->purpose == FIL_TABLESPACE				\
335 	    && srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)\
336 	  || ((s)->purpose == FIL_LOG					\
337 	    && srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT))
338 
339 #else /* __WIN__ */
340 # define fil_buffering_disabled(s)	(0)
341 #endif /* __WIN__ */
342 
343 #ifdef UNIV_DEBUG
344 /** Try fil_validate() every this many times */
345 # define FIL_VALIDATE_SKIP	17
346 
347 /******************************************************************//**
348 Checks the consistency of the tablespace cache some of the time.
349 @return	TRUE if ok or the check was skipped */
350 static
351 ibool
fil_validate_skip(void)352 fil_validate_skip(void)
353 /*===================*/
354 {
355 	/** The fil_validate() call skip counter. Use a signed type
356 	because of the race condition below. */
357 	static int fil_validate_count = FIL_VALIDATE_SKIP;
358 
359 	/* There is a race condition below, but it does not matter,
360 	because this call is only for heuristic purposes. We want to
361 	reduce the call frequency of the costly fil_validate() check
362 	in debug builds. */
363 	if (--fil_validate_count > 0) {
364 		return(TRUE);
365 	}
366 
367 	fil_validate_count = FIL_VALIDATE_SKIP;
368 	return(fil_validate());
369 }
370 #endif /* UNIV_DEBUG */
371 
372 /********************************************************************//**
373 Determines if a file node belongs to the least-recently-used list.
374 @return TRUE if the file belongs to fil_system->LRU mutex. */
375 UNIV_INLINE
376 ibool
fil_space_belongs_in_lru(const fil_space_t * space)377 fil_space_belongs_in_lru(
378 /*=====================*/
379 	const fil_space_t*	space)	/*!< in: file space */
380 {
381 	return(space->purpose == FIL_TABLESPACE
382 	       && fil_is_user_tablespace_id(space->id));
383 }
384 
385 /********************************************************************//**
386 NOTE: you must call fil_mutex_enter_and_prepare_for_io() first!
387 
388 Prepares a file node for i/o. Opens the file if it is closed. Updates the
389 pending i/o's field in the node and the system appropriately. Takes the node
390 off the LRU list if it is in the LRU list. The caller must hold the fil_sys
391 mutex.
392 @return false if the file can't be opened, otherwise true */
393 static
394 bool
395 fil_node_prepare_for_io(
396 /*====================*/
397 	fil_node_t*	node,	/*!< in: file node */
398 	fil_system_t*	system,	/*!< in: tablespace memory cache */
399 	fil_space_t*	space);	/*!< in: space */
400 /********************************************************************//**
401 Updates the data structures when an i/o operation finishes. Updates the
402 pending i/o's field in the node appropriately. */
403 static
404 void
405 fil_node_complete_io(
406 /*=================*/
407 	fil_node_t*	node,	/*!< in: file node */
408 	fil_system_t*	system,	/*!< in: tablespace memory cache */
409 	ulint		type);	/*!< in: OS_FILE_WRITE or OS_FILE_READ; marks
410 				the node as modified if
411 				type == OS_FILE_WRITE */
412 /*******************************************************************//**
413 Frees a space object from the tablespace memory cache. Closes the files in
414 the chain but does not delete them. There must not be any pending i/o's or
415 flushes on the files.
416 @return TRUE on success */
417 static
418 ibool
419 fil_space_free(
420 /*===========*/
421 	ulint		id,		/* in: space id */
422 	ibool		x_latched);	/* in: TRUE if caller has space->latch
423 					in X mode */
424 /********************************************************************//**
425 Reads data from a space to a buffer. Remember that the possible incomplete
426 blocks at the end of file are ignored: they are not taken into account when
427 calculating the byte offset within a space.
428 @return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do
429 i/o on a tablespace which does not exist */
430 UNIV_INLINE
431 dberr_t
fil_read(bool sync,ulint space_id,ulint zip_size,ulint block_offset,ulint byte_offset,ulint len,void * buf,void * message)432 fil_read(
433 /*=====*/
434 	bool	sync,		/*!< in: true if synchronous aio is desired */
435 	ulint	space_id,	/*!< in: space id */
436 	ulint	zip_size,	/*!< in: compressed page size in bytes;
437 				0 for uncompressed pages */
438 	ulint	block_offset,	/*!< in: offset in number of blocks */
439 	ulint	byte_offset,	/*!< in: remainder of offset in bytes; in aio
440 				this must be divisible by the OS block size */
441 	ulint	len,		/*!< in: how many bytes to read; this must not
442 				cross a file boundary; in aio this must be a
443 				block size multiple */
444 	void*	buf,		/*!< in/out: buffer where to store data read;
445 				in aio this must be appropriately aligned */
446 	void*	message)	/*!< in: message for aio handler if non-sync
447 				aio used, else ignored */
448 {
449 	return(fil_io(OS_FILE_READ, sync, space_id, zip_size, block_offset,
450 					  byte_offset, len, buf, message));
451 }
452 
453 /********************************************************************//**
454 Writes data to a space from a buffer. Remember that the possible incomplete
455 blocks at the end of file are ignored: they are not taken into account when
456 calculating the byte offset within a space.
457 @return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do
458 i/o on a tablespace which does not exist */
459 UNIV_INLINE
460 dberr_t
fil_write(bool sync,ulint space_id,ulint zip_size,ulint block_offset,ulint byte_offset,ulint len,void * buf,void * message)461 fil_write(
462 /*======*/
463 	bool	sync,		/*!< in: true if synchronous aio is desired */
464 	ulint	space_id,	/*!< in: space id */
465 	ulint	zip_size,	/*!< in: compressed page size in bytes;
466 				0 for uncompressed pages */
467 	ulint	block_offset,	/*!< in: offset in number of blocks */
468 	ulint	byte_offset,	/*!< in: remainder of offset in bytes; in aio
469 				this must be divisible by the OS block size */
470 	ulint	len,		/*!< in: how many bytes to write; this must
471 				not cross a file boundary; in aio this must
472 				be a block size multiple */
473 	void*	buf,		/*!< in: buffer from which to write; in aio
474 				this must be appropriately aligned */
475 	void*	message)	/*!< in: message for aio handler if non-sync
476 				aio used, else ignored */
477 {
478 	ut_ad(!srv_read_only_mode);
479 
480 	return(fil_io(OS_FILE_WRITE, sync, space_id, zip_size, block_offset,
481 					   byte_offset, len, buf, message));
482 }
483 
484 /*******************************************************************//**
485 Returns the table space by a given id, NULL if not found. */
486 UNIV_INLINE
487 fil_space_t*
fil_space_get_by_id(ulint id)488 fil_space_get_by_id(
489 /*================*/
490 	ulint	id)	/*!< in: space id */
491 {
492 	fil_space_t*	space;
493 
494 	ut_ad(mutex_own(&fil_system->mutex));
495 
496 	HASH_SEARCH(hash, fil_system->spaces, id,
497 		    fil_space_t*, space,
498 		    ut_ad(space->magic_n == FIL_SPACE_MAGIC_N),
499 		    space->id == id);
500 
501 	/* The system tablespace must always be found */
502 	ut_ad(space || id != 0 || srv_is_being_started);
503 	return(space);
504 }
505 
506 /*******************************************************************//**
507 Returns the table space by a given name, NULL if not found. */
508 UNIV_INLINE
509 fil_space_t*
fil_space_get_by_name(const char * name)510 fil_space_get_by_name(
511 /*==================*/
512 	const char*	name)	/*!< in: space name */
513 {
514 	fil_space_t*	space;
515 	ulint		fold;
516 
517 	ut_ad(mutex_own(&fil_system->mutex));
518 
519 	fold = ut_fold_string(name);
520 
521 	HASH_SEARCH(name_hash, fil_system->name_hash, fold,
522 		    fil_space_t*, space,
523 		    ut_ad(space->magic_n == FIL_SPACE_MAGIC_N),
524 		    !strcmp(name, space->name));
525 
526 	return(space);
527 }
528 
529 #ifndef UNIV_HOTBACKUP
530 /*******************************************************************//**
531 Returns the version number of a tablespace, -1 if not found.
532 @return version number, -1 if the tablespace does not exist in the
533 memory cache */
534 UNIV_INTERN
535 ib_int64_t
fil_space_get_version(ulint id)536 fil_space_get_version(
537 /*==================*/
538 	ulint	id)	/*!< in: space id */
539 {
540 	fil_space_t*	space;
541 	ib_int64_t	version		= -1;
542 
543 	ut_ad(fil_system);
544 
545 	mutex_enter(&fil_system->mutex);
546 
547 	space = fil_space_get_by_id(id);
548 
549 	if (space) {
550 		version = space->tablespace_version;
551 	}
552 
553 	mutex_exit(&fil_system->mutex);
554 
555 	return(version);
556 }
557 
558 /*******************************************************************//**
559 Returns the latch of a file space.
560 @return	latch protecting storage allocation */
561 UNIV_INTERN
562 prio_rw_lock_t*
fil_space_get_latch(ulint id,ulint * flags)563 fil_space_get_latch(
564 /*================*/
565 	ulint	id,	/*!< in: space id */
566 	ulint*	flags)	/*!< out: tablespace flags */
567 {
568 	fil_space_t*	space;
569 
570 	ut_ad(fil_system);
571 
572 	mutex_enter(&fil_system->mutex);
573 
574 	space = fil_space_get_by_id(id);
575 
576 	ut_a(space);
577 
578 	if (flags) {
579 		*flags = space->flags;
580 	}
581 
582 	mutex_exit(&fil_system->mutex);
583 
584 	return(&(space->latch));
585 }
586 
587 /*******************************************************************//**
588 Returns the type of a file space.
589 @return	FIL_TABLESPACE or FIL_LOG */
590 UNIV_INTERN
591 ulint
fil_space_get_type(ulint id)592 fil_space_get_type(
593 /*===============*/
594 	ulint	id)	/*!< in: space id */
595 {
596 	fil_space_t*	space;
597 
598 	ut_ad(fil_system);
599 
600 	mutex_enter(&fil_system->mutex);
601 
602 	space = fil_space_get_by_id(id);
603 
604 	ut_a(space);
605 
606 	mutex_exit(&fil_system->mutex);
607 
608 	return(space->purpose);
609 }
610 #endif /* !UNIV_HOTBACKUP */
611 
612 /**********************************************************************//**
613 Checks if all the file nodes in a space are flushed. The caller must hold
614 the fil_system mutex.
615 @return	true if all are flushed */
616 static
617 bool
fil_space_is_flushed(fil_space_t * space)618 fil_space_is_flushed(
619 /*=================*/
620 	fil_space_t*	space)	/*!< in: space */
621 {
622 	fil_node_t*	node;
623 
624 	ut_ad(mutex_own(&fil_system->mutex));
625 
626 	node = UT_LIST_GET_FIRST(space->chain);
627 
628 	while (node) {
629 		if (node->modification_counter > node->flush_counter) {
630 
631 			ut_ad(!fil_buffering_disabled(space));
632 			return(false);
633 		}
634 
635 		node = UT_LIST_GET_NEXT(chain, node);
636 	}
637 
638 	return(true);
639 }
640 
641 /*******************************************************************//**
642 Appends a new file to the chain of files of a space. File must be closed.
643 @return pointer to the file name, or NULL on error */
644 UNIV_INTERN
645 char*
fil_node_create(const char * name,ulint size,ulint id,ibool is_raw)646 fil_node_create(
647 /*============*/
648 	const char*	name,	/*!< in: file name (file must be closed) */
649 	ulint		size,	/*!< in: file size in database blocks, rounded
650 				downwards to an integer */
651 	ulint		id,	/*!< in: space id where to append */
652 	ibool		is_raw)	/*!< in: TRUE if a raw device or
653 				a raw disk partition */
654 {
655 	fil_node_t*	node;
656 	fil_space_t*	space;
657 
658 	ut_a(fil_system);
659 	ut_a(name);
660 
661 	mutex_enter(&fil_system->mutex);
662 
663 	node = static_cast<fil_node_t*>(mem_zalloc(sizeof(fil_node_t)));
664 
665 	node->name = mem_strdup(name);
666 
667 	ut_a(!is_raw || srv_start_raw_disk_in_use);
668 
669 	node->sync_event = os_event_create();
670 	node->is_raw_disk = is_raw;
671 	node->size = size;
672 	node->magic_n = FIL_NODE_MAGIC_N;
673 
674 	space = fil_space_get_by_id(id);
675 
676 	if (!space) {
677 		ut_print_timestamp(stderr);
678 		fprintf(stderr,
679 			"  InnoDB: Error: Could not find tablespace %lu for\n"
680 			"InnoDB: file ", (ulong) id);
681 		ut_print_filename(stderr, name);
682 		fputs(" in the tablespace memory cache.\n", stderr);
683 		mem_free(node->name);
684 
685 		mem_free(node);
686 
687 		mutex_exit(&fil_system->mutex);
688 
689 		return(NULL);
690 	}
691 
692 	space->size += size;
693 
694 	node->space = space;
695 
696 	UT_LIST_ADD_LAST(chain, space->chain, node);
697 
698 	if (id < SRV_LOG_SPACE_FIRST_ID && fil_system->max_assigned_id < id) {
699 
700 		fil_system->max_assigned_id = id;
701 	}
702 
703 	mutex_exit(&fil_system->mutex);
704 
705 	return(node->name);
706 }
707 
708 /********************************************************************//**
709 Opens a file of a node of a tablespace. The caller must own the fil_system
710 mutex.
711 @return false if the file can't be opened, otherwise true */
712 static
713 bool
fil_node_open_file(fil_node_t * node,fil_system_t * system,fil_space_t * space)714 fil_node_open_file(
715 /*===============*/
716 	fil_node_t*	node,	/*!< in: file node */
717 	fil_system_t*	system,	/*!< in: tablespace memory cache */
718 	fil_space_t*	space)	/*!< in: space */
719 {
720 	os_offset_t	size_bytes;
721 	ibool		ret;
722 	ibool		success;
723 	byte*		buf2;
724 	byte*		page;
725 	ulint		space_id;
726 	ulint		flags;
727 	ulint		page_size;
728 
729 	ut_ad(mutex_own(&(system->mutex)));
730 	ut_a(node->n_pending == 0);
731 	ut_a(node->open == FALSE);
732 
733 	if (node->size == 0) {
734 		/* It must be a single-table tablespace and we do not know the
735 		size of the file yet. First we open the file in the normal
736 		mode, no async I/O here, for simplicity. Then do some checks,
737 		and close the file again.
738 		NOTE that we could not use the simple file read function
739 		os_file_read() in Windows to read from a file opened for
740 		async I/O! */
741 
742 		node->handle = os_file_create_simple_no_error_handling(
743 			innodb_file_data_key, node->name, OS_FILE_OPEN,
744 			OS_FILE_READ_ONLY, &success);
745 		if (!success) {
746 			/* The following call prints an error message */
747 			os_file_get_last_error(true);
748 
749 			ut_print_timestamp(stderr);
750 
751 			ib_logf(IB_LOG_LEVEL_WARN, "InnoDB: Error: cannot "
752 				"open %s\n. InnoDB: Have you deleted .ibd "
753 				"files under a running mysqld server?\n",
754 				node->name);
755 
756 			return(false);
757 		}
758 
759 		size_bytes = os_file_get_size(node->handle);
760 		ut_a(size_bytes != (os_offset_t) -1);
761 #ifdef UNIV_HOTBACKUP
762 		if (space->id == 0) {
763 			node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE);
764 			os_file_close(node->handle);
765 			goto add_size;
766 		}
767 #endif /* UNIV_HOTBACKUP */
768 		ut_a(space->purpose != FIL_LOG);
769 		ut_a(fil_is_user_tablespace_id(space->id));
770 
771 		if (size_bytes < FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE) {
772 			fprintf(stderr,
773 				"InnoDB: Error: the size of single-table"
774 				" tablespace file %s\n"
775 				"InnoDB: is only " UINT64PF ","
776 				" should be at least %lu!\n",
777 				node->name,
778 				size_bytes,
779 				(ulong) (FIL_IBD_FILE_INITIAL_SIZE
780 					 * UNIV_PAGE_SIZE));
781 
782 			ut_a(0);
783 		}
784 
785 		/* Read the first page of the tablespace */
786 
787 		buf2 = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE));
788 		/* Align the memory for file i/o if we might have O_DIRECT
789 		set */
790 		page = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
791 
792 		success = os_file_read(node->handle, page, 0, UNIV_PAGE_SIZE);
793 		space_id = fsp_header_get_space_id(page);
794 		flags = fsp_header_get_flags(page);
795 		page_size = fsp_flags_get_page_size(flags);
796 
797 		ut_free(buf2);
798 
799 		/* Close the file now that we have read the space id from it */
800 
801 		os_file_close(node->handle);
802 
803 		if (UNIV_UNLIKELY(space_id != space->id)) {
804 			fprintf(stderr,
805 				"InnoDB: Error: tablespace id is %lu"
806 				" in the data dictionary\n"
807 				"InnoDB: but in file %s it is %lu!\n",
808 				space->id, node->name, space_id);
809 
810 			ut_error;
811 		}
812 
813 		if (UNIV_UNLIKELY(space_id == ULINT_UNDEFINED
814 				  || space_id == 0)) {
815 			fprintf(stderr,
816 				"InnoDB: Error: tablespace id %lu"
817 				" in file %s is not sensible\n",
818 				(ulong) space_id, node->name);
819 
820 			ut_error;
821 		}
822 
823 		if (UNIV_UNLIKELY(fsp_flags_get_page_size(space->flags)
824 				  != page_size)) {
825 			fprintf(stderr,
826 				"InnoDB: Error: tablespace file %s"
827 				" has page size 0x%lx\n"
828 				"InnoDB: but the data dictionary"
829 				" expects page size 0x%lx!\n",
830 				node->name, flags,
831 				fsp_flags_get_page_size(space->flags));
832 
833 			ut_error;
834 		}
835 
836 		if (UNIV_UNLIKELY(space->flags != flags)) {
837 			fprintf(stderr,
838 				"InnoDB: Error: table flags are 0x%lx"
839 				" in the data dictionary\n"
840 				"InnoDB: but the flags in file %s are 0x%lx!\n",
841 				space->flags, node->name, flags);
842 
843 			ut_error;
844 		}
845 
846 		if (size_bytes >= 1024 * 1024) {
847 			/* Truncate the size to whole megabytes. */
848 			size_bytes = ut_2pow_round(size_bytes, 1024 * 1024);
849 		}
850 
851 		if (!fsp_flags_is_compressed(flags)) {
852 			node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE);
853 		} else {
854 			node->size = (ulint)
855 				(size_bytes
856 				 / fsp_flags_get_zip_size(flags));
857 		}
858 
859 #ifdef UNIV_HOTBACKUP
860 add_size:
861 #endif /* UNIV_HOTBACKUP */
862 		space->size += node->size;
863 	}
864 
865 	/* printf("Opening file %s\n", node->name); */
866 
867 	/* Open the file for reading and writing, in Windows normally in the
868 	unbuffered async I/O mode, though global variables may make
869 	os_file_create() to fall back to the normal file I/O mode. */
870 
871 	if (space->purpose == FIL_LOG) {
872 		node->handle = os_file_create(innodb_file_log_key,
873 					      node->name, OS_FILE_OPEN,
874 					      OS_FILE_AIO, OS_LOG_FILE,
875 					      &ret);
876 	} else if (node->is_raw_disk) {
877 		node->handle = os_file_create(innodb_file_data_key,
878 					      node->name,
879 					      OS_FILE_OPEN_RAW,
880 					      OS_FILE_AIO, OS_DATA_FILE,
881 						     &ret);
882 	} else {
883 		node->handle = os_file_create(innodb_file_data_key,
884 					      node->name, OS_FILE_OPEN,
885 					      OS_FILE_AIO, OS_DATA_FILE,
886 					      &ret);
887 	}
888 
889 	ut_a(ret);
890 
891 	node->open = TRUE;
892 
893 	system->n_open++;
894 	fil_n_file_opened++;
895 
896 	if (fil_space_belongs_in_lru(space)) {
897 
898 		/* Put the node to the LRU list */
899 		UT_LIST_ADD_FIRST(LRU, system->LRU, node);
900 	}
901 
902 	return(true);
903 }
904 
905 /**********************************************************************//**
906 Closes a file. */
907 static
908 void
fil_node_close_file(fil_node_t * node,fil_system_t * system)909 fil_node_close_file(
910 /*================*/
911 	fil_node_t*	node,	/*!< in: file node */
912 	fil_system_t*	system)	/*!< in: tablespace memory cache */
913 {
914 	ibool	ret;
915 
916 	ut_ad(node && system);
917 	ut_ad(mutex_own(&(system->mutex)));
918 	ut_a(node->open);
919 	ut_a(node->n_pending == 0);
920 	ut_a(node->n_pending_flushes == 0);
921 	ut_a(!node->being_extended);
922 #ifndef UNIV_HOTBACKUP
923 	ut_a(node->modification_counter == node->flush_counter
924 	     || srv_fast_shutdown == 2);
925 #endif /* !UNIV_HOTBACKUP */
926 
927 	ret = os_file_close(node->handle);
928 	ut_a(ret);
929 
930 	/* printf("Closing file %s\n", node->name); */
931 
932 	node->open = FALSE;
933 	ut_a(system->n_open > 0);
934 	system->n_open--;
935 	fil_n_file_opened--;
936 
937 	if (fil_space_belongs_in_lru(node->space)) {
938 
939 		ut_a(UT_LIST_GET_LEN(system->LRU) > 0);
940 
941 		/* The node is in the LRU list, remove it */
942 		UT_LIST_REMOVE(LRU, system->LRU, node);
943 	}
944 }
945 
946 /********************************************************************//**
947 Tries to close a file in the LRU list. The caller must hold the fil_sys
948 mutex.
949 @return TRUE if success, FALSE if should retry later; since i/o's
950 generally complete in < 100 ms, and as InnoDB writes at most 128 pages
951 from the buffer pool in a batch, and then immediately flushes the
952 files, there is a good chance that the next time we find a suitable
953 node from the LRU list */
954 static
955 ibool
fil_try_to_close_file_in_LRU(ibool print_info)956 fil_try_to_close_file_in_LRU(
957 /*=========================*/
958 	ibool	print_info)	/*!< in: if TRUE, prints information why it
959 				cannot close a file */
960 {
961 	fil_node_t*	node;
962 
963 	ut_ad(mutex_own(&fil_system->mutex));
964 
965 	if (print_info) {
966 		fprintf(stderr,
967 			"InnoDB: fil_sys open file LRU len %lu\n",
968 			(ulong) UT_LIST_GET_LEN(fil_system->LRU));
969 	}
970 
971 	for (node = UT_LIST_GET_LAST(fil_system->LRU);
972 	     node != NULL;
973 	     node = UT_LIST_GET_PREV(LRU, node)) {
974 
975 		if (node->modification_counter == node->flush_counter
976 		    && node->n_pending_flushes == 0
977 		    && !node->being_extended) {
978 
979 			fil_node_close_file(node, fil_system);
980 
981 			return(TRUE);
982 		}
983 
984 		if (!print_info) {
985 			continue;
986 		}
987 
988 		if (node->n_pending_flushes > 0) {
989 			fputs("InnoDB: cannot close file ", stderr);
990 			ut_print_filename(stderr, node->name);
991 			fprintf(stderr, ", because n_pending_flushes %lu\n",
992 				(ulong) node->n_pending_flushes);
993 		}
994 
995 		if (node->modification_counter != node->flush_counter) {
996 			fputs("InnoDB: cannot close file ", stderr);
997 			ut_print_filename(stderr, node->name);
998 			fprintf(stderr,
999 				", because mod_count %ld != fl_count %ld\n",
1000 				(long) node->modification_counter,
1001 				(long) node->flush_counter);
1002 
1003 		}
1004 
1005 		if (node->being_extended) {
1006 			fputs("InnoDB: cannot close file ", stderr);
1007 			ut_print_filename(stderr, node->name);
1008 			fprintf(stderr, ", because it is being extended\n");
1009 		}
1010 	}
1011 
1012 	return(FALSE);
1013 }
1014 
1015 /*******************************************************************//**
1016 Reserves the fil_system mutex and tries to make sure we can open at least one
1017 file while holding it. This should be called before calling
1018 fil_node_prepare_for_io(), because that function may need to open a file. */
1019 static
1020 void
fil_mutex_enter_and_prepare_for_io(ulint space_id)1021 fil_mutex_enter_and_prepare_for_io(
1022 /*===============================*/
1023 	ulint	space_id)	/*!< in: space id */
1024 {
1025 	fil_space_t*	space;
1026 	ibool		success;
1027 	ibool		print_info	= FALSE;
1028 	ulint		count		= 0;
1029 	ulint		count2		= 0;
1030 
1031 retry:
1032 	mutex_enter(&fil_system->mutex);
1033 
1034 	if (space_id == 0 || space_id >= SRV_LOG_SPACE_FIRST_ID) {
1035 		/* We keep log files and system tablespace files always open;
1036 		this is important in preventing deadlocks in this module, as
1037 		a page read completion often performs another read from the
1038 		insert buffer. The insert buffer is in tablespace 0, and we
1039 		cannot end up waiting in this function. */
1040 
1041 		return;
1042 	}
1043 
1044 	space = fil_space_get_by_id(space_id);
1045 
1046 	if (space != NULL && space->stop_ios) {
1047 		/* We are going to do a rename file and want to stop new i/o's
1048 		for a while */
1049 
1050 		if (count2 > 20000) {
1051 			fputs("InnoDB: Warning: tablespace ", stderr);
1052 			ut_print_filename(stderr, space->name);
1053 			fprintf(stderr,
1054 				" has i/o ops stopped for a long time %lu\n",
1055 				(ulong) count2);
1056 		}
1057 
1058 		mutex_exit(&fil_system->mutex);
1059 
1060 #ifndef UNIV_HOTBACKUP
1061 
1062 		/* Wake the i/o-handler threads to make sure pending
1063 		i/o's are performed */
1064 		os_aio_simulated_wake_handler_threads();
1065 
1066 		/* The sleep here is just to give IO helper threads a
1067 		bit of time to do some work. It is not required that
1068 		all IO related to the tablespace being renamed must
1069 		be flushed here as we do fil_flush() in
1070 		fil_rename_tablespace() as well. */
1071 		os_thread_sleep(20000);
1072 
1073 #endif /* UNIV_HOTBACKUP */
1074 
1075 		/* Flush tablespaces so that we can close modified
1076 		files in the LRU list */
1077 		fil_flush_file_spaces(FIL_TABLESPACE);
1078 
1079 		os_thread_sleep(20000);
1080 
1081 		count2++;
1082 
1083 		goto retry;
1084 	}
1085 
1086 	if (fil_system->n_open < fil_system->max_n_open) {
1087 
1088 		return;
1089 	}
1090 
1091 	/* If the file is already open, no need to do anything; if the space
1092 	does not exist, we handle the situation in the function which called
1093 	this function */
1094 
1095 	if (!space || UT_LIST_GET_FIRST(space->chain)->open) {
1096 
1097 		return;
1098 	}
1099 
1100 	if (count > 1) {
1101 		print_info = TRUE;
1102 	}
1103 
1104 	/* Too many files are open, try to close some */
1105 close_more:
1106 	success = fil_try_to_close_file_in_LRU(print_info);
1107 
1108 	if (success && fil_system->n_open >= fil_system->max_n_open) {
1109 
1110 		goto close_more;
1111 	}
1112 
1113 	if (fil_system->n_open < fil_system->max_n_open) {
1114 		/* Ok */
1115 
1116 		return;
1117 	}
1118 
1119 	if (count >= 2) {
1120 		ut_print_timestamp(stderr);
1121 		fprintf(stderr,
1122 			"  InnoDB: Warning: too many (%lu) files stay open"
1123 			" while the maximum\n"
1124 			"InnoDB: allowed value would be %lu.\n"
1125 			"InnoDB: You may need to raise the value of"
1126 			" innodb_open_files in\n"
1127 			"InnoDB: my.cnf.\n",
1128 			(ulong) fil_system->n_open,
1129 			(ulong) fil_system->max_n_open);
1130 
1131 		return;
1132 	}
1133 
1134 	mutex_exit(&fil_system->mutex);
1135 
1136 #ifndef UNIV_HOTBACKUP
1137 	/* Wake the i/o-handler threads to make sure pending i/o's are
1138 	performed */
1139 	os_aio_simulated_wake_handler_threads();
1140 
1141 	os_thread_sleep(20000);
1142 #endif
1143 	/* Flush tablespaces so that we can close modified files in the LRU
1144 	list */
1145 
1146 	fil_flush_file_spaces(FIL_TABLESPACE);
1147 
1148 	count++;
1149 
1150 	goto retry;
1151 }
1152 
1153 /*******************************************************************//**
1154 Frees a file node object from a tablespace memory cache. */
1155 static
1156 void
fil_node_free(fil_node_t * node,fil_system_t * system,fil_space_t * space)1157 fil_node_free(
1158 /*==========*/
1159 	fil_node_t*	node,	/*!< in, own: file node */
1160 	fil_system_t*	system,	/*!< in: tablespace memory cache */
1161 	fil_space_t*	space)	/*!< in: space where the file node is chained */
1162 {
1163 	ut_ad(node && system && space);
1164 	ut_ad(mutex_own(&(system->mutex)));
1165 	ut_a(node->magic_n == FIL_NODE_MAGIC_N);
1166 	ut_a(node->n_pending == 0);
1167 	ut_a(!node->being_extended);
1168 
1169 	if (node->open) {
1170 		/* We fool the assertion in fil_node_close_file() to think
1171 		there are no unflushed modifications in the file */
1172 
1173 		node->modification_counter = node->flush_counter;
1174 		os_event_set(node->sync_event);
1175 
1176 		if (fil_buffering_disabled(space)) {
1177 
1178 			ut_ad(!space->is_in_unflushed_spaces);
1179 			ut_ad(fil_space_is_flushed(space));
1180 
1181 		} else if (space->is_in_unflushed_spaces
1182 			   && fil_space_is_flushed(space)) {
1183 
1184 			space->is_in_unflushed_spaces = false;
1185 
1186 			UT_LIST_REMOVE(unflushed_spaces,
1187 				       system->unflushed_spaces,
1188 				       space);
1189 		}
1190 
1191 		fil_node_close_file(node, system);
1192 	}
1193 
1194 	space->size -= node->size;
1195 
1196 	UT_LIST_REMOVE(chain, space->chain, node);
1197 
1198 	os_event_free(node->sync_event);
1199 	mem_free(node->name);
1200 	mem_free(node);
1201 }
1202 
1203 #ifdef UNIV_LOG_ARCHIVE
1204 /****************************************************************//**
1205 Drops files from the start of a file space, so that its size is cut by
1206 the amount given. */
1207 UNIV_INTERN
1208 void
fil_space_truncate_start(ulint id,ulint trunc_len)1209 fil_space_truncate_start(
1210 /*=====================*/
1211 	ulint	id,		/*!< in: space id */
1212 	ulint	trunc_len)	/*!< in: truncate by this much; it is an error
1213 				if this does not equal to the combined size of
1214 				some initial files in the space */
1215 {
1216 	fil_node_t*	node;
1217 	fil_space_t*	space;
1218 
1219 	mutex_enter(&fil_system->mutex);
1220 
1221 	space = fil_space_get_by_id(id);
1222 
1223 	ut_a(space);
1224 
1225 	while (trunc_len > 0) {
1226 		node = UT_LIST_GET_FIRST(space->chain);
1227 
1228 		ut_a(node->size * UNIV_PAGE_SIZE <= trunc_len);
1229 
1230 		trunc_len -= node->size * UNIV_PAGE_SIZE;
1231 
1232 		fil_node_free(node, fil_system, space);
1233 	}
1234 
1235 	mutex_exit(&fil_system->mutex);
1236 }
1237 
1238 /****************************************************************//**
1239 Check is there node in file space with given name. */
1240 UNIV_INTERN
1241 ibool
fil_space_contains_node(ulint id,char * node_name)1242 fil_space_contains_node(
1243 /*====================*/
1244 	ulint	id,		/*!< in: space id */
1245 	char*	node_name)	/*!< in: node name */
1246 {
1247 	fil_node_t*	node;
1248 	fil_space_t*	space;
1249 
1250 	mutex_enter(&fil_system->mutex);
1251 
1252 	space = fil_space_get_by_id(id);
1253 
1254 	ut_a(space);
1255 
1256 	for (node = UT_LIST_GET_FIRST(space->chain); node != NULL;
1257 	     node = UT_LIST_GET_NEXT(chain, node)) {
1258 
1259 		if (ut_strcmp(node->name, node_name) == 0) {
1260 			mutex_exit(&fil_system->mutex);
1261 			return(TRUE);
1262 		}
1263 
1264 	}
1265 
1266 	mutex_exit(&fil_system->mutex);
1267 	return(FALSE);
1268 }
1269 
1270 #endif /* UNIV_LOG_ARCHIVE */
1271 
1272 /*******************************************************************//**
1273 Creates a space memory object and puts it to the 'fil system' hash table.
1274 If there is an error, prints an error message to the .err log.
1275 @return	TRUE if success */
1276 UNIV_INTERN
1277 ibool
fil_space_create(const char * name,ulint id,ulint flags,ulint purpose)1278 fil_space_create(
1279 /*=============*/
1280 	const char*	name,	/*!< in: space name */
1281 	ulint		id,	/*!< in: space id */
1282 	ulint		flags,	/*!< in: tablespace flags */
1283 	ulint		purpose)/*!< in: FIL_TABLESPACE, or FIL_LOG if log */
1284 {
1285 	fil_space_t*	space;
1286 
1287 	DBUG_EXECUTE_IF("fil_space_create_failure", return(false););
1288 
1289 	ut_a(fil_system);
1290 	ut_a(fsp_flags_is_valid(flags));
1291 
1292 	/* Look for a matching tablespace and if found free it. */
1293 	do {
1294 		mutex_enter(&fil_system->mutex);
1295 
1296 		space = fil_space_get_by_name(name);
1297 
1298 		if (space != 0) {
1299 			ib_logf(IB_LOG_LEVEL_WARN,
1300 				"Tablespace '%s' exists in the cache "
1301 				"with id %lu != %lu",
1302 				name, (ulong) space->id, (ulong) id);
1303 
1304 			if (id == 0 || purpose != FIL_TABLESPACE) {
1305 
1306 				mutex_exit(&fil_system->mutex);
1307 
1308 				return(FALSE);
1309 			}
1310 
1311 			ib_logf(IB_LOG_LEVEL_WARN,
1312 				"Freeing existing tablespace '%s' entry "
1313 				"from the cache with id %lu",
1314 				name, (ulong) id);
1315 
1316 			ibool	success = fil_space_free(space->id, FALSE);
1317 			ut_a(success);
1318 
1319 			mutex_exit(&fil_system->mutex);
1320 		}
1321 
1322 	} while (space != 0);
1323 
1324 	space = fil_space_get_by_id(id);
1325 
1326 	if (space != 0) {
1327 		ib_logf(IB_LOG_LEVEL_ERROR,
1328 			"Trying to add tablespace '%s' with id %lu "
1329 			"to the tablespace memory cache, but tablespace '%s' "
1330 			"with id %lu already exists in the cache!",
1331 			name, (ulong) id, space->name, (ulong) space->id);
1332 
1333 		mutex_exit(&fil_system->mutex);
1334 
1335 		return(FALSE);
1336 	}
1337 
1338 	space = static_cast<fil_space_t*>(mem_zalloc(sizeof(*space)));
1339 
1340 	space->name = mem_strdup(name);
1341 	space->id = id;
1342 
1343 	fil_system->tablespace_version++;
1344 	space->tablespace_version = fil_system->tablespace_version;
1345 	space->mark = FALSE;
1346 
1347 	if (purpose == FIL_TABLESPACE && !recv_recovery_on
1348 	    && id > fil_system->max_assigned_id) {
1349 
1350 		if (!fil_system->space_id_reuse_warned) {
1351 			fil_system->space_id_reuse_warned = TRUE;
1352 
1353 			ib_logf(IB_LOG_LEVEL_WARN,
1354 				"Allocated tablespace %lu, old maximum "
1355 				"was %lu",
1356 				(ulong) id,
1357 				(ulong) fil_system->max_assigned_id);
1358 		}
1359 
1360 		fil_system->max_assigned_id = id;
1361 	}
1362 
1363 	space->purpose = purpose;
1364 	space->flags = flags;
1365 
1366 	space->magic_n = FIL_SPACE_MAGIC_N;
1367 
1368 	rw_lock_create(fil_space_latch_key, &space->latch, SYNC_FSP);
1369 
1370 	HASH_INSERT(fil_space_t, hash, fil_system->spaces, id, space);
1371 
1372 	HASH_INSERT(fil_space_t, name_hash, fil_system->name_hash,
1373 		    ut_fold_string(name), space);
1374 	space->is_in_unflushed_spaces = false;
1375 
1376 	space->is_corrupt = FALSE;
1377 
1378 	UT_LIST_ADD_LAST(space_list, fil_system->space_list, space);
1379 
1380 	mutex_exit(&fil_system->mutex);
1381 
1382 	return(TRUE);
1383 }
1384 
1385 /*******************************************************************//**
1386 Assigns a new space id for a new single-table tablespace. This works simply by
1387 incrementing the global counter. If 4 billion id's is not enough, we may need
1388 to recycle id's.
1389 @return	TRUE if assigned, FALSE if not */
1390 UNIV_INTERN
1391 ibool
fil_assign_new_space_id(ulint * space_id)1392 fil_assign_new_space_id(
1393 /*====================*/
1394 	ulint*	space_id)	/*!< in/out: space id */
1395 {
1396 	ulint	id;
1397 	ibool	success;
1398 
1399 	mutex_enter(&fil_system->mutex);
1400 
1401 	id = *space_id;
1402 
1403 	if (id < fil_system->max_assigned_id) {
1404 		id = fil_system->max_assigned_id;
1405 	}
1406 
1407 	id++;
1408 
1409 	if (id > (SRV_LOG_SPACE_FIRST_ID / 2) && (id % 1000000UL == 0)) {
1410 		ut_print_timestamp(stderr);
1411 		fprintf(stderr,
1412 			"InnoDB: Warning: you are running out of new"
1413 			" single-table tablespace id's.\n"
1414 			"InnoDB: Current counter is %lu and it"
1415 			" must not exceed %lu!\n"
1416 			"InnoDB: To reset the counter to zero"
1417 			" you have to dump all your tables and\n"
1418 			"InnoDB: recreate the whole InnoDB installation.\n",
1419 			(ulong) id,
1420 			(ulong) SRV_LOG_SPACE_FIRST_ID);
1421 	}
1422 
1423 	success = (id < SRV_LOG_SPACE_FIRST_ID);
1424 
1425 	if (success) {
1426 		*space_id = fil_system->max_assigned_id = id;
1427 	} else {
1428 		ut_print_timestamp(stderr);
1429 		fprintf(stderr,
1430 			"InnoDB: You have run out of single-table"
1431 			" tablespace id's!\n"
1432 			"InnoDB: Current counter is %lu.\n"
1433 			"InnoDB: To reset the counter to zero you"
1434 			" have to dump all your tables and\n"
1435 			"InnoDB: recreate the whole InnoDB installation.\n",
1436 			(ulong) id);
1437 		*space_id = ULINT_UNDEFINED;
1438 	}
1439 
1440 	mutex_exit(&fil_system->mutex);
1441 
1442 	return(success);
1443 }
1444 
1445 /*******************************************************************//**
1446 Frees a space object from the tablespace memory cache. Closes the files in
1447 the chain but does not delete them. There must not be any pending i/o's or
1448 flushes on the files.
1449 @return	TRUE if success */
1450 static
1451 ibool
fil_space_free(ulint id,ibool x_latched)1452 fil_space_free(
1453 /*===========*/
1454 					/* out: TRUE if success */
1455 	ulint		id,		/* in: space id */
1456 	ibool		x_latched)	/* in: TRUE if caller has space->latch
1457 					in X mode */
1458 {
1459 	fil_space_t*	space;
1460 	fil_space_t*	fnamespace;
1461 
1462 	ut_ad(mutex_own(&fil_system->mutex));
1463 
1464 	space = fil_space_get_by_id(id);
1465 
1466 	if (!space) {
1467 		ut_print_timestamp(stderr);
1468 		fprintf(stderr,
1469 			"  InnoDB: Error: trying to remove tablespace %lu"
1470 			" from the cache but\n"
1471 			"InnoDB: it is not there.\n", (ulong) id);
1472 
1473 		return(FALSE);
1474 	}
1475 
1476 	HASH_DELETE(fil_space_t, hash, fil_system->spaces, id, space);
1477 
1478 	fnamespace = fil_space_get_by_name(space->name);
1479 	ut_a(fnamespace);
1480 	ut_a(space == fnamespace);
1481 
1482 	HASH_DELETE(fil_space_t, name_hash, fil_system->name_hash,
1483 		    ut_fold_string(space->name), space);
1484 
1485 	if (space->is_in_unflushed_spaces) {
1486 
1487 		ut_ad(!fil_buffering_disabled(space));
1488 		space->is_in_unflushed_spaces = false;
1489 
1490 		UT_LIST_REMOVE(unflushed_spaces, fil_system->unflushed_spaces,
1491 			       space);
1492 	}
1493 
1494 	UT_LIST_REMOVE(space_list, fil_system->space_list, space);
1495 
1496 	ut_a(space->magic_n == FIL_SPACE_MAGIC_N);
1497 	ut_a(0 == space->n_pending_flushes);
1498 
1499 	for (fil_node_t* fil_node = UT_LIST_GET_FIRST(space->chain);
1500 	     fil_node != NULL;
1501 	     fil_node = UT_LIST_GET_FIRST(space->chain)) {
1502 
1503 		fil_node_free(fil_node, fil_system, space);
1504 	}
1505 
1506 	ut_a(0 == UT_LIST_GET_LEN(space->chain));
1507 
1508 	if (x_latched) {
1509 		rw_lock_x_unlock(&space->latch);
1510 	}
1511 
1512 	rw_lock_free(&(space->latch));
1513 
1514 	mem_free(space->name);
1515 	mem_free(space);
1516 
1517 	return(TRUE);
1518 }
1519 
1520 /*******************************************************************//**
1521 Returns a pointer to the file_space_t that is in the memory cache
1522 associated with a space id. The caller must lock fil_system->mutex.
1523 @return	file_space_t pointer, NULL if space not found */
1524 UNIV_INLINE
1525 fil_space_t*
fil_space_get_space(ulint id)1526 fil_space_get_space(
1527 /*================*/
1528 	ulint	id)	/*!< in: space id */
1529 {
1530 	fil_space_t*	space;
1531 	fil_node_t*	node;
1532 
1533 	ut_ad(fil_system);
1534 
1535 	space = fil_space_get_by_id(id);
1536 	if (space == NULL) {
1537 		return(NULL);
1538 	}
1539 
1540 	if (space->size == 0 && space->purpose == FIL_TABLESPACE) {
1541 		ut_a(id != 0);
1542 
1543 		mutex_exit(&fil_system->mutex);
1544 
1545 		/* It is possible that the space gets evicted at this point
1546 		before the fil_mutex_enter_and_prepare_for_io() acquires
1547 		the fil_system->mutex. Check for this after completing the
1548 		call to fil_mutex_enter_and_prepare_for_io(). */
1549 		fil_mutex_enter_and_prepare_for_io(id);
1550 
1551 		/* We are still holding the fil_system->mutex. Check if
1552 		the space is still in memory cache. */
1553 		space = fil_space_get_by_id(id);
1554 		if (space == NULL) {
1555 			return(NULL);
1556 		}
1557 
1558 		/* The following code must change when InnoDB supports
1559 		multiple datafiles per tablespace. */
1560 		ut_a(1 == UT_LIST_GET_LEN(space->chain));
1561 
1562 		node = UT_LIST_GET_FIRST(space->chain);
1563 
1564 		/* It must be a single-table tablespace and we have not opened
1565 		the file yet; the following calls will open it and update the
1566 		size fields */
1567 
1568 		if (!fil_node_prepare_for_io(node, fil_system, space)) {
1569 			/* The single-table tablespace can't be opened,
1570 			because the ibd file is missing. */
1571 			return(NULL);
1572 		}
1573 		fil_node_complete_io(node, fil_system, OS_FILE_READ);
1574 	}
1575 
1576 	return(space);
1577 }
1578 
1579 /*******************************************************************//**
1580 Returns the path from the first fil_node_t found for the space ID sent.
1581 The caller is responsible for freeing the memory allocated here for the
1582 value returned.
1583 @return	own: A copy of fil_node_t::path, NULL if space ID is zero
1584 or not found. */
1585 UNIV_INTERN
1586 char*
fil_space_get_first_path(ulint id)1587 fil_space_get_first_path(
1588 /*=====================*/
1589 	ulint		id)	/*!< in: space id */
1590 {
1591 	fil_space_t*	space;
1592 	fil_node_t*	node;
1593 	char*		path;
1594 
1595 	ut_ad(fil_system);
1596 	ut_a(id);
1597 
1598 	fil_mutex_enter_and_prepare_for_io(id);
1599 
1600 	space = fil_space_get_space(id);
1601 
1602 	if (space == NULL) {
1603 		mutex_exit(&fil_system->mutex);
1604 
1605 		return(NULL);
1606 	}
1607 
1608 	ut_ad(mutex_own(&fil_system->mutex));
1609 
1610 	node = UT_LIST_GET_FIRST(space->chain);
1611 
1612 	path = mem_strdup(node->name);
1613 
1614 	mutex_exit(&fil_system->mutex);
1615 
1616 	return(path);
1617 }
1618 
1619 /*******************************************************************//**
1620 Returns the size of the space in pages. The tablespace must be cached in the
1621 memory cache.
1622 @return	space size, 0 if space not found */
1623 UNIV_INTERN
1624 ulint
fil_space_get_size(ulint id)1625 fil_space_get_size(
1626 /*===============*/
1627 	ulint	id)	/*!< in: space id */
1628 {
1629 	fil_space_t*	space;
1630 	ulint		size;
1631 
1632 	ut_ad(fil_system);
1633 	mutex_enter(&fil_system->mutex);
1634 
1635 	space = fil_space_get_space(id);
1636 
1637 	size = space ? space->size : 0;
1638 
1639 	mutex_exit(&fil_system->mutex);
1640 
1641 	return(size);
1642 }
1643 
1644 /*******************************************************************//**
1645 Returns the flags of the space. The tablespace must be cached
1646 in the memory cache.
1647 @return	flags, ULINT_UNDEFINED if space not found */
1648 UNIV_INTERN
1649 ulint
fil_space_get_flags(ulint id)1650 fil_space_get_flags(
1651 /*================*/
1652 	ulint	id)	/*!< in: space id */
1653 {
1654 	fil_space_t*	space;
1655 	ulint		flags;
1656 
1657 	ut_ad(fil_system);
1658 
1659 	if (!id) {
1660 		return(0);
1661 	}
1662 
1663 	mutex_enter(&fil_system->mutex);
1664 
1665 	space = fil_space_get_space(id);
1666 
1667 	if (space == NULL) {
1668 		mutex_exit(&fil_system->mutex);
1669 
1670 		return(ULINT_UNDEFINED);
1671 	}
1672 
1673 	flags = space->flags;
1674 
1675 	mutex_exit(&fil_system->mutex);
1676 
1677 	return(flags);
1678 }
1679 
1680 /*******************************************************************//**
1681 Returns the compressed page size of the space, or 0 if the space
1682 is not compressed. The tablespace must be cached in the memory cache.
1683 @return	compressed page size, ULINT_UNDEFINED if space not found */
1684 UNIV_INTERN
1685 ulint
fil_space_get_zip_size(ulint id)1686 fil_space_get_zip_size(
1687 /*===================*/
1688 	ulint	id)	/*!< in: space id */
1689 {
1690 	ulint	flags;
1691 
1692 	flags = fil_space_get_flags(id);
1693 
1694 	if (flags && flags != ULINT_UNDEFINED) {
1695 
1696 		return(fsp_flags_get_zip_size(flags));
1697 	}
1698 
1699 	return(flags);
1700 }
1701 
1702 /*******************************************************************//**
1703 Checks if the pair space, page_no refers to an existing page in a tablespace
1704 file space. The tablespace must be cached in the memory cache.
1705 @return	TRUE if the address is meaningful */
1706 UNIV_INTERN
1707 ibool
fil_check_adress_in_tablespace(ulint id,ulint page_no)1708 fil_check_adress_in_tablespace(
1709 /*===========================*/
1710 	ulint	id,	/*!< in: space id */
1711 	ulint	page_no)/*!< in: page number */
1712 {
1713 	if (fil_space_get_size(id) > page_no) {
1714 
1715 		return(TRUE);
1716 	}
1717 
1718 	return(FALSE);
1719 }
1720 
1721 /****************************************************************//**
1722 Initializes the tablespace memory cache. */
1723 UNIV_INTERN
1724 void
fil_init(ulint hash_size,ulint max_n_open)1725 fil_init(
1726 /*=====*/
1727 	ulint	hash_size,	/*!< in: hash table size */
1728 	ulint	max_n_open)	/*!< in: max number of open files */
1729 {
1730 	ut_a(fil_system == NULL);
1731 
1732 	ut_a(hash_size > 0);
1733 	ut_a(max_n_open > 0);
1734 
1735 	fil_system = static_cast<fil_system_t*>(
1736 		mem_zalloc(sizeof(fil_system_t)));
1737 
1738 	mutex_create(fil_system_mutex_key,
1739 		     &fil_system->mutex, SYNC_ANY_LATCH);
1740 
1741 	fil_system->spaces = hash_create(hash_size);
1742 	fil_system->name_hash = hash_create(hash_size);
1743 
1744 	UT_LIST_INIT(fil_system->LRU);
1745 
1746 	fil_system->max_n_open = max_n_open;
1747 }
1748 
1749 /*******************************************************************//**
1750 Opens all log files and system tablespace data files. They stay open until the
1751 database server shutdown. This should be called at a server startup after the
1752 space objects for the log and the system tablespace have been created. The
1753 purpose of this operation is to make sure we never run out of file descriptors
1754 if we need to read from the insert buffer or to write to the log. */
1755 UNIV_INTERN
1756 void
fil_open_log_and_system_tablespace_files(void)1757 fil_open_log_and_system_tablespace_files(void)
1758 /*==========================================*/
1759 {
1760 	fil_space_t*	space;
1761 
1762 	mutex_enter(&fil_system->mutex);
1763 
1764 	for (space = UT_LIST_GET_FIRST(fil_system->space_list);
1765 	     space != NULL;
1766 	     space = UT_LIST_GET_NEXT(space_list, space)) {
1767 
1768 		fil_node_t*	node;
1769 
1770 		if (fil_space_belongs_in_lru(space)) {
1771 
1772 			continue;
1773 		}
1774 
1775 		for (node = UT_LIST_GET_FIRST(space->chain);
1776 		     node != NULL;
1777 		     node = UT_LIST_GET_NEXT(chain, node)) {
1778 
1779 			if (!node->open) {
1780 				if (!fil_node_open_file(node, fil_system,
1781 							space)) {
1782 					/* This func is called during server's
1783 					startup. If some file of log or system
1784 					tablespace is missing, the server
1785 					can't start successfully. So we should
1786 					assert for it. */
1787 					ut_a(0);
1788 				}
1789 			}
1790 
1791 			if (fil_system->max_n_open < 10 + fil_system->n_open) {
1792 
1793 				fprintf(stderr,
1794 					"InnoDB: Warning: you must"
1795 					" raise the value of"
1796 					" innodb_open_files in\n"
1797 					"InnoDB: my.cnf! Remember that"
1798 					" InnoDB keeps all log files"
1799 					" and all system\n"
1800 					"InnoDB: tablespace files open"
1801 					" for the whole time mysqld is"
1802 					" running, and\n"
1803 					"InnoDB: needs to open also"
1804 					" some .ibd files if the"
1805 					" file-per-table storage\n"
1806 					"InnoDB: model is used."
1807 					" Current open files %lu,"
1808 					" max allowed"
1809 					" open files %lu.\n",
1810 					(ulong) fil_system->n_open,
1811 					(ulong) fil_system->max_n_open);
1812 			}
1813 		}
1814 	}
1815 
1816 	mutex_exit(&fil_system->mutex);
1817 }
1818 
1819 /*******************************************************************//**
1820 Closes all open files. There must not be any pending i/o's or not flushed
1821 modifications in the files. */
1822 UNIV_INTERN
1823 void
fil_close_all_files(void)1824 fil_close_all_files(void)
1825 /*=====================*/
1826 {
1827 	fil_space_t*	space;
1828 
1829 	// Must check both flags as it's possible for this to be called during
1830 	// server startup with srv_track_changed_pages == true but
1831 	// srv_redo_log_thread_started == false
1832 	if (srv_track_changed_pages && srv_redo_log_thread_started)
1833 		os_event_wait(srv_redo_log_tracked_event);
1834 
1835 	mutex_enter(&fil_system->mutex);
1836 
1837 	space = UT_LIST_GET_FIRST(fil_system->space_list);
1838 
1839 	while (space != NULL) {
1840 		fil_node_t*	node;
1841 		fil_space_t*	prev_space = space;
1842 
1843 		for (node = UT_LIST_GET_FIRST(space->chain);
1844 		     node != NULL;
1845 		     node = UT_LIST_GET_NEXT(chain, node)) {
1846 
1847 			if (node->open) {
1848 				fil_node_close_file(node, fil_system);
1849 			}
1850 		}
1851 
1852 		space = UT_LIST_GET_NEXT(space_list, space);
1853 
1854 		fil_space_free(prev_space->id, FALSE);
1855 	}
1856 
1857 	mutex_exit(&fil_system->mutex);
1858 }
1859 
1860 /*******************************************************************//**
1861 Closes the redo log files. There must not be any pending i/o's or not
1862 flushed modifications in the files. */
1863 UNIV_INTERN
1864 void
fil_close_log_files(bool free)1865 fil_close_log_files(
1866 /*================*/
1867 	bool	free)	/*!< in: whether to free the memory object */
1868 {
1869 	fil_space_t*	space;
1870 
1871 	// Must check both flags as it's possible for this to be called during
1872 	// server startup with srv_track_changed_pages == true but
1873 	// srv_redo_log_thread_started == false
1874 	if (srv_track_changed_pages && srv_redo_log_thread_started)
1875 		os_event_wait(srv_redo_log_tracked_event);
1876 
1877 	mutex_enter(&fil_system->mutex);
1878 
1879 	space = UT_LIST_GET_FIRST(fil_system->space_list);
1880 
1881 	while (space != NULL) {
1882 		fil_node_t*	node;
1883 		fil_space_t*	prev_space = space;
1884 
1885 		if (space->purpose != FIL_LOG) {
1886 			space = UT_LIST_GET_NEXT(space_list, space);
1887 			continue;
1888 		}
1889 
1890 		for (node = UT_LIST_GET_FIRST(space->chain);
1891 		     node != NULL;
1892 		     node = UT_LIST_GET_NEXT(chain, node)) {
1893 
1894 			if (node->open) {
1895 				fil_node_close_file(node, fil_system);
1896 			}
1897 		}
1898 
1899 		space = UT_LIST_GET_NEXT(space_list, space);
1900 
1901 		if (free) {
1902 			fil_space_free(prev_space->id, FALSE);
1903 		}
1904 	}
1905 
1906 	mutex_exit(&fil_system->mutex);
1907 }
1908 
1909 /*******************************************************************//**
1910 Sets the max tablespace id counter if the given number is bigger than the
1911 previous value. */
1912 UNIV_INTERN
1913 void
fil_set_max_space_id_if_bigger(ulint max_id)1914 fil_set_max_space_id_if_bigger(
1915 /*===========================*/
1916 	ulint	max_id)	/*!< in: maximum known id */
1917 {
1918 	if (max_id >= SRV_LOG_SPACE_FIRST_ID) {
1919 		fprintf(stderr,
1920 			"InnoDB: Fatal error: max tablespace id"
1921 			" is too high, %lu\n", (ulong) max_id);
1922 		ut_error;
1923 	}
1924 
1925 	mutex_enter(&fil_system->mutex);
1926 
1927 	if (fil_system->max_assigned_id < max_id) {
1928 
1929 		fil_system->max_assigned_id = max_id;
1930 	}
1931 
1932 	mutex_exit(&fil_system->mutex);
1933 }
1934 
1935 /****************************************************************//**
1936 Writes the flushed lsn and the latest archived log number to the page header
1937 of the first page of a data file of the system tablespace (space 0),
1938 which is uncompressed. */
1939 static MY_ATTRIBUTE((warn_unused_result))
1940 dberr_t
fil_write_lsn_and_arch_no_to_file(ulint space,ulint sum_of_sizes,lsn_t lsn,ulint arch_log_no MY_ATTRIBUTE ((unused)))1941 fil_write_lsn_and_arch_no_to_file(
1942 /*==============================*/
1943 	ulint	space,		/*!< in: space to write to */
1944 	ulint	sum_of_sizes,	/*!< in: combined size of previous files
1945 				in space, in database pages */
1946 	lsn_t	lsn,		/*!< in: lsn to write */
1947 	ulint	arch_log_no MY_ATTRIBUTE((unused)))
1948 				/*!< in: archived log number to write */
1949 {
1950 	byte*	buf1;
1951 	byte*	buf;
1952 	dberr_t	err;
1953 
1954 	buf1 = static_cast<byte*>(mem_alloc(2 * UNIV_PAGE_SIZE));
1955 	buf = static_cast<byte*>(ut_align(buf1, UNIV_PAGE_SIZE));
1956 
1957 	err = fil_read(TRUE, space, 0, sum_of_sizes, 0,
1958 		       UNIV_PAGE_SIZE, buf, NULL);
1959 	if (err == DB_SUCCESS) {
1960 		mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN, lsn);
1961 
1962 		err = fil_write(TRUE, space, 0, sum_of_sizes, 0,
1963 				UNIV_PAGE_SIZE, buf, NULL);
1964 	}
1965 
1966 	mem_free(buf1);
1967 
1968 	return(err);
1969 }
1970 
1971 /****************************************************************//**
1972 Writes the flushed lsn and the latest archived log number to the page
1973 header of the first page of each data file in the system tablespace.
1974 @return	DB_SUCCESS or error number */
1975 UNIV_INTERN
1976 dberr_t
fil_write_flushed_lsn_to_data_files(lsn_t lsn,ulint arch_log_no)1977 fil_write_flushed_lsn_to_data_files(
1978 /*================================*/
1979 	lsn_t	lsn,		/*!< in: lsn to write */
1980 	ulint	arch_log_no)	/*!< in: latest archived log file number */
1981 {
1982 	fil_space_t*	space;
1983 	fil_node_t*	node;
1984 	dberr_t		err;
1985 
1986 	mutex_enter(&fil_system->mutex);
1987 
1988 	for (space = UT_LIST_GET_FIRST(fil_system->space_list);
1989 	     space != NULL;
1990 	     space = UT_LIST_GET_NEXT(space_list, space)) {
1991 
1992 		/* We only write the lsn to all existing data files which have
1993 		been open during the lifetime of the mysqld process; they are
1994 		represented by the space objects in the tablespace memory
1995 		cache. Note that all data files in the system tablespace 0
1996 		and the UNDO log tablespaces (if separate) are always open. */
1997 
1998 		if (space->purpose == FIL_TABLESPACE
1999 		    && !fil_is_user_tablespace_id(space->id)) {
2000 			ulint	sum_of_sizes = 0;
2001 
2002 			for (node = UT_LIST_GET_FIRST(space->chain);
2003 			     node != NULL;
2004 			     node = UT_LIST_GET_NEXT(chain, node)) {
2005 
2006 				mutex_exit(&fil_system->mutex);
2007 
2008 				err = fil_write_lsn_and_arch_no_to_file(
2009 					space->id, sum_of_sizes, lsn,
2010 					arch_log_no);
2011 
2012 				if (err != DB_SUCCESS) {
2013 
2014 					return(err);
2015 				}
2016 
2017 				mutex_enter(&fil_system->mutex);
2018 
2019 				sum_of_sizes += node->size;
2020 			}
2021 		}
2022 	}
2023 
2024 	mutex_exit(&fil_system->mutex);
2025 
2026 	return(DB_SUCCESS);
2027 }
2028 
2029 /*******************************************************************//**
2030 Checks the consistency of the first data page of a tablespace
2031 at database startup.
2032 @retval NULL on success, or if innodb_force_recovery is set
2033 @return pointer to an error message string */
2034 static MY_ATTRIBUTE((warn_unused_result))
2035 const char*
fil_check_first_page(const page_t * page)2036 fil_check_first_page(
2037 /*=================*/
2038 	const page_t*	page)		/*!< in: data page */
2039 {
2040 	ulint	space_id;
2041 	ulint	flags;
2042 
2043 	if (srv_force_recovery >= SRV_FORCE_IGNORE_CORRUPT) {
2044 		return(NULL);
2045 	}
2046 
2047 	space_id = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_ID + page);
2048 	flags = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page);
2049 
2050 	if (UNIV_PAGE_SIZE != fsp_flags_get_page_size(flags)) {
2051 		return("innodb-page-size mismatch");
2052 	}
2053 
2054 	if (!space_id && !flags) {
2055 		ulint		nonzero_bytes	= UNIV_PAGE_SIZE;
2056 		const byte*	b		= page;
2057 
2058 		while (!*b && --nonzero_bytes) {
2059 			b++;
2060 		}
2061 
2062 		if (!nonzero_bytes) {
2063 			return("space header page consists of zero bytes");
2064 		}
2065 	}
2066 
2067 	if (buf_page_is_corrupted(
2068 		    false, page, fsp_flags_get_zip_size(flags))) {
2069 		return("checksum mismatch");
2070 	}
2071 
2072 	if (page_get_space_id(page) == space_id
2073 	    && page_get_page_no(page) == 0) {
2074 		return(NULL);
2075 	}
2076 
2077 	return("inconsistent data in space header");
2078 }
2079 
2080 /*******************************************************************//**
2081 Reads the flushed lsn, arch no, space_id and tablespace flag fields from
2082 the first page of a data file at database startup.
2083 @retval NULL on success, or if innodb_force_recovery is set
2084 @return pointer to an error message string */
2085 UNIV_INTERN
2086 const char*
fil_read_first_page(pfs_os_file_t data_file,ibool one_read_already,ulint * flags,ulint * space_id,lsn_t * min_flushed_lsn,lsn_t * max_flushed_lsn)2087 fil_read_first_page(
2088 /*================*/
2089 	pfs_os_file_t	data_file,		/*!< in: open data file */
2090 	ibool		one_read_already,	/*!< in: TRUE if min and max
2091 						parameters below already
2092 						contain sensible data */
2093 	ulint*		flags,			/*!< out: tablespace flags */
2094 	ulint*		space_id,		/*!< out: tablespace ID */
2095 	lsn_t*		min_flushed_lsn,	/*!< out: min of flushed
2096 						lsn values in data files */
2097 	lsn_t*		max_flushed_lsn)	/*!< out: max of flushed
2098 						lsn values in data files */
2099 {
2100 	byte*		buf;
2101 	byte*		page;
2102 	lsn_t		flushed_lsn;
2103 	const char*	check_msg = NULL;
2104 
2105 	buf = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE));
2106 
2107 	/* Align the memory for a possible read from a raw device */
2108 
2109 	page = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE));
2110 
2111 	os_file_read(data_file, page, 0, UNIV_PAGE_SIZE);
2112 
2113 	/* The FSP_HEADER on page 0 is only valid for the first file
2114 	in a tablespace.  So if this is not the first datafile, leave
2115 	*flags and *space_id as they were read from the first file and
2116 	do not validate the first page. */
2117 	if (!one_read_already) {
2118 		*flags = fsp_header_get_flags(page);
2119 		*space_id = fsp_header_get_space_id(page);
2120 
2121 		check_msg = fil_check_first_page(page);
2122 	}
2123 
2124 	flushed_lsn = mach_read_from_8(page + FIL_PAGE_FILE_FLUSH_LSN);
2125 
2126 	ut_free(buf);
2127 
2128 	if (check_msg) {
2129 		return(check_msg);
2130 	}
2131 
2132 	if (!one_read_already) {
2133 		*min_flushed_lsn = flushed_lsn;
2134 		*max_flushed_lsn = flushed_lsn;
2135 
2136 		return(NULL);
2137 	}
2138 
2139 	if (*min_flushed_lsn > flushed_lsn) {
2140 		*min_flushed_lsn = flushed_lsn;
2141 	}
2142 	if (*max_flushed_lsn < flushed_lsn) {
2143 		*max_flushed_lsn = flushed_lsn;
2144 	}
2145 
2146 	return(NULL);
2147 }
2148 
2149 /*================ SINGLE-TABLE TABLESPACES ==========================*/
2150 
2151 #ifndef UNIV_HOTBACKUP
2152 /*******************************************************************//**
2153 Increments the count of pending operation, if space is not being deleted.
2154 @return	TRUE if being deleted, and operation should be skipped */
2155 UNIV_INTERN
2156 ibool
fil_inc_pending_ops(ulint id,ibool print_err)2157 fil_inc_pending_ops(
2158 /*================*/
2159 	ulint	id,		/*!< in: space id */
2160 	ibool	print_err)	/*!< in: need to print error or not */
2161 {
2162 	fil_space_t*	space;
2163 
2164 	mutex_enter(&fil_system->mutex);
2165 
2166 	space = fil_space_get_by_id(id);
2167 
2168 	if (space == NULL) {
2169 		if (print_err) {
2170 			fprintf(stderr,
2171 				"InnoDB: Error: trying to do an operation on a"
2172 				" dropped tablespace %lu\n",
2173 				(ulong) id);
2174 		}
2175 	}
2176 
2177 	if (space == NULL || space->stop_new_ops) {
2178 		mutex_exit(&fil_system->mutex);
2179 
2180 		return(TRUE);
2181 	}
2182 
2183 	space->n_pending_ops++;
2184 
2185 	mutex_exit(&fil_system->mutex);
2186 
2187 	return(FALSE);
2188 }
2189 
2190 /*******************************************************************//**
2191 Decrements the count of pending operations. */
2192 UNIV_INTERN
2193 void
fil_decr_pending_ops(ulint id)2194 fil_decr_pending_ops(
2195 /*=================*/
2196 	ulint	id)	/*!< in: space id */
2197 {
2198 	fil_space_t*	space;
2199 
2200 	mutex_enter(&fil_system->mutex);
2201 
2202 	space = fil_space_get_by_id(id);
2203 
2204 	if (space == NULL) {
2205 		fprintf(stderr,
2206 			"InnoDB: Error: decrementing pending operation"
2207 			" of a dropped tablespace %lu\n",
2208 			(ulong) id);
2209 	}
2210 
2211 	if (space != NULL) {
2212 		space->n_pending_ops--;
2213 	}
2214 
2215 	mutex_exit(&fil_system->mutex);
2216 }
2217 #endif /* !UNIV_HOTBACKUP */
2218 
2219 /********************************************************//**
2220 Creates the database directory for a table if it does not exist yet. */
2221 static
2222 void
fil_create_directory_for_tablename(const char * name)2223 fil_create_directory_for_tablename(
2224 /*===============================*/
2225 	const char*	name)	/*!< in: name in the standard
2226 				'databasename/tablename' format */
2227 {
2228 	const char*	namend;
2229 	char*		path;
2230 	ulint		len;
2231 
2232 	len = strlen(fil_path_to_mysql_datadir);
2233 	namend = strchr(name, '/');
2234 	ut_a(namend);
2235 	path = static_cast<char*>(mem_alloc(len + (namend - name) + 2));
2236 
2237 	memcpy(path, fil_path_to_mysql_datadir, len);
2238 	path[len] = '/';
2239 	memcpy(path + len + 1, name, namend - name);
2240 	path[len + (namend - name) + 1] = 0;
2241 
2242 	srv_normalize_path_for_win(path);
2243 
2244 	ut_a(os_file_create_directory(path, FALSE));
2245 	mem_free(path);
2246 }
2247 
2248 #ifndef UNIV_HOTBACKUP
2249 /********************************************************//**
2250 Writes a log record about an .ibd file create/rename/delete. */
2251 static
2252 void
fil_op_write_log(ulint type,ulint space_id,ulint log_flags,ulint flags,const char * name,const char * new_name,mtr_t * mtr)2253 fil_op_write_log(
2254 /*=============*/
2255 	ulint		type,		/*!< in: MLOG_FILE_CREATE,
2256 					MLOG_FILE_CREATE2,
2257 					MLOG_FILE_DELETE, or
2258 					MLOG_FILE_RENAME */
2259 	ulint		space_id,	/*!< in: space id */
2260 	ulint		log_flags,	/*!< in: redo log flags (stored
2261 					in the page number field) */
2262 	ulint		flags,		/*!< in: compressed page size
2263 					and file format
2264 					if type==MLOG_FILE_CREATE2, or 0 */
2265 	const char*	name,		/*!< in: table name in the familiar
2266 					'databasename/tablename' format, or
2267 					the file path in the case of
2268 					MLOG_FILE_DELETE */
2269 	const char*	new_name,	/*!< in: if type is MLOG_FILE_RENAME,
2270 					the new table name in the
2271 					'databasename/tablename' format */
2272 	mtr_t*		mtr)		/*!< in: mini-transaction handle */
2273 {
2274 	byte*	log_ptr;
2275 	ulint	len;
2276 
2277 	log_ptr = mlog_open(mtr, 11 + 2 + 1);
2278 
2279 	if (!log_ptr) {
2280 		/* Logging in mtr is switched off during crash recovery:
2281 		in that case mlog_open returns NULL */
2282 		return;
2283 	}
2284 
2285 	log_ptr = mlog_write_initial_log_record_for_file_op(
2286 		type, space_id, log_flags, log_ptr, mtr);
2287 	if (type == MLOG_FILE_CREATE2) {
2288 		mach_write_to_4(log_ptr, flags);
2289 		log_ptr += 4;
2290 	}
2291 	/* Let us store the strings as null-terminated for easier readability
2292 	and handling */
2293 
2294 	len = strlen(name) + 1;
2295 
2296 	mach_write_to_2(log_ptr, len);
2297 	log_ptr += 2;
2298 	mlog_close(mtr, log_ptr);
2299 
2300 	mlog_catenate_string(mtr, (byte*) name, len);
2301 
2302 	if (type == MLOG_FILE_RENAME) {
2303 		len = strlen(new_name) + 1;
2304 		log_ptr = mlog_open(mtr, 2 + len);
2305 		ut_a(log_ptr);
2306 		mach_write_to_2(log_ptr, len);
2307 		log_ptr += 2;
2308 		mlog_close(mtr, log_ptr);
2309 
2310 		mlog_catenate_string(mtr, (byte*) new_name, len);
2311 	}
2312 }
2313 #endif
2314 
2315 /*******************************************************************//**
2316 Parses the body of a log record written about an .ibd file operation. That is,
2317 the log record part after the standard (type, space id, page no) header of the
2318 log record.
2319 
2320 If desired, also replays the delete or rename operation if the .ibd file
2321 exists and the space id in it matches. Replays the create operation if a file
2322 at that path does not exist yet. If the database directory for the file to be
2323 created does not exist, then we create the directory, too.
2324 
2325 Note that mysqlbackup --apply-log sets fil_path_to_mysql_datadir to point to
2326 the datadir that we should use in replaying the file operations.
2327 
2328 InnoDB recovery does not replay these fully since it always sets the space id
2329 to zero. But mysqlbackup does replay them.  TODO: If remote tablespaces are
2330 used, mysqlbackup will only create tables in the default directory since
2331 MLOG_FILE_CREATE and MLOG_FILE_CREATE2 only know the tablename, not the path.
2332 
2333 @return end of log record, or NULL if the record was not completely
2334 contained between ptr and end_ptr */
2335 UNIV_INTERN
2336 byte*
fil_op_log_parse_or_replay(byte * ptr,byte * end_ptr,ulint type,ulint space_id,ulint log_flags)2337 fil_op_log_parse_or_replay(
2338 /*=======================*/
2339 	byte*	ptr,		/*!< in: buffer containing the log record body,
2340 				or an initial segment of it, if the record does
2341 				not fir completely between ptr and end_ptr */
2342 	byte*	end_ptr,	/*!< in: buffer end */
2343 	ulint	type,		/*!< in: the type of this log record */
2344 	ulint	space_id,	/*!< in: the space id of the tablespace in
2345 				question, or 0 if the log record should
2346 				only be parsed but not replayed */
2347 	ulint	log_flags)	/*!< in: redo log flags
2348 				(stored in the page number parameter) */
2349 {
2350 	ulint		name_len;
2351 	ulint		new_name_len;
2352 	const char*	name;
2353 	const char*	new_name	= NULL;
2354 	ulint		flags		= 0;
2355 
2356 	if (type == MLOG_FILE_CREATE2) {
2357 		if (end_ptr < ptr + 4) {
2358 
2359 			return(NULL);
2360 		}
2361 
2362 		flags = mach_read_from_4(ptr);
2363 		ptr += 4;
2364 	}
2365 
2366 	if (end_ptr < ptr + 2) {
2367 
2368 		return(NULL);
2369 	}
2370 
2371 	name_len = mach_read_from_2(ptr);
2372 
2373 	ptr += 2;
2374 
2375 	if (end_ptr < ptr + name_len) {
2376 
2377 		return(NULL);
2378 	}
2379 
2380 	name = (const char*) ptr;
2381 
2382 	ptr += name_len;
2383 
2384 	if (type == MLOG_FILE_RENAME) {
2385 		if (end_ptr < ptr + 2) {
2386 
2387 			return(NULL);
2388 		}
2389 
2390 		new_name_len = mach_read_from_2(ptr);
2391 
2392 		ptr += 2;
2393 
2394 		if (end_ptr < ptr + new_name_len) {
2395 
2396 			return(NULL);
2397 		}
2398 
2399 		new_name = (const char*) ptr;
2400 
2401 		ptr += new_name_len;
2402 	}
2403 
2404 	/* We managed to parse a full log record body */
2405 	/*
2406 	printf("Parsed log rec of type %lu space %lu\n"
2407 	"name %s\n", type, space_id, name);
2408 
2409 	if (type == MLOG_FILE_RENAME) {
2410 	printf("new name %s\n", new_name);
2411 	}
2412 	*/
2413 	if (!space_id) {
2414 		return(ptr);
2415 	} else {
2416 		/* Only replay file ops during recovery.  This is a
2417 		release-build assert to minimize any data loss risk by a
2418 		misapplied file operation.  */
2419 		ut_a(recv_recovery_is_on());
2420 	}
2421 
2422 	/* Let us try to perform the file operation, if sensible. Note that
2423 	mysqlbackup has at this stage already read in all space id info to the
2424 	fil0fil.cc data structures.
2425 
2426 	NOTE that our algorithm is not guaranteed to work correctly if there
2427 	were renames of tables during the backup. See mysqlbackup code for more
2428 	on the problem. */
2429 
2430 	switch (type) {
2431 	case MLOG_FILE_DELETE:
2432 		if (fil_tablespace_exists_in_mem(space_id)) {
2433 			dberr_t	err = fil_delete_tablespace(
2434 				space_id, BUF_REMOVE_FLUSH_NO_WRITE);
2435 			ut_a(err == DB_SUCCESS);
2436 		}
2437 
2438 		break;
2439 
2440 	case MLOG_FILE_RENAME:
2441 		/* In order to replay the rename, the following must hold:
2442 		* The new name is not already used.
2443 		* A tablespace is open in memory with the old name.
2444 		* The space ID for that tablepace matches this log entry.
2445 		This will prevent unintended renames during recovery. */
2446 
2447 		if (fil_get_space_id_for_table(new_name) == ULINT_UNDEFINED
2448 		    && space_id == fil_get_space_id_for_table(name)) {
2449 			/* Create the database directory for the new name, if
2450 			it does not exist yet */
2451 			fil_create_directory_for_tablename(new_name);
2452 
2453 			if (!fil_rename_tablespace(name, space_id,
2454 						   new_name, NULL)) {
2455 				ut_error;
2456 			}
2457 		}
2458 
2459 		break;
2460 
2461 	case MLOG_FILE_CREATE:
2462 	case MLOG_FILE_CREATE2:
2463 		if (fil_tablespace_exists_in_mem(space_id)) {
2464 			/* Do nothing */
2465 		} else if (fil_get_space_id_for_table(name)
2466 			   != ULINT_UNDEFINED) {
2467 			/* Do nothing */
2468 		} else if (log_flags & MLOG_FILE_FLAG_TEMP) {
2469 			/* Temporary table, do nothing */
2470 		} else {
2471 			const char*	path = NULL;
2472 
2473 			/* Create the database directory for name, if it does
2474 			not exist yet */
2475 			fil_create_directory_for_tablename(name);
2476 
2477 			if (fil_create_new_single_table_tablespace(
2478 				    space_id, name, path, flags,
2479 				    DICT_TF2_USE_TABLESPACE,
2480 				    FIL_IBD_FILE_INITIAL_SIZE) != DB_SUCCESS) {
2481 				ut_error;
2482 			}
2483 		}
2484 
2485 		break;
2486 
2487 	default:
2488 		ut_error;
2489 	}
2490 
2491 	return(ptr);
2492 }
2493 
2494 /*******************************************************************//**
2495 Allocates a file name for the EXPORT/IMPORT config file name.  The
2496 string must be freed by caller with mem_free().
2497 @return own: file name */
2498 static
2499 char*
fil_make_cfg_name(const char * filepath)2500 fil_make_cfg_name(
2501 /*==============*/
2502 	const char*	filepath)	/*!< in: .ibd file name */
2503 {
2504 	char*	cfg_name;
2505 
2506 	/* Create a temporary file path by replacing the .ibd suffix
2507 	with .cfg. */
2508 
2509 	ut_ad(strlen(filepath) > 4);
2510 
2511 	cfg_name = mem_strdup(filepath);
2512 	ut_snprintf(cfg_name + strlen(cfg_name) - 3, 4, "cfg");
2513 	return(cfg_name);
2514 }
2515 
2516 /*******************************************************************//**
2517 Check for change buffer merges.
2518 @return 0 if no merges else count + 1. */
2519 static
2520 ulint
fil_ibuf_check_pending_ops(fil_space_t * space,ulint count)2521 fil_ibuf_check_pending_ops(
2522 /*=======================*/
2523 	fil_space_t*	space,	/*!< in/out: Tablespace to check */
2524 	ulint		count)	/*!< in: number of attempts so far */
2525 {
2526 	ut_ad(mutex_own(&fil_system->mutex));
2527 
2528 	if (space != 0 && space->n_pending_ops != 0) {
2529 
2530 		if (count > 5000) {
2531 			ib_logf(IB_LOG_LEVEL_WARN,
2532 				"Trying to close/delete tablespace "
2533 				"'%s' but there are %lu pending change "
2534 				"buffer merges on it.",
2535 				space->name,
2536 				(ulong) space->n_pending_ops);
2537 		}
2538 
2539 		return(count + 1);
2540 	}
2541 
2542 	return(0);
2543 }
2544 
2545 /*******************************************************************//**
2546 Check for pending IO.
2547 @return 0 if no pending else count + 1. */
2548 static
2549 ulint
fil_check_pending_io(fil_space_t * space,fil_node_t ** node,ulint count)2550 fil_check_pending_io(
2551 /*=================*/
2552 	fil_space_t*	space,	/*!< in/out: Tablespace to check */
2553 	fil_node_t**	node,	/*!< out: Node in space list */
2554 	ulint		count)	/*!< in: number of attempts so far */
2555 {
2556 	ut_ad(mutex_own(&fil_system->mutex));
2557 	ut_a(space->n_pending_ops == 0);
2558 
2559 	/* The following code must change when InnoDB supports
2560 	multiple datafiles per tablespace. */
2561 	ut_a(UT_LIST_GET_LEN(space->chain) == 1);
2562 
2563 	*node = UT_LIST_GET_FIRST(space->chain);
2564 
2565 	if (space->n_pending_flushes > 0 || (*node)->n_pending > 0) {
2566 
2567 		ut_a(!(*node)->being_extended);
2568 
2569 		if (count > 1000) {
2570 			ib_logf(IB_LOG_LEVEL_WARN,
2571 				"Trying to close/delete tablespace '%s' "
2572 				"but there are %lu flushes "
2573 				" and %lu pending i/o's on it.",
2574 				space->name,
2575 				(ulong) space->n_pending_flushes,
2576 				(ulong) (*node)->n_pending);
2577 		}
2578 
2579 		return(count + 1);
2580 	}
2581 
2582 	return(0);
2583 }
2584 
2585 /*******************************************************************//**
2586 Check pending operations on a tablespace.
2587 @return DB_SUCCESS or error failure. */
2588 static
2589 dberr_t
fil_check_pending_operations(ulint id,fil_space_t ** space,char ** path)2590 fil_check_pending_operations(
2591 /*=========================*/
2592 	ulint		id,	/*!< in: space id */
2593 	fil_space_t**	space,	/*!< out: tablespace instance in memory */
2594 	char**		path)	/*!< out/own: tablespace path */
2595 {
2596 	ulint		count = 0;
2597 
2598 	ut_a(id != TRX_SYS_SPACE);
2599 	ut_ad(space);
2600 
2601 	*space = 0;
2602 
2603 	mutex_enter(&fil_system->mutex);
2604 	fil_space_t* sp = fil_space_get_by_id(id);
2605 	if (sp) {
2606 		sp->stop_new_ops = TRUE;
2607 	}
2608 	mutex_exit(&fil_system->mutex);
2609 
2610 	/* Check for pending change buffer merges. */
2611 
2612 	do {
2613 		mutex_enter(&fil_system->mutex);
2614 
2615 		sp = fil_space_get_by_id(id);
2616 
2617 		count = fil_ibuf_check_pending_ops(sp, count);
2618 
2619 		mutex_exit(&fil_system->mutex);
2620 
2621 		if (count > 0) {
2622 			os_thread_sleep(20000);
2623 		}
2624 
2625 	} while (count > 0);
2626 
2627 	/* Check for pending IO. */
2628 
2629 	*path = 0;
2630 
2631 	do {
2632 		mutex_enter(&fil_system->mutex);
2633 
2634 		sp = fil_space_get_by_id(id);
2635 
2636 		if (sp == NULL) {
2637 			mutex_exit(&fil_system->mutex);
2638 			return(DB_TABLESPACE_NOT_FOUND);
2639 		}
2640 
2641 		fil_node_t*	node;
2642 
2643 		count = fil_check_pending_io(sp, &node, count);
2644 
2645 		if (count == 0) {
2646 			*path = mem_strdup(node->name);
2647 		}
2648 
2649 		mutex_exit(&fil_system->mutex);
2650 
2651 		if (count > 0) {
2652 			os_thread_sleep(20000);
2653 		}
2654 
2655 	} while (count > 0);
2656 
2657 	ut_ad(sp);
2658 
2659 	*space = sp;
2660 	return(DB_SUCCESS);
2661 }
2662 
2663 /*******************************************************************//**
2664 Closes a single-table tablespace. The tablespace must be cached in the
2665 memory cache. Free all pages used by the tablespace.
2666 @return	DB_SUCCESS or error */
2667 UNIV_INTERN
2668 dberr_t
fil_close_tablespace(trx_t * trx,ulint id)2669 fil_close_tablespace(
2670 /*=================*/
2671 	trx_t*		trx,	/*!< in/out: Transaction covering the close */
2672 	ulint		id)	/*!< in: space id */
2673 {
2674 	char*		path = 0;
2675 	fil_space_t*	space = 0;
2676 
2677 	ut_a(id != TRX_SYS_SPACE);
2678 
2679 	dberr_t		err = fil_check_pending_operations(id, &space, &path);
2680 
2681 	if (err != DB_SUCCESS) {
2682 		return(err);
2683 	}
2684 
2685 	ut_a(space);
2686 	ut_a(path != 0);
2687 
2688 	rw_lock_x_lock(&space->latch);
2689 
2690 #ifndef UNIV_HOTBACKUP
2691 	/* Invalidate in the buffer pool all pages belonging to the
2692 	tablespace. Since we have set space->stop_new_ops = TRUE, readahead
2693 	or ibuf merge can no longer read more pages of this tablespace to the
2694 	buffer pool. Thus we can clean the tablespace out of the buffer pool
2695 	completely and permanently. The flag stop_new_ops also prevents
2696 	fil_flush() from being applied to this tablespace. */
2697 
2698 	buf_LRU_flush_or_remove_pages(id, BUF_REMOVE_FLUSH_WRITE, trx);
2699 #endif
2700 	mutex_enter(&fil_system->mutex);
2701 
2702 	/* If the free is successful, the X lock will be released before
2703 	the space memory data structure is freed. */
2704 
2705 	if (!fil_space_free(id, TRUE)) {
2706 		rw_lock_x_unlock(&space->latch);
2707 		err = DB_TABLESPACE_NOT_FOUND;
2708 	} else {
2709 		err = DB_SUCCESS;
2710 	}
2711 
2712 	mutex_exit(&fil_system->mutex);
2713 
2714 	/* If it is a delete then also delete any generated files, otherwise
2715 	when we drop the database the remove directory will fail. */
2716 
2717 	char*	cfg_name = fil_make_cfg_name(path);
2718 
2719 	os_file_delete_if_exists(innodb_file_data_key, cfg_name);
2720 
2721 	mem_free(path);
2722 	mem_free(cfg_name);
2723 
2724 	return(err);
2725 }
2726 
2727 /*******************************************************************//**
2728 Deletes a single-table tablespace. The tablespace must be cached in the
2729 memory cache.
2730 @return	DB_SUCCESS or error */
2731 UNIV_INTERN
2732 dberr_t
fil_delete_tablespace(ulint id,buf_remove_t buf_remove)2733 fil_delete_tablespace(
2734 /*==================*/
2735 	ulint		id,		/*!< in: space id */
2736 	buf_remove_t	buf_remove)	/*!< in: specify the action to take
2737 					on the tables pages in the buffer
2738 					pool */
2739 {
2740 	char*		path = 0;
2741 	fil_space_t*	space = 0;
2742 
2743 	ut_a(id != TRX_SYS_SPACE);
2744 
2745 	dberr_t		err = fil_check_pending_operations(id, &space, &path);
2746 
2747 	if (err != DB_SUCCESS) {
2748 
2749 		ib_logf(IB_LOG_LEVEL_ERROR,
2750 			"Cannot delete tablespace %lu because it is not "
2751 			"found in the tablespace memory cache.",
2752 			(ulong) id);
2753 
2754 		return(err);
2755 	}
2756 
2757 	ut_a(space);
2758 	ut_a(path != 0);
2759 
2760 	/* Important: We rely on the data dictionary mutex to ensure
2761 	that a race is not possible here. It should serialize the tablespace
2762 	drop/free. We acquire an X latch only to avoid a race condition
2763 	when accessing the tablespace instance via:
2764 
2765 	  fsp_get_available_space_in_free_extents().
2766 
2767 	There our main motivation is to reduce the contention on the
2768 	dictionary mutex. */
2769 
2770 	rw_lock_x_lock(&space->latch);
2771 
2772 #ifndef UNIV_HOTBACKUP
2773 	/* IMPORTANT: Because we have set space::stop_new_ops there
2774 	can't be any new ibuf merges, reads or flushes. We are here
2775 	because node::n_pending was zero above. However, it is still
2776 	possible to have pending read and write requests:
2777 
2778 	A read request can happen because the reader thread has
2779 	gone through the ::stop_new_ops check in buf_page_init_for_read()
2780 	before the flag was set and has not yet incremented ::n_pending
2781 	when we checked it above.
2782 
2783 	A write request can be issued any time because we don't check
2784 	the ::stop_new_ops flag when queueing a block for write.
2785 
2786 	We deal with pending write requests in the following function
2787 	where we'd minimally evict all dirty pages belonging to this
2788 	space from the flush_list. Not that if a block is IO-fixed
2789 	we'll wait for IO to complete.
2790 
2791 	To deal with potential read requests by checking the
2792 	::stop_new_ops flag in fil_io() */
2793 
2794 	buf_LRU_flush_or_remove_pages(id, buf_remove, 0);
2795 
2796 #endif /* !UNIV_HOTBACKUP */
2797 
2798 	/* If it is a delete then also delete any generated files, otherwise
2799 	when we drop the database the remove directory will fail. */
2800 	{
2801 		char*	cfg_name = fil_make_cfg_name(path);
2802 		os_file_delete_if_exists(innodb_file_data_key, cfg_name);
2803 		mem_free(cfg_name);
2804 	}
2805 
2806 	/* Delete the link file pointing to the ibd file we are deleting. */
2807 	if (FSP_FLAGS_HAS_DATA_DIR(space->flags)) {
2808 		fil_delete_link_file(space->name);
2809 	}
2810 
2811 	mutex_enter(&fil_system->mutex);
2812 
2813 	/* Double check the sanity of pending ops after reacquiring
2814 	the fil_system::mutex. */
2815 	if (fil_space_get_by_id(id)) {
2816 		ut_a(space->n_pending_ops == 0);
2817 		ut_a(UT_LIST_GET_LEN(space->chain) == 1);
2818 		fil_node_t* node = UT_LIST_GET_FIRST(space->chain);
2819 		ut_a(node->n_pending == 0);
2820 	}
2821 
2822 	if (!fil_space_free(id, TRUE)) {
2823 		err = DB_TABLESPACE_NOT_FOUND;
2824 	}
2825 
2826 	mutex_exit(&fil_system->mutex);
2827 
2828 	if (err != DB_SUCCESS) {
2829 		rw_lock_x_unlock(&space->latch);
2830 	} else if (!os_file_delete(innodb_file_data_key, path)
2831 		   && !os_file_delete_if_exists(innodb_file_data_key, path)) {
2832 
2833 		/* Note: This is because we have removed the
2834 		tablespace instance from the cache. */
2835 
2836 		err = DB_IO_ERROR;
2837 	}
2838 
2839 	if (err == DB_SUCCESS) {
2840 #ifndef UNIV_HOTBACKUP
2841 		/* Write a log record about the deletion of the .ibd
2842 		file, so that mysqlbackup can replay it in the
2843 		--apply-log phase. We use a dummy mtr and the familiar
2844 		log write mechanism. */
2845 		mtr_t		mtr;
2846 
2847 		/* When replaying the operation in mysqlbackup, do not try
2848 		to write any log record */
2849 		mtr_start(&mtr);
2850 
2851 		fil_op_write_log(MLOG_FILE_DELETE, id, 0, 0, path, NULL, &mtr);
2852 		mtr_commit(&mtr);
2853 #endif
2854 		err = DB_SUCCESS;
2855 	}
2856 
2857 	mem_free(path);
2858 
2859 	return(err);
2860 }
2861 
2862 /*******************************************************************//**
2863 Returns TRUE if a single-table tablespace is being deleted.
2864 @return TRUE if being deleted */
2865 UNIV_INTERN
2866 ibool
fil_tablespace_is_being_deleted(ulint id)2867 fil_tablespace_is_being_deleted(
2868 /*============================*/
2869 	ulint		id)	/*!< in: space id */
2870 {
2871 	fil_space_t*	space;
2872 	ibool		is_being_deleted;
2873 
2874 	mutex_enter(&fil_system->mutex);
2875 
2876 	space = fil_space_get_by_id(id);
2877 
2878 	ut_a(space != NULL);
2879 
2880 	is_being_deleted = space->stop_new_ops;
2881 
2882 	mutex_exit(&fil_system->mutex);
2883 
2884 	return(is_being_deleted);
2885 }
2886 
2887 #ifndef UNIV_HOTBACKUP
2888 /*******************************************************************//**
2889 Discards a single-table tablespace. The tablespace must be cached in the
2890 memory cache. Discarding is like deleting a tablespace, but
2891 
2892  1. We do not drop the table from the data dictionary;
2893 
2894  2. We remove all insert buffer entries for the tablespace immediately;
2895     in DROP TABLE they are only removed gradually in the background;
2896 
2897  3. Free all the pages in use by the tablespace.
2898 @return	DB_SUCCESS or error */
2899 UNIV_INTERN
2900 dberr_t
fil_discard_tablespace(ulint id)2901 fil_discard_tablespace(
2902 /*===================*/
2903 	ulint	id)	/*!< in: space id */
2904 {
2905 	dberr_t	err;
2906 
2907 	switch (err = fil_delete_tablespace(id, BUF_REMOVE_ALL_NO_WRITE)) {
2908 	case DB_SUCCESS:
2909 		break;
2910 
2911 	case DB_IO_ERROR:
2912 		ib_logf(IB_LOG_LEVEL_WARN,
2913 			"While deleting tablespace %lu in DISCARD TABLESPACE."
2914 			" File rename/delete failed: %s",
2915 			(ulong) id, ut_strerr(err));
2916 		break;
2917 
2918 	case DB_TABLESPACE_NOT_FOUND:
2919 		ib_logf(IB_LOG_LEVEL_WARN,
2920 			"Cannot delete tablespace %lu in DISCARD "
2921 			"TABLESPACE. %s",
2922 			(ulong) id, ut_strerr(err));
2923 		break;
2924 
2925 	default:
2926 		ut_error;
2927 	}
2928 
2929 	/* Remove all insert buffer entries for the tablespace */
2930 
2931 	ibuf_delete_for_discarded_space(id);
2932 
2933 	return(err);
2934 }
2935 #endif /* !UNIV_HOTBACKUP */
2936 
2937 /*******************************************************************//**
2938 Renames the memory cache structures of a single-table tablespace.
2939 @return	TRUE if success */
2940 static
2941 ibool
fil_rename_tablespace_in_mem(fil_space_t * space,fil_node_t * node,const char * new_name,const char * new_path)2942 fil_rename_tablespace_in_mem(
2943 /*=========================*/
2944 	fil_space_t*	space,	/*!< in: tablespace memory object */
2945 	fil_node_t*	node,	/*!< in: file node of that tablespace */
2946 	const char*	new_name,	/*!< in: new name */
2947 	const char*	new_path)	/*!< in: new file path */
2948 {
2949 	fil_space_t*	space2;
2950 	const char*	old_name	= space->name;
2951 
2952 	ut_ad(mutex_own(&fil_system->mutex));
2953 
2954 	space2 = fil_space_get_by_name(old_name);
2955 	if (space != space2) {
2956 		fputs("InnoDB: Error: cannot find ", stderr);
2957 		ut_print_filename(stderr, old_name);
2958 		fputs(" in tablespace memory cache\n", stderr);
2959 
2960 		return(FALSE);
2961 	}
2962 
2963 	space2 = fil_space_get_by_name(new_name);
2964 	if (space2 != NULL) {
2965 		fputs("InnoDB: Error: ", stderr);
2966 		ut_print_filename(stderr, new_name);
2967 		fputs(" is already in tablespace memory cache\n", stderr);
2968 
2969 		return(FALSE);
2970 	}
2971 
2972 	HASH_DELETE(fil_space_t, name_hash, fil_system->name_hash,
2973 		    ut_fold_string(space->name), space);
2974 	mem_free(space->name);
2975 	mem_free(node->name);
2976 
2977 	space->name = mem_strdup(new_name);
2978 	node->name = mem_strdup(new_path);
2979 
2980 	HASH_INSERT(fil_space_t, name_hash, fil_system->name_hash,
2981 		    ut_fold_string(new_name), space);
2982 	return(TRUE);
2983 }
2984 
2985 /*******************************************************************//**
2986 Allocates a file name for a single-table tablespace. The string must be freed
2987 by caller with mem_free().
2988 @return	own: file name */
2989 UNIV_INTERN
2990 char*
fil_make_ibd_name(const char * name,bool is_full_path)2991 fil_make_ibd_name(
2992 /*==============*/
2993 	const char*	name,		/*!< in: table name or a dir path */
2994 	bool		is_full_path)	/*!< in: TRUE if it is a dir path */
2995 {
2996 	char*	filename;
2997 	ulint	namelen		= strlen(name);
2998 	ulint	dirlen		= strlen(fil_path_to_mysql_datadir);
2999 	ulint	pathlen		= dirlen + namelen + sizeof "/.ibd";
3000 
3001 	filename = static_cast<char*>(mem_alloc(pathlen));
3002 
3003 	if (is_full_path) {
3004 		memcpy(filename, name, namelen);
3005 		memcpy(filename + namelen, ".ibd", sizeof ".ibd");
3006 	} else {
3007 		ut_snprintf(filename, pathlen, "%s/%s.ibd",
3008 			fil_path_to_mysql_datadir, name);
3009 
3010 	}
3011 
3012 	srv_normalize_path_for_win(filename);
3013 
3014 	return(filename);
3015 }
3016 
3017 /*******************************************************************//**
3018 Allocates a file name for a tablespace ISL file (InnoDB Symbolic Link).
3019 The string must be freed by caller with mem_free().
3020 @return	own: file name */
3021 UNIV_INTERN
3022 char*
fil_make_isl_name(const char * name)3023 fil_make_isl_name(
3024 /*==============*/
3025 	const char*	name)	/*!< in: table name */
3026 {
3027 	char*	filename;
3028 	ulint	namelen		= strlen(name);
3029 	ulint	dirlen		= strlen(fil_path_to_mysql_datadir);
3030 	ulint	pathlen		= dirlen + namelen + sizeof "/.isl";
3031 
3032 	filename = static_cast<char*>(mem_alloc(pathlen));
3033 
3034 	ut_snprintf(filename, pathlen, "%s/%s.isl",
3035 		fil_path_to_mysql_datadir, name);
3036 
3037 	srv_normalize_path_for_win(filename);
3038 
3039 	return(filename);
3040 }
3041 
3042 /** Test if a tablespace file can be renamed to a new filepath by checking
3043 if that the old filepath exists and the new filepath does not exist.
3044 @param[in]	space_id	tablespace id
3045 @param[in]	old_path	old filepath
3046 @param[in]	new_path	new filepath
3047 @param[in]	is_discarded	whether the tablespace is discarded
3048 @return innodb error code */
3049 dberr_t
fil_rename_tablespace_check(ulint space_id,const char * old_path,const char * new_path,bool is_discarded)3050 fil_rename_tablespace_check(
3051 	ulint		space_id,
3052 	const char*	old_path,
3053 	const char*	new_path,
3054 	bool		is_discarded)
3055 {
3056 	ulint	exists = false;
3057 	os_file_type_t	ftype;
3058 
3059 	if (!is_discarded
3060 	    && os_file_status(old_path, &exists, &ftype)
3061 	    && !exists) {
3062 		ib_logf(IB_LOG_LEVEL_ERROR,
3063 			"Cannot rename '%s' to '%s' for space ID %lu"
3064 			" because the source file does not exist.",
3065 			old_path, new_path, space_id);
3066 
3067 		return(DB_TABLESPACE_NOT_FOUND);
3068 	}
3069 
3070 	exists = false;
3071 	if (!os_file_status(new_path, &exists, &ftype) || exists) {
3072 		ib_logf(IB_LOG_LEVEL_ERROR,
3073 			"Cannot rename '%s' to '%s' for space ID %lu"
3074 			" because the target file exists."
3075 			" Remove the target file and try again.",
3076 			old_path, new_path, space_id);
3077 
3078 		return(DB_TABLESPACE_EXISTS);
3079 	}
3080 
3081 	return(DB_SUCCESS);
3082 }
3083 
3084 /*******************************************************************//**
3085 Renames a single-table tablespace. The tablespace must be cached in the
3086 tablespace memory cache.
3087 @return	TRUE if success */
3088 UNIV_INTERN
3089 ibool
fil_rename_tablespace(const char * old_name_in,ulint id,const char * new_name,const char * new_path_in)3090 fil_rename_tablespace(
3091 /*==================*/
3092 	const char*	old_name_in,	/*!< in: old table name in the
3093 					standard databasename/tablename
3094 					format of InnoDB, or NULL if we
3095 					do the rename based on the space
3096 					id only */
3097 	ulint		id,		/*!< in: space id */
3098 	const char*	new_name,	/*!< in: new table name in the
3099 					standard databasename/tablename
3100 					format of InnoDB */
3101 	const char*	new_path_in)	/*!< in: new full datafile path
3102 					if the tablespace is remotely
3103 					located, or NULL if it is located
3104 					in the normal data directory. */
3105 {
3106 	ibool		success;
3107 	fil_space_t*	space;
3108 	fil_node_t*	node;
3109 	ulint		count		= 0;
3110 	char*		new_path;
3111 	char*		old_name;
3112 	char*		old_path;
3113 	const char*	not_given	= "(name not specified)";
3114 
3115 	ut_a(id != 0);
3116 
3117 retry:
3118 	count++;
3119 
3120 	if (!(count % 1000)) {
3121 		ut_print_timestamp(stderr);
3122 		fputs("  InnoDB: Warning: problems renaming ", stderr);
3123 		ut_print_filename(stderr,
3124 				  old_name_in ? old_name_in : not_given);
3125 		fputs(" to ", stderr);
3126 		ut_print_filename(stderr, new_name);
3127 		fprintf(stderr, ", %lu iterations\n", (ulong) count);
3128 	}
3129 
3130 	mutex_enter(&fil_system->mutex);
3131 
3132 	space = fil_space_get_by_id(id);
3133 
3134 	DBUG_EXECUTE_IF("fil_rename_tablespace_failure_1", space = NULL; );
3135 
3136 	if (space == NULL) {
3137 		ib_logf(IB_LOG_LEVEL_ERROR,
3138 			"Cannot find space id %lu in the tablespace "
3139 			"memory cache, though the table '%s' in a "
3140 			"rename operation should have that id.",
3141 			(ulong) id, old_name_in ? old_name_in : not_given);
3142 		mutex_exit(&fil_system->mutex);
3143 
3144 		return(FALSE);
3145 	}
3146 
3147 	if (count > 25000) {
3148 		space->stop_ios = FALSE;
3149 		mutex_exit(&fil_system->mutex);
3150 
3151 		return(FALSE);
3152 	}
3153 
3154 	/* We temporarily close the .ibd file because we do not trust that
3155 	operating systems can rename an open file. For the closing we have to
3156 	wait until there are no pending i/o's or flushes on the file. */
3157 
3158 	space->stop_ios = TRUE;
3159 
3160 	/* The following code must change when InnoDB supports
3161 	multiple datafiles per tablespace. */
3162 	ut_a(UT_LIST_GET_LEN(space->chain) == 1);
3163 	node = UT_LIST_GET_FIRST(space->chain);
3164 
3165 	if (node->n_pending > 0
3166 	    || node->n_pending_flushes > 0
3167 	    || node->being_extended) {
3168 		/* There are pending i/o's or flushes or the file is
3169 		currently being extended, sleep for a while and
3170 		retry */
3171 
3172 		mutex_exit(&fil_system->mutex);
3173 
3174 		os_thread_sleep(20000);
3175 
3176 		goto retry;
3177 
3178 	} else if (node->modification_counter > node->flush_counter) {
3179 		/* Flush the space */
3180 
3181 		mutex_exit(&fil_system->mutex);
3182 
3183 		os_thread_sleep(20000);
3184 
3185 		fil_flush(id);
3186 
3187 		goto retry;
3188 
3189 	} else if (node->open) {
3190 		/* Close the file */
3191 
3192 		fil_node_close_file(node, fil_system);
3193 	}
3194 
3195 	/* Check that the old name in the space is right */
3196 
3197 	if (old_name_in) {
3198 		old_name = mem_strdup(old_name_in);
3199 		ut_a(strcmp(space->name, old_name) == 0);
3200 	} else {
3201 		old_name = mem_strdup(space->name);
3202 	}
3203 	old_path = mem_strdup(node->name);
3204 
3205 	/* Rename the tablespace and the node in the memory cache */
3206 	new_path = new_path_in ? mem_strdup(new_path_in)
3207 		: fil_make_ibd_name(new_name, false);
3208 
3209 	success = fil_rename_tablespace_in_mem(
3210 		space, node, new_name, new_path);
3211 
3212 	if (success) {
3213 
3214 		DBUG_EXECUTE_IF("fil_rename_tablespace_failure_2",
3215 			goto skip_second_rename; );
3216 
3217 		success = os_file_rename(
3218 			innodb_file_data_key, old_path, new_path);
3219 
3220 		DBUG_EXECUTE_IF("fil_rename_tablespace_failure_2",
3221 skip_second_rename:
3222 			success = FALSE; );
3223 
3224 		if (!success) {
3225 			/* We have to revert the changes we made
3226 			to the tablespace memory cache */
3227 
3228 			ut_a(fil_rename_tablespace_in_mem(
3229 					space, node, old_name, old_path));
3230 		}
3231 	}
3232 
3233 	space->stop_ios = FALSE;
3234 
3235 	mutex_exit(&fil_system->mutex);
3236 
3237 #ifndef UNIV_HOTBACKUP
3238 	if (success && !recv_recovery_on) {
3239 		mtr_t		mtr;
3240 
3241 		mtr_start(&mtr);
3242 
3243 		fil_op_write_log(MLOG_FILE_RENAME, id, 0, 0, old_name, new_name,
3244 				 &mtr);
3245 		mtr_commit(&mtr);
3246 	}
3247 #endif /* !UNIV_HOTBACKUP */
3248 
3249 	mem_free(new_path);
3250 	mem_free(old_path);
3251 	mem_free(old_name);
3252 
3253 	return(success);
3254 }
3255 
3256 /*******************************************************************//**
3257 Creates a new InnoDB Symbolic Link (ISL) file.  It is always created
3258 under the 'datadir' of MySQL. The datadir is the directory of a
3259 running mysqld program. We can refer to it by simply using the path '.'.
3260 @return	DB_SUCCESS or error code */
3261 UNIV_INTERN
3262 dberr_t
fil_create_link_file(const char * tablename,const char * filepath)3263 fil_create_link_file(
3264 /*=================*/
3265 	const char*	tablename,	/*!< in: tablename */
3266 	const char*	filepath)	/*!< in: pathname of tablespace */
3267 {
3268 	dberr_t		err = DB_SUCCESS;
3269 	char*		link_filepath;
3270 	char*		prev_filepath = fil_read_link_file(tablename);
3271 
3272 	ut_ad(!srv_read_only_mode);
3273 
3274 	if (prev_filepath) {
3275 		/* Truncate will call this with an existing
3276 		link file which contains the same filepath. */
3277 		if (0 == strcmp(prev_filepath, filepath)) {
3278 			mem_free(prev_filepath);
3279 			return(DB_SUCCESS);
3280 		}
3281 		mem_free(prev_filepath);
3282 	}
3283 
3284 	link_filepath = fil_make_isl_name(tablename);
3285 
3286 	/** Check if the file already exists. */
3287 	FILE*                   file = NULL;
3288 	ibool                   exists;
3289 	os_file_type_t          ftype;
3290 
3291 	bool success = os_file_status(link_filepath, &exists, &ftype);
3292 
3293 	ulint error = 0;
3294 	if (success && !exists) {
3295 		file = fopen(link_filepath, "w");
3296 		if (file == NULL) {
3297 			/* This call will print its own error message */
3298 			error = os_file_get_last_error(true);
3299 		}
3300 	} else {
3301 		error = OS_FILE_ALREADY_EXISTS;
3302 	}
3303 	if (error != 0) {
3304 
3305 		ut_print_timestamp(stderr);
3306 		fputs("  InnoDB: Cannot create file ", stderr);
3307 		ut_print_filename(stderr, link_filepath);
3308 		fputs(".\n", stderr);
3309 
3310 		if (error == OS_FILE_ALREADY_EXISTS) {
3311 			fputs("InnoDB: The link file: ", stderr);
3312 			ut_print_filename(stderr, filepath);
3313 			fputs(" already exists.\n", stderr);
3314 			err = DB_TABLESPACE_EXISTS;
3315 
3316 		} else if (error == OS_FILE_DISK_FULL) {
3317 			err = DB_OUT_OF_FILE_SPACE;
3318 
3319 		} else {
3320 			err = DB_ERROR;
3321 		}
3322 
3323 		/* file is not open, no need to close it. */
3324 		mem_free(link_filepath);
3325 		return(err);
3326 	}
3327 
3328 	ulint rbytes = fwrite(filepath, 1, strlen(filepath), file);
3329 	if (rbytes != strlen(filepath)) {
3330 		os_file_get_last_error(true);
3331 		ib_logf(IB_LOG_LEVEL_ERROR,
3332 			"cannot write link file "
3333 			 "%s",filepath);
3334 		err = DB_ERROR;
3335 	}
3336 
3337 	/* Close the file, we only need it at startup */
3338 	fclose(file);
3339 
3340 	mem_free(link_filepath);
3341 
3342 	return(err);
3343 }
3344 
3345 /*******************************************************************//**
3346 Deletes an InnoDB Symbolic Link (ISL) file. */
3347 UNIV_INTERN
3348 void
fil_delete_link_file(const char * tablename)3349 fil_delete_link_file(
3350 /*=================*/
3351 	const char*	tablename)	/*!< in: name of table */
3352 {
3353 	char* link_filepath = fil_make_isl_name(tablename);
3354 
3355 	os_file_delete_if_exists(innodb_file_data_key, link_filepath);
3356 
3357 	mem_free(link_filepath);
3358 }
3359 
3360 /*******************************************************************//**
3361 Reads an InnoDB Symbolic Link (ISL) file.
3362 It is always created under the 'datadir' of MySQL.  The name is of the
3363 form {databasename}/{tablename}. and the isl file is expected to be in a
3364 '{databasename}' directory called '{tablename}.isl'. The caller must free
3365 the memory of the null-terminated path returned if it is not null.
3366 @return	own: filepath found in link file, NULL if not found. */
3367 UNIV_INTERN
3368 char*
fil_read_link_file(const char * name)3369 fil_read_link_file(
3370 /*===============*/
3371 	const char*	name)		/*!< in: tablespace name */
3372 {
3373 	char*		filepath = NULL;
3374 	char*		link_filepath;
3375 	FILE*		file = NULL;
3376 
3377 	/* The .isl file is in the 'normal' tablespace location. */
3378 	link_filepath = fil_make_isl_name(name);
3379 
3380 	file = fopen(link_filepath, "r+b");
3381 
3382 	mem_free(link_filepath);
3383 
3384 	if (file) {
3385 		filepath = static_cast<char*>(mem_alloc(OS_FILE_MAX_PATH));
3386 
3387 		os_file_read_string(file, filepath, OS_FILE_MAX_PATH);
3388 		fclose(file);
3389 
3390 		if (strlen(filepath)) {
3391 			/* Trim whitespace from end of filepath */
3392 			ulint lastch = strlen(filepath) - 1;
3393 			while (lastch > 4 && filepath[lastch] <= 0x20) {
3394 				filepath[lastch--] = 0x00;
3395 			}
3396 			srv_normalize_path_for_win(filepath);
3397 		}
3398 	}
3399 
3400 	return(filepath);
3401 }
3402 
3403 /*******************************************************************//**
3404 Opens a handle to the file linked to in an InnoDB Symbolic Link file.
3405 @return	TRUE if remote linked tablespace file is found and opened. */
3406 UNIV_INTERN
3407 ibool
fil_open_linked_file(const char * tablename,char ** remote_filepath,pfs_os_file_t * remote_file)3408 fil_open_linked_file(
3409 /*===============*/
3410 	const char*	tablename,	/*!< in: database/tablename */
3411 	char**		remote_filepath,/*!< out: remote filepath */
3412 	pfs_os_file_t*	remote_file)	/*!< out: remote file handle */
3413 
3414 {
3415 	ibool		success;
3416 
3417 	*remote_filepath = fil_read_link_file(tablename);
3418 	if (*remote_filepath == NULL) {
3419 		return(FALSE);
3420 	}
3421 
3422 	/* The filepath provided is different from what was
3423 	found in the link file. */
3424 	*remote_file = os_file_create_simple_no_error_handling(
3425 		innodb_file_data_key, *remote_filepath,
3426 		OS_FILE_OPEN, OS_FILE_READ_ONLY,
3427 		&success);
3428 
3429 	if (!success) {
3430 		char*	link_filepath = fil_make_isl_name(tablename);
3431 
3432 		/* The following call prints an error message */
3433 		os_file_get_last_error(true);
3434 
3435 		ib_logf(IB_LOG_LEVEL_ERROR,
3436 			"A link file was found named '%s' "
3437 			"but the linked tablespace '%s' "
3438 			"could not be opened.",
3439 			link_filepath, *remote_filepath);
3440 
3441 		mem_free(link_filepath);
3442 		mem_free(*remote_filepath);
3443 		*remote_filepath = NULL;
3444 	}
3445 
3446 	return(success);
3447 }
3448 
3449 /*******************************************************************//**
3450 Creates a new single-table tablespace to a database directory of MySQL.
3451 Database directories are under the 'datadir' of MySQL. The datadir is the
3452 directory of a running mysqld program. We can refer to it by simply the
3453 path '.'. Tables created with CREATE TEMPORARY TABLE we place in the temp
3454 dir of the mysqld server.
3455 
3456 @return	DB_SUCCESS or error code */
3457 UNIV_INTERN
3458 dberr_t
fil_create_new_single_table_tablespace(ulint space_id,const char * tablename,const char * dir_path,ulint flags,ulint flags2,ulint size)3459 fil_create_new_single_table_tablespace(
3460 /*===================================*/
3461 	ulint		space_id,	/*!< in: space id */
3462 	const char*	tablename,	/*!< in: the table name in the usual
3463 					databasename/tablename format
3464 					of InnoDB */
3465 	const char*	dir_path,	/*!< in: NULL or a dir path */
3466 	ulint		flags,		/*!< in: tablespace flags */
3467 	ulint		flags2,		/*!< in: table flags2 */
3468 	ulint		size)		/*!< in: the initial size of the
3469 					tablespace file in pages,
3470 					must be >= FIL_IBD_FILE_INITIAL_SIZE */
3471 {
3472 	pfs_os_file_t	file;
3473 
3474 	ibool		ret;
3475 	dberr_t		err;
3476 	byte*		buf2;
3477 	byte*		page;
3478 	char*		path;
3479 	ibool		success;
3480 	/* TRUE if a table is created with CREATE TEMPORARY TABLE */
3481 	bool		is_temp = !!(flags2 & DICT_TF2_TEMPORARY);
3482 	bool		has_data_dir = FSP_FLAGS_HAS_DATA_DIR(flags);
3483 
3484 	ut_a(space_id > 0);
3485 	ut_ad(!srv_read_only_mode);
3486 	ut_a(space_id < SRV_LOG_SPACE_FIRST_ID);
3487 	ut_a(size >= FIL_IBD_FILE_INITIAL_SIZE);
3488 	ut_a(fsp_flags_is_valid(flags));
3489 
3490 	if (is_temp) {
3491 		/* Temporary table filepath */
3492 		ut_ad(dir_path);
3493 		path = fil_make_ibd_name(dir_path, true);
3494 	} else if (has_data_dir) {
3495 		ut_ad(dir_path);
3496 		path = os_file_make_remote_pathname(dir_path, tablename, "ibd");
3497 
3498 		/* Since this tablespace file will be created in a
3499 		remote directory, let's create the subdirectories
3500 		in the path, if they are not there already. */
3501 		success = os_file_create_subdirs_if_needed(path);
3502 		if (!success) {
3503 			err = DB_ERROR;
3504 			goto error_exit_3;
3505 		}
3506 	} else {
3507 		path = fil_make_ibd_name(tablename, false);
3508 	}
3509 
3510 	file = os_file_create(
3511 		innodb_file_data_key, path,
3512 		OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT,
3513 		OS_FILE_NORMAL,
3514 		OS_DATA_FILE,
3515 		&ret);
3516 
3517 	if (ret == FALSE) {
3518 		/* The following call will print an error message */
3519 		ulint	error = os_file_get_last_error(true);
3520 
3521 		ib_logf(IB_LOG_LEVEL_ERROR,
3522 			"Cannot create file '%s'\n", path);
3523 
3524 		if (error == OS_FILE_ALREADY_EXISTS) {
3525 			ib_logf(IB_LOG_LEVEL_ERROR,
3526 				"The file '%s' already exists though the "
3527 				"corresponding table did not exist "
3528 				"in the InnoDB data dictionary. "
3529 				"Have you moved InnoDB .ibd files "
3530 				"around without using the SQL commands "
3531 				"DISCARD TABLESPACE and IMPORT TABLESPACE, "
3532 				"or did mysqld crash in the middle of "
3533 				"CREATE TABLE? "
3534 				"You can resolve the problem by removing "
3535 				"the file '%s' under the 'datadir' of MySQL.",
3536 				path, path);
3537 
3538 			err = DB_TABLESPACE_EXISTS;
3539 			goto error_exit_3;
3540 		}
3541 
3542 		if (error == OS_FILE_DISK_FULL) {
3543 			err = DB_OUT_OF_FILE_SPACE;
3544 			goto error_exit_3;
3545 		}
3546 
3547 		err = DB_ERROR;
3548 		goto error_exit_3;
3549 	}
3550 
3551 	ret = os_file_set_size(path, file, size * UNIV_PAGE_SIZE);
3552 
3553 	if (!ret) {
3554 		err = DB_OUT_OF_FILE_SPACE;
3555 		goto error_exit_2;
3556 	}
3557 
3558 	/* printf("Creating tablespace %s id %lu\n", path, space_id); */
3559 
3560 	/* We have to write the space id to the file immediately and flush the
3561 	file to disk. This is because in crash recovery we must be aware what
3562 	tablespaces exist and what are their space id's, so that we can apply
3563 	the log records to the right file. It may take quite a while until
3564 	buffer pool flush algorithms write anything to the file and flush it to
3565 	disk. If we would not write here anything, the file would be filled
3566 	with zeros from the call of os_file_set_size(), until a buffer pool
3567 	flush would write to it. */
3568 
3569 	buf2 = static_cast<byte*>(ut_malloc(3 * UNIV_PAGE_SIZE));
3570 	/* Align the memory for file i/o if we might have O_DIRECT set */
3571 	page = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
3572 
3573 	memset(page, '\0', UNIV_PAGE_SIZE);
3574 
3575 	/* Add the UNIV_PAGE_SIZE to the table flags and write them to the
3576 	tablespace header. */
3577 	flags = fsp_flags_set_page_size(flags, UNIV_PAGE_SIZE);
3578 	fsp_header_init_fields(page, space_id, flags);
3579 	mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space_id);
3580 
3581 	if (!(fsp_flags_is_compressed(flags))) {
3582 		buf_flush_init_for_writing(page, NULL, 0);
3583 		ret = os_file_write(path, file, page, 0, UNIV_PAGE_SIZE);
3584 	} else {
3585 		page_zip_des_t	page_zip;
3586 		ulint		zip_size;
3587 
3588 		zip_size = fsp_flags_get_zip_size(flags);
3589 
3590 		page_zip_set_size(&page_zip, zip_size);
3591 		page_zip.data = page + UNIV_PAGE_SIZE;
3592 #ifdef UNIV_DEBUG
3593 		page_zip.m_start =
3594 #endif /* UNIV_DEBUG */
3595 			page_zip.m_end = page_zip.m_nonempty =
3596 			page_zip.n_blobs = 0;
3597 		buf_flush_init_for_writing(page, &page_zip, 0);
3598 		ret = os_file_write(path, file, page_zip.data, 0, zip_size);
3599 	}
3600 
3601 	ut_free(buf2);
3602 
3603 	if (!ret) {
3604 		ib_logf(IB_LOG_LEVEL_ERROR,
3605 			"Could not write the first page to tablespace "
3606 			"'%s'", path);
3607 
3608 		err = DB_ERROR;
3609 		goto error_exit_2;
3610 	}
3611 
3612 	ret = os_file_flush(file);
3613 
3614 	if (!ret) {
3615 		ib_logf(IB_LOG_LEVEL_ERROR,
3616 			"File flush of tablespace '%s' failed", path);
3617 		err = DB_ERROR;
3618 		goto error_exit_2;
3619 	}
3620 
3621 	if (has_data_dir) {
3622 		/* Now that the IBD file is created, make the ISL file. */
3623 		err = fil_create_link_file(tablename, path);
3624 		if (err != DB_SUCCESS) {
3625 			goto error_exit_2;
3626 		}
3627 	}
3628 
3629 	success = fil_space_create(tablename, space_id, flags, FIL_TABLESPACE);
3630 	if (!success || !fil_node_create(path, size, space_id, FALSE)) {
3631 		err = DB_ERROR;
3632 		goto error_exit_1;
3633 	}
3634 
3635 #ifndef UNIV_HOTBACKUP
3636 	{
3637 		mtr_t		mtr;
3638 		ulint		mlog_file_flag = 0;
3639 
3640 		if (is_temp) {
3641 			mlog_file_flag |= MLOG_FILE_FLAG_TEMP;
3642 		}
3643 
3644 		mtr_start(&mtr);
3645 
3646 		fil_op_write_log(flags
3647 				 ? MLOG_FILE_CREATE2
3648 				 : MLOG_FILE_CREATE,
3649 				 space_id, mlog_file_flag, flags,
3650 				 tablename, NULL, &mtr);
3651 
3652 		mtr_commit(&mtr);
3653 	}
3654 #endif
3655 	err = DB_SUCCESS;
3656 
3657 	/* Error code is set.  Cleanup the various variables used.
3658 	These labels reflect the order in which variables are assigned or
3659 	actions are done. */
3660 error_exit_1:
3661 	if (has_data_dir && err != DB_SUCCESS) {
3662 		fil_delete_link_file(tablename);
3663 	}
3664 error_exit_2:
3665 	os_file_close(file);
3666 	if (err != DB_SUCCESS) {
3667 		os_file_delete(innodb_file_data_key, path);
3668 	}
3669 error_exit_3:
3670 	mem_free(path);
3671 
3672 	return(err);
3673 }
3674 
3675 #ifndef UNIV_HOTBACKUP
3676 /********************************************************************//**
3677 Report information about a bad tablespace. */
3678 static
3679 void
fil_report_bad_tablespace(const char * filepath,const char * check_msg,ulint found_id,ulint found_flags,ulint expected_id,ulint expected_flags)3680 fil_report_bad_tablespace(
3681 /*======================*/
3682 	const char*	filepath,	/*!< in: filepath */
3683 	const char*	check_msg,	/*!< in: fil_check_first_page() */
3684 	ulint		found_id,	/*!< in: found space ID */
3685 	ulint		found_flags,	/*!< in: found flags */
3686 	ulint		expected_id,	/*!< in: expected space id */
3687 	ulint		expected_flags)	/*!< in: expected flags */
3688 {
3689 	if (check_msg) {
3690 		ib_logf(IB_LOG_LEVEL_ERROR,
3691 			"Error %s in file '%s',"
3692 			"tablespace id=%lu, flags=%lu. "
3693 			"Please refer to "
3694 			REFMAN "innodb-troubleshooting-datadict.html "
3695 			"for how to resolve the issue.",
3696 			check_msg, filepath,
3697 			(ulong) expected_id, (ulong) expected_flags);
3698 		return;
3699 	}
3700 
3701 	ib_logf(IB_LOG_LEVEL_ERROR,
3702 		"In file '%s', tablespace id and flags are %lu and %lu, "
3703 		"but in the InnoDB data dictionary they are %lu and %lu. "
3704 		"Have you moved InnoDB .ibd files around without using the "
3705 		"commands DISCARD TABLESPACE and IMPORT TABLESPACE? "
3706 		"Please refer to "
3707 		REFMAN "innodb-troubleshooting-datadict.html "
3708 		"for how to resolve the issue.",
3709 		filepath, (ulong) found_id, (ulong) found_flags,
3710 		(ulong) expected_id, (ulong) expected_flags);
3711 }
3712 
3713 /********************************************************************//**
3714 Tries to open a single-table tablespace and optionally checks that the
3715 space id in it is correct. If this does not succeed, print an error message
3716 to the .err log. This function is used to open a tablespace when we start
3717 mysqld after the dictionary has been booted, and also in IMPORT TABLESPACE.
3718 
3719 NOTE that we assume this operation is used either at the database startup
3720 or under the protection of the dictionary mutex, so that two users cannot
3721 race here. This operation does not leave the file associated with the
3722 tablespace open, but closes it after we have looked at the space id in it.
3723 
3724 If the validate boolean is set, we read the first page of the file and
3725 check that the space id in the file is what we expect. We assume that
3726 this function runs much faster if no check is made, since accessing the
3727 file inode probably is much faster (the OS caches them) than accessing
3728 the first page of the file.  This boolean may be initially FALSE, but if
3729 a remote tablespace is found it will be changed to true.
3730 
3731 If the fix_dict boolean is set, then it is safe to use an internal SQL
3732 statement to update the dictionary tables if they are incorrect.
3733 
3734 @return	DB_SUCCESS or error code */
3735 UNIV_INTERN
3736 dberr_t
fil_open_single_table_tablespace(bool validate,bool fix_dict,ulint id,ulint flags,const char * tablename,const char * path_in)3737 fil_open_single_table_tablespace(
3738 /*=============================*/
3739 	bool		validate,	/*!< in: Do we validate tablespace? */
3740 	bool		fix_dict,	/*!< in: Can we fix the dictionary? */
3741 	ulint		id,		/*!< in: space id */
3742 	ulint		flags,		/*!< in: tablespace flags */
3743 	const char*	tablename,	/*!< in: table name in the
3744 					databasename/tablename format */
3745 	const char*	path_in)	/*!< in: tablespace filepath */
3746 {
3747 	dberr_t		err = DB_SUCCESS;
3748 	bool		dict_filepath_same_as_default = false;
3749 	bool		link_file_found = false;
3750 	bool		link_file_is_bad = false;
3751 	fsp_open_info	def;
3752 	fsp_open_info	dict;
3753 	fsp_open_info	remote;
3754 	ulint		tablespaces_found = 0;
3755 	ulint		valid_tablespaces_found = 0;
3756 
3757 #ifdef UNIV_SYNC_DEBUG
3758 	ut_ad(!fix_dict || rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
3759 #endif /* UNIV_SYNC_DEBUG */
3760 	ut_ad(!fix_dict || mutex_own(&(dict_sys->mutex)));
3761 
3762 	if (!fsp_flags_is_valid(flags)) {
3763 		return(DB_CORRUPTION);
3764 	}
3765 
3766 	/* If the tablespace was relocated, we do not
3767 	compare the DATA_DIR flag */
3768 	ulint mod_flags = flags & ~FSP_FLAGS_MASK_DATA_DIR;
3769 
3770 	memset(&def, 0, sizeof(def));
3771 	memset(&dict, 0, sizeof(dict));
3772 	memset(&remote, 0, sizeof(remote));
3773 
3774 	/* Discover the correct filepath.  We will always look for an ibd
3775 	in the default location. If it is remote, it should not be here. */
3776 	def.filepath = fil_make_ibd_name(tablename, false);
3777 
3778 	/* The path_in was read from SYS_DATAFILES. */
3779 	if (path_in) {
3780 		if (strcmp(def.filepath, path_in)) {
3781 			dict.filepath = mem_strdup(path_in);
3782 			/* possibility of multiple files. */
3783 			validate = true;
3784 		} else {
3785 			dict_filepath_same_as_default = true;
3786 		}
3787 	}
3788 
3789 	link_file_found = fil_open_linked_file(
3790 		tablename, &remote.filepath, &remote.file);
3791 	remote.success = link_file_found;
3792 	if (remote.success) {
3793 		/* possibility of multiple files. */
3794 		validate = true;
3795 		tablespaces_found++;
3796 
3797 		/* A link file was found. MySQL does not allow a DATA
3798 		DIRECTORY to be be the same as the default filepath. */
3799 		ut_a(strcmp(def.filepath, remote.filepath));
3800 
3801 		/* If there was a filepath found in SYS_DATAFILES,
3802 		we hope it was the same as this remote.filepath found
3803 		in the ISL file. */
3804 		if (dict.filepath
3805 		    && (0 == strcmp(dict.filepath, remote.filepath))) {
3806 			remote.success = FALSE;
3807 			os_file_close(remote.file);
3808 			mem_free(remote.filepath);
3809 			remote.filepath = NULL;
3810 			tablespaces_found--;
3811 		}
3812 	}
3813 
3814 	/* Attempt to open the tablespace at other possible filepaths. */
3815 	if (dict.filepath) {
3816 		dict.file = os_file_create_simple_no_error_handling(
3817 			innodb_file_data_key, dict.filepath, OS_FILE_OPEN,
3818 			OS_FILE_READ_ONLY, &dict.success);
3819 		if (dict.success) {
3820 			/* possibility of multiple files. */
3821 			validate = true;
3822 			tablespaces_found++;
3823 		}
3824 	}
3825 
3826 	/* Always look for a file at the default location. */
3827 	ut_a(def.filepath);
3828 	def.file = os_file_create_simple_no_error_handling(
3829 		innodb_file_data_key, def.filepath, OS_FILE_OPEN,
3830 		OS_FILE_READ_ONLY, &def.success);
3831 	if (def.success) {
3832 		tablespaces_found++;
3833 	}
3834 
3835 	/*  We have now checked all possible tablespace locations and
3836 	have a count of how many we found.  If things are normal, we
3837 	only found 1. */
3838 	if (!validate && tablespaces_found == 1) {
3839 		goto skip_validate;
3840 	}
3841 
3842 	/* Read the first page of the datadir tablespace, if found. */
3843 	if (def.success) {
3844 		def.check_msg = fil_read_first_page(
3845 			def.file, FALSE, &def.flags, &def.id,
3846 			&def.lsn, &def.lsn);
3847 		def.valid = !def.check_msg;
3848 
3849 		/* Validate this single-table-tablespace with SYS_TABLES,
3850 		but do not compare the DATA_DIR flag, in case the
3851 		tablespace was relocated. */
3852 		if (def.valid && def.id == id
3853 		    && (def.flags & ~FSP_FLAGS_MASK_DATA_DIR) == mod_flags) {
3854 			valid_tablespaces_found++;
3855 		} else {
3856 			def.valid = false;
3857 			/* Do not use this tablespace. */
3858 			fil_report_bad_tablespace(
3859 				def.filepath, def.check_msg, def.id,
3860 				def.flags, id, flags);
3861 		}
3862 	}
3863 
3864 	/* Read the first page of the remote tablespace */
3865 	if (remote.success) {
3866 		remote.check_msg = fil_read_first_page(
3867 			remote.file, FALSE, &remote.flags, &remote.id,
3868 			&remote.lsn, &remote.lsn);
3869 		remote.valid = !remote.check_msg;
3870 
3871 		/* Validate this single-table-tablespace with SYS_TABLES,
3872 		but do not compare the DATA_DIR flag, in case the
3873 		tablespace was relocated. */
3874 		if (remote.valid && remote.id == id
3875 		    && (remote.flags & ~FSP_FLAGS_MASK_DATA_DIR) == mod_flags) {
3876 			valid_tablespaces_found++;
3877 		} else {
3878 			remote.valid = false;
3879 			/* Do not use this linked tablespace. */
3880 			fil_report_bad_tablespace(
3881 				remote.filepath, remote.check_msg, remote.id,
3882 				remote.flags, id, flags);
3883 			link_file_is_bad = true;
3884 		}
3885 	}
3886 
3887 	/* Read the first page of the datadir tablespace, if found. */
3888 	if (dict.success) {
3889 		dict.check_msg = fil_read_first_page(
3890 			dict.file, FALSE, &dict.flags, &dict.id,
3891 			&dict.lsn, &dict.lsn);
3892 		dict.valid = !dict.check_msg;
3893 
3894 		/* Validate this single-table-tablespace with SYS_TABLES,
3895 		but do not compare the DATA_DIR flag, in case the
3896 		tablespace was relocated. */
3897 		if (dict.valid && dict.id == id
3898 		    && (dict.flags & ~FSP_FLAGS_MASK_DATA_DIR) == mod_flags) {
3899 			valid_tablespaces_found++;
3900 		} else {
3901 			dict.valid = false;
3902 			/* Do not use this tablespace. */
3903 			fil_report_bad_tablespace(
3904 				dict.filepath, dict.check_msg, dict.id,
3905 				dict.flags, id, flags);
3906 		}
3907 	}
3908 
3909 	/* Make sense of these three possible locations.
3910 	First, bail out if no tablespace files were found. */
3911 	if (valid_tablespaces_found == 0) {
3912 		/* The following call prints an error message */
3913 		os_file_get_last_error(true);
3914 
3915 		ib_logf(IB_LOG_LEVEL_ERROR,
3916 			"Could not find a valid tablespace file for '%s'. "
3917 			"See " REFMAN "innodb-troubleshooting-datadict.html "
3918 			"for how to resolve the issue.",
3919 			tablename);
3920 
3921 		err = DB_CORRUPTION;
3922 
3923 		goto cleanup_and_exit;
3924 	}
3925 
3926 	/* Do not open any tablespaces if more than one tablespace with
3927 	the correct space ID and flags were found. */
3928 	if (tablespaces_found > 1) {
3929 		ib_logf(IB_LOG_LEVEL_ERROR,
3930 			"A tablespace for %s has been found in "
3931 			"multiple places;", tablename);
3932 		if (def.success) {
3933 			ib_logf(IB_LOG_LEVEL_ERROR,
3934 				"Default location; %s, LSN=" LSN_PF
3935 				", Space ID=%lu, Flags=%lu",
3936 				def.filepath, def.lsn,
3937 				(ulong) def.id, (ulong) def.flags);
3938 		}
3939 		if (remote.success) {
3940 			ib_logf(IB_LOG_LEVEL_ERROR,
3941 				"Remote location; %s, LSN=" LSN_PF
3942 				", Space ID=%lu, Flags=%lu",
3943 				remote.filepath, remote.lsn,
3944 				(ulong) remote.id, (ulong) remote.flags);
3945 		}
3946 		if (dict.success) {
3947 			ib_logf(IB_LOG_LEVEL_ERROR,
3948 				"Dictionary location; %s, LSN=" LSN_PF
3949 				", Space ID=%lu, Flags=%lu",
3950 				dict.filepath, dict.lsn,
3951 				(ulong) dict.id, (ulong) dict.flags);
3952 		}
3953 
3954 		/* Force-recovery will allow some tablespaces to be
3955 		skipped by REDO if there was more than one file found.
3956 		Unlike during the REDO phase of recovery, we now know
3957 		if the tablespace is valid according to the dictionary,
3958 		which was not available then. So if we did not force
3959 		recovery and there is only one good tablespace, ignore
3960 		any bad tablespaces. */
3961 		if (valid_tablespaces_found > 1 || srv_force_recovery > 0) {
3962 			ib_logf(IB_LOG_LEVEL_ERROR,
3963 				"Will not open the tablespace for '%s'",
3964 				tablename);
3965 
3966 			if (def.success != def.valid
3967 			    || dict.success != dict.valid
3968 			    || remote.success != remote.valid) {
3969 				err = DB_CORRUPTION;
3970 			} else {
3971 				err = DB_ERROR;
3972 			}
3973 			goto cleanup_and_exit;
3974 		}
3975 
3976 		/* There is only one valid tablespace found and we did
3977 		not use srv_force_recovery during REDO.  Use this one
3978 		tablespace and clean up invalid tablespace pointers */
3979 		if (def.success && !def.valid) {
3980 			def.success = false;
3981 			os_file_close(def.file);
3982 			tablespaces_found--;
3983 		}
3984 		if (dict.success && !dict.valid) {
3985 			dict.success = false;
3986 			os_file_close(dict.file);
3987 			/* Leave dict.filepath so that SYS_DATAFILES
3988 			can be corrected below. */
3989 			tablespaces_found--;
3990 		}
3991 		if (remote.success && !remote.valid) {
3992 			remote.success = false;
3993 			os_file_close(remote.file);
3994 			mem_free(remote.filepath);
3995 			remote.filepath = NULL;
3996 			tablespaces_found--;
3997 		}
3998 	}
3999 
4000 	/* At this point, there should be only one filepath. */
4001 	ut_a(tablespaces_found == 1);
4002 	ut_a(valid_tablespaces_found == 1);
4003 
4004 	/* Only fix the dictionary at startup when there is only one thread.
4005 	Calls to dict_load_table() can be done while holding other latches. */
4006 	if (!fix_dict) {
4007 		goto skip_validate;
4008 	}
4009 
4010 	/* We may need to change what is stored in SYS_DATAFILES or
4011 	SYS_TABLESPACES or adjust the link file.
4012 	Since a failure to update SYS_TABLESPACES or SYS_DATAFILES does
4013 	not prevent opening and using the single_table_tablespace either
4014 	this time or the next, we do not check the return code or fail
4015 	to open the tablespace. But dict_update_filepath() will issue a
4016 	warning to the log. */
4017 	if (dict.filepath) {
4018 		if (remote.success) {
4019 			dict_update_filepath(id, remote.filepath);
4020 		} else if (def.success) {
4021 			dict_update_filepath(id, def.filepath);
4022 			if (link_file_is_bad) {
4023 				fil_delete_link_file(tablename);
4024 			}
4025 		} else if (!link_file_found || link_file_is_bad) {
4026 			ut_ad(dict.success);
4027 			/* Fix the link file if we got our filepath
4028 			from the dictionary but a link file did not
4029 			exist or it did not point to a valid file. */
4030 			fil_delete_link_file(tablename);
4031 			fil_create_link_file(tablename, dict.filepath);
4032 		}
4033 
4034 	} else if (remote.success && dict_filepath_same_as_default) {
4035 		dict_update_filepath(id, remote.filepath);
4036 
4037 	} else if (remote.success && path_in == NULL) {
4038 		/* SYS_DATAFILES record for this space ID was not found. */
4039 		dict_insert_tablespace_and_filepath(
4040 			id, tablename, remote.filepath, flags);
4041 	}
4042 
4043 skip_validate:
4044 	if (err != DB_SUCCESS) {
4045 		; // Don't load the tablespace into the cache
4046 	} else if (!fil_space_create(tablename, id, flags, FIL_TABLESPACE)) {
4047 		err = DB_ERROR;
4048 	} else {
4049 		/* We do not measure the size of the file, that is why
4050 		we pass the 0 below */
4051 
4052 		if (!fil_node_create(remote.success ? remote.filepath :
4053 				     dict.success ? dict.filepath :
4054 				     def.filepath, 0, id, FALSE)) {
4055 			err = DB_ERROR;
4056 		}
4057 	}
4058 
4059 cleanup_and_exit:
4060 	if (remote.success) {
4061 		os_file_close(remote.file);
4062 	}
4063 	if (remote.filepath) {
4064 		mem_free(remote.filepath);
4065 	}
4066 	if (dict.success) {
4067 		os_file_close(dict.file);
4068 	}
4069 	if (dict.filepath) {
4070 		mem_free(dict.filepath);
4071 	}
4072 	if (def.success) {
4073 		os_file_close(def.file);
4074 	}
4075 	mem_free(def.filepath);
4076 
4077 	return(err);
4078 }
4079 #endif /* !UNIV_HOTBACKUP */
4080 
4081 #ifdef UNIV_HOTBACKUP
4082 /*******************************************************************//**
4083 Allocates a file name for an old version of a single-table tablespace.
4084 The string must be freed by caller with mem_free()!
4085 @return	own: file name */
4086 static
4087 char*
fil_make_ibbackup_old_name(const char * name)4088 fil_make_ibbackup_old_name(
4089 /*=======================*/
4090 	const char*	name)		/*!< in: original file name */
4091 {
4092 	static const char suffix[] = "_ibbackup_old_vers_";
4093 	char*	path;
4094 	ulint	len	= strlen(name);
4095 
4096 	path = static_cast<char*>(mem_alloc(len + (15 + sizeof suffix)));
4097 
4098 	memcpy(path, name, len);
4099 	memcpy(path + len, suffix, (sizeof suffix) - 1);
4100 	ut_sprintf_timestamp_without_extra_chars(
4101 		path + len + ((sizeof suffix) - 1));
4102 	return(path);
4103 }
4104 #endif /* UNIV_HOTBACKUP */
4105 
4106 
4107 /*******************************************************************//**
4108 Determine the space id of the given file descriptor by reading a few
4109 pages from the beginning of the .ibd file.
4110 @return true if space id was successfully identified, or false. */
4111 static
4112 bool
fil_user_tablespace_find_space_id(fsp_open_info * fsp)4113 fil_user_tablespace_find_space_id(
4114 /*==============================*/
4115 	fsp_open_info*	fsp)	/* in/out: contains file descriptor, which is
4116 				used as input.  contains space_id, which is
4117 				the output */
4118 {
4119 	bool		st;
4120 	os_offset_t	file_size;
4121 
4122 	file_size = os_file_get_size(fsp->file);
4123 
4124 	if (file_size == (os_offset_t) -1) {
4125 		ib_logf(IB_LOG_LEVEL_ERROR, "Could not get file size: %s",
4126 			fsp->filepath);
4127 		return(false);
4128 	}
4129 
4130 	/* Assuming a page size, read the space_id from each page and store it
4131 	in a map.  Find out which space_id is agreed on by majority of the
4132 	pages.  Choose that space_id. */
4133 	for (ulint page_size = UNIV_ZIP_SIZE_MIN;
4134 	     page_size <= UNIV_PAGE_SIZE_MAX; page_size <<= 1) {
4135 
4136 		/* map[space_id] = count of pages */
4137 		std::map<ulint, ulint> verify;
4138 
4139 		ulint page_count = 64;
4140 		ulint valid_pages = 0;
4141 
4142 		/* Adjust the number of pages to analyze based on file size */
4143 		while ((page_count * page_size) > file_size) {
4144 			--page_count;
4145 		}
4146 
4147 		ib_logf(IB_LOG_LEVEL_INFO, "Page size:%lu Pages to analyze:"
4148 			"%lu", page_size, page_count);
4149 
4150 		byte* buf = static_cast<byte*>(ut_malloc(2*page_size));
4151 		byte* page = static_cast<byte*>(ut_align(buf, page_size));
4152 
4153 		for (ulint j = 0; j < page_count; ++j) {
4154 
4155 			st = os_file_read(fsp->file, page, (j* page_size), page_size);
4156 
4157 			if (!st) {
4158 				ib_logf(IB_LOG_LEVEL_INFO,
4159 					"READ FAIL: page_no:%lu", j);
4160 				continue;
4161 			}
4162 
4163 			bool uncompressed_ok = false;
4164 
4165 			/* For uncompressed pages, the page size must be equal
4166 			to UNIV_PAGE_SIZE. */
4167 			if (page_size == UNIV_PAGE_SIZE) {
4168 				uncompressed_ok = !buf_page_is_corrupted(
4169 					false, page, 0);
4170 			}
4171 
4172 			bool compressed_ok = !buf_page_is_corrupted(
4173 				false, page, page_size);
4174 
4175 			if (uncompressed_ok || compressed_ok) {
4176 
4177 				ulint space_id = mach_read_from_4(page
4178 					+ FIL_PAGE_SPACE_ID);
4179 
4180 				if (space_id > 0) {
4181 					ib_logf(IB_LOG_LEVEL_INFO,
4182 						"VALID: space:%lu "
4183 						"page_no:%lu page_size:%lu",
4184 						space_id, j, page_size);
4185 					verify[space_id]++;
4186 					++valid_pages;
4187 				}
4188 			}
4189 		}
4190 
4191 		ut_free(buf);
4192 
4193 		ib_logf(IB_LOG_LEVEL_INFO, "Page size: %lu, Possible space_id "
4194 			"count:%lu", page_size, (ulint) verify.size());
4195 
4196 		const ulint pages_corrupted = 3;
4197 		for (ulint missed = 0; missed <= pages_corrupted; ++missed) {
4198 
4199 			for (std::map<ulint, ulint>::iterator
4200 			     m = verify.begin(); m != verify.end(); ++m ) {
4201 
4202 				ib_logf(IB_LOG_LEVEL_INFO, "space_id:%lu, "
4203 					"Number of pages matched: %lu/%lu "
4204 					"(%lu)", m->first, m->second,
4205 					valid_pages, page_size);
4206 
4207 				if (m->second == (valid_pages - missed)) {
4208 
4209 					ib_logf(IB_LOG_LEVEL_INFO,
4210 						"Chosen space:%lu\n", m->first);
4211 
4212 					fsp->id = m->first;
4213 					return(true);
4214 				}
4215 			}
4216 
4217 		}
4218 	}
4219 
4220 	return(false);
4221 }
4222 
4223 /*******************************************************************//**
4224 Finds the given page_no of the given space id from the double write buffer,
4225 and copies it to the corresponding .ibd file.
4226 @return true if copy was successful, or false. */
4227 bool
fil_user_tablespace_restore_page(fsp_open_info * fsp,ulint page_no)4228 fil_user_tablespace_restore_page(
4229 /*==============================*/
4230 	fsp_open_info*	fsp,		/* in: contains space id and .ibd
4231 					file information */
4232 	ulint		page_no)	/* in: page_no to obtain from double
4233 					write buffer */
4234 {
4235 	bool	err;
4236 	ulint	flags;
4237 	ulint	zip_size;
4238 	ulint	page_size;
4239 	ulint	buflen;
4240 	byte*	page;
4241 
4242 	ib_logf(IB_LOG_LEVEL_INFO, "Restoring page %lu of tablespace %lu",
4243 		page_no, fsp->id);
4244 
4245 	// find if double write buffer has page_no of given space id
4246 	page = recv_sys->dblwr.find_page(fsp->id, page_no);
4247 
4248 	if (!page) {
4249                 ib_logf(IB_LOG_LEVEL_WARN, "Doublewrite does not have "
4250 			"page_no=%lu of space: %lu", page_no, fsp->id);
4251 		err = false;
4252 		goto out;
4253 	}
4254 
4255         flags = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page);
4256 	zip_size = fsp_flags_get_zip_size(flags);
4257 	page_size = fsp_flags_get_page_size(flags);
4258 
4259 	ut_ad(page_no == page_get_page_no(page));
4260 
4261 	buflen = zip_size ? zip_size: page_size;
4262 
4263 	ib_logf(IB_LOG_LEVEL_INFO, "Writing %lu bytes into file: %s",
4264 		buflen, fsp->filepath);
4265 
4266 	err = os_file_write(fsp->filepath, fsp->file, page,
4267 			    (zip_size ? zip_size : page_size) * page_no,
4268 			    buflen);
4269 
4270 	os_file_flush(fsp->file);
4271 out:
4272 	return(err);
4273 }
4274 
4275 /********************************************************************//**
4276 Opens an .ibd file and adds the associated single-table tablespace to the
4277 InnoDB fil0fil.cc data structures.
4278 Set fsp->success to TRUE if tablespace is valid, FALSE if not. */
4279 static
4280 void
fil_validate_single_table_tablespace(const char * tablename,fsp_open_info * fsp)4281 fil_validate_single_table_tablespace(
4282 /*=================================*/
4283 	const char*	tablename,	/*!< in: database/tablename */
4284 	fsp_open_info*	fsp)		/*!< in/out: tablespace info */
4285 {
4286 	bool restore_attempted = false;
4287 
4288 check_first_page:
4289 	fsp->success = TRUE;
4290 	if (const char* check_msg = fil_read_first_page(
4291 		    fsp->file, FALSE, &fsp->flags, &fsp->id,
4292 		    &fsp->lsn, &fsp->lsn)) {
4293 		ib_logf(IB_LOG_LEVEL_ERROR,
4294 			"%s in tablespace %s (table %s)",
4295 			check_msg, fsp->filepath, tablename);
4296 		fsp->success = FALSE;
4297 	}
4298 
4299 	if (!fsp->success) {
4300 		if (!restore_attempted) {
4301 			if (!fil_user_tablespace_find_space_id(fsp)) {
4302 				return;
4303 			}
4304 			restore_attempted = true;
4305 
4306 			if (fsp->id > 0
4307 			    && !fil_user_tablespace_restore_page(fsp, 0)) {
4308 				return;
4309 			}
4310 			goto check_first_page;
4311 		}
4312 		return;
4313 	}
4314 
4315 	if (fsp->id == ULINT_UNDEFINED || fsp->id == 0) {
4316 		ib_logf(IB_LOG_LEVEL_ERROR,
4317 			"Tablespace is not sensible;"
4318 			" Table: %s  Space ID: %lu  Filepath: %s\n",
4319 		tablename, (ulong) fsp->id, fsp->filepath);
4320 		fsp->success = FALSE;
4321 		return;
4322 	}
4323 
4324 	mutex_enter(&fil_system->mutex);
4325 	fil_space_t* space = fil_space_get_by_id(fsp->id);
4326 	mutex_exit(&fil_system->mutex);
4327 	if (space != NULL) {
4328 		char* prev_filepath = fil_space_get_first_path(fsp->id);
4329 
4330 		ib_logf(IB_LOG_LEVEL_ERROR,
4331 			"Attempted to open a previously opened tablespace. "
4332 			"Previous tablespace %s uses space ID: %lu at "
4333 			"filepath: %s. Cannot open tablespace %s which uses "
4334 			"space ID: %lu at filepath: %s",
4335 			space->name, (ulong) space->id, prev_filepath,
4336 			tablename, (ulong) fsp->id, fsp->filepath);
4337 
4338 		mem_free(prev_filepath);
4339 		fsp->success = FALSE;
4340 		return;
4341 	}
4342 
4343 	fsp->success = TRUE;
4344 }
4345 
4346 
4347 /********************************************************************//**
4348 Opens an .ibd file and adds the associated single-table tablespace to the
4349 InnoDB fil0fil.cc data structures. */
4350 static
4351 void
fil_load_single_table_tablespace(const char * dbname,const char * filename)4352 fil_load_single_table_tablespace(
4353 /*=============================*/
4354 	const char*	dbname,		/*!< in: database name */
4355 	const char*	filename)	/*!< in: file name (not a path),
4356 					including the .ibd or .isl extension */
4357 {
4358 	char*		tablename;
4359 	ulint		tablename_len;
4360 	ulint		dbname_len = strlen(dbname);
4361 	ulint		filename_len = strlen(filename);
4362 	fsp_open_info	def;
4363 	fsp_open_info	remote;
4364 	os_offset_t	size;
4365 #ifdef UNIV_HOTBACKUP
4366 	fil_space_t*	space;
4367 #endif
4368 
4369 	memset(&def, 0, sizeof(def));
4370 	memset(&remote, 0, sizeof(remote));
4371 
4372 	/* The caller assured that the extension is ".ibd" or ".isl". */
4373 	ut_ad(0 == memcmp(filename + filename_len - 4, ".ibd", 4)
4374 	      || 0 == memcmp(filename + filename_len - 4, ".isl", 4));
4375 
4376 	/* Build up the tablename in the standard form database/table. */
4377 	tablename = static_cast<char*>(
4378 		mem_alloc(dbname_len + filename_len + 2));
4379 
4380 	/* When lower_case_table_names = 2 it is possible that the
4381 	dbname is in upper case ,but while storing it in fil_space_t
4382 	we must convert it into lower case */
4383 	sprintf(tablename, "%s" , dbname);
4384 	tablename[dbname_len] = '\0';
4385 
4386         if (lower_case_file_system) {
4387                 dict_casedn_str(tablename);
4388         }
4389 
4390 	sprintf(tablename+dbname_len,"/%s",filename);
4391 	tablename_len = strlen(tablename) - strlen(".ibd");
4392 	tablename[tablename_len] = '\0';
4393 
4394 	/* There may be both .ibd and .isl file in the directory.
4395 	And it is possible that the .isl file refers to a different
4396 	.ibd file.  If so, we open and compare them the first time
4397 	one of them is sent to this function.  So if this table has
4398 	already been loaded, there is nothing to do.*/
4399 	mutex_enter(&fil_system->mutex);
4400 	if (fil_space_get_by_name(tablename)) {
4401 		mem_free(tablename);
4402 		mutex_exit(&fil_system->mutex);
4403 		return;
4404 	}
4405 	mutex_exit(&fil_system->mutex);
4406 
4407 	/* Build up the filepath of the .ibd tablespace in the datadir.
4408 	This must be freed independent of def.success. */
4409 	def.filepath = fil_make_ibd_name(tablename, false);
4410 
4411 #ifdef __WIN__
4412 # ifndef UNIV_HOTBACKUP
4413 	/* If lower_case_table_names is 0 or 2, then MySQL allows database
4414 	directory names with upper case letters. On Windows, all table and
4415 	database names in InnoDB are internally always in lower case. Put the
4416 	file path to lower case, so that we are consistent with InnoDB's
4417 	internal data dictionary. */
4418 
4419 	dict_casedn_str(def.filepath);
4420 # endif /* !UNIV_HOTBACKUP */
4421 #endif
4422 
4423 	/* Check for a link file which locates a remote tablespace. */
4424 	remote.success = fil_open_linked_file(
4425 		tablename, &remote.filepath, &remote.file);
4426 
4427 	/* Read the first page of the remote tablespace */
4428 	if (remote.success) {
4429 		fil_validate_single_table_tablespace(tablename, &remote);
4430 		if (!remote.success) {
4431 			os_file_close(remote.file);
4432 			mem_free(remote.filepath);
4433 		}
4434 	}
4435 
4436 
4437 	/* Try to open the tablespace in the datadir. */
4438 	def.file = os_file_create_simple_no_error_handling(
4439 		innodb_file_data_key, def.filepath, OS_FILE_OPEN,
4440 		OS_FILE_READ_WRITE, &def.success);
4441 
4442 	/* Read the first page of the remote tablespace */
4443 	if (def.success) {
4444 		fil_validate_single_table_tablespace(tablename, &def);
4445 		if (!def.success) {
4446 			os_file_close(def.file);
4447 		}
4448 	}
4449 
4450 	if (!def.success && !remote.success) {
4451 		/* The following call prints an error message */
4452 		os_file_get_last_error(true);
4453 		fprintf(stderr,
4454 			"InnoDB: Error: could not open single-table"
4455 			" tablespace file %s\n", def.filepath);
4456 
4457 		if (!strncmp(filename,
4458 			     tmp_file_prefix, tmp_file_prefix_length)) {
4459 			/* Ignore errors for #sql tablespaces. */
4460 			mem_free(tablename);
4461 			if (remote.filepath) {
4462 				mem_free(remote.filepath);
4463 			}
4464 			if (def.filepath) {
4465 				mem_free(def.filepath);
4466 			}
4467 			return;
4468 		}
4469 no_good_file:
4470 		fprintf(stderr,
4471 			"InnoDB: We do not continue the crash recovery,"
4472 			" because the table may become\n"
4473 			"InnoDB: corrupt if we cannot apply the log"
4474 			" records in the InnoDB log to it.\n"
4475 			"InnoDB: To fix the problem and start mysqld:\n"
4476 			"InnoDB: 1) If there is a permission problem"
4477 			" in the file and mysqld cannot\n"
4478 			"InnoDB: open the file, you should"
4479 			" modify the permissions.\n"
4480 			"InnoDB: 2) If the table is not needed, or you"
4481 			" can restore it from a backup,\n"
4482 			"InnoDB: then you can remove the .ibd file,"
4483 			" and InnoDB will do a normal\n"
4484 			"InnoDB: crash recovery and ignore that table.\n"
4485 			"InnoDB: 3) If the file system or the"
4486 			" disk is broken, and you cannot remove\n"
4487 			"InnoDB: the .ibd file, you can set"
4488 			" innodb_force_recovery > 0 in my.cnf\n"
4489 			"InnoDB: and force InnoDB to continue crash"
4490 			" recovery here.\n");
4491 will_not_choose:
4492 		mem_free(tablename);
4493 		if (remote.filepath) {
4494 			mem_free(remote.filepath);
4495 		}
4496 		if (def.filepath) {
4497 			mem_free(def.filepath);
4498 		}
4499 
4500 		if (srv_force_recovery > 0) {
4501 			ib_logf(IB_LOG_LEVEL_INFO,
4502 				"innodb_force_recovery was set to %lu. "
4503 				"Continuing crash recovery even though we "
4504 				"cannot access the .ibd file of this table.",
4505 				srv_force_recovery);
4506 			return;
4507 		}
4508 
4509 		exit(1);
4510 	}
4511 
4512 	if (def.success && remote.success) {
4513 		ib_logf(IB_LOG_LEVEL_ERROR,
4514 			"Tablespaces for %s have been found in two places;\n"
4515 			"Location 1: SpaceID: %lu  LSN: %lu  File: %s\n"
4516 			"Location 2: SpaceID: %lu  LSN: %lu  File: %s\n"
4517 			"You must delete one of them.",
4518 			tablename, (ulong) def.id, (ulong) def.lsn,
4519 			def.filepath, (ulong) remote.id, (ulong) remote.lsn,
4520 			remote.filepath);
4521 
4522 		def.success = FALSE;
4523 		os_file_close(def.file);
4524 		os_file_close(remote.file);
4525 		goto will_not_choose;
4526 	}
4527 
4528 	/* At this point, only one tablespace is open */
4529 	ut_a(def.success == !remote.success);
4530 
4531 	fsp_open_info*	fsp = def.success ? &def : &remote;
4532 
4533 	/* Get and test the file size. */
4534 	size = os_file_get_size(fsp->file);
4535 
4536 	if (size == (os_offset_t) -1) {
4537 		/* The following call prints an error message */
4538 		os_file_get_last_error(true);
4539 
4540 		ib_logf(IB_LOG_LEVEL_ERROR,
4541 			"could not measure the size of single-table "
4542 			"tablespace file %s", fsp->filepath);
4543 
4544 		os_file_close(fsp->file);
4545 		goto no_good_file;
4546 	}
4547 
4548 	/* Every .ibd file is created >= 4 pages in size. Smaller files
4549 	cannot be ok. */
4550 	ulong minimum_size = FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE;
4551 	if (size < minimum_size) {
4552 #ifndef UNIV_HOTBACKUP
4553 		ib_logf(IB_LOG_LEVEL_ERROR,
4554 			"The size of single-table tablespace file %s "
4555 			"is only " UINT64PF ", should be at least %lu!",
4556 			fsp->filepath, size, minimum_size);
4557 		os_file_close(fsp->file);
4558 		goto no_good_file;
4559 #else
4560 		fsp->id = ULINT_UNDEFINED;
4561 		fsp->flags = 0;
4562 #endif /* !UNIV_HOTBACKUP */
4563 	}
4564 
4565 #ifdef UNIV_HOTBACKUP
4566 	if (fsp->id == ULINT_UNDEFINED || fsp->id == 0) {
4567 		char*	new_path;
4568 
4569 		fprintf(stderr,
4570 			"InnoDB: Renaming tablespace %s of id %lu,\n"
4571 			"InnoDB: to %s_ibbackup_old_vers_<timestamp>\n"
4572 			"InnoDB: because its size %" PRId64 " is too small"
4573 			" (< 4 pages 16 kB each),\n"
4574 			"InnoDB: or the space id in the file header"
4575 			" is not sensible.\n"
4576 			"InnoDB: This can happen in an mysqlbackup run,"
4577 			" and is not dangerous.\n",
4578 			fsp->filepath, fsp->id, fsp->filepath, size);
4579 		os_file_close(fsp->file);
4580 
4581 		new_path = fil_make_ibbackup_old_name(fsp->filepath);
4582 
4583 		bool	success = os_file_rename(
4584 			innodb_file_data_key, fsp->filepath, new_path);
4585 
4586 		ut_a(success);
4587 
4588 		mem_free(new_path);
4589 
4590 		goto func_exit_after_close;
4591 	}
4592 
4593 	/* A backup may contain the same space several times, if the space got
4594 	renamed at a sensitive time. Since it is enough to have one version of
4595 	the space, we rename the file if a space with the same space id
4596 	already exists in the tablespace memory cache. We rather rename the
4597 	file than delete it, because if there is a bug, we do not want to
4598 	destroy valuable data. */
4599 
4600 	mutex_enter(&fil_system->mutex);
4601 
4602 	space = fil_space_get_by_id(fsp->id);
4603 
4604 	if (space) {
4605 		char*	new_path;
4606 
4607 		fprintf(stderr,
4608 			"InnoDB: Renaming tablespace %s of id %lu,\n"
4609 			"InnoDB: to %s_ibbackup_old_vers_<timestamp>\n"
4610 			"InnoDB: because space %s with the same id\n"
4611 			"InnoDB: was scanned earlier. This can happen"
4612 			" if you have renamed tables\n"
4613 			"InnoDB: during an mysqlbackup run.\n",
4614 			fsp->filepath, fsp->id, fsp->filepath,
4615 			space->name);
4616 		os_file_close(fsp->file);
4617 
4618 		new_path = fil_make_ibbackup_old_name(fsp->filepath);
4619 
4620 		mutex_exit(&fil_system->mutex);
4621 
4622 		bool	success = os_file_rename(
4623 			innodb_file_data_key, fsp->filepath, new_path);
4624 
4625 		ut_a(success);
4626 
4627 		mem_free(new_path);
4628 
4629 		goto func_exit_after_close;
4630 	}
4631 	mutex_exit(&fil_system->mutex);
4632 #endif /* UNIV_HOTBACKUP */
4633 	ibool file_space_create_success = fil_space_create(
4634 		tablename, fsp->id, fsp->flags, FIL_TABLESPACE);
4635 
4636 	if (!file_space_create_success) {
4637 		if (srv_force_recovery > 0) {
4638 			fprintf(stderr,
4639 				"InnoDB: innodb_force_recovery was set"
4640 				" to %lu. Continuing crash recovery\n"
4641 				"InnoDB: even though the tablespace"
4642 				" creation of this table failed.\n",
4643 				srv_force_recovery);
4644 			goto func_exit;
4645 		}
4646 
4647 		/* Exit here with a core dump, stack, etc. */
4648 		ut_a(file_space_create_success);
4649 	}
4650 
4651 	/* We do not use the size information we have about the file, because
4652 	the rounding formula for extents and pages is somewhat complex; we
4653 	let fil_node_open() do that task. */
4654 
4655 	if (!fil_node_create(fsp->filepath, 0, fsp->id, FALSE)) {
4656 		ut_error;
4657 	}
4658 
4659 func_exit:
4660 	os_file_close(fsp->file);
4661 
4662 #ifdef UNIV_HOTBACKUP
4663 func_exit_after_close:
4664 #else
4665 	ut_ad(!mutex_own(&fil_system->mutex));
4666 #endif
4667 	mem_free(tablename);
4668 	if (remote.success) {
4669 		mem_free(remote.filepath);
4670 	}
4671 	mem_free(def.filepath);
4672 }
4673 
4674 /***********************************************************************//**
4675 A fault-tolerant function that tries to read the next file name in the
4676 directory. We retry 100 times if os_file_readdir_next_file() returns -1. The
4677 idea is to read as much good data as we can and jump over bad data.
4678 @return 0 if ok, -1 if error even after the retries, 1 if at the end
4679 of the directory */
4680 static
4681 int
fil_file_readdir_next_file(dberr_t * err,const char * dirname,os_file_dir_t dir,os_file_stat_t * info)4682 fil_file_readdir_next_file(
4683 /*=======================*/
4684 	dberr_t*	err,	/*!< out: this is set to DB_ERROR if an error
4685 				was encountered, otherwise not changed */
4686 	const char*	dirname,/*!< in: directory name or path */
4687 	os_file_dir_t	dir,	/*!< in: directory stream */
4688 	os_file_stat_t*	info)	/*!< in/out: buffer where the
4689 				info is returned */
4690 {
4691 	for (ulint i = 0; i < 100; i++) {
4692 		int	ret = os_file_readdir_next_file(dirname, dir, info);
4693 
4694 		if (ret != -1) {
4695 
4696 			return(ret);
4697 		}
4698 
4699 		ib_logf(IB_LOG_LEVEL_ERROR,
4700 			"os_file_readdir_next_file() returned -1 in "
4701 			"directory %s, crash recovery may have failed "
4702 			"for some .ibd files!", dirname);
4703 
4704 		*err = DB_ERROR;
4705 	}
4706 
4707 	return(-1);
4708 }
4709 
4710 /********************************************************************//**
4711 At the server startup, if we need crash recovery, scans the database
4712 directories under the MySQL datadir, looking for .ibd files. Those files are
4713 single-table tablespaces. We need to know the space id in each of them so that
4714 we know into which file we should look to check the contents of a page stored
4715 in the doublewrite buffer, also to know where to apply log records where the
4716 space id is != 0.
4717 @return	DB_SUCCESS or error number */
4718 UNIV_INTERN
4719 dberr_t
fil_load_single_table_tablespaces(void)4720 fil_load_single_table_tablespaces(void)
4721 /*===================================*/
4722 {
4723 	int		ret;
4724 	char*		dbpath		= NULL;
4725 	ulint		dbpath_len	= 100;
4726 	os_file_dir_t	dir;
4727 	os_file_dir_t	dbdir;
4728 	os_file_stat_t	dbinfo;
4729 	os_file_stat_t	fileinfo;
4730 	dberr_t		err		= DB_SUCCESS;
4731 
4732 	/* The datadir of MySQL is always the default directory of mysqld */
4733 
4734 	dir = os_file_opendir(fil_path_to_mysql_datadir, TRUE);
4735 
4736 	if (dir == NULL) {
4737 
4738 		return(DB_ERROR);
4739 	}
4740 
4741 	dbpath = static_cast<char*>(mem_alloc(dbpath_len));
4742 
4743 	/* Scan all directories under the datadir. They are the database
4744 	directories of MySQL. */
4745 
4746 	ret = fil_file_readdir_next_file(&err, fil_path_to_mysql_datadir, dir,
4747 					 &dbinfo);
4748 	while (ret == 0) {
4749 		ulint len;
4750 		/* printf("Looking at %s in datadir\n", dbinfo.name); */
4751 
4752 		if (dbinfo.type == OS_FILE_TYPE_FILE
4753 		    || dbinfo.type == OS_FILE_TYPE_UNKNOWN) {
4754 
4755 			goto next_datadir_item;
4756 		}
4757 
4758 		/* We found a symlink or a directory; try opening it to see
4759 		if a symlink is a directory */
4760 
4761 		len = strlen(fil_path_to_mysql_datadir)
4762 			+ strlen (dbinfo.name) + 2;
4763 		if (len > dbpath_len) {
4764 			dbpath_len = len;
4765 
4766 			if (dbpath) {
4767 				mem_free(dbpath);
4768 			}
4769 
4770 			dbpath = static_cast<char*>(mem_alloc(dbpath_len));
4771 		}
4772 		ut_snprintf(dbpath, dbpath_len,
4773 			    "%s/%s", fil_path_to_mysql_datadir, dbinfo.name);
4774 		srv_normalize_path_for_win(dbpath);
4775 
4776 		dbdir = os_file_opendir(dbpath, FALSE);
4777 
4778 		if (dbdir != NULL) {
4779 
4780 			/* We found a database directory; loop through it,
4781 			looking for possible .ibd files in it */
4782 
4783 			ret = fil_file_readdir_next_file(&err, dbpath, dbdir,
4784 							 &fileinfo);
4785 			while (ret == 0) {
4786 
4787 				if (fileinfo.type == OS_FILE_TYPE_DIR) {
4788 
4789 					goto next_file_item;
4790 				}
4791 
4792 				/* We found a symlink or a file */
4793 				if (strlen(fileinfo.name) > 4
4794 				    && (0 == strcmp(fileinfo.name
4795 						   + strlen(fileinfo.name) - 4,
4796 						   ".ibd")
4797 					|| 0 == strcmp(fileinfo.name
4798 						   + strlen(fileinfo.name) - 4,
4799 						   ".isl"))) {
4800 					/* The name ends in .ibd or .isl;
4801 					try opening the file */
4802 					fil_load_single_table_tablespace(
4803 						dbinfo.name, fileinfo.name);
4804 				}
4805 next_file_item:
4806 				ret = fil_file_readdir_next_file(&err,
4807 								 dbpath, dbdir,
4808 								 &fileinfo);
4809 			}
4810 
4811 			if (0 != os_file_closedir(dbdir)) {
4812 				fputs("InnoDB: Warning: could not"
4813 				      " close database directory ", stderr);
4814 				ut_print_filename(stderr, dbpath);
4815 				putc('\n', stderr);
4816 
4817 				err = DB_ERROR;
4818 			}
4819 		}
4820 
4821 next_datadir_item:
4822 		ret = fil_file_readdir_next_file(&err,
4823 						 fil_path_to_mysql_datadir,
4824 						 dir, &dbinfo);
4825 	}
4826 
4827 	mem_free(dbpath);
4828 
4829 	if (0 != os_file_closedir(dir)) {
4830 		fprintf(stderr,
4831 			"InnoDB: Error: could not close MySQL datadir\n");
4832 
4833 		return(DB_ERROR);
4834 	}
4835 
4836 	return(err);
4837 }
4838 
4839 /*******************************************************************//**
4840 Returns TRUE if a single-table tablespace does not exist in the memory cache,
4841 or is being deleted there.
4842 @return	TRUE if does not exist or is being deleted */
4843 UNIV_INTERN
4844 ibool
fil_tablespace_deleted_or_being_deleted_in_mem(ulint id,ib_int64_t version)4845 fil_tablespace_deleted_or_being_deleted_in_mem(
4846 /*===========================================*/
4847 	ulint		id,	/*!< in: space id */
4848 	ib_int64_t	version)/*!< in: tablespace_version should be this; if
4849 				you pass -1 as the value of this, then this
4850 				parameter is ignored */
4851 {
4852 	fil_space_t*	space;
4853 
4854 	ut_ad(fil_system);
4855 
4856 	mutex_enter(&fil_system->mutex);
4857 
4858 	space = fil_space_get_by_id(id);
4859 
4860 	if (space == NULL || space->stop_new_ops) {
4861 		mutex_exit(&fil_system->mutex);
4862 
4863 		return(TRUE);
4864 	}
4865 
4866 	if (version != ((ib_int64_t)-1)
4867 	    && space->tablespace_version != version) {
4868 		mutex_exit(&fil_system->mutex);
4869 
4870 		return(TRUE);
4871 	}
4872 
4873 	mutex_exit(&fil_system->mutex);
4874 
4875 	return(FALSE);
4876 }
4877 
4878 /*******************************************************************//**
4879 Returns TRUE if a single-table tablespace exists in the memory cache.
4880 @return	TRUE if exists */
4881 UNIV_INTERN
4882 ibool
fil_tablespace_exists_in_mem(ulint id)4883 fil_tablespace_exists_in_mem(
4884 /*=========================*/
4885 	ulint	id)	/*!< in: space id */
4886 {
4887 	fil_space_t*	space;
4888 
4889 	ut_ad(fil_system);
4890 
4891 	mutex_enter(&fil_system->mutex);
4892 
4893 	space = fil_space_get_by_id(id);
4894 
4895 	mutex_exit(&fil_system->mutex);
4896 
4897 	return(space != NULL);
4898 }
4899 
4900 /*******************************************************************//**
4901 Report that a tablespace for a table was not found. */
4902 static
4903 void
fil_report_missing_tablespace(const char * name,ulint space_id)4904 fil_report_missing_tablespace(
4905 /*===========================*/
4906 	const char*	name,			/*!< in: table name */
4907 	ulint		space_id)		/*!< in: table's space id */
4908 {
4909 	char index_name[MAX_FULL_NAME_LEN + 1];
4910 
4911 	innobase_format_name(index_name, sizeof(index_name), name, TRUE);
4912 
4913 	ib_logf(IB_LOG_LEVEL_ERROR,
4914 		"Table %s in the InnoDB data dictionary has tablespace id %lu, "
4915 		"but tablespace with that id or name does not exist. Have "
4916 		"you deleted or moved .ibd files? This may also be a table "
4917 		"created with CREATE TEMPORARY TABLE whose .ibd and .frm "
4918 		"files MySQL automatically removed, but the table still "
4919 		"exists in the InnoDB internal data dictionary.",
4920 		name, space_id);
4921 }
4922 
4923 /*******************************************************************//**
4924 Returns TRUE if a matching tablespace exists in the InnoDB tablespace memory
4925 cache. Note that if we have not done a crash recovery at the database startup,
4926 there may be many tablespaces which are not yet in the memory cache.
4927 @return	TRUE if a matching tablespace exists in the memory cache */
4928 UNIV_INTERN
4929 ibool
fil_space_for_table_exists_in_mem(ulint id,const char * name,ibool mark_space,ibool print_error_if_does_not_exist,bool adjust_space,mem_heap_t * heap,table_id_t table_id)4930 fil_space_for_table_exists_in_mem(
4931 /*==============================*/
4932 	ulint		id,		/*!< in: space id */
4933 	const char*	name,		/*!< in: table name used in
4934 					fil_space_create().  Either the
4935 					standard 'dbname/tablename' format
4936 					or table->dir_path_of_temp_table */
4937 	ibool		mark_space,	/*!< in: in crash recovery, at database
4938 					startup we mark all spaces which have
4939 					an associated table in the InnoDB
4940 					data dictionary, so that
4941 					we can print a warning about orphaned
4942 					tablespaces */
4943 	ibool		print_error_if_does_not_exist,
4944 					/*!< in: print detailed error
4945 					information to the .err log if a
4946 					matching tablespace is not found from
4947 					memory */
4948 	bool		adjust_space,	/*!< in: whether to adjust space id
4949 					when find table space mismatch */
4950 	mem_heap_t*	heap,		/*!< in: heap memory */
4951 	table_id_t	table_id)	/*!< in: table id */
4952 {
4953 	fil_space_t*	fnamespace;
4954 	fil_space_t*	space;
4955 
4956 	ut_ad(fil_system);
4957 
4958 	mutex_enter(&fil_system->mutex);
4959 
4960 	/* Look if there is a space with the same id */
4961 
4962 	space = fil_space_get_by_id(id);
4963 
4964 	/* Look if there is a space with the same name; the name is the
4965 	directory path from the datadir to the file */
4966 
4967 	fnamespace = fil_space_get_by_name(name);
4968 	if (space && space == fnamespace) {
4969 		/* Found */
4970 
4971 		if (mark_space) {
4972 			space->mark = TRUE;
4973 		}
4974 
4975 		mutex_exit(&fil_system->mutex);
4976 
4977 		return(TRUE);
4978 	}
4979 
4980 	/* Info from "fnamespace" comes from the ibd file itself, it can
4981 	be different from data obtained from System tables since it is
4982 	not transactional. If adjust_space is set, and the mismatching
4983 	space are between a user table and its temp table, we shall
4984 	adjust the ibd file name according to system table info */
4985 	if (adjust_space
4986 	    && space != NULL
4987 	    && row_is_mysql_tmp_table_name(space->name)
4988 	    && !row_is_mysql_tmp_table_name(name)) {
4989 
4990 		mutex_exit(&fil_system->mutex);
4991 
4992 		DBUG_EXECUTE_IF("ib_crash_before_adjust_fil_space",
4993 				DBUG_SUICIDE(););
4994 
4995 		if (fnamespace) {
4996 			char*	tmp_name;
4997 
4998 			tmp_name = dict_mem_create_temporary_tablename(
4999 				heap, name, table_id);
5000 
5001 			fil_rename_tablespace(fnamespace->name, fnamespace->id,
5002 					      tmp_name, NULL);
5003 		}
5004 
5005 		DBUG_EXECUTE_IF("ib_crash_after_adjust_one_fil_space",
5006 				DBUG_SUICIDE(););
5007 
5008 		fil_rename_tablespace(space->name, id, name, NULL);
5009 
5010 		DBUG_EXECUTE_IF("ib_crash_after_adjust_fil_space",
5011 				DBUG_SUICIDE(););
5012 
5013 		mutex_enter(&fil_system->mutex);
5014 		fnamespace = fil_space_get_by_name(name);
5015 		ut_ad(space == fnamespace);
5016 		mutex_exit(&fil_system->mutex);
5017 
5018 		return(TRUE);
5019 	}
5020 
5021 	if (!print_error_if_does_not_exist) {
5022 
5023 		mutex_exit(&fil_system->mutex);
5024 
5025 		return(FALSE);
5026 	}
5027 
5028 	if (space == NULL) {
5029 		if (fnamespace == NULL) {
5030 			if (print_error_if_does_not_exist) {
5031 				fil_report_missing_tablespace(name, id);
5032 			}
5033 		} else {
5034 			ut_print_timestamp(stderr);
5035 			fputs("  InnoDB: Error: table ", stderr);
5036 			ut_print_filename(stderr, name);
5037 			fprintf(stderr, "\n"
5038 				"InnoDB: in InnoDB data dictionary has"
5039 				" tablespace id %lu,\n"
5040 				"InnoDB: but a tablespace with that id"
5041 				" does not exist. There is\n"
5042 				"InnoDB: a tablespace of name %s and id %lu,"
5043 				" though. Have\n"
5044 				"InnoDB: you deleted or moved .ibd files?\n",
5045 				(ulong) id, fnamespace->name,
5046 				(ulong) fnamespace->id);
5047 		}
5048 error_exit:
5049 		fputs("InnoDB: Please refer to\n"
5050 		      "InnoDB: " REFMAN "innodb-troubleshooting-datadict.html\n"
5051 		      "InnoDB: for how to resolve the issue.\n", stderr);
5052 
5053 		mutex_exit(&fil_system->mutex);
5054 
5055 		return(FALSE);
5056 	}
5057 
5058 	if (0 != strcmp(space->name, name)) {
5059 		ut_print_timestamp(stderr);
5060 		fputs("  InnoDB: Error: table ", stderr);
5061 		ut_print_filename(stderr, name);
5062 		fprintf(stderr, "\n"
5063 			"InnoDB: in InnoDB data dictionary has"
5064 			" tablespace id %lu,\n"
5065 			"InnoDB: but the tablespace with that id"
5066 			" has name %s.\n"
5067 			"InnoDB: Have you deleted or moved .ibd files?\n",
5068 			(ulong) id, space->name);
5069 
5070 		if (fnamespace != NULL) {
5071 			fputs("InnoDB: There is a tablespace"
5072 			      " with the right name\n"
5073 			      "InnoDB: ", stderr);
5074 			ut_print_filename(stderr, fnamespace->name);
5075 			fprintf(stderr, ", but its id is %lu.\n",
5076 				(ulong) fnamespace->id);
5077 		}
5078 
5079 		goto error_exit;
5080 	}
5081 
5082 	mutex_exit(&fil_system->mutex);
5083 
5084 	return(FALSE);
5085 }
5086 
5087 /*******************************************************************//**
5088 Checks if a single-table tablespace for a given table name exists in the
5089 tablespace memory cache.
5090 @return	space id, ULINT_UNDEFINED if not found */
5091 UNIV_INTERN
5092 ulint
fil_get_space_id_for_table(const char * tablename)5093 fil_get_space_id_for_table(
5094 /*=======================*/
5095 	const char*	tablename)	/*!< in: table name in the standard
5096 				'databasename/tablename' format */
5097 {
5098 	fil_space_t*	fnamespace;
5099 	ulint		id		= ULINT_UNDEFINED;
5100 
5101 	ut_ad(fil_system);
5102 
5103 	mutex_enter(&fil_system->mutex);
5104 
5105 	/* Look if there is a space with the same name. */
5106 
5107 	fnamespace = fil_space_get_by_name(tablename);
5108 
5109 	if (fnamespace) {
5110 		id = fnamespace->id;
5111 	}
5112 
5113 	mutex_exit(&fil_system->mutex);
5114 
5115 	return(id);
5116 }
5117 
5118 /**********************************************************************//**
5119 Tries to extend a data file so that it would accommodate the number of pages
5120 given. The tablespace must be cached in the memory cache. If the space is big
5121 enough already, does nothing.
5122 @return	TRUE if success */
5123 UNIV_INTERN
5124 ibool
fil_extend_space_to_desired_size(ulint * actual_size,ulint space_id,ulint size_after_extend)5125 fil_extend_space_to_desired_size(
5126 /*=============================*/
5127 	ulint*	actual_size,	/*!< out: size of the space after extension;
5128 				if we ran out of disk space this may be lower
5129 				than the desired size */
5130 	ulint	space_id,	/*!< in: space id */
5131 	ulint	size_after_extend)/*!< in: desired size in pages after the
5132 				extension; if the current space size is bigger
5133 				than this already, the function does nothing */
5134 {
5135 	fil_node_t*	node;
5136 	fil_space_t*	space;
5137 	byte*		buf2;
5138 	byte*		buf;
5139 	ulint		buf_size;
5140 	ulint		start_page_no;
5141 	ulint		file_start_page_no;
5142 	ulint		page_size;
5143 	ulint		pages_added;
5144 	ibool		success;
5145 
5146 	ut_ad(!srv_read_only_mode);
5147 
5148 retry:
5149 	pages_added = 0;
5150 	success = TRUE;
5151 
5152 	fil_mutex_enter_and_prepare_for_io(space_id);
5153 
5154 	space = fil_space_get_by_id(space_id);
5155 	ut_a(space);
5156 
5157 	if (space->size >= size_after_extend) {
5158 		/* Space already big enough */
5159 
5160 		*actual_size = space->size;
5161 
5162 		mutex_exit(&fil_system->mutex);
5163 
5164 		return(TRUE);
5165 	}
5166 
5167 	page_size = fsp_flags_get_zip_size(space->flags);
5168 	if (!page_size) {
5169 		page_size = UNIV_PAGE_SIZE;
5170 	}
5171 
5172 	node = UT_LIST_GET_LAST(space->chain);
5173 
5174 	if (!node->being_extended) {
5175 		/* Mark this node as undergoing extension. This flag
5176 		is used by other threads to wait for the extension
5177 		opereation to finish. */
5178 		node->being_extended = TRUE;
5179 	} else {
5180 		/* Another thread is currently extending the file. Wait
5181 		for it to finish.
5182 		It'd have been better to use event driven mechanism but
5183 		the entire module is peppered with polling stuff. */
5184 		mutex_exit(&fil_system->mutex);
5185 		os_thread_sleep(100000);
5186 		goto retry;
5187 	}
5188 
5189 	if (!fil_node_prepare_for_io(node, fil_system, space)) {
5190 		/* The tablespace data file, such as .ibd file, is missing */
5191 		node->being_extended = false;
5192 		mutex_exit(&fil_system->mutex);
5193 
5194 		return(false);
5195 	}
5196 
5197 	/* At this point it is safe to release fil_system mutex. No
5198 	other thread can rename, delete or close the file because
5199 	we have set the node->being_extended flag. */
5200 	mutex_exit(&fil_system->mutex);
5201 
5202 	start_page_no = space->size;
5203 	file_start_page_no = space->size - node->size;
5204 
5205 #ifdef HAVE_POSIX_FALLOCATE
5206 	if (srv_use_posix_fallocate) {
5207 
5208 		os_offset_t	start_offset = file_start_page_no * page_size;
5209 		os_offset_t	end_offset
5210 			= (size_after_extend - file_start_page_no) * page_size;
5211 
5212 		success = (os_file_allocate(node->handle, start_offset,
5213 					    end_offset) == 0);
5214 		if (!success)
5215 		{
5216 			ib_logf(IB_LOG_LEVEL_ERROR,
5217 				"preallocating file space for file \'%s\' "
5218 				"failed.  Current size " INT64PF
5219 				", len " INT64PF ", desired size " INT64PF,
5220 				node->name, start_offset, end_offset,
5221 				start_offset + end_offset);
5222 		}
5223 		mutex_enter(&fil_system->mutex);
5224 		if (success) {
5225 			node->size += (size_after_extend - start_page_no);
5226 			space->size += (size_after_extend - start_page_no);
5227 			os_has_said_disk_full = FALSE;
5228 		}
5229 		node->being_extended = FALSE;
5230 		fil_node_complete_io(node, fil_system, OS_FILE_READ);
5231 		goto complete_io;
5232 	}
5233 #endif
5234 
5235 	/* Extend at most 64 pages at a time */
5236 	buf_size = ut_min(64, size_after_extend - start_page_no) * page_size;
5237 	buf2 = static_cast<byte*>(mem_alloc(buf_size + page_size));
5238 	buf = static_cast<byte*>(ut_align(buf2, page_size));
5239 
5240 	memset(buf, 0, buf_size);
5241 
5242 	while (start_page_no < size_after_extend) {
5243 		ulint		n_pages
5244 			= ut_min(buf_size / page_size,
5245 				 size_after_extend - start_page_no);
5246 
5247 		os_offset_t	offset
5248 			= ((os_offset_t) (start_page_no - file_start_page_no))
5249 			* page_size;
5250 #ifdef UNIV_HOTBACKUP
5251 		success = os_file_write(node->name, node->handle, buf,
5252 					offset, page_size * n_pages);
5253 #else
5254 		success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC,
5255 				 node->name, node->handle, buf,
5256 				 offset, page_size * n_pages,
5257 				 NULL, NULL, space_id, NULL, false);
5258 #endif /* UNIV_HOTBACKUP */
5259 		if (success) {
5260 			os_has_said_disk_full = FALSE;
5261 		} else {
5262 			/* Let us measure the size of the file to determine
5263 			how much we were able to extend it */
5264 			os_offset_t	size;
5265 
5266 			size = os_file_get_size(node->handle);
5267 			ut_a(size != (os_offset_t) -1);
5268 
5269 			n_pages = ((ulint) (size / page_size))
5270 				- node->size - pages_added;
5271 
5272 			pages_added += n_pages;
5273 			break;
5274 		}
5275 
5276 		start_page_no += n_pages;
5277 		pages_added += n_pages;
5278 	}
5279 
5280 	mem_free(buf2);
5281 
5282 	mutex_enter(&fil_system->mutex);
5283 
5284 	ut_a(node->being_extended);
5285 
5286 	space->size += pages_added;
5287 	node->size += pages_added;
5288 	node->being_extended = FALSE;
5289 
5290 	fil_node_complete_io(node, fil_system, OS_FILE_WRITE);
5291 
5292 #ifdef HAVE_POSIX_FALLOCATE
5293 complete_io:
5294 #endif
5295 
5296 	*actual_size = space->size;
5297 
5298 #ifndef UNIV_HOTBACKUP
5299 	if (space_id == 0) {
5300 		ulint pages_per_mb = (1024 * 1024) / page_size;
5301 
5302 		/* Keep the last data file size info up to date, rounded to
5303 		full megabytes */
5304 
5305 		srv_data_file_sizes[srv_n_data_files - 1]
5306 			= (node->size / pages_per_mb) * pages_per_mb;
5307 	}
5308 #endif /* !UNIV_HOTBACKUP */
5309 
5310 	/*
5311 	printf("Extended %s to %lu, actual size %lu pages\n", space->name,
5312 	size_after_extend, *actual_size); */
5313 	mutex_exit(&fil_system->mutex);
5314 
5315 	fil_flush(space_id);
5316 
5317 	return(success);
5318 }
5319 
5320 #ifdef UNIV_HOTBACKUP
5321 /********************************************************************//**
5322 Extends all tablespaces to the size stored in the space header. During the
5323 mysqlbackup --apply-log phase we extended the spaces on-demand so that log
5324 records could be applied, but that may have left spaces still too small
5325 compared to the size stored in the space header. */
5326 UNIV_INTERN
5327 void
fil_extend_tablespaces_to_stored_len(void)5328 fil_extend_tablespaces_to_stored_len(void)
5329 /*======================================*/
5330 {
5331 	fil_space_t*	space;
5332 	byte*		buf;
5333 	ulint		actual_size;
5334 	ulint		size_in_header;
5335 	dberr_t		error;
5336 	ibool		success;
5337 
5338 	buf = mem_alloc(UNIV_PAGE_SIZE);
5339 
5340 	mutex_enter(&fil_system->mutex);
5341 
5342 	space = UT_LIST_GET_FIRST(fil_system->space_list);
5343 
5344 	while (space) {
5345 		ut_a(space->purpose == FIL_TABLESPACE);
5346 
5347 		mutex_exit(&fil_system->mutex); /* no need to protect with a
5348 					      mutex, because this is a
5349 					      single-threaded operation */
5350 		error = fil_read(TRUE, space->id,
5351 				 fsp_flags_get_zip_size(space->flags),
5352 				 0, 0, UNIV_PAGE_SIZE, buf, NULL);
5353 		ut_a(error == DB_SUCCESS);
5354 
5355 		size_in_header = fsp_get_size_low(buf);
5356 
5357 		success = fil_extend_space_to_desired_size(
5358 			&actual_size, space->id, size_in_header);
5359 		if (!success) {
5360 			fprintf(stderr,
5361 				"InnoDB: Error: could not extend the"
5362 				" tablespace of %s\n"
5363 				"InnoDB: to the size stored in header,"
5364 				" %lu pages;\n"
5365 				"InnoDB: size after extension %lu pages\n"
5366 				"InnoDB: Check that you have free disk space"
5367 				" and retry!\n",
5368 				space->name, size_in_header, actual_size);
5369 			ut_a(success);
5370 		}
5371 
5372 		mutex_enter(&fil_system->mutex);
5373 
5374 		space = UT_LIST_GET_NEXT(space_list, space);
5375 	}
5376 
5377 	mutex_exit(&fil_system->mutex);
5378 
5379 	mem_free(buf);
5380 }
5381 #endif
5382 
5383 /*========== RESERVE FREE EXTENTS (for a B-tree split, for example) ===*/
5384 
5385 /*******************************************************************//**
5386 Tries to reserve free extents in a file space.
5387 @return	TRUE if succeed */
5388 UNIV_INTERN
5389 ibool
fil_space_reserve_free_extents(ulint id,ulint n_free_now,ulint n_to_reserve)5390 fil_space_reserve_free_extents(
5391 /*===========================*/
5392 	ulint	id,		/*!< in: space id */
5393 	ulint	n_free_now,	/*!< in: number of free extents now */
5394 	ulint	n_to_reserve)	/*!< in: how many one wants to reserve */
5395 {
5396 	fil_space_t*	space;
5397 	ibool		success;
5398 
5399 	ut_ad(fil_system);
5400 
5401 	mutex_enter(&fil_system->mutex);
5402 
5403 	space = fil_space_get_by_id(id);
5404 
5405 	ut_a(space);
5406 
5407 	if (space->n_reserved_extents + n_to_reserve > n_free_now) {
5408 		success = FALSE;
5409 	} else {
5410 		space->n_reserved_extents += n_to_reserve;
5411 		success = TRUE;
5412 	}
5413 
5414 	mutex_exit(&fil_system->mutex);
5415 
5416 	return(success);
5417 }
5418 
5419 /*******************************************************************//**
5420 Releases free extents in a file space. */
5421 UNIV_INTERN
5422 void
fil_space_release_free_extents(ulint id,ulint n_reserved)5423 fil_space_release_free_extents(
5424 /*===========================*/
5425 	ulint	id,		/*!< in: space id */
5426 	ulint	n_reserved)	/*!< in: how many one reserved */
5427 {
5428 	fil_space_t*	space;
5429 
5430 	ut_ad(fil_system);
5431 
5432 	mutex_enter(&fil_system->mutex);
5433 
5434 	space = fil_space_get_by_id(id);
5435 
5436 	ut_a(space);
5437 	ut_a(space->n_reserved_extents >= n_reserved);
5438 
5439 	space->n_reserved_extents -= n_reserved;
5440 
5441 	mutex_exit(&fil_system->mutex);
5442 }
5443 
5444 /*******************************************************************//**
5445 Gets the number of reserved extents. If the database is silent, this number
5446 should be zero. */
5447 UNIV_INTERN
5448 ulint
fil_space_get_n_reserved_extents(ulint id)5449 fil_space_get_n_reserved_extents(
5450 /*=============================*/
5451 	ulint	id)		/*!< in: space id */
5452 {
5453 	fil_space_t*	space;
5454 	ulint		n;
5455 
5456 	ut_ad(fil_system);
5457 
5458 	mutex_enter(&fil_system->mutex);
5459 
5460 	space = fil_space_get_by_id(id);
5461 
5462 	ut_a(space);
5463 
5464 	n = space->n_reserved_extents;
5465 
5466 	mutex_exit(&fil_system->mutex);
5467 
5468 	return(n);
5469 }
5470 
5471 /*============================ FILE I/O ================================*/
5472 
5473 /********************************************************************//**
5474 NOTE: you must call fil_mutex_enter_and_prepare_for_io() first!
5475 
5476 Prepares a file node for i/o. Opens the file if it is closed. Updates the
5477 pending i/o's field in the node and the system appropriately. Takes the node
5478 off the LRU list if it is in the LRU list. The caller must hold the fil_sys
5479 mutex.
5480 @return false if the file can't be opened, otherwise true */
5481 static
5482 bool
fil_node_prepare_for_io(fil_node_t * node,fil_system_t * system,fil_space_t * space)5483 fil_node_prepare_for_io(
5484 /*====================*/
5485 	fil_node_t*	node,	/*!< in: file node */
5486 	fil_system_t*	system,	/*!< in: tablespace memory cache */
5487 	fil_space_t*	space)	/*!< in: space */
5488 {
5489 	ut_ad(node && system && space);
5490 	ut_ad(mutex_own(&(system->mutex)));
5491 
5492 	if (system->n_open > system->max_n_open + 5) {
5493 		ut_print_timestamp(stderr);
5494 		fprintf(stderr,
5495 			"  InnoDB: Warning: open files %lu"
5496 			" exceeds the limit %lu\n",
5497 			(ulong) system->n_open,
5498 			(ulong) system->max_n_open);
5499 	}
5500 
5501 	if (node->open == FALSE) {
5502 		/* File is closed: open it */
5503 		ut_a(node->n_pending == 0);
5504 
5505 		if (!fil_node_open_file(node, system, space)) {
5506 			return(false);
5507 		}
5508 	}
5509 
5510 	if (node->n_pending == 0 && fil_space_belongs_in_lru(space)) {
5511 		/* The node is in the LRU list, remove it */
5512 
5513 		ut_a(UT_LIST_GET_LEN(system->LRU) > 0);
5514 
5515 		UT_LIST_REMOVE(LRU, system->LRU, node);
5516 	}
5517 
5518 	node->n_pending++;
5519 
5520 	return(true);
5521 }
5522 
5523 /********************************************************************//**
5524 Updates the data structures when an i/o operation finishes. Updates the
5525 pending i/o's field in the node appropriately. */
5526 static
5527 void
fil_node_complete_io(fil_node_t * node,fil_system_t * system,ulint type)5528 fil_node_complete_io(
5529 /*=================*/
5530 	fil_node_t*	node,	/*!< in: file node */
5531 	fil_system_t*	system,	/*!< in: tablespace memory cache */
5532 	ulint		type)	/*!< in: OS_FILE_WRITE or OS_FILE_READ; marks
5533 				the node as modified if
5534 				type == OS_FILE_WRITE */
5535 {
5536 	ut_ad(node);
5537 	ut_ad(system);
5538 	ut_ad(mutex_own(&(system->mutex)));
5539 
5540 	ut_a(node->n_pending > 0);
5541 
5542 	node->n_pending--;
5543 
5544 	if (type == OS_FILE_WRITE) {
5545 		ut_ad(!srv_read_only_mode);
5546 		system->modification_counter++;
5547 		node->modification_counter = system->modification_counter;
5548 
5549 		if (fil_buffering_disabled(node->space)) {
5550 
5551 			/* We don't need to keep track of unflushed
5552 			changes as user has explicitly disabled
5553 			buffering. */
5554 			ut_ad(!node->space->is_in_unflushed_spaces);
5555 			node->flush_counter = node->modification_counter;
5556 
5557 		} else if (!node->space->is_in_unflushed_spaces) {
5558 
5559 			node->space->is_in_unflushed_spaces = true;
5560 			UT_LIST_ADD_FIRST(unflushed_spaces,
5561 					  system->unflushed_spaces,
5562 					  node->space);
5563 		}
5564 	}
5565 
5566 	if (node->n_pending == 0 && fil_space_belongs_in_lru(node->space)) {
5567 
5568 		/* The node must be put back to the LRU list */
5569 		UT_LIST_ADD_FIRST(LRU, system->LRU, node);
5570 	}
5571 }
5572 
5573 /********************************************************************//**
5574 Report information about an invalid page access. */
5575 static
5576 void
fil_report_invalid_page_access(ulint block_offset,ulint space_id,const char * space_name,ulint byte_offset,ulint len,ulint type)5577 fil_report_invalid_page_access(
5578 /*===========================*/
5579 	ulint		block_offset,	/*!< in: block offset */
5580 	ulint		space_id,	/*!< in: space id */
5581 	const char*	space_name,	/*!< in: space name */
5582 	ulint		byte_offset,	/*!< in: byte offset */
5583 	ulint		len,		/*!< in: I/O length */
5584 	ulint		type)		/*!< in: I/O type */
5585 {
5586 	fprintf(stderr,
5587 		"InnoDB: Error: trying to access page number %lu"
5588 		" in space %lu,\n"
5589 		"InnoDB: space name %s,\n"
5590 		"InnoDB: which is outside the tablespace bounds.\n"
5591 		"InnoDB: Byte offset %lu, len %lu, i/o type %lu.\n"
5592 		"InnoDB: If you get this error at mysqld startup,"
5593 		" please check that\n"
5594 		"InnoDB: your my.cnf matches the ibdata files"
5595 		" that you have in the\n"
5596 		"InnoDB: MySQL server.\n",
5597 		(ulong) block_offset, (ulong) space_id, space_name,
5598 		(ulong) byte_offset, (ulong) len, (ulong) type);
5599 }
5600 
5601 /********************************************************************//**
5602 Reads or writes data. This operation is asynchronous (aio).
5603 @return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do
5604 i/o on a tablespace which does not exist */
5605 UNIV_INTERN
5606 dberr_t
_fil_io(ulint type,bool sync,ulint space_id,ulint zip_size,ulint block_offset,ulint byte_offset,ulint len,void * buf,void * message,trx_t * trx,bool should_buffer)5607 _fil_io(
5608 /*===*/
5609 	ulint	type,		/*!< in: OS_FILE_READ or OS_FILE_WRITE,
5610 				ORed to OS_FILE_LOG, if a log i/o
5611 				and ORed to OS_AIO_SIMULATED_WAKE_LATER
5612 				if simulated aio and we want to post a
5613 				batch of i/os; NOTE that a simulated batch
5614 				may introduce hidden chances of deadlocks,
5615 				because i/os are not actually handled until
5616 				all have been posted: use with great
5617 				caution! */
5618 	bool	sync,		/*!< in: true if synchronous aio is desired */
5619 	ulint	space_id,	/*!< in: space id */
5620 	ulint	zip_size,	/*!< in: compressed page size in bytes;
5621 				0 for uncompressed pages */
5622 	ulint	block_offset,	/*!< in: offset in number of blocks */
5623 	ulint	byte_offset,	/*!< in: remainder of offset in bytes; in
5624 				aio this must be divisible by the OS block
5625 				size */
5626 	ulint	len,		/*!< in: how many bytes to read or write; this
5627 				must not cross a file boundary; in aio this
5628 				must be a block size multiple */
5629 	void*	buf,		/*!< in/out: buffer where to store read data
5630 				or from where to write; in aio this must be
5631 				appropriately aligned */
5632 	void*	message,	/*!< in: message for aio handler if non-sync
5633 				aio used, else ignored */
5634 	trx_t*	trx,
5635 	bool	should_buffer)	/*!< in: whether to buffer an aio request.
5636 				AIO read ahead uses this. If you plan to
5637 				use this parameter, make sure you remember
5638 				to call os_aio_dispatch_read_array_submit()
5639 				when you're ready to commit all your requests.*/
5640 {
5641 	ulint		mode;
5642 	fil_space_t*	space;
5643 	fil_node_t*	node;
5644 	ibool		ret;
5645 	ulint		is_log;
5646 	ulint		wake_later;
5647 	os_offset_t	offset;
5648 	ibool		ignore_nonexistent_pages;
5649 
5650 	is_log = type & OS_FILE_LOG;
5651 	type = type & ~OS_FILE_LOG;
5652 
5653 	wake_later = type & OS_AIO_SIMULATED_WAKE_LATER;
5654 	type = type & ~OS_AIO_SIMULATED_WAKE_LATER;
5655 
5656 	ignore_nonexistent_pages = type & BUF_READ_IGNORE_NONEXISTENT_PAGES;
5657 	type &= ~BUF_READ_IGNORE_NONEXISTENT_PAGES;
5658 
5659 	ut_ad(byte_offset < UNIV_PAGE_SIZE);
5660 	ut_ad(!zip_size || !byte_offset);
5661 	ut_ad(ut_is_2pow(zip_size));
5662 	ut_ad(buf);
5663 	ut_ad(len > 0);
5664 	ut_ad(UNIV_PAGE_SIZE == (ulong)(1 << UNIV_PAGE_SIZE_SHIFT));
5665 #if (1 << UNIV_PAGE_SIZE_SHIFT_MAX) != UNIV_PAGE_SIZE_MAX
5666 # error "(1 << UNIV_PAGE_SIZE_SHIFT_MAX) != UNIV_PAGE_SIZE_MAX"
5667 #endif
5668 #if (1 << UNIV_PAGE_SIZE_SHIFT_MIN) != UNIV_PAGE_SIZE_MIN
5669 # error "(1 << UNIV_PAGE_SIZE_SHIFT_MIN) != UNIV_PAGE_SIZE_MIN"
5670 #endif
5671 	ut_ad(fil_validate_skip());
5672 #ifndef UNIV_HOTBACKUP
5673 # ifndef UNIV_LOG_DEBUG
5674 	/* ibuf bitmap pages must be read in the sync aio mode: */
5675 	ut_ad(recv_no_ibuf_operations
5676 	      || type == OS_FILE_WRITE
5677 	      || !ibuf_bitmap_page(zip_size, block_offset)
5678 	      || sync
5679 	      || is_log);
5680 # endif /* UNIV_LOG_DEBUG */
5681 	if (sync) {
5682 		mode = OS_AIO_SYNC;
5683 	} else if (is_log) {
5684 		mode = OS_AIO_LOG;
5685 	} else if (type == OS_FILE_READ
5686 		   && !recv_no_ibuf_operations
5687 		   && ibuf_page(space_id, zip_size, block_offset, NULL)) {
5688 		mode = OS_AIO_IBUF;
5689 	} else {
5690 		mode = OS_AIO_NORMAL;
5691 	}
5692 #else /* !UNIV_HOTBACKUP */
5693 	ut_a(sync);
5694 	mode = OS_AIO_SYNC;
5695 #endif /* !UNIV_HOTBACKUP */
5696 
5697 	if (type == OS_FILE_READ) {
5698 		srv_stats.data_read.add(len);
5699 	} else if (type == OS_FILE_WRITE) {
5700 		ut_ad(!srv_read_only_mode);
5701 		srv_stats.data_written.add(len);
5702 	}
5703 
5704 	/* Reserve the fil_system mutex and make sure that we can open at
5705 	least one file while holding it, if the file is not already open */
5706 
5707 	fil_mutex_enter_and_prepare_for_io(space_id);
5708 
5709 	space = fil_space_get_by_id(space_id);
5710 
5711 	/* If we are deleting a tablespace we don't allow async read operations
5712 	on that. However, we do allow write and sync read operations */
5713 	if (space == 0
5714 	    || (type == OS_FILE_READ && !sync && space->stop_new_ops)) {
5715 		mutex_exit(&fil_system->mutex);
5716 
5717 		ib_logf(IB_LOG_LEVEL_ERROR,
5718 			"Trying to do i/o to a tablespace which does "
5719 			"not exist. i/o type %lu, space id %lu, "
5720 			"page no. %lu, i/o length %lu bytes",
5721 			(ulong) type, (ulong) space_id, (ulong) block_offset,
5722 			(ulong) len);
5723 
5724 		return(DB_TABLESPACE_DELETED);
5725 	}
5726 
5727 	ut_ad(mode != OS_AIO_IBUF || space->purpose == FIL_TABLESPACE);
5728 
5729 	node = UT_LIST_GET_FIRST(space->chain);
5730 
5731 	for (;;) {
5732 		if (node == NULL) {
5733 			if (ignore_nonexistent_pages) {
5734 				mutex_exit(&fil_system->mutex);
5735 				return(DB_ERROR);
5736 			}
5737 
5738 			fil_report_invalid_page_access(
5739 				block_offset, space_id, space->name,
5740 				byte_offset, len, type);
5741 
5742 			ut_error;
5743 
5744 		} else if (fil_is_user_tablespace_id(space->id)
5745 			   && node->size == 0) {
5746 
5747 			/* We do not know the size of a single-table tablespace
5748 			before we open the file */
5749 			break;
5750 		} else if (node->size > block_offset) {
5751 			/* Found! */
5752 			break;
5753 		} else {
5754 			block_offset -= node->size;
5755 			node = UT_LIST_GET_NEXT(chain, node);
5756 		}
5757 	}
5758 
5759 	/* Open file if closed */
5760 	if (!fil_node_prepare_for_io(node, fil_system, space)) {
5761 		if (space->purpose == FIL_TABLESPACE
5762 		    && fil_is_user_tablespace_id(space->id)) {
5763 			mutex_exit(&fil_system->mutex);
5764 
5765 			ib_logf(IB_LOG_LEVEL_ERROR,
5766 				"Trying to do i/o to a tablespace which "
5767 				"exists without .ibd data file. "
5768 				"i/o type %lu, space id %lu, page no %lu, "
5769 				"i/o length %lu bytes",
5770 				(ulong) type, (ulong) space_id,
5771 				(ulong) block_offset, (ulong) len);
5772 
5773 			return(DB_TABLESPACE_DELETED);
5774 		}
5775 
5776 		/* The tablespace is for log. Currently, we just assert here
5777 		to prevent handling errors along the way fil_io returns.
5778 		Also, if the log files are missing, it would be hard to
5779 		promise the server can continue running. */
5780 		ut_a(0);
5781 	}
5782 
5783 	/* Check that at least the start offset is within the bounds of a
5784 	single-table tablespace, including rollback tablespaces. */
5785 	if (UNIV_UNLIKELY(node->size <= block_offset)
5786 	    && space->id != 0 && space->purpose == FIL_TABLESPACE) {
5787 
5788 		fil_report_invalid_page_access(
5789 			block_offset, space_id, space->name, byte_offset,
5790 			len, type);
5791 
5792 		ut_error;
5793 	}
5794 
5795 	/* Now we have made the changes in the data structures of fil_system */
5796 	mutex_exit(&fil_system->mutex);
5797 
5798 	/* Calculate the low 32 bits and the high 32 bits of the file offset */
5799 
5800 	if (!zip_size) {
5801 		offset = ((os_offset_t) block_offset << UNIV_PAGE_SIZE_SHIFT)
5802 			+ byte_offset;
5803 
5804 		ut_a(node->size - block_offset
5805 		     >= ((byte_offset + len + (UNIV_PAGE_SIZE - 1))
5806 			 / UNIV_PAGE_SIZE));
5807 	} else {
5808 		ulint	zip_size_shift;
5809 		switch (zip_size) {
5810 		case 1024: zip_size_shift = 10; break;
5811 		case 2048: zip_size_shift = 11; break;
5812 		case 4096: zip_size_shift = 12; break;
5813 		case 8192: zip_size_shift = 13; break;
5814 		case 16384: zip_size_shift = 14; break;
5815 		default: ut_error;
5816 		}
5817 		offset = ((os_offset_t) block_offset << zip_size_shift)
5818 			+ byte_offset;
5819 		ut_a(node->size - block_offset
5820 		     >= (len + (zip_size - 1)) / zip_size);
5821 	}
5822 
5823 	/* Do aio */
5824 
5825 	ut_a(byte_offset % OS_MIN_LOG_BLOCK_SIZE == 0);
5826 	ut_a((len % OS_MIN_LOG_BLOCK_SIZE) == 0);
5827 
5828 #ifndef UNIV_HOTBACKUP
5829 	if (UNIV_UNLIKELY(space->is_corrupt && srv_pass_corrupt_table)) {
5830 
5831 		/* should ignore i/o for the crashed space */
5832 		if (srv_pass_corrupt_table == 1 ||
5833 		    type == OS_FILE_WRITE) {
5834 
5835 			mutex_enter(&fil_system->mutex);
5836 			fil_node_complete_io(node, fil_system, type);
5837 			mutex_exit(&fil_system->mutex);
5838 			if (mode == OS_AIO_NORMAL) {
5839 				ut_a(space->purpose == FIL_TABLESPACE);
5840 				buf_page_io_complete(static_cast<buf_page_t *>
5841 						     (message));
5842 			}
5843 		}
5844 
5845 		if (srv_pass_corrupt_table == 1 && type == OS_FILE_READ) {
5846 
5847 			return(DB_TABLESPACE_DELETED);
5848 
5849 		} else if (type == OS_FILE_WRITE) {
5850 
5851 			return(DB_SUCCESS);
5852 		}
5853 	}
5854 
5855 	/* Queue the aio request */
5856 	ret = os_aio(type, mode | wake_later, node->name, node->handle, buf,
5857 		offset, len, node, message, space_id, trx, should_buffer);
5858 
5859 #else
5860 	/* In mysqlbackup do normal i/o, not aio */
5861 	if (type == OS_FILE_READ) {
5862 		ret = os_file_read(node->handle, buf, offset, len);
5863 	} else {
5864 		ut_ad(!srv_read_only_mode);
5865 		ret = os_file_write(node->name, node->handle, buf,
5866 				    offset, len);
5867 	}
5868 #endif /* !UNIV_HOTBACKUP */
5869 	ut_a(ret);
5870 
5871 	if (mode == OS_AIO_SYNC) {
5872 		/* The i/o operation is already completed when we return from
5873 		os_aio: */
5874 
5875 		mutex_enter(&fil_system->mutex);
5876 
5877 		fil_node_complete_io(node, fil_system, type);
5878 
5879 		mutex_exit(&fil_system->mutex);
5880 
5881 		ut_ad(fil_validate_skip());
5882 	}
5883 
5884 	return(DB_SUCCESS);
5885 }
5886 
5887 #ifndef UNIV_HOTBACKUP
5888 /**********************************************************************//**
5889 Waits for an aio operation to complete. This function is used to write the
5890 handler for completed requests. The aio array of pending requests is divided
5891 into segments (see os0file.cc for more info). The thread specifies which
5892 segment it wants to wait for. */
5893 UNIV_INTERN
5894 void
fil_aio_wait(ulint segment)5895 fil_aio_wait(
5896 /*=========*/
5897 	ulint	segment)	/*!< in: the number of the segment in the aio
5898 				array to wait for */
5899 {
5900 	ibool		ret;
5901 	fil_node_t*	fil_node;
5902 	void*		message;
5903 	ulint		type;
5904 	ulint		space_id = 0;
5905 
5906 	ut_ad(fil_validate_skip());
5907 
5908 	if (srv_use_native_aio) {
5909 		srv_set_io_thread_op_info(segment, "native aio handle");
5910 #ifdef WIN_ASYNC_IO
5911 		ret = os_aio_windows_handle(
5912 			segment, 0, &fil_node, &message, &type, &space_id);
5913 #elif defined(LINUX_NATIVE_AIO)
5914 		ret = os_aio_linux_handle(
5915 			segment, &fil_node, &message, &type, &space_id);
5916 #else
5917 		ut_error;
5918 		ret = 0; /* Eliminate compiler warning */
5919 #endif /* WIN_ASYNC_IO */
5920 	} else {
5921 		srv_set_io_thread_op_info(segment, "simulated aio handle");
5922 
5923 		ret = os_aio_simulated_handle(
5924 			segment, &fil_node, &message, &type, &space_id);
5925 	}
5926 
5927 	ut_a(ret);
5928 	if (fil_node == NULL) {
5929 		ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS);
5930 		return;
5931 	}
5932 
5933 	srv_set_io_thread_op_info(segment, "complete io for fil node");
5934 
5935 	mutex_enter(&fil_system->mutex);
5936 
5937 	fil_node_complete_io(fil_node, fil_system, type);
5938 
5939 	mutex_exit(&fil_system->mutex);
5940 
5941 	ut_ad(fil_validate_skip());
5942 
5943 	/* Do the i/o handling */
5944 	/* IMPORTANT: since i/o handling for reads will read also the insert
5945 	buffer in tablespace 0, you have to be very careful not to introduce
5946 	deadlocks in the i/o system. We keep tablespace 0 data files always
5947 	open, and use a special i/o thread to serve insert buffer requests. */
5948 
5949 	if (fil_node->space->purpose == FIL_TABLESPACE) {
5950 		srv_set_io_thread_op_info(segment, "complete io for buf page");
5951 		buf_page_io_complete(static_cast<buf_page_t*>(message));
5952 	} else {
5953 		srv_set_io_thread_op_info(segment, "complete io for log");
5954 		log_io_complete(static_cast<log_group_t*>(message));
5955 	}
5956 }
5957 #endif /* UNIV_HOTBACKUP */
5958 
5959 /**********************************************************************//**
5960 Flushes to disk possible writes cached by the OS. If the space does not exist
5961 or is being dropped, does not do anything. */
5962 UNIV_INTERN
5963 void
fil_flush(ulint space_id)5964 fil_flush(
5965 /*======*/
5966 	ulint	space_id)	/*!< in: file space id (this can be a group of
5967 				log files or a tablespace of the database) */
5968 {
5969 	fil_space_t*	space;
5970 	fil_node_t*	node;
5971 	pfs_os_file_t	file;
5972 
5973 
5974 	mutex_enter(&fil_system->mutex);
5975 
5976 	space = fil_space_get_by_id(space_id);
5977 
5978 	if (!space || space->stop_new_ops) {
5979 		mutex_exit(&fil_system->mutex);
5980 
5981 		return;
5982 	}
5983 
5984 	if (fil_buffering_disabled(space)) {
5985 
5986 		/* No need to flush. User has explicitly disabled
5987 		buffering. */
5988 		ut_ad(!space->is_in_unflushed_spaces);
5989 		ut_ad(fil_space_is_flushed(space));
5990 		ut_ad(space->n_pending_flushes == 0);
5991 
5992 #ifdef UNIV_DEBUG
5993 		for (node = UT_LIST_GET_FIRST(space->chain);
5994 		     node != NULL;
5995 		     node = UT_LIST_GET_NEXT(chain, node)) {
5996 			ut_ad(node->modification_counter
5997 			      == node->flush_counter);
5998 			ut_ad(node->n_pending_flushes == 0);
5999 		}
6000 #endif /* UNIV_DEBUG */
6001 
6002 		mutex_exit(&fil_system->mutex);
6003 		return;
6004 	}
6005 
6006 	space->n_pending_flushes++;	/*!< prevent dropping of the space while
6007 					we are flushing */
6008 	for (node = UT_LIST_GET_FIRST(space->chain);
6009 	     node != NULL;
6010 	     node = UT_LIST_GET_NEXT(chain, node)) {
6011 
6012 		ib_int64_t old_mod_counter = node->modification_counter;;
6013 
6014 		if (old_mod_counter <= node->flush_counter) {
6015 			continue;
6016 		}
6017 
6018 		ut_a(node->open);
6019 
6020 		if (space->purpose == FIL_TABLESPACE) {
6021 			fil_n_pending_tablespace_flushes++;
6022 		} else {
6023 			fil_n_pending_log_flushes++;
6024 			fil_n_log_flushes++;
6025 		}
6026 #ifdef __WIN__
6027 		if (node->is_raw_disk) {
6028 
6029 			goto skip_flush;
6030 		}
6031 #endif /* __WIN__ */
6032 retry:
6033 		if (node->n_pending_flushes > 0) {
6034 			/* We want to avoid calling os_file_flush() on
6035 			the file twice at the same time, because we do
6036 			not know what bugs OS's may contain in file
6037 			i/o */
6038 
6039 			ib_int64_t sig_count =
6040 				os_event_reset(node->sync_event);
6041 
6042 			mutex_exit(&fil_system->mutex);
6043 
6044 			os_event_wait_low(node->sync_event, sig_count);
6045 
6046 			mutex_enter(&fil_system->mutex);
6047 
6048 			if (node->flush_counter >= old_mod_counter) {
6049 
6050 				goto skip_flush;
6051 			}
6052 
6053 			goto retry;
6054 		}
6055 
6056 		ut_a(node->open);
6057 		file = node->handle;
6058 		node->n_pending_flushes++;
6059 
6060 		mutex_exit(&fil_system->mutex);
6061 
6062 		os_file_flush(file);
6063 
6064 		mutex_enter(&fil_system->mutex);
6065 
6066 		os_event_set(node->sync_event);
6067 
6068 		node->n_pending_flushes--;
6069 skip_flush:
6070 		if (node->flush_counter < old_mod_counter) {
6071 			node->flush_counter = old_mod_counter;
6072 
6073 			if (space->is_in_unflushed_spaces
6074 			    && fil_space_is_flushed(space)) {
6075 
6076 				space->is_in_unflushed_spaces = false;
6077 
6078 				UT_LIST_REMOVE(
6079 					unflushed_spaces,
6080 					fil_system->unflushed_spaces,
6081 					space);
6082 			}
6083 		}
6084 
6085 		if (space->purpose == FIL_TABLESPACE) {
6086 			fil_n_pending_tablespace_flushes--;
6087 		} else {
6088 			fil_n_pending_log_flushes--;
6089 		}
6090 	}
6091 
6092 	space->n_pending_flushes--;
6093 
6094 	mutex_exit(&fil_system->mutex);
6095 }
6096 
6097 /**********************************************************************//**
6098 Flushes to disk the writes in file spaces of the given type possibly cached by
6099 the OS. */
6100 UNIV_INTERN
6101 void
fil_flush_file_spaces(ulint purpose)6102 fil_flush_file_spaces(
6103 /*==================*/
6104 	ulint	purpose)	/*!< in: FIL_TABLESPACE, FIL_LOG */
6105 {
6106 	fil_space_t*	space;
6107 	ulint*		space_ids;
6108 	ulint		n_space_ids;
6109 	ulint		i;
6110 
6111 	mutex_enter(&fil_system->mutex);
6112 
6113 	n_space_ids = UT_LIST_GET_LEN(fil_system->unflushed_spaces);
6114 	if (n_space_ids == 0) {
6115 
6116 		mutex_exit(&fil_system->mutex);
6117 		return;
6118 	}
6119 
6120 	/* Assemble a list of space ids to flush.  Previously, we
6121 	traversed fil_system->unflushed_spaces and called UT_LIST_GET_NEXT()
6122 	on a space that was just removed from the list by fil_flush().
6123 	Thus, the space could be dropped and the memory overwritten. */
6124 	space_ids = static_cast<ulint*>(
6125 		mem_alloc(n_space_ids * sizeof *space_ids));
6126 
6127 	n_space_ids = 0;
6128 
6129 	for (space = UT_LIST_GET_FIRST(fil_system->unflushed_spaces);
6130 	     space;
6131 	     space = UT_LIST_GET_NEXT(unflushed_spaces, space)) {
6132 
6133 		if (space->purpose == purpose && !space->stop_new_ops) {
6134 
6135 			space_ids[n_space_ids++] = space->id;
6136 		}
6137 	}
6138 
6139 	mutex_exit(&fil_system->mutex);
6140 
6141 	/* Flush the spaces.  It will not hurt to call fil_flush() on
6142 	a non-existing space id. */
6143 	for (i = 0; i < n_space_ids; i++) {
6144 
6145 		fil_flush(space_ids[i]);
6146 	}
6147 
6148 	mem_free(space_ids);
6149 }
6150 
6151 /** Functor to validate the space list. */
6152 struct	Check {
operator ()Check6153 	void	operator()(const fil_node_t* elem)
6154 	{
6155 		ut_a(elem->open || !elem->n_pending);
6156 	}
6157 };
6158 
6159 /******************************************************************//**
6160 Checks the consistency of the tablespace cache.
6161 @return	TRUE if ok */
6162 UNIV_INTERN
6163 ibool
fil_validate(void)6164 fil_validate(void)
6165 /*==============*/
6166 {
6167 	fil_space_t*	space;
6168 	fil_node_t*	fil_node;
6169 	ulint		n_open		= 0;
6170 	ulint		i;
6171 
6172 	mutex_enter(&fil_system->mutex);
6173 
6174 	/* Look for spaces in the hash table */
6175 
6176 	for (i = 0; i < hash_get_n_cells(fil_system->spaces); i++) {
6177 
6178 		for (space = static_cast<fil_space_t*>(
6179 				HASH_GET_FIRST(fil_system->spaces, i));
6180 		     space != 0;
6181 		     space = static_cast<fil_space_t*>(
6182 			     	HASH_GET_NEXT(hash, space))) {
6183 
6184 			UT_LIST_VALIDATE(
6185 				chain, fil_node_t, space->chain, Check());
6186 
6187 			for (fil_node = UT_LIST_GET_FIRST(space->chain);
6188 			     fil_node != 0;
6189 			     fil_node = UT_LIST_GET_NEXT(chain, fil_node)) {
6190 
6191 				if (fil_node->n_pending > 0) {
6192 					ut_a(fil_node->open);
6193 				}
6194 
6195 				if (fil_node->open) {
6196 					n_open++;
6197 				}
6198 			}
6199 		}
6200 	}
6201 
6202 	ut_a(fil_system->n_open == n_open);
6203 
6204 	UT_LIST_CHECK(LRU, fil_node_t, fil_system->LRU);
6205 
6206 	for (fil_node = UT_LIST_GET_FIRST(fil_system->LRU);
6207 	     fil_node != 0;
6208 	     fil_node = UT_LIST_GET_NEXT(LRU, fil_node)) {
6209 
6210 		ut_a(fil_node->n_pending == 0);
6211 		ut_a(!fil_node->being_extended);
6212 		ut_a(fil_node->open);
6213 		ut_a(fil_space_belongs_in_lru(fil_node->space));
6214 	}
6215 
6216 	mutex_exit(&fil_system->mutex);
6217 
6218 	return(TRUE);
6219 }
6220 
6221 /********************************************************************//**
6222 Returns TRUE if file address is undefined.
6223 @return	TRUE if undefined */
6224 UNIV_INTERN
6225 ibool
fil_addr_is_null(fil_addr_t addr)6226 fil_addr_is_null(
6227 /*=============*/
6228 	fil_addr_t	addr)	/*!< in: address */
6229 {
6230 	return(addr.page == FIL_NULL);
6231 }
6232 
6233 /********************************************************************//**
6234 Get the predecessor of a file page.
6235 @return	FIL_PAGE_PREV */
6236 UNIV_INTERN
6237 ulint
fil_page_get_prev(const byte * page)6238 fil_page_get_prev(
6239 /*==============*/
6240 	const byte*	page)	/*!< in: file page */
6241 {
6242 	return(mach_read_from_4(page + FIL_PAGE_PREV));
6243 }
6244 
6245 /********************************************************************//**
6246 Get the successor of a file page.
6247 @return	FIL_PAGE_NEXT */
6248 UNIV_INTERN
6249 ulint
fil_page_get_next(const byte * page)6250 fil_page_get_next(
6251 /*==============*/
6252 	const byte*	page)	/*!< in: file page */
6253 {
6254 	return(mach_read_from_4(page + FIL_PAGE_NEXT));
6255 }
6256 
6257 /*********************************************************************//**
6258 Sets the file page type. */
6259 UNIV_INTERN
6260 void
fil_page_set_type(byte * page,ulint type)6261 fil_page_set_type(
6262 /*==============*/
6263 	byte*	page,	/*!< in/out: file page */
6264 	ulint	type)	/*!< in: type */
6265 {
6266 	ut_ad(page);
6267 
6268 	mach_write_to_2(page + FIL_PAGE_TYPE, type);
6269 }
6270 
6271 /*********************************************************************//**
6272 Gets the file page type.
6273 @return type; NOTE that if the type has not been written to page, the
6274 return value not defined */
6275 UNIV_INTERN
6276 ulint
fil_page_get_type(const byte * page)6277 fil_page_get_type(
6278 /*==============*/
6279 	const byte*	page)	/*!< in: file page */
6280 {
6281 	ut_ad(page);
6282 
6283 	return(mach_read_from_2(page + FIL_PAGE_TYPE));
6284 }
6285 
6286 /****************************************************************//**
6287 Closes the tablespace memory cache. */
6288 UNIV_INTERN
6289 void
fil_close(void)6290 fil_close(void)
6291 /*===========*/
6292 {
6293 	mutex_free(&fil_system->mutex);
6294 
6295 	hash_table_free(fil_system->spaces);
6296 
6297 	hash_table_free(fil_system->name_hash);
6298 
6299 	ut_a(UT_LIST_GET_LEN(fil_system->LRU) == 0);
6300 	ut_a(UT_LIST_GET_LEN(fil_system->unflushed_spaces) == 0);
6301 	ut_a(UT_LIST_GET_LEN(fil_system->space_list) == 0);
6302 
6303 	mem_free(fil_system);
6304 
6305 	fil_system = NULL;
6306 }
6307 
6308 /********************************************************************//**
6309 Initializes a buffer control block when the buf_pool is created. */
6310 static
6311 void
fil_buf_block_init(buf_block_t * block,byte * frame)6312 fil_buf_block_init(
6313 /*===============*/
6314 	buf_block_t*	block,		/*!< in: pointer to control block */
6315 	byte*		frame)		/*!< in: pointer to buffer frame */
6316 {
6317 	UNIV_MEM_DESC(frame, UNIV_PAGE_SIZE);
6318 
6319 	block->frame = frame;
6320 
6321 	block->page.io_fix = BUF_IO_NONE;
6322 	/* There are assertions that check for this. */
6323 	block->page.buf_fix_count = 1;
6324 	block->page.state = BUF_BLOCK_READY_FOR_USE;
6325 
6326 	page_zip_des_init(&block->page.zip);
6327 }
6328 
6329 struct fil_iterator_t {
6330 	pfs_os_file_t	file;			/*!< File handle */
6331 	const char*	filepath;		/*!< File path name */
6332 	os_offset_t	start;			/*!< From where to start */
6333 	os_offset_t	end;			/*!< Where to stop */
6334 	os_offset_t	file_size;		/*!< File size in bytes */
6335 	ulint		page_size;		/*!< Page size */
6336 	ulint		n_io_buffers;		/*!< Number of pages to use
6337 						for IO */
6338 	byte*		io_buffer;		/*!< Buffer to use for IO */
6339 };
6340 
6341 /********************************************************************//**
6342 TODO: This can be made parallel trivially by chunking up the file and creating
6343 a callback per thread. . Main benefit will be to use multiple CPUs for
6344 checksums and compressed tables. We have to do compressed tables block by
6345 block right now. Secondly we need to decompress/compress and copy too much
6346 of data. These are CPU intensive.
6347 
6348 Iterate over all the pages in the tablespace.
6349 @param iter - Tablespace iterator
6350 @param block - block to use for IO
6351 @param callback - Callback to inspect and update page contents
6352 @retval DB_SUCCESS or error code */
6353 static
6354 dberr_t
fil_iterate(const fil_iterator_t & iter,buf_block_t * block,PageCallback & callback)6355 fil_iterate(
6356 /*========*/
6357 	const fil_iterator_t&	iter,
6358 	buf_block_t*		block,
6359 	PageCallback&		callback)
6360 {
6361 	os_offset_t		offset;
6362 	ulint			page_no = 0;
6363 	ulint			space_id = callback.get_space_id();
6364 	ulint			n_bytes = iter.n_io_buffers * iter.page_size;
6365 
6366 	ut_ad(!srv_read_only_mode);
6367 
6368 	/* TODO: For compressed tables we do a lot of useless
6369 	copying for non-index pages. Unfortunately, it is
6370 	required by buf_zip_decompress() */
6371 
6372 	for (offset = iter.start; offset < iter.end; offset += n_bytes) {
6373 
6374 		byte*		io_buffer = iter.io_buffer;
6375 
6376 		block->frame = io_buffer;
6377 
6378 		if (callback.get_zip_size() > 0) {
6379 			page_zip_des_init(&block->page.zip);
6380 			page_zip_set_size(&block->page.zip, iter.page_size);
6381 			block->page.zip.data = block->frame + UNIV_PAGE_SIZE;
6382 			ut_d(block->page.zip.m_external = true);
6383 			ut_ad(iter.page_size == callback.get_zip_size());
6384 
6385 			/* Zip IO is done in the compressed page buffer. */
6386 			io_buffer = block->page.zip.data;
6387 		} else {
6388 			io_buffer = iter.io_buffer;
6389 		}
6390 
6391 		/* We have to read the exact number of bytes. Otherwise the
6392 		InnoDB IO functions croak on failed reads. */
6393 
6394 		n_bytes = static_cast<ulint>(
6395 			ut_min(static_cast<os_offset_t>(n_bytes),
6396 			       iter.end - offset));
6397 
6398 		ut_ad(n_bytes > 0);
6399 		ut_ad(!(n_bytes % iter.page_size));
6400 
6401 		if (!os_file_read(iter.file, io_buffer, offset,
6402 				  (ulint) n_bytes)) {
6403 
6404 			ib_logf(IB_LOG_LEVEL_ERROR, "os_file_read() failed");
6405 
6406 			return(DB_IO_ERROR);
6407 		}
6408 
6409 		bool		updated = false;
6410 		os_offset_t	page_off = offset;
6411 		ulint		n_pages_read = (ulint) n_bytes / iter.page_size;
6412 
6413 		for (ulint i = 0; i < n_pages_read; ++i) {
6414 
6415 			buf_block_set_file_page(block, space_id, page_no++);
6416 
6417 			dberr_t	err;
6418 
6419 			if ((err = callback(page_off, block)) != DB_SUCCESS) {
6420 
6421 				return(err);
6422 
6423 			} else if (!updated) {
6424 				updated = buf_block_get_state(block)
6425 					== BUF_BLOCK_FILE_PAGE;
6426 			}
6427 
6428 			buf_block_set_state(block, BUF_BLOCK_NOT_USED);
6429 			buf_block_set_state(block, BUF_BLOCK_READY_FOR_USE);
6430 
6431 			page_off += iter.page_size;
6432 			block->frame += iter.page_size;
6433 		}
6434 
6435 		/* A page was updated in the set, write back to disk. */
6436 		if (updated
6437 		    && !os_file_write(
6438 				iter.filepath, iter.file, io_buffer,
6439 				offset, (ulint) n_bytes)) {
6440 
6441 			ib_logf(IB_LOG_LEVEL_ERROR, "os_file_write() failed");
6442 
6443 			return(DB_IO_ERROR);
6444 		}
6445 	}
6446 
6447 	return(DB_SUCCESS);
6448 }
6449 
6450 /********************************************************************//**
6451 Iterate over all the pages in the tablespace.
6452 @param table - the table definiton in the server
6453 @param n_io_buffers - number of blocks to read and write together
6454 @param callback - functor that will do the page updates
6455 @return	DB_SUCCESS or error code */
6456 UNIV_INTERN
6457 dberr_t
fil_tablespace_iterate(dict_table_t * table,ulint n_io_buffers,PageCallback & callback)6458 fil_tablespace_iterate(
6459 /*===================*/
6460 	dict_table_t*	table,
6461 	ulint		n_io_buffers,
6462 	PageCallback&	callback)
6463 {
6464 	dberr_t		err;
6465 	pfs_os_file_t	file;
6466 	char*		filepath;
6467 
6468 	ut_a(n_io_buffers > 0);
6469 	ut_ad(!srv_read_only_mode);
6470 
6471 	DBUG_EXECUTE_IF("ib_import_trigger_corruption_1",
6472 			return(DB_CORRUPTION););
6473 
6474 	if (DICT_TF_HAS_DATA_DIR(table->flags)) {
6475 		dict_get_and_save_data_dir_path(table, false);
6476 		ut_a(table->data_dir_path);
6477 
6478 		filepath = os_file_make_remote_pathname(
6479 			table->data_dir_path, table->name, "ibd");
6480 	} else {
6481 		filepath = fil_make_ibd_name(table->name, false);
6482 	}
6483 
6484 	{
6485 		ibool	success;
6486 
6487 		file = os_file_create_simple_no_error_handling(
6488 			innodb_file_data_key, filepath,
6489 			OS_FILE_OPEN, OS_FILE_READ_WRITE, &success);
6490 
6491 		DBUG_EXECUTE_IF("fil_tablespace_iterate_failure",
6492 		{
6493 			static bool once;
6494 
6495 			if (!once || ut_rnd_interval(0, 10) == 5) {
6496 				once = true;
6497 				success = FALSE;
6498 				os_file_close(file);
6499 			}
6500 		});
6501 
6502 		if (!success) {
6503 			/* The following call prints an error message */
6504 			os_file_get_last_error(true);
6505 
6506 			ib_logf(IB_LOG_LEVEL_ERROR,
6507 				"Trying to import a tablespace, but could not "
6508 				"open the tablespace file %s", filepath);
6509 
6510 			mem_free(filepath);
6511 
6512 			return(DB_TABLESPACE_NOT_FOUND);
6513 
6514 		} else {
6515 			err = DB_SUCCESS;
6516 		}
6517 	}
6518 
6519 	callback.set_file(filepath, file);
6520 
6521 	os_offset_t	file_size = os_file_get_size(file);
6522 	ut_a(file_size != (os_offset_t) -1);
6523 
6524 	/* The block we will use for every physical page */
6525 	buf_block_t	block;
6526 
6527 	memset(&block, 0x0, sizeof(block));
6528 
6529 	/* Allocate a page to read in the tablespace header, so that we
6530 	can determine the page size and zip_size (if it is compressed).
6531 	We allocate an extra page in case it is a compressed table. One
6532 	page is to ensure alignement. */
6533 
6534 	void*	page_ptr = mem_alloc(3 * UNIV_PAGE_SIZE);
6535 	byte*	page = static_cast<byte*>(ut_align(page_ptr, UNIV_PAGE_SIZE));
6536 
6537 	fil_buf_block_init(&block, page);
6538 
6539 	/* Read the first page and determine the page and zip size. */
6540 
6541 	if (!os_file_read(file, page, 0, UNIV_PAGE_SIZE)) {
6542 
6543 		err = DB_IO_ERROR;
6544 
6545 	} else if ((err = callback.init(file_size, &block)) == DB_SUCCESS) {
6546 		fil_iterator_t	iter;
6547 
6548 		iter.file = file;
6549 		iter.start = 0;
6550 		iter.end = file_size;
6551 		iter.filepath = filepath;
6552 		iter.file_size = file_size;
6553 		iter.n_io_buffers = n_io_buffers;
6554 		iter.page_size = callback.get_page_size();
6555 
6556 		/* Compressed pages can't be optimised for block IO for now.
6557 		We do the IMPORT page by page. */
6558 
6559 		if (callback.get_zip_size() > 0) {
6560 			iter.n_io_buffers = 1;
6561 			ut_a(iter.page_size == callback.get_zip_size());
6562 		}
6563 
6564 		/** Add an extra page for compressed page scratch area. */
6565 
6566 		void*	io_buffer = mem_alloc(
6567 			(2 + iter.n_io_buffers) * UNIV_PAGE_SIZE);
6568 
6569 		iter.io_buffer = static_cast<byte*>(
6570 			ut_align(io_buffer, UNIV_PAGE_SIZE));
6571 
6572 		err = fil_iterate(iter, &block, callback);
6573 
6574 		mem_free(io_buffer);
6575 	}
6576 
6577 	if (err == DB_SUCCESS) {
6578 
6579 		ib_logf(IB_LOG_LEVEL_INFO, "Sync to disk");
6580 
6581 		if (!os_file_flush(file)) {
6582 			ib_logf(IB_LOG_LEVEL_INFO, "os_file_flush() failed!");
6583 			err = DB_IO_ERROR;
6584 		} else {
6585 			ib_logf(IB_LOG_LEVEL_INFO, "Sync to disk - done!");
6586 		}
6587 	}
6588 
6589 	os_file_close(file);
6590 
6591 	mem_free(page_ptr);
6592 	mem_free(filepath);
6593 
6594 	return(err);
6595 }
6596 
6597 /**
6598 Set the tablespace compressed table size.
6599 @return DB_SUCCESS if it is valie or DB_CORRUPTION if not */
6600 dberr_t
set_zip_size(const buf_frame_t * page)6601 PageCallback::set_zip_size(const buf_frame_t* page) UNIV_NOTHROW
6602 {
6603 	m_zip_size = fsp_header_get_zip_size(page);
6604 
6605 	if (!ut_is_2pow(m_zip_size) || m_zip_size > UNIV_ZIP_SIZE_MAX) {
6606 		return(DB_CORRUPTION);
6607 	}
6608 
6609 	return(DB_SUCCESS);
6610 }
6611 
6612 /********************************************************************//**
6613 Delete the tablespace file and any related files like .cfg.
6614 This should not be called for temporary tables. */
6615 UNIV_INTERN
6616 void
fil_delete_file(const char * ibd_name)6617 fil_delete_file(
6618 /*============*/
6619 	const char*	ibd_name)	/*!< in: filepath of the ibd
6620 					tablespace */
6621 {
6622 	/* Force a delete of any stale .ibd files that are lying around. */
6623 
6624 	ib_logf(IB_LOG_LEVEL_INFO, "Deleting %s", ibd_name);
6625 
6626 	os_file_delete_if_exists(innodb_file_data_key, ibd_name);
6627 
6628 	char*	cfg_name = fil_make_cfg_name(ibd_name);
6629 
6630 	os_file_delete_if_exists(innodb_file_data_key, cfg_name);
6631 
6632 	mem_free(cfg_name);
6633 }
6634 
6635 /*************************************************************************
6636 Return local hash table informations. */
6637 
6638 ulint
fil_system_hash_cells(void)6639 fil_system_hash_cells(void)
6640 /*=======================*/
6641 {
6642        if (fil_system) {
6643                return (fil_system->spaces->n_cells
6644                        + fil_system->name_hash->n_cells);
6645        } else {
6646                return 0;
6647        }
6648 }
6649 
6650 ulint
fil_system_hash_nodes(void)6651 fil_system_hash_nodes(void)
6652 /*=======================*/
6653 {
6654        if (fil_system) {
6655                return (UT_LIST_GET_LEN(fil_system->space_list)
6656                        * (sizeof(fil_space_t) + MEM_BLOCK_HEADER_SIZE));
6657        } else {
6658                return 0;
6659        }
6660 }
6661 
6662 /**
6663 Iterate over all the spaces in the space list and fetch the
6664 tablespace names. It will return a copy of the name that must be
6665 freed by the caller using: delete[].
6666 @return DB_SUCCESS if all OK. */
6667 UNIV_INTERN
6668 dberr_t
fil_get_space_names(space_name_list_t & space_name_list)6669 fil_get_space_names(
6670 /*================*/
6671 	space_name_list_t&	space_name_list)
6672 				/*!< in/out: List to append to */
6673 {
6674 	fil_space_t*	space;
6675 	dberr_t		err = DB_SUCCESS;
6676 
6677 	mutex_enter(&fil_system->mutex);
6678 
6679 	for (space = UT_LIST_GET_FIRST(fil_system->space_list);
6680 	     space != NULL;
6681 	     space = UT_LIST_GET_NEXT(space_list, space)) {
6682 
6683 		if (space->purpose == FIL_TABLESPACE) {
6684 			ulint	len;
6685 			char*	name;
6686 
6687 			len = strlen(space->name);
6688 			name = new(std::nothrow) char[len + 1];
6689 
6690 			if (name == 0) {
6691 				/* Caller to free elements allocated so far. */
6692 				err = DB_OUT_OF_MEMORY;
6693 				break;
6694 			}
6695 
6696 			memcpy(name, space->name, len);
6697 			name[len] = 0;
6698 
6699 			space_name_list.push_back(name);
6700 		}
6701 	}
6702 
6703 	mutex_exit(&fil_system->mutex);
6704 
6705 	return(err);
6706 }
6707 
6708 /** Generate redo log for swapping two .ibd files
6709 @param[in]	old_table	old table
6710 @param[in]	new_table	new table
6711 @param[in]	tmp_name	temporary table name
6712 @param[in,out]	mtr		mini-transaction
6713 @return innodb error code */
6714 UNIV_INTERN
6715 dberr_t
fil_mtr_rename_log(const dict_table_t * old_table,const dict_table_t * new_table,const char * tmp_name,mtr_t * mtr)6716 fil_mtr_rename_log(
6717 	const dict_table_t*	old_table,
6718 	const dict_table_t*	new_table,
6719 	const char*		tmp_name,
6720 	mtr_t*			mtr)
6721 {
6722 	dberr_t	err = DB_SUCCESS;
6723 	char*	old_path;
6724 
6725 	/* If neither table is file-per-table,
6726 	there will be no renaming of files. */
6727 	if (old_table->space == TRX_SYS_SPACE
6728 	    && new_table->space == TRX_SYS_SPACE) {
6729 		return(DB_SUCCESS);
6730 	}
6731 
6732 	if (DICT_TF_HAS_DATA_DIR(old_table->flags)) {
6733 		old_path = os_file_make_remote_pathname(
6734 			old_table->data_dir_path, old_table->name, "ibd");
6735 	} else {
6736 		old_path = fil_make_ibd_name(old_table->name, false);
6737 	}
6738 	if (old_path == NULL) {
6739 		return(DB_OUT_OF_MEMORY);
6740 	}
6741 
6742 	if (old_table->space != TRX_SYS_SPACE) {
6743 		char*	tmp_path;
6744 
6745 		if (DICT_TF_HAS_DATA_DIR(old_table->flags)) {
6746 			tmp_path = os_file_make_remote_pathname(
6747 				old_table->data_dir_path, tmp_name, "ibd");
6748 		}
6749 		else {
6750 			tmp_path = fil_make_ibd_name(tmp_name, false);
6751 		}
6752 
6753 		if (tmp_path == NULL) {
6754 			mem_free(old_path);
6755 			return(DB_OUT_OF_MEMORY);
6756 		}
6757 
6758 		/* Temp filepath must not exist. */
6759 		err = fil_rename_tablespace_check(
6760 			old_table->space, old_path, tmp_path,
6761 			dict_table_is_discarded(old_table));
6762 		mem_free(tmp_path);
6763 		if (err != DB_SUCCESS) {
6764 			mem_free(old_path);
6765 			return(err);
6766 		}
6767 
6768 		fil_op_write_log(MLOG_FILE_RENAME, old_table->space,
6769 				 0, 0, old_table->name, tmp_name, mtr);
6770 	}
6771 
6772 	if (new_table->space != TRX_SYS_SPACE) {
6773 
6774 		/* Destination filepath must not exist unless this ALTER
6775 		TABLE starts and ends with a file_per-table tablespace. */
6776 		if (old_table->space == TRX_SYS_SPACE) {
6777 			char*	new_path = NULL;
6778 
6779 			if (DICT_TF_HAS_DATA_DIR(new_table->flags)) {
6780 				new_path = os_file_make_remote_pathname(
6781 					new_table->data_dir_path,
6782 					new_table->name, "ibd");
6783 			}
6784 			else {
6785 				new_path = fil_make_ibd_name(
6786 					new_table->name, false);
6787 			}
6788 
6789 			if (new_path == NULL) {
6790 				mem_free(old_path);
6791 				return(DB_OUT_OF_MEMORY);
6792 			}
6793 
6794 			err = fil_rename_tablespace_check(
6795 				new_table->space, new_path, old_path,
6796 				dict_table_is_discarded(new_table));
6797 			mem_free(new_path);
6798 			if (err != DB_SUCCESS) {
6799 				mem_free(old_path);
6800 				return(err);
6801 			}
6802 		}
6803 
6804 		fil_op_write_log(MLOG_FILE_RENAME, new_table->space,
6805 				 0, 0, new_table->name, old_table->name, mtr);
6806 
6807 	}
6808 
6809 	mem_free(old_path);
6810 
6811 	return(err);
6812 }
6813 
6814 /*************************************************************************
6815 functions to access is_corrupt flag of fil_space_t*/
6816 
6817 void
fil_space_set_corrupt(ulint space_id)6818 fil_space_set_corrupt(
6819 /*==================*/
6820 	ulint	space_id)
6821 {
6822 	fil_space_t*	space;
6823 
6824 	mutex_enter(&fil_system->mutex);
6825 
6826 	space = fil_space_get_by_id(space_id);
6827 
6828 	if (space) {
6829 		space->is_corrupt = TRUE;
6830 	}
6831 
6832 	mutex_exit(&fil_system->mutex);
6833 }
6834