1 /* mdbx_chk.c - memory-mapped database check tool */
2
3 /*
4 * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru>
5 * and other libmdbx authors: please see AUTHORS file.
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted only as authorized by the OpenLDAP
10 * Public License.
11 *
12 * A copy of this license is available in the file LICENSE in the
13 * top-level directory of the distribution or, alternatively, at
14 * <http://www.OpenLDAP.org/license.html>. */
15
16 #ifdef _MSC_VER
17 #if _MSC_VER > 1800
18 #pragma warning(disable : 4464) /* relative include path contains '..' */
19 #endif
20 #pragma warning(disable : 4996) /* The POSIX name is deprecated... */
21 #endif /* _MSC_VER (warnings) */
22
23 #define xMDBX_TOOLS /* Avoid using internal mdbx_assert() */
24 /*
25 * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru>
26 * and other libmdbx authors: please see AUTHORS file.
27 * All rights reserved.
28 *
29 * Redistribution and use in source and binary forms, with or without
30 * modification, are permitted only as authorized by the OpenLDAP
31 * Public License.
32 *
33 * A copy of this license is available in the file LICENSE in the
34 * top-level directory of the distribution or, alternatively, at
35 * <http://www.OpenLDAP.org/license.html>. */
36
37 #define MDBX_BUILD_SOURCERY facaa40d3bb34698b2ba800e2fe225773e3941040aef7dc92580b74ad840e798_v0_11_2_0_gd47eed0
38 #ifdef MDBX_CONFIG_H
39 #include MDBX_CONFIG_H
40 #endif
41
42 #define LIBMDBX_INTERNALS
43 #ifdef xMDBX_TOOLS
44 #define MDBX_DEPRECATED
45 #endif /* xMDBX_TOOLS */
46
47 #ifdef xMDBX_ALLOY
48 /* Amalgamated build */
49 #define MDBX_INTERNAL_FUNC static
50 #define MDBX_INTERNAL_VAR static
51 #else
52 /* Non-amalgamated build */
53 #define MDBX_INTERNAL_FUNC
54 #define MDBX_INTERNAL_VAR extern
55 #endif /* xMDBX_ALLOY */
56
57 /** Disables using GNU/Linux libc extensions.
58 * \ingroup build_option
59 * \note This option couldn't be moved to the options.h since dependant
60 * control macros/defined should be prepared before include the options.h */
61 #ifndef MDBX_DISABLE_GNU_SOURCE
62 #define MDBX_DISABLE_GNU_SOURCE 0
63 #endif
64 #if MDBX_DISABLE_GNU_SOURCE
65 #undef _GNU_SOURCE
66 #elif (defined(__linux__) || defined(__gnu_linux__)) && !defined(_GNU_SOURCE)
67 #define _GNU_SOURCE
68 #endif /* MDBX_DISABLE_GNU_SOURCE */
69
70 /*----------------------------------------------------------------------------*/
71
72 /* Should be defined before any includes */
73 #ifndef _FILE_OFFSET_BITS
74 #define _FILE_OFFSET_BITS 64
75 #endif
76
77 #ifdef __APPLE__
78 #define _DARWIN_C_SOURCE
79 #endif
80
81 #ifdef _MSC_VER
82 #if _MSC_FULL_VER < 190024234
83 /* Actually libmdbx was not tested with compilers older than 19.00.24234 (Visual
84 * Studio 2015 Update 3). But you could remove this #error and try to continue
85 * at your own risk. In such case please don't rise up an issues related ONLY to
86 * old compilers.
87 *
88 * NOTE:
89 * Unfortunately, there are several different builds of "Visual Studio" that
90 * are called "Visual Studio 2015 Update 3".
91 *
92 * The 190024234 is used here because it is minimal version of Visual Studio
93 * that was used for build and testing libmdbx in recent years. Soon this
94 * value will be increased to 19.0.24241.7, since build and testing using
95 * "Visual Studio 2015" will be performed only at https://ci.appveyor.com.
96 *
97 * Please ask Microsoft (but not us) for information about version differences
98 * and how to and where you can obtain the latest "Visual Studio 2015" build
99 * with all fixes.
100 */
101 #error \
102 "At least \"Microsoft C/C++ Compiler\" version 19.00.24234 (Visual Studio 2015 Update 3) is required."
103 #endif
104 #ifndef _CRT_SECURE_NO_WARNINGS
105 #define _CRT_SECURE_NO_WARNINGS
106 #endif /* _CRT_SECURE_NO_WARNINGS */
107 #if _MSC_VER > 1800
108 #pragma warning(disable : 4464) /* relative include path contains '..' */
109 #endif
110 #if _MSC_VER > 1913
111 #pragma warning(disable : 5045) /* Compiler will insert Spectre mitigation... \
112 */
113 #endif
114 #pragma warning(disable : 4710) /* 'xyz': function not inlined */
115 #pragma warning(disable : 4711) /* function 'xyz' selected for automatic \
116 inline expansion */
117 #pragma warning( \
118 disable : 4201) /* nonstandard extension used : nameless struct / union */
119 #pragma warning(disable : 4702) /* unreachable code */
120 #pragma warning(disable : 4706) /* assignment within conditional expression */
121 #pragma warning(disable : 4127) /* conditional expression is constant */
122 #pragma warning(disable : 4324) /* 'xyz': structure was padded due to \
123 alignment specifier */
124 #pragma warning(disable : 4310) /* cast truncates constant value */
125 #pragma warning( \
126 disable : 4820) /* bytes padding added after data member for alignment */
127 #pragma warning(disable : 4548) /* expression before comma has no effect; \
128 expected expression with side - effect */
129 #pragma warning(disable : 4366) /* the result of the unary '&' operator may be \
130 unaligned */
131 #pragma warning(disable : 4200) /* nonstandard extension used: zero-sized \
132 array in struct/union */
133 #pragma warning(disable : 4204) /* nonstandard extension used: non-constant \
134 aggregate initializer */
135 #pragma warning( \
136 disable : 4505) /* unreferenced local function has been removed */
137 #endif /* _MSC_VER (warnings) */
138
139 #include "mdbx.h"
140 /*
141 * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru>
142 * and other libmdbx authors: please see AUTHORS file.
143 * All rights reserved.
144 *
145 * Redistribution and use in source and binary forms, with or without
146 * modification, are permitted only as authorized by the OpenLDAP
147 * Public License.
148 *
149 * A copy of this license is available in the file LICENSE in the
150 * top-level directory of the distribution or, alternatively, at
151 * <http://www.OpenLDAP.org/license.html>.
152 */
153
154 /* *INDENT-OFF* */
155 /* clang-format off */
156
157 #ifndef __GNUC_PREREQ
158 # if defined(__GNUC__) && defined(__GNUC_MINOR__)
159 # define __GNUC_PREREQ(maj, min) \
160 ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min))
161 # else
162 # define __GNUC_PREREQ(maj, min) (0)
163 # endif
164 #endif /* __GNUC_PREREQ */
165
166 #ifndef __CLANG_PREREQ
167 # ifdef __clang__
168 # define __CLANG_PREREQ(maj,min) \
169 ((__clang_major__ << 16) + __clang_minor__ >= ((maj) << 16) + (min))
170 # else
171 # define __CLANG_PREREQ(maj,min) (0)
172 # endif
173 #endif /* __CLANG_PREREQ */
174
175 #ifndef __GLIBC_PREREQ
176 # if defined(__GLIBC__) && defined(__GLIBC_MINOR__)
177 # define __GLIBC_PREREQ(maj, min) \
178 ((__GLIBC__ << 16) + __GLIBC_MINOR__ >= ((maj) << 16) + (min))
179 # else
180 # define __GLIBC_PREREQ(maj, min) (0)
181 # endif
182 #endif /* __GLIBC_PREREQ */
183
184 #ifndef __has_warning
185 # define __has_warning(x) (0)
186 #endif
187
188 #ifndef __has_include
189 # define __has_include(x) (0)
190 #endif
191
192 #if __has_feature(thread_sanitizer)
193 # define __SANITIZE_THREAD__ 1
194 #endif
195
196 #if __has_feature(address_sanitizer)
197 # define __SANITIZE_ADDRESS__ 1
198 #endif
199
200 /*----------------------------------------------------------------------------*/
201
202 #ifndef __extern_C
203 # ifdef __cplusplus
204 # define __extern_C extern "C"
205 # else
206 # define __extern_C
207 # endif
208 #endif /* __extern_C */
209
210 #if !defined(nullptr) && !defined(__cplusplus) || (__cplusplus < 201103L && !defined(_MSC_VER))
211 # define nullptr NULL
212 #endif
213
214 /*----------------------------------------------------------------------------*/
215
216 #ifndef __always_inline
217 # if defined(__GNUC__) || __has_attribute(__always_inline__)
218 # define __always_inline __inline __attribute__((__always_inline__))
219 # elif defined(_MSC_VER)
220 # define __always_inline __forceinline
221 # else
222 # define __always_inline
223 # endif
224 #endif /* __always_inline */
225
226 #ifndef __noinline
227 # if defined(__GNUC__) || __has_attribute(__noinline__)
228 # define __noinline __attribute__((__noinline__))
229 # elif defined(_MSC_VER)
230 # define __noinline __declspec(noinline)
231 # else
232 # define __noinline
233 # endif
234 #endif /* __noinline */
235
236 #ifndef __must_check_result
237 # if defined(__GNUC__) || __has_attribute(__warn_unused_result__)
238 # define __must_check_result __attribute__((__warn_unused_result__))
239 # else
240 # define __must_check_result
241 # endif
242 #endif /* __must_check_result */
243
244 #if !defined(__noop) && !defined(_MSC_VER)
245 # define __noop(...) do {} while(0)
246 #endif /* __noop */
247
248 #ifndef __fallthrough
249 # if defined(__cplusplus) && (__has_cpp_attribute(fallthrough) && \
250 (!defined(__clang__) || __clang__ > 4)) || __cplusplus >= 201703L
251 # define __fallthrough [[fallthrough]]
252 # elif __GNUC_PREREQ(8, 0) && defined(__cplusplus) && __cplusplus >= 201103L
253 # define __fallthrough [[fallthrough]]
254 # elif __GNUC_PREREQ(7, 0) && \
255 (!defined(__LCC__) || (__LCC__ == 124 && __LCC_MINOR__ >= 12) || \
256 (__LCC__ == 125 && __LCC_MINOR__ >= 5) || (__LCC__ >= 126))
257 # define __fallthrough __attribute__((__fallthrough__))
258 # elif defined(__clang__) && defined(__cplusplus) && __cplusplus >= 201103L &&\
259 __has_feature(cxx_attributes) && __has_warning("-Wimplicit-fallthrough")
260 # define __fallthrough [[clang::fallthrough]]
261 # else
262 # define __fallthrough
263 # endif
264 #endif /* __fallthrough */
265
266 #ifndef __unreachable
267 # if __GNUC_PREREQ(4,5) || __has_builtin(__builtin_unreachable)
268 # define __unreachable() __builtin_unreachable()
269 # elif defined(_MSC_VER)
270 # define __unreachable() __assume(0)
271 # else
272 # define __unreachable() __noop()
273 # endif
274 #endif /* __unreachable */
275
276 #ifndef __prefetch
277 # if defined(__GNUC__) || defined(__clang__) || __has_builtin(__builtin_prefetch)
278 # define __prefetch(ptr) __builtin_prefetch(ptr)
279 # else
280 # define __prefetch(ptr) __noop(ptr)
281 # endif
282 #endif /* __prefetch */
283
284 #ifndef __nothrow
285 # if defined(__cplusplus)
286 # if __cplusplus < 201703L
287 # define __nothrow throw()
288 # else
289 # define __nothrow noexcept(true)
290 # endif /* __cplusplus */
291 # elif defined(__GNUC__) || __has_attribute(__nothrow__)
292 # define __nothrow __attribute__((__nothrow__))
293 # elif defined(_MSC_VER) && defined(__cplusplus)
294 # define __nothrow __declspec(nothrow)
295 # else
296 # define __nothrow
297 # endif
298 #endif /* __nothrow */
299
300 #ifndef __hidden
301 # if defined(__GNUC__) || __has_attribute(__visibility__)
302 # define __hidden __attribute__((__visibility__("hidden")))
303 # else
304 # define __hidden
305 # endif
306 #endif /* __hidden */
307
308 #ifndef __optimize
309 # if defined(__OPTIMIZE__)
310 # if (defined(__GNUC__) && !defined(__clang__)) || __has_attribute(__optimize__)
311 # define __optimize(ops) __attribute__((__optimize__(ops)))
312 # else
313 # define __optimize(ops)
314 # endif
315 # else
316 # define __optimize(ops)
317 # endif
318 #endif /* __optimize */
319
320 #ifndef __hot
321 # if defined(__OPTIMIZE__)
322 # if defined(__e2k__)
323 # define __hot __attribute__((__hot__)) __optimize(3)
324 # elif defined(__clang__) && !__has_attribute(__hot_) \
325 && __has_attribute(__section__) && (defined(__linux__) || defined(__gnu_linux__))
326 /* just put frequently used functions in separate section */
327 # define __hot __attribute__((__section__("text.hot"))) __optimize("O3")
328 # elif defined(__GNUC__) || __has_attribute(__hot__)
329 # define __hot __attribute__((__hot__)) __optimize("O3")
330 # else
331 # define __hot __optimize("O3")
332 # endif
333 # else
334 # define __hot
335 # endif
336 #endif /* __hot */
337
338 #ifndef __cold
339 # if defined(__OPTIMIZE__)
340 # if defined(__e2k__)
341 # define __cold __attribute__((__cold__)) __optimize(1)
342 # elif defined(__clang__) && !__has_attribute(cold) \
343 && __has_attribute(__section__) && (defined(__linux__) || defined(__gnu_linux__))
344 /* just put infrequently used functions in separate section */
345 # define __cold __attribute__((__section__("text.unlikely"))) __optimize("Os")
346 # elif defined(__GNUC__) || __has_attribute(cold)
347 # define __cold __attribute__((__cold__)) __optimize("Os")
348 # else
349 # define __cold __optimize("Os")
350 # endif
351 # else
352 # define __cold
353 # endif
354 #endif /* __cold */
355
356 #ifndef __flatten
357 # if defined(__OPTIMIZE__) && (defined(__GNUC__) || __has_attribute(__flatten__))
358 # define __flatten __attribute__((__flatten__))
359 # else
360 # define __flatten
361 # endif
362 #endif /* __flatten */
363
364 #ifndef likely
365 # if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__)
366 # define likely(cond) __builtin_expect(!!(cond), 1)
367 # else
368 # define likely(x) (!!(x))
369 # endif
370 #endif /* likely */
371
372 #ifndef unlikely
373 # if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__)
374 # define unlikely(cond) __builtin_expect(!!(cond), 0)
375 # else
376 # define unlikely(x) (!!(x))
377 # endif
378 #endif /* unlikely */
379
380 #ifndef __anonymous_struct_extension__
381 # if defined(__GNUC__)
382 # define __anonymous_struct_extension__ __extension__
383 # else
384 # define __anonymous_struct_extension__
385 # endif
386 #endif /* __anonymous_struct_extension__ */
387
388 #ifndef __Wpedantic_format_voidptr
389 MDBX_MAYBE_UNUSED MDBX_PURE_FUNCTION static __inline const void*
__Wpedantic_format_voidptr(const void * ptr)390 __Wpedantic_format_voidptr(const void* ptr) {return ptr;}
391 # define __Wpedantic_format_voidptr(ARG) __Wpedantic_format_voidptr(ARG)
392 #endif /* __Wpedantic_format_voidptr */
393
394 /*----------------------------------------------------------------------------*/
395
396 #if defined(MDBX_USE_VALGRIND)
397 # include <valgrind/memcheck.h>
398 # ifndef VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE
399 /* LY: available since Valgrind 3.10 */
400 # define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s)
401 # define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s)
402 # endif
403 #elif !defined(RUNNING_ON_VALGRIND)
404 # define VALGRIND_CREATE_MEMPOOL(h,r,z)
405 # define VALGRIND_DESTROY_MEMPOOL(h)
406 # define VALGRIND_MEMPOOL_TRIM(h,a,s)
407 # define VALGRIND_MEMPOOL_ALLOC(h,a,s)
408 # define VALGRIND_MEMPOOL_FREE(h,a)
409 # define VALGRIND_MEMPOOL_CHANGE(h,a,b,s)
410 # define VALGRIND_MAKE_MEM_NOACCESS(a,s)
411 # define VALGRIND_MAKE_MEM_DEFINED(a,s)
412 # define VALGRIND_MAKE_MEM_UNDEFINED(a,s)
413 # define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s)
414 # define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s)
415 # define VALGRIND_CHECK_MEM_IS_ADDRESSABLE(a,s) (0)
416 # define VALGRIND_CHECK_MEM_IS_DEFINED(a,s) (0)
417 # define RUNNING_ON_VALGRIND (0)
418 #endif /* MDBX_USE_VALGRIND */
419
420 #ifdef __SANITIZE_ADDRESS__
421 # include <sanitizer/asan_interface.h>
422 #elif !defined(ASAN_POISON_MEMORY_REGION)
423 # define ASAN_POISON_MEMORY_REGION(addr, size) \
424 ((void)(addr), (void)(size))
425 # define ASAN_UNPOISON_MEMORY_REGION(addr, size) \
426 ((void)(addr), (void)(size))
427 #endif /* __SANITIZE_ADDRESS__ */
428
429 /*----------------------------------------------------------------------------*/
430
431 #ifndef ARRAY_LENGTH
432 # ifdef __cplusplus
433 template <typename T, size_t N>
434 char (&__ArraySizeHelper(T (&array)[N]))[N];
435 # define ARRAY_LENGTH(array) (sizeof(::__ArraySizeHelper(array)))
436 # else
437 # define ARRAY_LENGTH(array) (sizeof(array) / sizeof(array[0]))
438 # endif
439 #endif /* ARRAY_LENGTH */
440
441 #ifndef ARRAY_END
442 # define ARRAY_END(array) (&array[ARRAY_LENGTH(array)])
443 #endif /* ARRAY_END */
444
445 #define CONCAT(a,b) a##b
446 #define XCONCAT(a,b) CONCAT(a,b)
447
448 #ifndef offsetof
449 # define offsetof(type, member) __builtin_offsetof(type, member)
450 #endif /* offsetof */
451
452 #ifndef container_of
453 # define container_of(ptr, type, member) \
454 ((type *)((char *)(ptr) - offsetof(type, member)))
455 #endif /* container_of */
456
457 #define MDBX_TETRAD(a, b, c, d) \
458 ((uint32_t)(a) << 24 | (uint32_t)(b) << 16 | (uint32_t)(c) << 8 | (d))
459
460 #define MDBX_STRING_TETRAD(str) MDBX_TETRAD(str[0], str[1], str[2], str[3])
461
462 #define FIXME "FIXME: " __FILE__ ", " MDBX_STRINGIFY(__LINE__)
463
464 #ifndef STATIC_ASSERT_MSG
465 # if defined(static_assert)
466 # define STATIC_ASSERT_MSG(expr, msg) static_assert(expr, msg)
467 # elif defined(_STATIC_ASSERT)
468 # define STATIC_ASSERT_MSG(expr, msg) _STATIC_ASSERT(expr)
469 # elif defined(_MSC_VER)
470 # include <crtdbg.h>
471 # define STATIC_ASSERT_MSG(expr, msg) _STATIC_ASSERT(expr)
472 # elif (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) \
473 || __has_feature(c_static_assert)
474 # define STATIC_ASSERT_MSG(expr, msg) _Static_assert(expr, msg)
475 # else
476 # define STATIC_ASSERT_MSG(expr, msg) switch (0) {case 0:case (expr):;}
477 # endif
478 #endif /* STATIC_ASSERT */
479
480 #ifndef STATIC_ASSERT
481 # define STATIC_ASSERT(expr) STATIC_ASSERT_MSG(expr, #expr)
482 #endif
483
484 /* *INDENT-ON* */
485 /* clang-format on */
486
487 #if defined(__GNUC__) && !__GNUC_PREREQ(4, 2)
488 /* Actually libmdbx was not tested with compilers older than GCC 4.2.
489 * But you could ignore this warning at your own risk.
490 * In such case please don't rise up an issues related ONLY to old compilers.
491 */
492 #warning "libmdbx required GCC >= 4.2"
493 #endif
494
495 #if defined(__clang__) && !__CLANG_PREREQ(3, 8)
496 /* Actually libmdbx was not tested with CLANG older than 3.8.
497 * But you could ignore this warning at your own risk.
498 * In such case please don't rise up an issues related ONLY to old compilers.
499 */
500 #warning "libmdbx required CLANG >= 3.8"
501 #endif
502
503 #if defined(__GLIBC__) && !__GLIBC_PREREQ(2, 12)
504 /* Actually libmdbx was not tested with something older than glibc 2.12.
505 * But you could ignore this warning at your own risk.
506 * In such case please don't rise up an issues related ONLY to old systems.
507 */
508 #warning "libmdbx was only tested with GLIBC >= 2.12."
509 #endif
510
511 #ifdef __SANITIZE_THREAD__
512 #warning \
513 "libmdbx don't compatible with ThreadSanitizer, you will get a lot of false-positive issues."
514 #endif /* __SANITIZE_THREAD__ */
515
516 #if __has_warning("-Wnested-anon-types")
517 #if defined(__clang__)
518 #pragma clang diagnostic ignored "-Wnested-anon-types"
519 #elif defined(__GNUC__)
520 #pragma GCC diagnostic ignored "-Wnested-anon-types"
521 #else
522 #pragma warning disable "nested-anon-types"
523 #endif
524 #endif /* -Wnested-anon-types */
525
526 #if __has_warning("-Wconstant-logical-operand")
527 #if defined(__clang__)
528 #pragma clang diagnostic ignored "-Wconstant-logical-operand"
529 #elif defined(__GNUC__)
530 #pragma GCC diagnostic ignored "-Wconstant-logical-operand"
531 #else
532 #pragma warning disable "constant-logical-operand"
533 #endif
534 #endif /* -Wconstant-logical-operand */
535
536 #if defined(__LCC__) && (__LCC__ <= 121)
537 /* bug #2798 */
538 #pragma diag_suppress alignment_reduction_ignored
539 #elif defined(__ICC)
540 #pragma warning(disable : 3453 1366)
541 #elif __has_warning("-Walignment-reduction-ignored")
542 #if defined(__clang__)
543 #pragma clang diagnostic ignored "-Walignment-reduction-ignored"
544 #elif defined(__GNUC__)
545 #pragma GCC diagnostic ignored "-Walignment-reduction-ignored"
546 #else
547 #pragma warning disable "alignment-reduction-ignored"
548 #endif
549 #endif /* -Walignment-reduction-ignored */
550
551 #ifdef __cplusplus
552 extern "C" {
553 #endif
554
555 /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */
556
557 /*
558 * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru>
559 * and other libmdbx authors: please see AUTHORS file.
560 * All rights reserved.
561 *
562 * Redistribution and use in source and binary forms, with or without
563 * modification, are permitted only as authorized by the OpenLDAP
564 * Public License.
565 *
566 * A copy of this license is available in the file LICENSE in the
567 * top-level directory of the distribution or, alternatively, at
568 * <http://www.OpenLDAP.org/license.html>.
569 */
570
571
572 /*----------------------------------------------------------------------------*/
573 /* Microsoft compiler generates a lot of warning for self includes... */
574
575 #ifdef _MSC_VER
576 #pragma warning(push, 1)
577 #pragma warning(disable : 4548) /* expression before comma has no effect; \
578 expected expression with side - effect */
579 #pragma warning(disable : 4530) /* C++ exception handler used, but unwind \
580 * semantics are not enabled. Specify /EHsc */
581 #pragma warning(disable : 4577) /* 'noexcept' used with no exception handling \
582 * mode specified; termination on exception is \
583 * not guaranteed. Specify /EHsc */
584 #endif /* _MSC_VER (warnings) */
585
586 #if defined(_WIN32) || defined(_WIN64)
587 #if !defined(_CRT_SECURE_NO_WARNINGS)
588 #define _CRT_SECURE_NO_WARNINGS
589 #endif /* _CRT_SECURE_NO_WARNINGS */
590 #if !defined(_NO_CRT_STDIO_INLINE) && MDBX_BUILD_SHARED_LIBRARY && \
591 !defined(xMDBX_TOOLS) && MDBX_WITHOUT_MSVC_CRT
592 #define _NO_CRT_STDIO_INLINE
593 #endif
594 #elif !defined(_POSIX_C_SOURCE)
595 #define _POSIX_C_SOURCE 200809L
596 #endif /* Windows */
597
598 /*----------------------------------------------------------------------------*/
599 /* C99 includes */
600 #include <inttypes.h>
601 #include <stddef.h>
602 #include <stdint.h>
603 #include <stdlib.h>
604
605 #include <assert.h>
606 #include <fcntl.h>
607 #include <limits.h>
608 #include <stdio.h>
609 #include <string.h>
610 #include <time.h>
611
612 /* C11 stdalign.h */
613 #if __has_include(<stdalign.h>)
614 #include <stdalign.h>
615 #elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
616 #define alignas(N) _Alignas(N)
617 #elif defined(_MSC_VER)
618 #define alignas(N) __declspec(align(N))
619 #elif __has_attribute(__aligned__) || defined(__GNUC__)
620 #define alignas(N) __attribute__((__aligned__(N)))
621 #else
622 #error "FIXME: Required _alignas() or equivalent."
623 #endif
624
625 /*----------------------------------------------------------------------------*/
626 /* Systems includes */
627
628 #ifdef __APPLE__
629 #include <TargetConditionals.h>
630 #endif /* Apple OSX & iOS */
631
632 #if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
633 defined(__BSD__) || defined(__bsdi__) || defined(__DragonFly__) || \
634 defined(__APPLE__) || defined(__MACH__)
635 #include <sys/cdefs.h>
636 #include <sys/mount.h>
637 #include <sys/sysctl.h>
638 #include <sys/types.h>
639 #if defined(__FreeBSD__) || defined(__DragonFly__)
640 #include <vm/vm_param.h>
641 #elif defined(__OpenBSD__) || defined(__NetBSD__)
642 #include <uvm/uvm_param.h>
643 #else
644 #define SYSCTL_LEGACY_NONCONST_MIB
645 #endif
646 #ifndef __MACH__
647 #include <sys/vmmeter.h>
648 #endif
649 #else
650 #include <malloc.h>
651 #if !(defined(__sun) || defined(__SVR4) || defined(__svr4__) || \
652 defined(_WIN32) || defined(_WIN64))
653 #include <mntent.h>
654 #endif /* !Solaris */
655 #endif /* !xBSD */
656
657 #if defined(__FreeBSD__) || __has_include(<malloc_np.h>)
658 #include <malloc_np.h>
659 #endif
660
661 #if defined(__APPLE__) || defined(__MACH__) || __has_include(<malloc/malloc.h>)
662 #include <malloc/malloc.h>
663 #endif /* MacOS */
664
665 #if defined(__MACH__)
666 #include <mach/host_info.h>
667 #include <mach/mach_host.h>
668 #include <mach/mach_port.h>
669 #include <uuid/uuid.h>
670 #endif
671
672 #if defined(__linux__) || defined(__gnu_linux__)
673 #include <sched.h>
674 #include <sys/sendfile.h>
675 #include <sys/statfs.h>
676 #endif /* Linux */
677
678 #ifndef _XOPEN_SOURCE
679 #define _XOPEN_SOURCE 0
680 #endif
681
682 #ifndef _XOPEN_SOURCE_EXTENDED
683 #define _XOPEN_SOURCE_EXTENDED 0
684 #else
685 #include <utmpx.h>
686 #endif /* _XOPEN_SOURCE_EXTENDED */
687
688 #if defined(__sun) || defined(__SVR4) || defined(__svr4__)
689 #include <kstat.h>
690 #include <sys/mnttab.h>
691 /* On Solaris, it's easier to add a missing prototype rather than find a
692 * combination of #defines that break nothing. */
693 __extern_C key_t ftok(const char *, int);
694 #endif /* SunOS/Solaris */
695
696 #if defined(_WIN32) || defined(_WIN64)
697 #ifndef _WIN32_WINNT
698 #define _WIN32_WINNT 0x0601 /* Windows 7 */
699 #elif _WIN32_WINNT < 0x0500
700 #error At least 'Windows 2000' API is required for libmdbx.
701 #endif /* _WIN32_WINNT */
702 #if (defined(__MINGW32__) || defined(__MINGW64__)) && \
703 !defined(__USE_MINGW_ANSI_STDIO)
704 #define __USE_MINGW_ANSI_STDIO 1
705 #endif /* MinGW */
706 #ifndef WIN32_LEAN_AND_MEAN
707 #define WIN32_LEAN_AND_MEAN
708 #endif /* WIN32_LEAN_AND_MEAN */
709 #include <excpt.h>
710 #include <tlhelp32.h>
711 #include <windows.h>
712 #include <winnt.h>
713 #include <winternl.h>
714 #define HAVE_SYS_STAT_H
715 #define HAVE_SYS_TYPES_H
716 typedef HANDLE mdbx_thread_t;
717 typedef unsigned mdbx_thread_key_t;
718 #define MAP_FAILED NULL
719 #define HIGH_DWORD(v) ((DWORD)((sizeof(v) > 4) ? ((uint64_t)(v) >> 32) : 0))
720 #define THREAD_CALL WINAPI
721 #define THREAD_RESULT DWORD
722 typedef struct {
723 HANDLE mutex;
724 HANDLE event[2];
725 } mdbx_condpair_t;
726 typedef CRITICAL_SECTION mdbx_fastmutex_t;
727
728 #if !defined(_MSC_VER) && !defined(__try)
729 /* *INDENT-OFF* */
730 /* clang-format off */
731 #define __try
732 #define __except(COND) if(false)
733 /* *INDENT-ON* */
734 /* clang-format on */
735 #endif /* stub for MSVC's __try/__except */
736
737 #if MDBX_WITHOUT_MSVC_CRT
738
739 #ifndef mdbx_malloc
mdbx_malloc(size_t bytes)740 static inline void *mdbx_malloc(size_t bytes) {
741 return HeapAlloc(GetProcessHeap(), 0, bytes);
742 }
743 #endif /* mdbx_malloc */
744
745 #ifndef mdbx_calloc
mdbx_calloc(size_t nelem,size_t size)746 static inline void *mdbx_calloc(size_t nelem, size_t size) {
747 return HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, nelem * size);
748 }
749 #endif /* mdbx_calloc */
750
751 #ifndef mdbx_realloc
mdbx_realloc(void * ptr,size_t bytes)752 static inline void *mdbx_realloc(void *ptr, size_t bytes) {
753 return ptr ? HeapReAlloc(GetProcessHeap(), 0, ptr, bytes)
754 : HeapAlloc(GetProcessHeap(), 0, bytes);
755 }
756 #endif /* mdbx_realloc */
757
758 #ifndef mdbx_free
mdbx_free(void * ptr)759 static inline void mdbx_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); }
760 #endif /* mdbx_free */
761
762 #else /* MDBX_WITHOUT_MSVC_CRT */
763
764 #define mdbx_malloc malloc
765 #define mdbx_calloc calloc
766 #define mdbx_realloc realloc
767 #define mdbx_free free
768 #define mdbx_strdup _strdup
769
770 #endif /* MDBX_WITHOUT_MSVC_CRT */
771
772 #ifndef snprintf
773 #define snprintf _snprintf /* ntdll */
774 #endif
775
776 #ifndef vsnprintf
777 #define vsnprintf _vsnprintf /* ntdll */
778 #endif
779
780 #else /*----------------------------------------------------------------------*/
781
782 #include <unistd.h>
783 #if !defined(_POSIX_MAPPED_FILES) || _POSIX_MAPPED_FILES < 1
784 #error "libmdbx requires the _POSIX_MAPPED_FILES feature"
785 #endif /* _POSIX_MAPPED_FILES */
786
787 #include <pthread.h>
788 #include <semaphore.h>
789 #include <signal.h>
790 #include <sys/file.h>
791 #include <sys/ipc.h>
792 #include <sys/mman.h>
793 #include <sys/param.h>
794 #include <sys/stat.h>
795 #include <sys/statvfs.h>
796 #include <sys/uio.h>
797 typedef pthread_t mdbx_thread_t;
798 typedef pthread_key_t mdbx_thread_key_t;
799 #define INVALID_HANDLE_VALUE (-1)
800 #define THREAD_CALL
801 #define THREAD_RESULT void *
802 typedef struct {
803 pthread_mutex_t mutex;
804 pthread_cond_t cond[2];
805 } mdbx_condpair_t;
806 typedef pthread_mutex_t mdbx_fastmutex_t;
807 #define mdbx_malloc malloc
808 #define mdbx_calloc calloc
809 #define mdbx_realloc realloc
810 #define mdbx_free free
811 #define mdbx_strdup strdup
812 #endif /* Platform */
813
814 #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size)
815 /* malloc_usable_size() already provided */
816 #elif defined(__APPLE__)
817 #define malloc_usable_size(ptr) malloc_size(ptr)
818 #elif defined(_MSC_VER) && !MDBX_WITHOUT_MSVC_CRT
819 #define malloc_usable_size(ptr) _msize(ptr)
820 #endif /* malloc_usable_size */
821
822 #ifdef __ANDROID_API__
823 #include <android/log.h>
824 #if __ANDROID_API__ >= 21
825 #include <sys/sendfile.h>
826 #endif
827 #endif /* Android */
828
829 /* *INDENT-OFF* */
830 /* clang-format off */
831 #if defined(HAVE_SYS_STAT_H) || __has_include(<sys/stat.h>)
832 #include <sys/stat.h>
833 #endif
834 #if defined(HAVE_SYS_TYPES_H) || __has_include(<sys/types.h>)
835 #include <sys/types.h>
836 #endif
837 #if defined(HAVE_SYS_FILE_H) || __has_include(<sys/file.h>)
838 #include <sys/file.h>
839 #endif
840 /* *INDENT-ON* */
841 /* clang-format on */
842
843 #ifndef SSIZE_MAX
844 #define SSIZE_MAX INTPTR_MAX
845 #endif
846
847 #if !defined(MADV_DODUMP) && defined(MADV_CORE)
848 #define MADV_DODUMP MADV_CORE
849 #endif /* MADV_CORE -> MADV_DODUMP */
850
851 #if !defined(MADV_DONTDUMP) && defined(MADV_NOCORE)
852 #define MADV_DONTDUMP MADV_NOCORE
853 #endif /* MADV_NOCORE -> MADV_DONTDUMP */
854
855 #if defined(i386) || defined(__386) || defined(__i386) || defined(__i386__) || \
856 defined(i486) || defined(__i486) || defined(__i486__) || \
857 defined(i586) | defined(__i586) || defined(__i586__) || defined(i686) || \
858 defined(__i686) || defined(__i686__) || defined(_M_IX86) || \
859 defined(_X86_) || defined(__THW_INTEL__) || defined(__I86__) || \
860 defined(__INTEL__) || defined(__x86_64) || defined(__x86_64__) || \
861 defined(__amd64__) || defined(__amd64) || defined(_M_X64) || \
862 defined(_M_AMD64) || defined(__IA32__) || defined(__INTEL__)
863 #ifndef __ia32__
864 /* LY: define neutral __ia32__ for x86 and x86-64 */
865 #define __ia32__ 1
866 #endif /* __ia32__ */
867 #if !defined(__amd64__) && (defined(__x86_64) || defined(__x86_64__) || \
868 defined(__amd64) || defined(_M_X64))
869 /* LY: define trusty __amd64__ for all AMD64/x86-64 arch */
870 #define __amd64__ 1
871 #endif /* __amd64__ */
872 #endif /* all x86 */
873
874 #if (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF
875 #error \
876 "Sanity checking failed: Two's complement, reasonably sized integer types"
877 #endif
878
879 #if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul
880 #define MDBX_WORDBITS 64
881 #else
882 #define MDBX_WORDBITS 32
883 #endif /* MDBX_WORDBITS */
884
885 /*----------------------------------------------------------------------------*/
886 /* Compiler's includes for builtins/intrinsics */
887
888 #if defined(_MSC_VER) || defined(__INTEL_COMPILER)
889 #include <intrin.h>
890 #elif __GNUC_PREREQ(4, 4) || defined(__clang__)
891 #if defined(__ia32__) || defined(__e2k__)
892 #include <x86intrin.h>
893 #endif /* __ia32__ */
894 #if defined(__ia32__)
895 #include <cpuid.h>
896 #endif /* __ia32__ */
897 #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun)
898 #include <mbarrier.h>
899 #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \
900 (defined(HP_IA64) || defined(__ia64))
901 #include <machine/sys/inline.h>
902 #elif defined(__IBMC__) && defined(__powerpc)
903 #include <atomic.h>
904 #elif defined(_AIX)
905 #include <builtins.h>
906 #include <sys/atomic_op.h>
907 #elif (defined(__osf__) && defined(__DECC)) || defined(__alpha)
908 #include <c_asm.h>
909 #include <machine/builtins.h>
910 #elif defined(__MWERKS__)
911 /* CodeWarrior - troubles ? */
912 #pragma gcc_extensions
913 #elif defined(__SNC__)
914 /* Sony PS3 - troubles ? */
915 #elif defined(__hppa__) || defined(__hppa)
916 #include <machine/inline.h>
917 #else
918 #error Unsupported C compiler, please use GNU C 4.4 or newer
919 #endif /* Compiler */
920
921 /*----------------------------------------------------------------------------*/
922 /* Byteorder */
923
924 #if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || \
925 !defined(__ORDER_BIG_ENDIAN__)
926
927 /* *INDENT-OFF* */
928 /* clang-format off */
929 #if defined(__GLIBC__) || defined(__GNU_LIBRARY__) || defined(__ANDROID_API__) || \
930 defined(HAVE_ENDIAN_H) || __has_include(<endian.h>)
931 #include <endian.h>
932 #elif defined(__APPLE__) || defined(__MACH__) || defined(__OpenBSD__) || \
933 defined(HAVE_MACHINE_ENDIAN_H) || __has_include(<machine/endian.h>)
934 #include <machine/endian.h>
935 #elif defined(HAVE_SYS_ISA_DEFS_H) || __has_include(<sys/isa_defs.h>)
936 #include <sys/isa_defs.h>
937 #elif (defined(HAVE_SYS_TYPES_H) && defined(HAVE_SYS_ENDIAN_H)) || \
938 (__has_include(<sys/types.h>) && __has_include(<sys/endian.h>))
939 #include <sys/endian.h>
940 #include <sys/types.h>
941 #elif defined(__bsdi__) || defined(__DragonFly__) || defined(__FreeBSD__) || \
942 defined(__NetBSD__) || \
943 defined(HAVE_SYS_PARAM_H) || __has_include(<sys/param.h>)
944 #include <sys/param.h>
945 #endif /* OS */
946 /* *INDENT-ON* */
947 /* clang-format on */
948
949 #if defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && defined(__BIG_ENDIAN)
950 #define __ORDER_LITTLE_ENDIAN__ __LITTLE_ENDIAN
951 #define __ORDER_BIG_ENDIAN__ __BIG_ENDIAN
952 #define __BYTE_ORDER__ __BYTE_ORDER
953 #elif defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN)
954 #define __ORDER_LITTLE_ENDIAN__ _LITTLE_ENDIAN
955 #define __ORDER_BIG_ENDIAN__ _BIG_ENDIAN
956 #define __BYTE_ORDER__ _BYTE_ORDER
957 #else
958 #define __ORDER_LITTLE_ENDIAN__ 1234
959 #define __ORDER_BIG_ENDIAN__ 4321
960
961 #if defined(__LITTLE_ENDIAN__) || \
962 (defined(_LITTLE_ENDIAN) && !defined(_BIG_ENDIAN)) || \
963 defined(__ARMEL__) || defined(__THUMBEL__) || defined(__AARCH64EL__) || \
964 defined(__MIPSEL__) || defined(_MIPSEL) || defined(__MIPSEL) || \
965 defined(_M_ARM) || defined(_M_ARM64) || defined(__e2k__) || \
966 defined(__elbrus_4c__) || defined(__elbrus_8c__) || defined(__bfin__) || \
967 defined(__BFIN__) || defined(__ia64__) || defined(_IA64) || \
968 defined(__IA64__) || defined(__ia64) || defined(_M_IA64) || \
969 defined(__itanium__) || defined(__ia32__) || defined(__CYGWIN__) || \
970 defined(_WIN64) || defined(_WIN32) || defined(__TOS_WIN__) || \
971 defined(__WINDOWS__)
972 #define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
973
974 #elif defined(__BIG_ENDIAN__) || \
975 (defined(_BIG_ENDIAN) && !defined(_LITTLE_ENDIAN)) || \
976 defined(__ARMEB__) || defined(__THUMBEB__) || defined(__AARCH64EB__) || \
977 defined(__MIPSEB__) || defined(_MIPSEB) || defined(__MIPSEB) || \
978 defined(__m68k__) || defined(M68000) || defined(__hppa__) || \
979 defined(__hppa) || defined(__HPPA__) || defined(__sparc__) || \
980 defined(__sparc) || defined(__370__) || defined(__THW_370__) || \
981 defined(__s390__) || defined(__s390x__) || defined(__SYSC_ZARCH__)
982 #define __BYTE_ORDER__ __ORDER_BIG_ENDIAN__
983
984 #else
985 #error __BYTE_ORDER__ should be defined.
986 #endif /* Arch */
987
988 #endif
989 #endif /* __BYTE_ORDER__ || __ORDER_LITTLE_ENDIAN__ || __ORDER_BIG_ENDIAN__ */
990
991 /* Get the size of a memory page for the system.
992 * This is the basic size that the platform's memory manager uses, and is
993 * fundamental to the use of memory-mapped files. */
994 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t
mdbx_syspagesize(void)995 mdbx_syspagesize(void) {
996 #if defined(_WIN32) || defined(_WIN64)
997 SYSTEM_INFO si;
998 GetSystemInfo(&si);
999 return si.dwPageSize;
1000 #else
1001 return sysconf(_SC_PAGE_SIZE);
1002 #endif
1003 }
1004
1005 typedef struct mdbx_mmap_param {
1006 union {
1007 void *address;
1008 uint8_t *dxb;
1009 struct MDBX_lockinfo *lck;
1010 };
1011 mdbx_filehandle_t fd;
1012 size_t limit; /* mapping length, but NOT a size of file nor DB */
1013 size_t current; /* mapped region size, i.e. the size of file and DB */
1014 uint64_t filesize /* in-process cache of a file size */;
1015 #if defined(_WIN32) || defined(_WIN64)
1016 HANDLE section; /* memory-mapped section handle */
1017 #endif
1018 } mdbx_mmap_t;
1019
1020 typedef union bin128 {
1021 __anonymous_struct_extension__ struct { uint64_t x, y; };
1022 __anonymous_struct_extension__ struct { uint32_t a, b, c, d; };
1023 } bin128_t;
1024
1025 #if defined(_WIN32) || defined(_WIN64)
1026 typedef union MDBX_srwlock {
1027 struct {
1028 long volatile readerCount;
1029 long volatile writerCount;
1030 };
1031 RTL_SRWLOCK native;
1032 } MDBX_srwlock;
1033 #endif /* Windows */
1034
1035 #ifndef __cplusplus
1036
1037 MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void mdbx_osal_jitter(bool tiny);
1038 MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny);
1039
1040 /*----------------------------------------------------------------------------*/
1041 /* Atomics */
1042
1043 #if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && (__has_include(<cstdatomic>) || __has_extension(cxx_atomic))
1044 #include <cstdatomic>
1045 #define MDBX_HAVE_C11ATOMICS
1046 #elif !defined(__cplusplus) && \
1047 (__STDC_VERSION__ >= 201112L || __has_extension(c_atomic)) && \
1048 !defined(__STDC_NO_ATOMICS__) && \
1049 (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \
1050 !(defined(__GNUC__) || defined(__clang__)))
1051 #include <stdatomic.h>
1052 #define MDBX_HAVE_C11ATOMICS
1053 #elif defined(__GNUC__) || defined(__clang__)
1054 #elif defined(_MSC_VER)
1055 #pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */
1056 #pragma warning(disable : 4133) /* 'function': incompatible types - from \
1057 'size_t' to 'LONGLONG' */
1058 #pragma warning(disable : 4244) /* 'return': conversion from 'LONGLONG' to \
1059 'std::size_t', possible loss of data */
1060 #pragma warning(disable : 4267) /* 'function': conversion from 'size_t' to \
1061 'long', possible loss of data */
1062 #pragma intrinsic(_InterlockedExchangeAdd, _InterlockedCompareExchange)
1063 #pragma intrinsic(_InterlockedExchangeAdd64, _InterlockedCompareExchange64)
1064 #elif defined(__APPLE__)
1065 #include <libkern/OSAtomic.h>
1066 #else
1067 #error FIXME atomic-ops
1068 #endif
1069
1070 /*----------------------------------------------------------------------------*/
1071 /* Memory/Compiler barriers, cache coherence */
1072
1073 #if __has_include(<sys/cachectl.h>)
1074 #include <sys/cachectl.h>
1075 #elif defined(__mips) || defined(__mips__) || defined(__mips64) || \
1076 defined(__mips64__) || defined(_M_MRX000) || defined(_MIPS_) || \
1077 defined(__MWERKS__) || defined(__sgi)
1078 /* MIPS should have explicit cache control */
1079 #include <sys/cachectl.h>
1080 #endif
1081
mdbx_compiler_barrier(void)1082 MDBX_MAYBE_UNUSED static __inline void mdbx_compiler_barrier(void) {
1083 #if defined(__clang__) || defined(__GNUC__)
1084 __asm__ __volatile__("" ::: "memory");
1085 #elif defined(_MSC_VER)
1086 _ReadWriteBarrier();
1087 #elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */
1088 __memory_barrier();
1089 #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun)
1090 __compiler_barrier();
1091 #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \
1092 (defined(HP_IA64) || defined(__ia64))
1093 _Asm_sched_fence(/* LY: no-arg meaning 'all expect ALU', e.g. 0x3D3D */);
1094 #elif defined(_AIX) || defined(__ppc__) || defined(__powerpc__) || \
1095 defined(__ppc64__) || defined(__powerpc64__)
1096 __fence();
1097 #else
1098 #error "Could not guess the kind of compiler, please report to us."
1099 #endif
1100 }
1101
mdbx_memory_barrier(void)1102 MDBX_MAYBE_UNUSED static __inline void mdbx_memory_barrier(void) {
1103 #ifdef MDBX_HAVE_C11ATOMICS
1104 atomic_thread_fence(memory_order_seq_cst);
1105 #elif defined(__ATOMIC_SEQ_CST)
1106 #ifdef __clang__
1107 __c11_atomic_thread_fence(__ATOMIC_SEQ_CST);
1108 #else
1109 __atomic_thread_fence(__ATOMIC_SEQ_CST);
1110 #endif
1111 #elif defined(__clang__) || defined(__GNUC__)
1112 __sync_synchronize();
1113 #elif defined(_WIN32) || defined(_WIN64)
1114 MemoryBarrier();
1115 #elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */
1116 #if defined(__ia32__)
1117 _mm_mfence();
1118 #else
1119 __mf();
1120 #endif
1121 #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun)
1122 __machine_rw_barrier();
1123 #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \
1124 (defined(HP_IA64) || defined(__ia64))
1125 _Asm_mf();
1126 #elif defined(_AIX) || defined(__ppc__) || defined(__powerpc__) || \
1127 defined(__ppc64__) || defined(__powerpc64__)
1128 __lwsync();
1129 #else
1130 #error "Could not guess the kind of compiler, please report to us."
1131 #endif
1132 }
1133
1134 /*----------------------------------------------------------------------------*/
1135 /* libc compatibility stuff */
1136
1137 #if (!defined(__GLIBC__) && __GLIBC_PREREQ(2, 1)) && \
1138 (defined(_GNU_SOURCE) || defined(_BSD_SOURCE))
1139 #define mdbx_asprintf asprintf
1140 #define mdbx_vasprintf vasprintf
1141 #else
1142 MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC
1143 MDBX_PRINTF_ARGS(2, 3) int mdbx_asprintf(char **strp, const char *fmt, ...);
1144 MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap);
1145 #endif
1146
1147 /*----------------------------------------------------------------------------*/
1148 /* OS abstraction layer stuff */
1149
1150 /* max bytes to write in one call */
1151 #if defined(_WIN32) || defined(_WIN64)
1152 #define MAX_WRITE UINT32_C(0x01000000)
1153 #else
1154 #define MAX_WRITE UINT32_C(0x3fff0000)
1155 #endif
1156
1157 #if defined(__linux__) || defined(__gnu_linux__)
1158 MDBX_INTERNAL_VAR uint32_t mdbx_linux_kernel_version;
1159 MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */;
1160 #endif /* Linux */
1161
1162 #ifndef mdbx_strdup
1163 LIBMDBX_API char *mdbx_strdup(const char *str);
1164 #endif
1165
mdbx_get_errno(void)1166 MDBX_MAYBE_UNUSED static __inline int mdbx_get_errno(void) {
1167 #if defined(_WIN32) || defined(_WIN64)
1168 DWORD rc = GetLastError();
1169 #else
1170 int rc = errno;
1171 #endif
1172 return rc;
1173 }
1174
1175 #ifndef mdbx_memalign_alloc
1176 MDBX_INTERNAL_FUNC int mdbx_memalign_alloc(size_t alignment, size_t bytes,
1177 void **result);
1178 #endif
1179 #ifndef mdbx_memalign_free
1180 MDBX_INTERNAL_FUNC void mdbx_memalign_free(void *ptr);
1181 #endif
1182
1183 MDBX_INTERNAL_FUNC int mdbx_condpair_init(mdbx_condpair_t *condpair);
1184 MDBX_INTERNAL_FUNC int mdbx_condpair_lock(mdbx_condpair_t *condpair);
1185 MDBX_INTERNAL_FUNC int mdbx_condpair_unlock(mdbx_condpair_t *condpair);
1186 MDBX_INTERNAL_FUNC int mdbx_condpair_signal(mdbx_condpair_t *condpair,
1187 bool part);
1188 MDBX_INTERNAL_FUNC int mdbx_condpair_wait(mdbx_condpair_t *condpair, bool part);
1189 MDBX_INTERNAL_FUNC int mdbx_condpair_destroy(mdbx_condpair_t *condpair);
1190
1191 MDBX_INTERNAL_FUNC int mdbx_fastmutex_init(mdbx_fastmutex_t *fastmutex);
1192 MDBX_INTERNAL_FUNC int mdbx_fastmutex_acquire(mdbx_fastmutex_t *fastmutex);
1193 MDBX_INTERNAL_FUNC int mdbx_fastmutex_release(mdbx_fastmutex_t *fastmutex);
1194 MDBX_INTERNAL_FUNC int mdbx_fastmutex_destroy(mdbx_fastmutex_t *fastmutex);
1195
1196 MDBX_INTERNAL_FUNC int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov,
1197 int iovcnt, uint64_t offset,
1198 size_t expected_written);
1199 MDBX_INTERNAL_FUNC int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t count,
1200 uint64_t offset);
1201 MDBX_INTERNAL_FUNC int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf,
1202 size_t count, uint64_t offset);
1203 MDBX_INTERNAL_FUNC int mdbx_write(mdbx_filehandle_t fd, const void *buf,
1204 size_t count);
1205
1206 MDBX_INTERNAL_FUNC int
1207 mdbx_thread_create(mdbx_thread_t *thread,
1208 THREAD_RESULT(THREAD_CALL *start_routine)(void *),
1209 void *arg);
1210 MDBX_INTERNAL_FUNC int mdbx_thread_join(mdbx_thread_t thread);
1211
1212 enum mdbx_syncmode_bits {
1213 MDBX_SYNC_NONE = 0,
1214 MDBX_SYNC_DATA = 1,
1215 MDBX_SYNC_SIZE = 2,
1216 MDBX_SYNC_IODQ = 4
1217 };
1218
1219 MDBX_INTERNAL_FUNC int mdbx_fsync(mdbx_filehandle_t fd,
1220 const enum mdbx_syncmode_bits mode_bits);
1221 MDBX_INTERNAL_FUNC int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length);
1222 MDBX_INTERNAL_FUNC int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos);
1223 MDBX_INTERNAL_FUNC int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length);
1224
1225 enum mdbx_openfile_purpose {
1226 MDBX_OPEN_DXB_READ = 0,
1227 MDBX_OPEN_DXB_LAZY = 1,
1228 MDBX_OPEN_DXB_DSYNC = 2,
1229 MDBX_OPEN_LCK = 3,
1230 MDBX_OPEN_COPY = 4,
1231 MDBX_OPEN_DELETE = 5
1232 };
1233
1234 MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose,
1235 const MDBX_env *env, const char *pathname,
1236 mdbx_filehandle_t *fd,
1237 mdbx_mode_t unix_mode_bits);
1238 MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd);
1239 MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname);
1240 MDBX_INTERNAL_FUNC int mdbx_removedirectory(const char *pathname);
1241 MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd);
1242 MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait);
1243
1244 #define MMAP_OPTION_TRUNCATE 1
1245 #define MMAP_OPTION_SEMAPHORE 2
1246 MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map,
1247 const size_t must, const size_t limit,
1248 const unsigned options);
1249 MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map);
1250 #define MDBX_MRESIZE_MAY_MOVE 0x00000100
1251 #define MDBX_MRESIZE_MAY_UNMAP 0x00000200
1252 MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map,
1253 size_t size, size_t limit);
1254 #if defined(_WIN32) || defined(_WIN64)
1255 typedef struct {
1256 unsigned limit, count;
1257 HANDLE handles[31];
1258 } mdbx_handle_array_t;
1259 MDBX_INTERNAL_FUNC int
1260 mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array);
1261 MDBX_INTERNAL_FUNC int
1262 mdbx_resume_threads_after_remap(mdbx_handle_array_t *array);
1263 #endif /* Windows */
1264 MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset,
1265 size_t length,
1266 enum mdbx_syncmode_bits mode_bits);
1267 MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle,
1268 const char *pathname, int err);
1269
mdbx_getpid(void)1270 MDBX_MAYBE_UNUSED static __inline uint32_t mdbx_getpid(void) {
1271 STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t));
1272 #if defined(_WIN32) || defined(_WIN64)
1273 return GetCurrentProcessId();
1274 #else
1275 return getpid();
1276 #endif
1277 }
1278
mdbx_thread_self(void)1279 MDBX_MAYBE_UNUSED static __inline uintptr_t mdbx_thread_self(void) {
1280 mdbx_tid_t thunk;
1281 STATIC_ASSERT(sizeof(uintptr_t) >= sizeof(thunk));
1282 #if defined(_WIN32) || defined(_WIN64)
1283 thunk = GetCurrentThreadId();
1284 #else
1285 thunk = pthread_self();
1286 #endif
1287 return (uintptr_t)thunk;
1288 }
1289
1290 MDBX_INTERNAL_FUNC uint64_t mdbx_osal_monotime(void);
1291 MDBX_INTERNAL_FUNC uint64_t
1292 mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16);
1293 MDBX_INTERNAL_FUNC uint32_t mdbx_osal_monotime_to_16dot16(uint64_t monotime);
1294
1295 MDBX_INTERNAL_FUNC bin128_t mdbx_osal_bootid(void);
1296 /*----------------------------------------------------------------------------*/
1297 /* lck stuff */
1298
1299 /// \brief Initialization of synchronization primitives linked with MDBX_env
1300 /// instance both in LCK-file and within the current process.
1301 /// \param
1302 /// global_uniqueness_flag = true - denotes that there are no other processes
1303 /// working with DB and LCK-file. Thus the function MUST initialize
1304 /// shared synchronization objects in memory-mapped LCK-file.
1305 /// global_uniqueness_flag = false - denotes that at least one process is
1306 /// already working with DB and LCK-file, including the case when DB
1307 /// has already been opened in the current process. Thus the function
1308 /// MUST NOT initialize shared synchronization objects in memory-mapped
1309 /// LCK-file that are already in use.
1310 /// \return Error code or zero on success.
1311 MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env,
1312 MDBX_env *inprocess_neighbor,
1313 int global_uniqueness_flag);
1314
1315 /// \brief Disconnects from shared interprocess objects and destructs
1316 /// synchronization objects linked with MDBX_env instance
1317 /// within the current process.
1318 /// \param
1319 /// inprocess_neighbor = NULL - if the current process does not have other
1320 /// instances of MDBX_env linked with the DB being closed.
1321 /// Thus the function MUST check for other processes working with DB or
1322 /// LCK-file, and keep or destroy shared synchronization objects in
1323 /// memory-mapped LCK-file depending on the result.
1324 /// inprocess_neighbor = not-NULL - pointer to another instance of MDBX_env
1325 /// (anyone of there is several) working with DB or LCK-file within the
1326 /// current process. Thus the function MUST NOT try to acquire exclusive
1327 /// lock and/or try to destruct shared synchronization objects linked with
1328 /// DB or LCK-file. Moreover, the implementation MUST ensure correct work
1329 /// of other instances of MDBX_env within the current process, e.g.
1330 /// restore POSIX-fcntl locks after the closing of file descriptors.
1331 /// \return Error code (MDBX_PANIC) or zero on success.
1332 MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env,
1333 MDBX_env *inprocess_neighbor);
1334
1335 /// \brief Connects to shared interprocess locking objects and tries to acquire
1336 /// the maximum lock level (shared if exclusive is not available)
1337 /// Depending on implementation or/and platform (Windows) this function may
1338 /// acquire the non-OS super-level lock (e.g. for shared synchronization
1339 /// objects initialization), which will be downgraded to OS-exclusive or
1340 /// shared via explicit calling of mdbx_lck_downgrade().
1341 /// \return
1342 /// MDBX_RESULT_TRUE (-1) - if an exclusive lock was acquired and thus
1343 /// the current process is the first and only after the last use of DB.
1344 /// MDBX_RESULT_FALSE (0) - if a shared lock was acquired and thus
1345 /// DB has already been opened and now is used by other processes.
1346 /// Otherwise (not 0 and not -1) - error code.
1347 MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env);
1348
1349 /// \brief Downgrades the level of initially acquired lock to
1350 /// operational level specified by argument. The reson for such downgrade:
1351 /// - unblocking of other processes that are waiting for access, i.e.
1352 /// if (env->me_flags & MDBX_EXCLUSIVE) != 0, then other processes
1353 /// should be made aware that access is unavailable rather than
1354 /// wait for it.
1355 /// - freeing locks that interfere file operation (especially for Windows)
1356 /// (env->me_flags & MDBX_EXCLUSIVE) == 0 - downgrade to shared lock.
1357 /// (env->me_flags & MDBX_EXCLUSIVE) != 0 - downgrade to exclusive
1358 /// operational lock.
1359 /// \return Error code or zero on success
1360 MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env);
1361
1362 /// \brief Locks LCK-file or/and table of readers for (de)registering.
1363 /// \return Error code or zero on success
1364 MDBX_INTERNAL_FUNC int mdbx_rdt_lock(MDBX_env *env);
1365
1366 /// \brief Unlocks LCK-file or/and table of readers after (de)registering.
1367 MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env);
1368
1369 /// \brief Acquires lock for DB change (on writing transaction start)
1370 /// Reading transactions will not be blocked.
1371 /// Declared as LIBMDBX_API because it is used in mdbx_chk.
1372 /// \return Error code or zero on success
1373 LIBMDBX_API int mdbx_txn_lock(MDBX_env *env, bool dont_wait);
1374
1375 /// \brief Releases lock once DB changes is made (after writing transaction
1376 /// has finished).
1377 /// Declared as LIBMDBX_API because it is used in mdbx_chk.
1378 LIBMDBX_API void mdbx_txn_unlock(MDBX_env *env);
1379
1380 /// \brief Sets alive-flag of reader presence (indicative lock) for PID of
1381 /// the current process. The function does no more than needed for
1382 /// the correct working of mdbx_rpid_check() in other processes.
1383 /// \return Error code or zero on success
1384 MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env);
1385
1386 /// \brief Resets alive-flag of reader presence (indicative lock)
1387 /// for PID of the current process. The function does no more than needed
1388 /// for the correct working of mdbx_rpid_check() in other processes.
1389 /// \return Error code or zero on success
1390 MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env);
1391
1392 /// \brief Checks for reading process status with the given pid with help of
1393 /// alive-flag of presence (indicative lock) or using another way.
1394 /// \return
1395 /// MDBX_RESULT_TRUE (-1) - if the reader process with the given PID is alive
1396 /// and working with DB (indicative lock is present).
1397 /// MDBX_RESULT_FALSE (0) - if the reader process with the given PID is absent
1398 /// or not working with DB (indicative lock is not present).
1399 /// Otherwise (not 0 and not -1) - error code.
1400 MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid);
1401
1402 #if defined(_WIN32) || defined(_WIN64)
1403
1404 typedef void(WINAPI *MDBX_srwlock_function)(MDBX_srwlock *);
1405 MDBX_INTERNAL_VAR MDBX_srwlock_function mdbx_srwlock_Init,
1406 mdbx_srwlock_AcquireShared, mdbx_srwlock_ReleaseShared,
1407 mdbx_srwlock_AcquireExclusive, mdbx_srwlock_ReleaseExclusive;
1408
1409 #if _WIN32_WINNT < 0x0600 /* prior to Windows Vista */
1410 typedef enum _FILE_INFO_BY_HANDLE_CLASS {
1411 FileBasicInfo,
1412 FileStandardInfo,
1413 FileNameInfo,
1414 FileRenameInfo,
1415 FileDispositionInfo,
1416 FileAllocationInfo,
1417 FileEndOfFileInfo,
1418 FileStreamInfo,
1419 FileCompressionInfo,
1420 FileAttributeTagInfo,
1421 FileIdBothDirectoryInfo,
1422 FileIdBothDirectoryRestartInfo,
1423 FileIoPriorityHintInfo,
1424 FileRemoteProtocolInfo,
1425 MaximumFileInfoByHandleClass
1426 } FILE_INFO_BY_HANDLE_CLASS,
1427 *PFILE_INFO_BY_HANDLE_CLASS;
1428
1429 typedef struct _FILE_END_OF_FILE_INFO {
1430 LARGE_INTEGER EndOfFile;
1431 } FILE_END_OF_FILE_INFO, *PFILE_END_OF_FILE_INFO;
1432
1433 #define REMOTE_PROTOCOL_INFO_FLAG_LOOPBACK 0x00000001
1434 #define REMOTE_PROTOCOL_INFO_FLAG_OFFLINE 0x00000002
1435
1436 typedef struct _FILE_REMOTE_PROTOCOL_INFO {
1437 USHORT StructureVersion;
1438 USHORT StructureSize;
1439 DWORD Protocol;
1440 USHORT ProtocolMajorVersion;
1441 USHORT ProtocolMinorVersion;
1442 USHORT ProtocolRevision;
1443 USHORT Reserved;
1444 DWORD Flags;
1445 struct {
1446 DWORD Reserved[8];
1447 } GenericReserved;
1448 struct {
1449 DWORD Reserved[16];
1450 } ProtocolSpecificReserved;
1451 } FILE_REMOTE_PROTOCOL_INFO, *PFILE_REMOTE_PROTOCOL_INFO;
1452
1453 #endif /* _WIN32_WINNT < 0x0600 (prior to Windows Vista) */
1454
1455 typedef BOOL(WINAPI *MDBX_GetFileInformationByHandleEx)(
1456 _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass,
1457 _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize);
1458 MDBX_INTERNAL_VAR MDBX_GetFileInformationByHandleEx
1459 mdbx_GetFileInformationByHandleEx;
1460
1461 typedef BOOL(WINAPI *MDBX_GetVolumeInformationByHandleW)(
1462 _In_ HANDLE hFile, _Out_opt_ LPWSTR lpVolumeNameBuffer,
1463 _In_ DWORD nVolumeNameSize, _Out_opt_ LPDWORD lpVolumeSerialNumber,
1464 _Out_opt_ LPDWORD lpMaximumComponentLength,
1465 _Out_opt_ LPDWORD lpFileSystemFlags,
1466 _Out_opt_ LPWSTR lpFileSystemNameBuffer, _In_ DWORD nFileSystemNameSize);
1467 MDBX_INTERNAL_VAR MDBX_GetVolumeInformationByHandleW
1468 mdbx_GetVolumeInformationByHandleW;
1469
1470 typedef DWORD(WINAPI *MDBX_GetFinalPathNameByHandleW)(_In_ HANDLE hFile,
1471 _Out_ LPWSTR lpszFilePath,
1472 _In_ DWORD cchFilePath,
1473 _In_ DWORD dwFlags);
1474 MDBX_INTERNAL_VAR MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW;
1475
1476 typedef BOOL(WINAPI *MDBX_SetFileInformationByHandle)(
1477 _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass,
1478 _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize);
1479 MDBX_INTERNAL_VAR MDBX_SetFileInformationByHandle
1480 mdbx_SetFileInformationByHandle;
1481
1482 typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)(
1483 IN HANDLE FileHandle, IN OUT HANDLE Event,
1484 IN OUT PVOID /* PIO_APC_ROUTINE */ ApcRoutine, IN OUT PVOID ApcContext,
1485 OUT PIO_STATUS_BLOCK IoStatusBlock, IN ULONG FsControlCode,
1486 IN OUT PVOID InputBuffer, IN ULONG InputBufferLength,
1487 OUT OPTIONAL PVOID OutputBuffer, IN ULONG OutputBufferLength);
1488 MDBX_INTERNAL_VAR MDBX_NtFsControlFile mdbx_NtFsControlFile;
1489
1490 typedef uint64_t(WINAPI *MDBX_GetTickCount64)(void);
1491 MDBX_INTERNAL_VAR MDBX_GetTickCount64 mdbx_GetTickCount64;
1492
1493 #if !defined(_WIN32_WINNT_WIN8) || _WIN32_WINNT < _WIN32_WINNT_WIN8
1494 typedef struct _WIN32_MEMORY_RANGE_ENTRY {
1495 PVOID VirtualAddress;
1496 SIZE_T NumberOfBytes;
1497 } WIN32_MEMORY_RANGE_ENTRY, *PWIN32_MEMORY_RANGE_ENTRY;
1498 #endif /* Windows 8.x */
1499
1500 typedef BOOL(WINAPI *MDBX_PrefetchVirtualMemory)(
1501 HANDLE hProcess, ULONG_PTR NumberOfEntries,
1502 PWIN32_MEMORY_RANGE_ENTRY VirtualAddresses, ULONG Flags);
1503 MDBX_INTERNAL_VAR MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory;
1504
1505 typedef enum _SECTION_INHERIT { ViewShare = 1, ViewUnmap = 2 } SECTION_INHERIT;
1506
1507 typedef NTSTATUS(NTAPI *MDBX_NtExtendSection)(IN HANDLE SectionHandle,
1508 IN PLARGE_INTEGER NewSectionSize);
1509 MDBX_INTERNAL_VAR MDBX_NtExtendSection mdbx_NtExtendSection;
1510
mdbx_RunningUnderWine(void)1511 static __inline bool mdbx_RunningUnderWine(void) {
1512 return !mdbx_NtExtendSection;
1513 }
1514
1515 typedef LSTATUS(WINAPI *MDBX_RegGetValueA)(HKEY hkey, LPCSTR lpSubKey,
1516 LPCSTR lpValue, DWORD dwFlags,
1517 LPDWORD pdwType, PVOID pvData,
1518 LPDWORD pcbData);
1519 MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA;
1520
1521 #endif /* Windows */
1522
1523 #endif /* !__cplusplus */
1524
1525 /*----------------------------------------------------------------------------*/
1526
1527 #if defined(_MSC_VER) && _MSC_VER >= 1900
1528 /* LY: MSVC 2015/2017/2019 has buggy/inconsistent PRIuPTR/PRIxPTR macros
1529 * for internal format-args checker. */
1530 #undef PRIuPTR
1531 #undef PRIiPTR
1532 #undef PRIdPTR
1533 #undef PRIxPTR
1534 #define PRIuPTR "Iu"
1535 #define PRIiPTR "Ii"
1536 #define PRIdPTR "Id"
1537 #define PRIxPTR "Ix"
1538 #define PRIuSIZE "zu"
1539 #define PRIiSIZE "zi"
1540 #define PRIdSIZE "zd"
1541 #define PRIxSIZE "zx"
1542 #endif /* fix PRI*PTR for _MSC_VER */
1543
1544 #ifndef PRIuSIZE
1545 #define PRIuSIZE PRIuPTR
1546 #define PRIiSIZE PRIiPTR
1547 #define PRIdSIZE PRIdPTR
1548 #define PRIxSIZE PRIxPTR
1549 #endif /* PRI*SIZE macros for MSVC */
1550
1551 #ifdef _MSC_VER
1552 #pragma warning(pop)
1553 #endif
1554
1555 #define mdbx_sourcery_anchor XCONCAT(mdbx_sourcery_, MDBX_BUILD_SOURCERY)
1556 #if defined(xMDBX_TOOLS)
1557 extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
1558 #endif
1559
1560 /*******************************************************************************
1561 *******************************************************************************
1562 *******************************************************************************
1563 *
1564 *
1565 * #### ##### ##### # #### # # ####
1566 * # # # # # # # # ## # #
1567 * # # # # # # # # # # # ####
1568 * # # ##### # # # # # # # #
1569 * # # # # # # # # ## # #
1570 * #### # # # #### # # ####
1571 *
1572 *
1573 */
1574
1575 /** \defgroup build_option Build options
1576 * The libmdbx build options.
1577 @{ */
1578
1579 /** Using fcntl(F_FULLFSYNC) with 5-10 times slowdown */
1580 #define MDBX_OSX_WANNA_DURABILITY 0
1581 /** Using fsync() with chance of data lost on power failure */
1582 #define MDBX_OSX_WANNA_SPEED 1
1583
1584 #ifndef MDBX_OSX_SPEED_INSTEADOF_DURABILITY
1585 /** Choices \ref MDBX_OSX_WANNA_DURABILITY or \ref MDBX_OSX_WANNA_SPEED
1586 * for OSX & iOS */
1587 #define MDBX_OSX_SPEED_INSTEADOF_DURABILITY MDBX_OSX_WANNA_DURABILITY
1588 #endif /* MDBX_OSX_SPEED_INSTEADOF_DURABILITY */
1589
1590 /** Controls checking PID against reuse DB environment after the fork() */
1591 #ifndef MDBX_ENV_CHECKPID
1592 #if defined(MADV_DONTFORK) || defined(_WIN32) || defined(_WIN64)
1593 /* PID check could be omitted:
1594 * - on Linux when madvise(MADV_DONTFORK) is available, i.e. after the fork()
1595 * mapped pages will not be available for child process.
1596 * - in Windows where fork() not available. */
1597 #define MDBX_ENV_CHECKPID 0
1598 #else
1599 #define MDBX_ENV_CHECKPID 1
1600 #endif
1601 #define MDBX_ENV_CHECKPID_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_ENV_CHECKPID)
1602 #else
1603 #define MDBX_ENV_CHECKPID_CONFIG MDBX_STRINGIFY(MDBX_ENV_CHECKPID)
1604 #endif /* MDBX_ENV_CHECKPID */
1605
1606 /** Controls checking transaction owner thread against misuse transactions from
1607 * other threads. */
1608 #ifndef MDBX_TXN_CHECKOWNER
1609 #define MDBX_TXN_CHECKOWNER 1
1610 #define MDBX_TXN_CHECKOWNER_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_TXN_CHECKOWNER)
1611 #else
1612 #define MDBX_TXN_CHECKOWNER_CONFIG MDBX_STRINGIFY(MDBX_TXN_CHECKOWNER)
1613 #endif /* MDBX_TXN_CHECKOWNER */
1614
1615 /** Does a system have battery-backed Real-Time Clock or just a fake. */
1616 #ifndef MDBX_TRUST_RTC
1617 #if defined(__linux__) || defined(__gnu_linux__) || defined(__NetBSD__) || \
1618 defined(__OpenBSD__)
1619 #define MDBX_TRUST_RTC 0 /* a lot of embedded systems have a fake RTC */
1620 #else
1621 #define MDBX_TRUST_RTC 1
1622 #endif
1623 #define MDBX_TRUST_RTC_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_TRUST_RTC)
1624 #else
1625 #define MDBX_TRUST_RTC_CONFIG MDBX_STRINGIFY(MDBX_TRUST_RTC)
1626 #endif /* MDBX_TRUST_RTC */
1627
1628 /** Controls online database auto-compactification during write-transactions. */
1629 #ifndef MDBX_ENABLE_REFUND
1630 #define MDBX_ENABLE_REFUND 1
1631 #elif !(MDBX_ENABLE_REFUND == 0 || MDBX_ENABLE_REFUND == 1)
1632 #error MDBX_ENABLE_REFUND must be defined as 0 or 1
1633 #endif /* MDBX_ENABLE_REFUND */
1634
1635 /** Controls gathering statistics for page operations. */
1636 #ifndef MDBX_ENABLE_PGOP_STAT
1637 #define MDBX_ENABLE_PGOP_STAT 1
1638 #elif !(MDBX_ENABLE_PGOP_STAT == 0 || MDBX_ENABLE_PGOP_STAT == 1)
1639 #error MDBX_ENABLE_PGOP_STAT must be defined as 0 or 1
1640 #endif /* MDBX_ENABLE_PGOP_STAT */
1641
1642 /** Controls use of POSIX madvise() hints and friends. */
1643 #ifndef MDBX_ENABLE_MADVISE
1644 #define MDBX_ENABLE_MADVISE 1
1645 #elif !(MDBX_ENABLE_MADVISE == 0 || MDBX_ENABLE_MADVISE == 1)
1646 #error MDBX_ENABLE_MADVISE must be defined as 0 or 1
1647 #endif /* MDBX_ENABLE_MADVISE */
1648
1649 /** Disable some checks to reduce an overhead and detection probability of
1650 * database corruption to a values closer to the LMDB. */
1651 #ifndef MDBX_DISABLE_PAGECHECKS
1652 #define MDBX_DISABLE_PAGECHECKS 0
1653 #elif !(MDBX_DISABLE_PAGECHECKS == 0 || MDBX_DISABLE_PAGECHECKS == 1)
1654 #error MDBX_DISABLE_PAGECHECKS must be defined as 0 or 1
1655 #endif /* MDBX_DISABLE_PAGECHECKS */
1656
1657 #ifndef MDBX_PNL_PREALLOC_FOR_RADIXSORT
1658 #define MDBX_PNL_PREALLOC_FOR_RADIXSORT 1
1659 #elif !(MDBX_PNL_PREALLOC_FOR_RADIXSORT == 0 || \
1660 MDBX_PNL_PREALLOC_FOR_RADIXSORT == 1)
1661 #error MDBX_PNL_PREALLOC_FOR_RADIXSORT must be defined as 0 or 1
1662 #endif /* MDBX_PNL_PREALLOC_FOR_RADIXSORT */
1663
1664 #ifndef MDBX_DPL_PREALLOC_FOR_RADIXSORT
1665 #define MDBX_DPL_PREALLOC_FOR_RADIXSORT 1
1666 #elif !(MDBX_DPL_PREALLOC_FOR_RADIXSORT == 0 || \
1667 MDBX_DPL_PREALLOC_FOR_RADIXSORT == 1)
1668 #error MDBX_DPL_PREALLOC_FOR_RADIXSORT must be defined as 0 or 1
1669 #endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */
1670
1671 /* Basically, this build-option is for TODO. Guess it should be replaced
1672 * with MDBX_ENABLE_WRITEMAP_SPILLING with the three variants:
1673 * 0/OFF = Don't track dirty pages at all and don't spilling ones.
1674 * This should be by-default on Linux and may-be other systems
1675 * (not sure: Darwin/OSX, FreeBSD, Windows 10) where kernel provides
1676 * properly LRU tracking and async writing on-demand.
1677 * 1/ON = Lite tracking of dirty pages but with LRU labels and explicit
1678 * spilling with msync(MS_ASYNC). */
1679 #ifndef MDBX_FAKE_SPILL_WRITEMAP
1680 #if defined(__linux__) || defined(__gnu_linux__)
1681 #define MDBX_FAKE_SPILL_WRITEMAP 1 /* msync(MS_ASYNC) is no-op on Linux */
1682 #else
1683 #define MDBX_FAKE_SPILL_WRITEMAP 0
1684 #endif
1685 #elif !(MDBX_FAKE_SPILL_WRITEMAP == 0 || MDBX_FAKE_SPILL_WRITEMAP == 1)
1686 #error MDBX_FAKE_SPILL_WRITEMAP must be defined as 0 or 1
1687 #endif /* MDBX_FAKE_SPILL_WRITEMAP */
1688
1689 /** Controls sort order of internal page number lists.
1690 * This mostly experimental/advanced option with not for regular MDBX users.
1691 * \warning The database format depend on this option and libmdbx builded with
1692 * different option value are incompatible. */
1693 #ifndef MDBX_PNL_ASCENDING
1694 #define MDBX_PNL_ASCENDING 0
1695 #elif !(MDBX_PNL_ASCENDING == 0 || MDBX_PNL_ASCENDING == 1)
1696 #error MDBX_PNL_ASCENDING must be defined as 0 or 1
1697 #endif /* MDBX_PNL_ASCENDING */
1698
1699 /** Avoid dependence from MSVC CRT and use ntdll.dll instead. */
1700 #ifndef MDBX_WITHOUT_MSVC_CRT
1701 #define MDBX_WITHOUT_MSVC_CRT 1
1702 #elif !(MDBX_WITHOUT_MSVC_CRT == 0 || MDBX_WITHOUT_MSVC_CRT == 1)
1703 #error MDBX_WITHOUT_MSVC_CRT must be defined as 0 or 1
1704 #endif /* MDBX_WITHOUT_MSVC_CRT */
1705
1706 /** Size of buffer used during copying a environment/database file. */
1707 #ifndef MDBX_ENVCOPY_WRITEBUF
1708 #define MDBX_ENVCOPY_WRITEBUF 1048576u
1709 #elif MDBX_ENVCOPY_WRITEBUF < 65536u || MDBX_ENVCOPY_WRITEBUF > 1073741824u || \
1710 MDBX_ENVCOPY_WRITEBUF % 65536u
1711 #error MDBX_ENVCOPY_WRITEBUF must be defined in range 65536..1073741824 and be multiple of 65536
1712 #endif /* MDBX_ENVCOPY_WRITEBUF */
1713
1714 /** Forces assertion checking */
1715 #ifndef MDBX_FORCE_ASSERTIONS
1716 #define MDBX_FORCE_ASSERTIONS 0
1717 #elif !(MDBX_FORCE_ASSERTIONS == 0 || MDBX_FORCE_ASSERTIONS == 1)
1718 #error MDBX_FORCE_ASSERTIONS must be defined as 0 or 1
1719 #endif /* MDBX_FORCE_ASSERTIONS */
1720
1721 /** Presumed malloc size overhead for each allocation
1722 * to adjust allocations to be more aligned. */
1723 #ifndef MDBX_ASSUME_MALLOC_OVERHEAD
1724 #ifdef __SIZEOF_POINTER__
1725 #define MDBX_ASSUME_MALLOC_OVERHEAD (__SIZEOF_POINTER__ * 2u)
1726 #else
1727 #define MDBX_ASSUME_MALLOC_OVERHEAD (sizeof(void *) * 2u)
1728 #endif
1729 #elif MDBX_ASSUME_MALLOC_OVERHEAD < 0 || MDBX_ASSUME_MALLOC_OVERHEAD > 64 || \
1730 MDBX_ASSUME_MALLOC_OVERHEAD % 4
1731 #error MDBX_ASSUME_MALLOC_OVERHEAD must be defined in range 0..64 and be multiple of 4
1732 #endif /* MDBX_ASSUME_MALLOC_OVERHEAD */
1733
1734 /** In case the MDBX_DEBUG is undefined set it corresponding to NDEBUG */
1735 #ifndef MDBX_DEBUG
1736 #ifdef NDEBUG
1737 #define MDBX_DEBUG 0
1738 #else
1739 #define MDBX_DEBUG 1
1740 #endif
1741 #endif /* MDBX_DEBUG */
1742
1743 /** If defined then enables integration with Valgrind,
1744 * a memory analyzing tool. */
1745 #ifndef MDBX_USE_VALGRIND
1746 #endif /* MDBX_USE_VALGRIND */
1747
1748 /** If defined then enables use C11 atomics,
1749 * otherwise detects ones availability automatically. */
1750 #ifndef MDBX_HAVE_C11ATOMICS
1751 #endif /* MDBX_HAVE_C11ATOMICS */
1752
1753 //------------------------------------------------------------------------------
1754
1755 /** Win32 File Locking API for \ref MDBX_LOCKING */
1756 #define MDBX_LOCKING_WIN32FILES -1
1757
1758 /** SystemV IPC semaphores for \ref MDBX_LOCKING */
1759 #define MDBX_LOCKING_SYSV 5
1760
1761 /** POSIX-1 Shared anonymous semaphores for \ref MDBX_LOCKING */
1762 #define MDBX_LOCKING_POSIX1988 1988
1763
1764 /** POSIX-2001 Shared Mutexes for \ref MDBX_LOCKING */
1765 #define MDBX_LOCKING_POSIX2001 2001
1766
1767 /** POSIX-2008 Robust Mutexes for \ref MDBX_LOCKING */
1768 #define MDBX_LOCKING_POSIX2008 2008
1769
1770 /** BeOS Benaphores, aka Futexes for \ref MDBX_LOCKING */
1771 #define MDBX_LOCKING_BENAPHORE 1995
1772
1773 /** Advanced: Choices the locking implementation (autodetection by default). */
1774 #if defined(_WIN32) || defined(_WIN64)
1775 #define MDBX_LOCKING MDBX_LOCKING_WIN32FILES
1776 #else
1777 #ifndef MDBX_LOCKING
1778 #if defined(_POSIX_THREAD_PROCESS_SHARED) && \
1779 _POSIX_THREAD_PROCESS_SHARED >= 200112L && !defined(__FreeBSD__)
1780
1781 /* Some platforms define the EOWNERDEAD error code even though they
1782 * don't support Robust Mutexes. If doubt compile with -MDBX_LOCKING=2001. */
1783 #if defined(EOWNERDEAD) && _POSIX_THREAD_PROCESS_SHARED >= 200809L && \
1784 ((defined(_POSIX_THREAD_ROBUST_PRIO_INHERIT) && \
1785 _POSIX_THREAD_ROBUST_PRIO_INHERIT > 0) || \
1786 (defined(_POSIX_THREAD_ROBUST_PRIO_PROTECT) && \
1787 _POSIX_THREAD_ROBUST_PRIO_PROTECT > 0) || \
1788 defined(PTHREAD_MUTEX_ROBUST) || defined(PTHREAD_MUTEX_ROBUST_NP)) && \
1789 (!defined(__GLIBC__) || \
1790 __GLIBC_PREREQ(2, 10) /* troubles with Robust mutexes before 2.10 */)
1791 #define MDBX_LOCKING MDBX_LOCKING_POSIX2008
1792 #else
1793 #define MDBX_LOCKING MDBX_LOCKING_POSIX2001
1794 #endif
1795 #elif defined(__sun) || defined(__SVR4) || defined(__svr4__)
1796 #define MDBX_LOCKING MDBX_LOCKING_POSIX1988
1797 #else
1798 #define MDBX_LOCKING MDBX_LOCKING_SYSV
1799 #endif
1800 #define MDBX_LOCKING_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_LOCKING)
1801 #else
1802 #define MDBX_LOCKING_CONFIG MDBX_STRINGIFY(MDBX_LOCKING)
1803 #endif /* MDBX_LOCKING */
1804 #endif /* !Windows */
1805
1806 /** Advanced: Using POSIX OFD-locks (autodetection by default). */
1807 #ifndef MDBX_USE_OFDLOCKS
1808 #if defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && defined(F_OFD_GETLK) && \
1809 !defined(MDBX_SAFE4QEMU) && \
1810 !defined(__sun) /* OFD-lock are broken on Solaris */
1811 #define MDBX_USE_OFDLOCKS 1
1812 #else
1813 #define MDBX_USE_OFDLOCKS 0
1814 #endif
1815 #define MDBX_USE_OFDLOCKS_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_USE_OFDLOCKS)
1816 #else
1817 #define MDBX_USE_OFDLOCKS_CONFIG MDBX_STRINGIFY(MDBX_USE_OFDLOCKS)
1818 #endif /* MDBX_USE_OFDLOCKS */
1819
1820 /** Advanced: Using sendfile() syscall (autodetection by default). */
1821 #ifndef MDBX_USE_SENDFILE
1822 #if ((defined(__linux__) || defined(__gnu_linux__)) && \
1823 !defined(__ANDROID_API__)) || \
1824 (defined(__ANDROID_API__) && __ANDROID_API__ >= 21)
1825 #define MDBX_USE_SENDFILE 1
1826 #else
1827 #define MDBX_USE_SENDFILE 0
1828 #endif
1829 #endif /* MDBX_USE_SENDFILE */
1830
1831 /** Advanced: Using copy_file_range() syscall (autodetection by default). */
1832 #ifndef MDBX_USE_COPYFILERANGE
1833 #if __GLIBC_PREREQ(2, 27) && defined(_GNU_SOURCE)
1834 #define MDBX_USE_COPYFILERANGE 1
1835 #else
1836 #define MDBX_USE_COPYFILERANGE 0
1837 #endif
1838 #endif /* MDBX_USE_COPYFILERANGE */
1839
1840 /** Advanced: Using sync_file_range() syscall (autodetection by default). */
1841 #ifndef MDBX_USE_SYNCFILERANGE
1842 #if ((defined(__linux__) || defined(__gnu_linux__)) && \
1843 defined(SYNC_FILE_RANGE_WRITE) && !defined(__ANDROID_API__)) || \
1844 (defined(__ANDROID_API__) && __ANDROID_API__ >= 26)
1845 #define MDBX_USE_SYNCFILERANGE 1
1846 #else
1847 #define MDBX_USE_SYNCFILERANGE 0
1848 #endif
1849 #endif /* MDBX_USE_SYNCFILERANGE */
1850
1851 //------------------------------------------------------------------------------
1852
1853 #ifndef MDBX_CPU_WRITEBACK_INCOHERENT
1854 #if defined(__ia32__) || defined(__e2k__) || defined(__hppa) || \
1855 defined(__hppa__) || defined(DOXYGEN)
1856 #define MDBX_CPU_WRITEBACK_INCOHERENT 0
1857 #else
1858 #define MDBX_CPU_WRITEBACK_INCOHERENT 1
1859 #endif
1860 #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */
1861
1862 #ifndef MDBX_MMAP_INCOHERENT_FILE_WRITE
1863 #ifdef __OpenBSD__
1864 #define MDBX_MMAP_INCOHERENT_FILE_WRITE 1
1865 #else
1866 #define MDBX_MMAP_INCOHERENT_FILE_WRITE 0
1867 #endif
1868 #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */
1869
1870 #ifndef MDBX_MMAP_INCOHERENT_CPU_CACHE
1871 #if defined(__mips) || defined(__mips__) || defined(__mips64) || \
1872 defined(__mips64__) || defined(_M_MRX000) || defined(_MIPS_) || \
1873 defined(__MWERKS__) || defined(__sgi)
1874 /* MIPS has cache coherency issues. */
1875 #define MDBX_MMAP_INCOHERENT_CPU_CACHE 1
1876 #else
1877 /* LY: assume no relevant mmap/dcache issues. */
1878 #define MDBX_MMAP_INCOHERENT_CPU_CACHE 0
1879 #endif
1880 #endif /* MDBX_MMAP_INCOHERENT_CPU_CACHE */
1881
1882 #ifndef MDBX_64BIT_ATOMIC
1883 #if MDBX_WORDBITS >= 64 || defined(DOXYGEN)
1884 #define MDBX_64BIT_ATOMIC 1
1885 #else
1886 #define MDBX_64BIT_ATOMIC 0
1887 #endif
1888 #define MDBX_64BIT_ATOMIC_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_64BIT_ATOMIC)
1889 #else
1890 #define MDBX_64BIT_ATOMIC_CONFIG MDBX_STRINGIFY(MDBX_64BIT_ATOMIC)
1891 #endif /* MDBX_64BIT_ATOMIC */
1892
1893 #ifndef MDBX_64BIT_CAS
1894 #if defined(ATOMIC_LLONG_LOCK_FREE)
1895 #if ATOMIC_LLONG_LOCK_FREE > 1
1896 #define MDBX_64BIT_CAS 1
1897 #else
1898 #define MDBX_64BIT_CAS 0
1899 #endif
1900 #elif defined(__GCC_ATOMIC_LLONG_LOCK_FREE)
1901 #if __GCC_ATOMIC_LLONG_LOCK_FREE > 1
1902 #define MDBX_64BIT_CAS 1
1903 #else
1904 #define MDBX_64BIT_CAS 0
1905 #endif
1906 #elif defined(__CLANG_ATOMIC_LLONG_LOCK_FREE)
1907 #if __CLANG_ATOMIC_LLONG_LOCK_FREE > 1
1908 #define MDBX_64BIT_CAS 1
1909 #else
1910 #define MDBX_64BIT_CAS 0
1911 #endif
1912 #elif defined(_MSC_VER) || defined(__APPLE__) || defined(DOXYGEN)
1913 #define MDBX_64BIT_CAS 1
1914 #else
1915 #define MDBX_64BIT_CAS MDBX_64BIT_ATOMIC
1916 #endif
1917 #define MDBX_64BIT_CAS_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_64BIT_CAS)
1918 #else
1919 #define MDBX_64BIT_CAS_CONFIG MDBX_STRINGIFY(MDBX_64BIT_CAS)
1920 #endif /* MDBX_64BIT_CAS */
1921
1922 #ifndef MDBX_UNALIGNED_OK
1923 #ifdef _MSC_VER
1924 #define MDBX_UNALIGNED_OK 1 /* avoid MSVC misoptimization */
1925 #elif __CLANG_PREREQ(5, 0) || __GNUC_PREREQ(5, 0)
1926 #define MDBX_UNALIGNED_OK 0 /* expecting optimization is well done */
1927 #elif (defined(__ia32__) || defined(__ARM_FEATURE_UNALIGNED)) && \
1928 !defined(__ALIGNED__)
1929 #define MDBX_UNALIGNED_OK 1
1930 #else
1931 #define MDBX_UNALIGNED_OK 0
1932 #endif
1933 #endif /* MDBX_UNALIGNED_OK */
1934
1935 #ifndef MDBX_CACHELINE_SIZE
1936 #if defined(SYSTEM_CACHE_ALIGNMENT_SIZE)
1937 #define MDBX_CACHELINE_SIZE SYSTEM_CACHE_ALIGNMENT_SIZE
1938 #elif defined(__ia64__) || defined(__ia64) || defined(_M_IA64)
1939 #define MDBX_CACHELINE_SIZE 128
1940 #else
1941 #define MDBX_CACHELINE_SIZE 64
1942 #endif
1943 #endif /* MDBX_CACHELINE_SIZE */
1944
1945 /** @} end of build options */
1946 /*******************************************************************************
1947 *******************************************************************************
1948 ******************************************************************************/
1949
1950 #ifdef DOXYGEN
1951 /* !!! Actually this is a fake definitions !!!
1952 * !!! for documentation generation by Doxygen !!! */
1953
1954 /** Controls enabling of debugging features.
1955 *
1956 * - `MDBX_DEBUG = 0` (by default) Disables any debugging features at all,
1957 * including logging and assertion controls.
1958 * Logging level and corresponding debug flags changing
1959 * by \ref mdbx_setup_debug() will not have effect.
1960 * - `MDBX_DEBUG > 0` Enables code for the debugging features (logging,
1961 * assertions checking and internal audit).
1962 * Simultaneously sets the default logging level
1963 * to the `MDBX_DEBUG` value.
1964 * Also enables \ref MDBX_DBG_AUDIT if `MDBX_DEBUG >= 2`.
1965 *
1966 * \ingroup build_option */
1967 #define MDBX_DEBUG 0...7
1968
1969 /** Disables using of GNU libc extensions. */
1970 #define MDBX_DISABLE_GNU_SOURCE 0 or 1
1971
1972 #endif /* DOXYGEN */
1973
1974 /* Undefine the NDEBUG if debugging is enforced by MDBX_DEBUG */
1975 #if MDBX_DEBUG
1976 #undef NDEBUG
1977 #endif
1978
1979 /*----------------------------------------------------------------------------*/
1980 /* Atomics */
1981
1982 enum MDBX_memory_order {
1983 mo_Relaxed,
1984 mo_AcquireRelease,
1985 mo_SequentialConsistency
1986 };
1987
1988 typedef union {
1989 volatile uint32_t weak;
1990 #ifdef MDBX_HAVE_C11ATOMICS
1991 volatile _Atomic uint32_t c11a;
1992 #endif /* MDBX_HAVE_C11ATOMICS */
1993 } MDBX_atomic_uint32_t;
1994
1995 typedef union {
1996 volatile uint64_t weak;
1997 #if defined(MDBX_HAVE_C11ATOMICS) && (MDBX_64BIT_CAS || MDBX_64BIT_ATOMIC)
1998 volatile _Atomic uint64_t c11a;
1999 #endif
2000 #if !defined(MDBX_HAVE_C11ATOMICS) || !MDBX_64BIT_CAS || !MDBX_64BIT_ATOMIC
2001 __anonymous_struct_extension__ struct {
2002 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
2003 MDBX_atomic_uint32_t low, high;
2004 #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
2005 MDBX_atomic_uint32_t high, low;
2006 #else
2007 #error "FIXME: Unsupported byte order"
2008 #endif /* __BYTE_ORDER__ */
2009 };
2010 #endif
2011 } MDBX_atomic_uint64_t;
2012
2013 #ifdef MDBX_HAVE_C11ATOMICS
2014
2015 /* Crutches for C11 atomic compiler's bugs */
2016 #if defined(__e2k__) && defined(__LCC__) && __LCC__ < /* FIXME */ 127
2017 #define MDBX_c11a_ro(type, ptr) (&(ptr)->weak)
2018 #define MDBX_c11a_rw(type, ptr) (&(ptr)->weak)
2019 #elif defined(__clang__) && __clang__ < 8
2020 #define MDBX_c11a_ro(type, ptr) ((volatile _Atomic(type) *)&(ptr)->c11a)
2021 #define MDBX_c11a_rw(type, ptr) (&(ptr)->c11a)
2022 #else
2023 #define MDBX_c11a_ro(type, ptr) (&(ptr)->c11a)
2024 #define MDBX_c11a_rw(type, ptr) (&(ptr)->c11a)
2025 #endif /* Crutches for C11 atomic compiler's bugs */
2026
mo_c11_store(enum MDBX_memory_order fence)2027 static __always_inline memory_order mo_c11_store(enum MDBX_memory_order fence) {
2028 switch (fence) {
2029 default:
2030 assert(false);
2031 __unreachable();
2032 case mo_Relaxed:
2033 return memory_order_relaxed;
2034 case mo_AcquireRelease:
2035 return memory_order_release;
2036 case mo_SequentialConsistency:
2037 return memory_order_seq_cst;
2038 }
2039 }
2040
mo_c11_load(enum MDBX_memory_order fence)2041 static __always_inline memory_order mo_c11_load(enum MDBX_memory_order fence) {
2042 switch (fence) {
2043 default:
2044 assert(false);
2045 __unreachable();
2046 case mo_Relaxed:
2047 return memory_order_relaxed;
2048 case mo_AcquireRelease:
2049 return memory_order_acquire;
2050 case mo_SequentialConsistency:
2051 return memory_order_seq_cst;
2052 }
2053 }
2054 #endif /* MDBX_HAVE_C11ATOMICS */
2055
2056 #ifndef __cplusplus
2057
2058 MDBX_MAYBE_UNUSED static __always_inline void
mdbx_memory_fence(enum MDBX_memory_order order,bool write)2059 mdbx_memory_fence(enum MDBX_memory_order order, bool write) {
2060 #ifdef MDBX_HAVE_C11ATOMICS
2061 atomic_thread_fence(write ? mo_c11_store(order) : mo_c11_load(order));
2062 #else /* MDBX_HAVE_C11ATOMICS */
2063 mdbx_compiler_barrier();
2064 if (write &&
2065 order > (MDBX_CPU_WRITEBACK_INCOHERENT ? mo_Relaxed : mo_AcquireRelease))
2066 mdbx_memory_barrier();
2067 #endif /* MDBX_HAVE_C11ATOMICS */
2068 }
2069
2070 MDBX_MAYBE_UNUSED static __always_inline uint32_t
atomic_store32(MDBX_atomic_uint32_t * p,const uint32_t value,enum MDBX_memory_order order)2071 atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value,
2072 enum MDBX_memory_order order) {
2073 STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4);
2074 #ifdef MDBX_HAVE_C11ATOMICS
2075 assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p)));
2076 atomic_store_explicit(MDBX_c11a_rw(uint32_t, p), value, mo_c11_store(order));
2077 #else /* MDBX_HAVE_C11ATOMICS */
2078 if (order != mo_Relaxed)
2079 mdbx_compiler_barrier();
2080 p->weak = value;
2081 mdbx_memory_fence(order, true);
2082 #endif /* MDBX_HAVE_C11ATOMICS */
2083 return value;
2084 }
2085
2086 MDBX_MAYBE_UNUSED static __always_inline uint32_t
atomic_load32(const MDBX_atomic_uint32_t * p,enum MDBX_memory_order order)2087 atomic_load32(const MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) {
2088 STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4);
2089 #ifdef MDBX_HAVE_C11ATOMICS
2090 assert(atomic_is_lock_free(MDBX_c11a_ro(uint32_t, p)));
2091 return atomic_load_explicit(MDBX_c11a_ro(uint32_t, p), mo_c11_load(order));
2092 #else /* MDBX_HAVE_C11ATOMICS */
2093 mdbx_memory_fence(order, false);
2094 const uint32_t value = p->weak;
2095 if (order != mo_Relaxed)
2096 mdbx_compiler_barrier();
2097 return value;
2098 #endif /* MDBX_HAVE_C11ATOMICS */
2099 }
2100
2101 #endif /* !__cplusplus */
2102
2103 /*----------------------------------------------------------------------------*/
2104 /* Basic constants and types */
2105
2106 /* A stamp that identifies a file as an MDBX file.
2107 * There's nothing special about this value other than that it is easily
2108 * recognizable, and it will reflect any byte order mismatches. */
2109 #define MDBX_MAGIC UINT64_C(/* 56-bit prime */ 0x59659DBDEF4C11)
2110
2111 /* FROZEN: The version number for a database's datafile format. */
2112 #define MDBX_DATA_VERSION 3
2113 /* The version number for a database's lockfile format. */
2114 #define MDBX_LOCK_VERSION 4
2115
2116 /* handle for the DB used to track free pages. */
2117 #define FREE_DBI 0
2118 /* handle for the default DB. */
2119 #define MAIN_DBI 1
2120 /* Number of DBs in metapage (free and main) - also hardcoded elsewhere */
2121 #define CORE_DBS 2
2122
2123 /* Number of meta pages - also hardcoded elsewhere */
2124 #define NUM_METAS 3
2125
2126 /* A page number in the database.
2127 *
2128 * MDBX uses 32 bit for page numbers. This limits database
2129 * size up to 2^44 bytes, in case of 4K pages. */
2130 typedef uint32_t pgno_t;
2131 typedef MDBX_atomic_uint32_t atomic_pgno_t;
2132 #define PRIaPGNO PRIu32
2133 #define MAX_PAGENO UINT32_C(0x7FFFffff)
2134 #define MIN_PAGENO NUM_METAS
2135
2136 #define SAFE64_INVALID_THRESHOLD UINT64_C(0xffffFFFF00000000)
2137
2138 /* A transaction ID. */
2139 typedef uint64_t txnid_t;
2140 typedef MDBX_atomic_uint64_t atomic_txnid_t;
2141 #define PRIaTXN PRIi64
2142 #define MIN_TXNID UINT64_C(1)
2143 #define MAX_TXNID (SAFE64_INVALID_THRESHOLD - 1)
2144 #define INITIAL_TXNID (MIN_TXNID + NUM_METAS - 1)
2145 #define INVALID_TXNID UINT64_MAX
2146 /* LY: for testing non-atomic 64-bit txnid on 32-bit arches.
2147 * #define xMDBX_TXNID_STEP (UINT32_MAX / 3) */
2148 #ifndef xMDBX_TXNID_STEP
2149 #if MDBX_64BIT_CAS
2150 #define xMDBX_TXNID_STEP 1u
2151 #else
2152 #define xMDBX_TXNID_STEP 2u
2153 #endif
2154 #endif /* xMDBX_TXNID_STEP */
2155
2156 /* Used for offsets within a single page.
2157 * Since memory pages are typically 4 or 8KB in size, 12-13 bits,
2158 * this is plenty. */
2159 typedef uint16_t indx_t;
2160
2161 #define MEGABYTE ((size_t)1 << 20)
2162
2163 /*----------------------------------------------------------------------------*/
2164 /* Core structures for database and shared memory (i.e. format definition) */
2165 #pragma pack(push, 4)
2166
2167 /* Information about a single database in the environment. */
2168 typedef struct MDBX_db {
2169 uint16_t md_flags; /* see mdbx_dbi_open */
2170 uint16_t md_depth; /* depth of this tree */
2171 uint32_t md_xsize; /* key-size for MDBX_DUPFIXED (LEAF2 pages) */
2172 pgno_t md_root; /* the root page of this tree */
2173 pgno_t md_branch_pages; /* number of internal pages */
2174 pgno_t md_leaf_pages; /* number of leaf pages */
2175 pgno_t md_overflow_pages; /* number of overflow pages */
2176 uint64_t md_seq; /* table sequence counter */
2177 uint64_t md_entries; /* number of data items */
2178 uint64_t md_mod_txnid; /* txnid of last committed modification */
2179 } MDBX_db;
2180
2181 /* database size-related parameters */
2182 typedef struct MDBX_geo {
2183 uint16_t grow_pv; /* datafile growth step as a 16-bit packed (exponential
2184 quantized) value */
2185 uint16_t shrink_pv; /* datafile shrink threshold as a 16-bit packed
2186 (exponential quantized) value */
2187 pgno_t lower; /* minimal size of datafile in pages */
2188 pgno_t upper; /* maximal size of datafile in pages */
2189 pgno_t now; /* current size of datafile in pages */
2190 pgno_t next; /* first unused page in the datafile,
2191 but actually the file may be shorter. */
2192 } MDBX_geo;
2193
2194 /* Meta page content.
2195 * A meta page is the start point for accessing a database snapshot.
2196 * Pages 0-1 are meta pages. Transaction N writes meta page (N % 2). */
2197 typedef struct MDBX_meta {
2198 /* Stamp identifying this as an MDBX file.
2199 * It must be set to MDBX_MAGIC with MDBX_DATA_VERSION. */
2200 uint32_t mm_magic_and_version[2];
2201
2202 /* txnid that committed this page, the first of a two-phase-update pair */
2203 uint32_t mm_txnid_a[2];
2204
2205 uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */
2206 uint8_t mm_validator_id; /* ID of checksum and page validation method,
2207 * zero (nothing) for now */
2208 uint8_t mm_extra_pagehdr; /* extra bytes in the page header,
2209 * zero (nothing) for now */
2210
2211 MDBX_geo mm_geo; /* database size-related parameters */
2212
2213 MDBX_db mm_dbs[CORE_DBS]; /* first is free space, 2nd is main db */
2214 /* The size of pages used in this DB */
2215 #define mm_psize mm_dbs[FREE_DBI].md_xsize
2216 MDBX_canary mm_canary;
2217
2218 #define MDBX_DATASIGN_NONE 0u
2219 #define MDBX_DATASIGN_WEAK 1u
2220 #define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK)
2221 #define META_IS_STEADY(meta) \
2222 SIGN_IS_STEADY(unaligned_peek_u64(4, (meta)->mm_datasync_sign))
2223 uint32_t mm_datasync_sign[2];
2224
2225 /* txnid that committed this page, the second of a two-phase-update pair */
2226 uint32_t mm_txnid_b[2];
2227
2228 /* Number of non-meta pages which were put in GC after COW. May be 0 in case
2229 * DB was previously handled by libmdbx without corresponding feature.
2230 * This value in couple with mr_snapshot_pages_retired allows fast estimation
2231 * of "how much reader is restraining GC recycling". */
2232 uint32_t mm_pages_retired[2];
2233
2234 /* The analogue /proc/sys/kernel/random/boot_id or similar to determine
2235 * whether the system was rebooted after the last use of the database files.
2236 * If there was no reboot, but there is no need to rollback to the last
2237 * steady sync point. Zeros mean that no relevant information is available
2238 * from the system. */
2239 bin128_t mm_bootid;
2240
2241 } MDBX_meta;
2242
2243 #pragma pack(1)
2244
2245 /* Common header for all page types. The page type depends on mp_flags.
2246 *
2247 * P_BRANCH and P_LEAF pages have unsorted 'MDBX_node's at the end, with
2248 * sorted mp_ptrs[] entries referring to them. Exception: P_LEAF2 pages
2249 * omit mp_ptrs and pack sorted MDBX_DUPFIXED values after the page header.
2250 *
2251 * P_OVERFLOW records occupy one or more contiguous pages where only the
2252 * first has a page header. They hold the real data of F_BIGDATA nodes.
2253 *
2254 * P_SUBP sub-pages are small leaf "pages" with duplicate data.
2255 * A node with flag F_DUPDATA but not F_SUBDATA contains a sub-page.
2256 * (Duplicate data can also go in sub-databases, which use normal pages.)
2257 *
2258 * P_META pages contain MDBX_meta, the start point of an MDBX snapshot.
2259 *
2260 * Each non-metapage up to MDBX_meta.mm_last_pg is reachable exactly once
2261 * in the snapshot: Either used by a database or listed in a GC record. */
2262 typedef struct MDBX_page {
2263 union {
2264 #define IS_FROZEN(txn, p) ((p)->mp_txnid < (txn)->mt_txnid)
2265 #define IS_SPILLED(txn, p) ((p)->mp_txnid == (txn)->mt_txnid)
2266 #define IS_SHADOWED(txn, p) ((p)->mp_txnid > (txn)->mt_txnid)
2267 #define IS_VALID(txn, p) ((p)->mp_txnid <= (txn)->mt_front)
2268 #define IS_MODIFIABLE(txn, p) ((p)->mp_txnid == (txn)->mt_front)
2269 uint64_t mp_txnid;
2270 struct MDBX_page *mp_next; /* for in-memory list of freed pages */
2271 };
2272 uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */
2273 #define P_BRANCH 0x01 /* branch page */
2274 #define P_LEAF 0x02 /* leaf page */
2275 #define P_OVERFLOW 0x04 /* overflow page */
2276 #define P_META 0x08 /* meta page */
2277 #define P_BAD 0x10 /* explicit flag for invalid/bad page */
2278 #define P_LEAF2 0x20 /* for MDBX_DUPFIXED records */
2279 #define P_SUBP 0x40 /* for MDBX_DUPSORT sub-pages */
2280 #define P_SPILLED 0x2000 /* spilled in parent txn */
2281 #define P_LOOSE 0x4000 /* page was dirtied then freed, can be reused */
2282 #define P_FROZEN 0x8000 /* used for retire page with known status */
2283 #define P_ILL_BITS (~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_SPILLED))
2284 uint16_t mp_flags;
2285 union {
2286 uint32_t mp_pages; /* number of overflow pages */
2287 __anonymous_struct_extension__ struct {
2288 indx_t mp_lower; /* lower bound of free space */
2289 indx_t mp_upper; /* upper bound of free space */
2290 };
2291 };
2292 pgno_t mp_pgno; /* page number */
2293
2294 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
2295 (!defined(__cplusplus) && defined(_MSC_VER))
2296 indx_t mp_ptrs[] /* dynamic size */;
2297 #endif /* C99 */
2298 } MDBX_page;
2299
2300 /* Size of the page header, excluding dynamic data at the end */
2301 #define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_ptrs))
2302
2303 #pragma pack(pop)
2304
2305 #if MDBX_ENABLE_PGOP_STAT
2306 /* Statistics of page operations overall of all (running, completed and aborted)
2307 * transactions */
2308 typedef struct {
2309 MDBX_atomic_uint64_t newly; /* Quantity of a new pages added */
2310 MDBX_atomic_uint64_t cow; /* Quantity of pages copied for update */
2311 MDBX_atomic_uint64_t clone; /* Quantity of parent's dirty pages clones
2312 for nested transactions */
2313 MDBX_atomic_uint64_t split; /* Page splits */
2314 MDBX_atomic_uint64_t merge; /* Page merges */
2315 MDBX_atomic_uint64_t spill; /* Quantity of spilled dirty pages */
2316 MDBX_atomic_uint64_t unspill; /* Quantity of unspilled/reloaded pages */
2317 MDBX_atomic_uint64_t
2318 wops; /* Number of explicit write operations (not a pages) to a disk */
2319 } MDBX_pgop_stat_t;
2320 #endif /* MDBX_ENABLE_PGOP_STAT */
2321
2322 #if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES
2323 #define MDBX_CLOCK_SIGN UINT32_C(0xF10C)
2324 typedef void mdbx_ipclock_t;
2325 #elif MDBX_LOCKING == MDBX_LOCKING_SYSV
2326
2327 #define MDBX_CLOCK_SIGN UINT32_C(0xF18D)
2328 typedef mdbx_pid_t mdbx_ipclock_t;
2329 #ifndef EOWNERDEAD
2330 #define EOWNERDEAD MDBX_RESULT_TRUE
2331 #endif
2332
2333 #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \
2334 MDBX_LOCKING == MDBX_LOCKING_POSIX2008
2335 #define MDBX_CLOCK_SIGN UINT32_C(0x8017)
2336 typedef pthread_mutex_t mdbx_ipclock_t;
2337 #elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988
2338 #define MDBX_CLOCK_SIGN UINT32_C(0xFC29)
2339 typedef sem_t mdbx_ipclock_t;
2340 #else
2341 #error "FIXME"
2342 #endif /* MDBX_LOCKING */
2343
2344 #if MDBX_LOCKING > MDBX_LOCKING_SYSV && !defined(__cplusplus)
2345 MDBX_INTERNAL_FUNC int mdbx_ipclock_stub(mdbx_ipclock_t *ipc);
2346 MDBX_INTERNAL_FUNC int mdbx_ipclock_destroy(mdbx_ipclock_t *ipc);
2347 #endif /* MDBX_LOCKING */
2348
2349 /* Reader Lock Table
2350 *
2351 * Readers don't acquire any locks for their data access. Instead, they
2352 * simply record their transaction ID in the reader table. The reader
2353 * mutex is needed just to find an empty slot in the reader table. The
2354 * slot's address is saved in thread-specific data so that subsequent
2355 * read transactions started by the same thread need no further locking to
2356 * proceed.
2357 *
2358 * If MDBX_NOTLS is set, the slot address is not saved in thread-specific data.
2359 * No reader table is used if the database is on a read-only filesystem.
2360 *
2361 * Since the database uses multi-version concurrency control, readers don't
2362 * actually need any locking. This table is used to keep track of which
2363 * readers are using data from which old transactions, so that we'll know
2364 * when a particular old transaction is no longer in use. Old transactions
2365 * that have discarded any data pages can then have those pages reclaimed
2366 * for use by a later write transaction.
2367 *
2368 * The lock table is constructed such that reader slots are aligned with the
2369 * processor's cache line size. Any slot is only ever used by one thread.
2370 * This alignment guarantees that there will be no contention or cache
2371 * thrashing as threads update their own slot info, and also eliminates
2372 * any need for locking when accessing a slot.
2373 *
2374 * A writer thread will scan every slot in the table to determine the oldest
2375 * outstanding reader transaction. Any freed pages older than this will be
2376 * reclaimed by the writer. The writer doesn't use any locks when scanning
2377 * this table. This means that there's no guarantee that the writer will
2378 * see the most up-to-date reader info, but that's not required for correct
2379 * operation - all we need is to know the upper bound on the oldest reader,
2380 * we don't care at all about the newest reader. So the only consequence of
2381 * reading stale information here is that old pages might hang around a
2382 * while longer before being reclaimed. That's actually good anyway, because
2383 * the longer we delay reclaiming old pages, the more likely it is that a
2384 * string of contiguous pages can be found after coalescing old pages from
2385 * many old transactions together. */
2386
2387 /* The actual reader record, with cacheline padding. */
2388 typedef struct MDBX_reader {
2389 /* Current Transaction ID when this transaction began, or (txnid_t)-1.
2390 * Multiple readers that start at the same time will probably have the
2391 * same ID here. Again, it's not important to exclude them from
2392 * anything; all we need to know is which version of the DB they
2393 * started from so we can avoid overwriting any data used in that
2394 * particular version. */
2395 MDBX_atomic_uint64_t /* txnid_t */ mr_txnid;
2396
2397 /* The information we store in a single slot of the reader table.
2398 * In addition to a transaction ID, we also record the process and
2399 * thread ID that owns a slot, so that we can detect stale information,
2400 * e.g. threads or processes that went away without cleaning up.
2401 *
2402 * NOTE: We currently don't check for stale records.
2403 * We simply re-init the table when we know that we're the only process
2404 * opening the lock file. */
2405
2406 /* The thread ID of the thread owning this txn. */
2407 MDBX_atomic_uint64_t mr_tid;
2408
2409 /* The process ID of the process owning this reader txn. */
2410 MDBX_atomic_uint32_t mr_pid;
2411
2412 /* The number of pages used in the reader's MVCC snapshot,
2413 * i.e. the value of meta->mm_geo.next and txn->mt_next_pgno */
2414 atomic_pgno_t mr_snapshot_pages_used;
2415 /* Number of retired pages at the time this reader starts transaction. So,
2416 * at any time the difference mm_pages_retired - mr_snapshot_pages_retired
2417 * will give the number of pages which this reader restraining from reuse. */
2418 MDBX_atomic_uint64_t mr_snapshot_pages_retired;
2419 } MDBX_reader;
2420
2421 /* The header for the reader table (a memory-mapped lock file). */
2422 typedef struct MDBX_lockinfo {
2423 /* Stamp identifying this as an MDBX file.
2424 * It must be set to MDBX_MAGIC with with MDBX_LOCK_VERSION. */
2425 uint64_t mti_magic_and_version;
2426
2427 /* Format of this lock file. Must be set to MDBX_LOCK_FORMAT. */
2428 uint32_t mti_os_and_format;
2429
2430 /* Flags which environment was opened. */
2431 MDBX_atomic_uint32_t mti_envmode;
2432
2433 /* Threshold of un-synced-with-disk pages for auto-sync feature,
2434 * zero means no-threshold, i.e. auto-sync is disabled. */
2435 atomic_pgno_t mti_autosync_threshold;
2436
2437 /* Low 32-bit of txnid with which meta-pages was synced,
2438 * i.e. for sync-polling in the MDBX_NOMETASYNC mode. */
2439 MDBX_atomic_uint32_t mti_meta_sync_txnid;
2440
2441 /* Period for timed auto-sync feature, i.e. at the every steady checkpoint
2442 * the mti_unsynced_timeout sets to the current_time + mti_autosync_period.
2443 * The time value is represented in a suitable system-dependent form, for
2444 * example clock_gettime(CLOCK_BOOTTIME) or clock_gettime(CLOCK_MONOTONIC).
2445 * Zero means timed auto-sync is disabled. */
2446 MDBX_atomic_uint64_t mti_autosync_period;
2447
2448 /* Marker to distinguish uniqueness of DB/CLK. */
2449 MDBX_atomic_uint64_t mti_bait_uniqueness;
2450
2451 alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/
2452
2453 #if MDBX_ENABLE_PGOP_STAT
2454 /* Statistics of costly ops of all (running, completed and aborted)
2455 * transactions */
2456 MDBX_pgop_stat_t mti_pgop_stat;
2457 #endif /* MDBX_ENABLE_PGOP_STAT*/
2458
2459 alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/
2460
2461 /* Write transaction lock. */
2462 #if MDBX_LOCKING > 0
2463 mdbx_ipclock_t mti_wlock;
2464 #endif /* MDBX_LOCKING > 0 */
2465
2466 atomic_txnid_t mti_oldest_reader;
2467
2468 /* Timestamp of the last steady sync. Value is represented in a suitable
2469 * system-dependent form, for example clock_gettime(CLOCK_BOOTTIME) or
2470 * clock_gettime(CLOCK_MONOTONIC). */
2471 MDBX_atomic_uint64_t mti_sync_timestamp;
2472
2473 /* Number un-synced-with-disk pages for auto-sync feature. */
2474 atomic_pgno_t mti_unsynced_pages;
2475
2476 /* Number of page which was discarded last time by madvise(MADV_FREE). */
2477 atomic_pgno_t mti_discarded_tail;
2478
2479 /* Timestamp of the last readers check. */
2480 MDBX_atomic_uint64_t mti_reader_check_timestamp;
2481
2482 /* Shared anchor for tracking readahead edge and enabled/disabled status. */
2483 pgno_t mti_readahead_anchor;
2484
2485 alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/
2486
2487 /* Readeaders registration lock. */
2488 #if MDBX_LOCKING > 0
2489 mdbx_ipclock_t mti_rlock;
2490 #endif /* MDBX_LOCKING > 0 */
2491
2492 /* The number of slots that have been used in the reader table.
2493 * This always records the maximum count, it is not decremented
2494 * when readers release their slots. */
2495 MDBX_atomic_uint32_t mti_numreaders;
2496 MDBX_atomic_uint32_t mti_readers_refresh_flag;
2497
2498 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
2499 (!defined(__cplusplus) && defined(_MSC_VER))
2500 alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/
2501 MDBX_reader mti_readers[] /* dynamic size */;
2502 #endif /* C99 */
2503 } MDBX_lockinfo;
2504
2505 /* Lockfile format signature: version, features and field layout */
2506 #define MDBX_LOCK_FORMAT \
2507 (MDBX_CLOCK_SIGN * 27733 + (unsigned)sizeof(MDBX_reader) * 13 + \
2508 (unsigned)offsetof(MDBX_reader, mr_snapshot_pages_used) * 251 + \
2509 (unsigned)offsetof(MDBX_lockinfo, mti_oldest_reader) * 83 + \
2510 (unsigned)offsetof(MDBX_lockinfo, mti_numreaders) * 37 + \
2511 (unsigned)offsetof(MDBX_lockinfo, mti_readers) * 29)
2512
2513 #define MDBX_DATA_MAGIC \
2514 ((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + MDBX_DATA_VERSION)
2515
2516 #define MDBX_DATA_MAGIC_LEGACY_COMPAT \
2517 ((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + 2)
2518
2519 #define MDBX_DATA_MAGIC_LEGACY_DEVEL ((MDBX_MAGIC << 8) + 255)
2520
2521 #define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION)
2522
2523 /* The maximum size of a database page.
2524 *
2525 * It is 64K, but value-PAGEHDRSZ must fit in MDBX_page.mp_upper.
2526 *
2527 * MDBX will use database pages < OS pages if needed.
2528 * That causes more I/O in write transactions: The OS must
2529 * know (read) the whole page before writing a partial page.
2530 *
2531 * Note that we don't currently support Huge pages. On Linux,
2532 * regular data files cannot use Huge pages, and in general
2533 * Huge pages aren't actually pageable. We rely on the OS
2534 * demand-pager to read our data and page it out when memory
2535 * pressure from other processes is high. So until OSs have
2536 * actual paging support for Huge pages, they're not viable. */
2537 #define MAX_PAGESIZE MDBX_MAX_PAGESIZE
2538 #define MIN_PAGESIZE MDBX_MIN_PAGESIZE
2539
2540 #define MIN_MAPSIZE (MIN_PAGESIZE * MIN_PAGENO)
2541 #if defined(_WIN32) || defined(_WIN64)
2542 #define MAX_MAPSIZE32 UINT32_C(0x38000000)
2543 #else
2544 #define MAX_MAPSIZE32 UINT32_C(0x7f000000)
2545 #endif
2546 #define MAX_MAPSIZE64 (MAX_PAGENO * (uint64_t)MAX_PAGESIZE)
2547
2548 #if MDBX_WORDBITS >= 64
2549 #define MAX_MAPSIZE MAX_MAPSIZE64
2550 #define MDBX_PGL_LIMIT ((size_t)MAX_PAGENO)
2551 #else
2552 #define MAX_MAPSIZE MAX_MAPSIZE32
2553 #define MDBX_PGL_LIMIT (MAX_MAPSIZE32 / MIN_PAGESIZE)
2554 #endif /* MDBX_WORDBITS */
2555
2556 #define MDBX_READERS_LIMIT 32767
2557 #define MDBX_RADIXSORT_THRESHOLD 333
2558
2559 /*----------------------------------------------------------------------------*/
2560
2561 /* An PNL is an Page Number List, a sorted array of IDs.
2562 * The first element of the array is a counter for how many actual page-numbers
2563 * are in the list. By default PNLs are sorted in descending order, this allow
2564 * cut off a page with lowest pgno (at the tail) just truncating the list. The
2565 * sort order of PNLs is controlled by the MDBX_PNL_ASCENDING build option. */
2566 typedef pgno_t *MDBX_PNL;
2567
2568 #if MDBX_PNL_ASCENDING
2569 #define MDBX_PNL_ORDERED(first, last) ((first) < (last))
2570 #define MDBX_PNL_DISORDERED(first, last) ((first) >= (last))
2571 #else
2572 #define MDBX_PNL_ORDERED(first, last) ((first) > (last))
2573 #define MDBX_PNL_DISORDERED(first, last) ((first) <= (last))
2574 #endif
2575
2576 /* List of txnid, only for MDBX_txn.tw.lifo_reclaimed */
2577 typedef txnid_t *MDBX_TXL;
2578
2579 /* An Dirty-Page list item is an pgno/pointer pair. */
2580 typedef struct MDBX_dp {
2581 MDBX_page *ptr;
2582 pgno_t pgno;
2583 union {
2584 unsigned extra;
2585 __anonymous_struct_extension__ struct {
2586 unsigned multi : 1;
2587 unsigned lru : 31;
2588 };
2589 };
2590 } MDBX_dp;
2591
2592 /* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */
2593 typedef struct MDBX_dpl {
2594 unsigned sorted;
2595 unsigned length;
2596 unsigned detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */
2597 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
2598 (!defined(__cplusplus) && defined(_MSC_VER))
2599 MDBX_dp items[] /* dynamic size with holes at zero and after the last */;
2600 #endif
2601 } MDBX_dpl;
2602
2603 /* PNL sizes */
2604 #define MDBX_PNL_GRANULATE 1024
2605 #define MDBX_PNL_INITIAL \
2606 (MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
2607
2608 #define MDBX_TXL_GRANULATE 32
2609 #define MDBX_TXL_INITIAL \
2610 (MDBX_TXL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t))
2611 #define MDBX_TXL_MAX \
2612 ((1u << 17) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t))
2613
2614 #define MDBX_PNL_ALLOCLEN(pl) ((pl)[-1])
2615 #define MDBX_PNL_SIZE(pl) ((pl)[0])
2616 #define MDBX_PNL_FIRST(pl) ((pl)[1])
2617 #define MDBX_PNL_LAST(pl) ((pl)[MDBX_PNL_SIZE(pl)])
2618 #define MDBX_PNL_BEGIN(pl) (&(pl)[1])
2619 #define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_SIZE(pl) + 1])
2620
2621 #if MDBX_PNL_ASCENDING
2622 #define MDBX_PNL_LEAST(pl) MDBX_PNL_FIRST(pl)
2623 #define MDBX_PNL_MOST(pl) MDBX_PNL_LAST(pl)
2624 #else
2625 #define MDBX_PNL_LEAST(pl) MDBX_PNL_LAST(pl)
2626 #define MDBX_PNL_MOST(pl) MDBX_PNL_FIRST(pl)
2627 #endif
2628
2629 #define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_SIZE(pl) + 1) * sizeof(pgno_t))
2630 #define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_SIZE(pl) == 0)
2631
2632 /*----------------------------------------------------------------------------*/
2633 /* Internal structures */
2634
2635 /* Auxiliary DB info.
2636 * The information here is mostly static/read-only. There is
2637 * only a single copy of this record in the environment. */
2638 typedef struct MDBX_dbx {
2639 MDBX_val md_name; /* name of the database */
2640 MDBX_cmp_func *md_cmp; /* function for comparing keys */
2641 MDBX_cmp_func *md_dcmp; /* function for comparing data items */
2642 size_t md_klen_min, md_klen_max; /* min/max key length for the database */
2643 size_t md_vlen_min,
2644 md_vlen_max; /* min/max value/data length for the database */
2645 } MDBX_dbx;
2646
2647 /* A database transaction.
2648 * Every operation requires a transaction handle. */
2649 struct MDBX_txn {
2650 #define MDBX_MT_SIGNATURE UINT32_C(0x93D53A31)
2651 uint32_t mt_signature;
2652
2653 /* Transaction Flags */
2654 /* mdbx_txn_begin() flags */
2655 #define MDBX_TXN_RO_BEGIN_FLAGS (MDBX_TXN_RDONLY | MDBX_TXN_RDONLY_PREPARE)
2656 #define MDBX_TXN_RW_BEGIN_FLAGS \
2657 (MDBX_TXN_NOMETASYNC | MDBX_TXN_NOSYNC | MDBX_TXN_TRY)
2658 /* Additional flag for mdbx_sync_locked() */
2659 #define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000)
2660
2661 /* internal txn flags */
2662 #define MDBX_TXN_FINISHED 0x01 /* txn is finished or never began */
2663 #define MDBX_TXN_ERROR 0x02 /* txn is unusable after an error */
2664 #define MDBX_TXN_DIRTY 0x04 /* must write, even if dirty list is empty */
2665 #define MDBX_TXN_SPILLS 0x08 /* txn or a parent has spilled pages */
2666 #define MDBX_TXN_HAS_CHILD 0x10 /* txn has an MDBX_txn.mt_child */
2667 /* most operations on the txn are currently illegal */
2668 #define MDBX_TXN_BLOCKED \
2669 (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_HAS_CHILD)
2670
2671 #define TXN_FLAGS \
2672 (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | \
2673 MDBX_TXN_HAS_CHILD)
2674
2675 #if (TXN_FLAGS & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS)) || \
2676 ((MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS | TXN_FLAGS) & \
2677 MDBX_SHRINK_ALLOWED)
2678 #error "Oops, some flags overlapped or wrong"
2679 #endif
2680 uint32_t mt_flags;
2681
2682 MDBX_txn *mt_parent; /* parent of a nested txn */
2683 /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */
2684 MDBX_txn *mt_child;
2685 MDBX_geo mt_geo;
2686 /* next unallocated page */
2687 #define mt_next_pgno mt_geo.next
2688 /* corresponding to the current size of datafile */
2689 #define mt_end_pgno mt_geo.now
2690
2691 /* The ID of this transaction. IDs are integers incrementing from 1.
2692 * Only committed write transactions increment the ID. If a transaction
2693 * aborts, the ID may be re-used by the next writer. */
2694 txnid_t mt_txnid;
2695 txnid_t mt_front;
2696
2697 MDBX_env *mt_env; /* the DB environment */
2698 /* Array of records for each DB known in the environment. */
2699 MDBX_dbx *mt_dbxs;
2700 /* Array of MDBX_db records for each known DB */
2701 MDBX_db *mt_dbs;
2702 /* Array of sequence numbers for each DB handle */
2703 unsigned *mt_dbiseqs;
2704
2705 /* Transaction DBI Flags */
2706 #define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */
2707 #define DBI_STALE MDBX_DBI_STALE /* Named-DB record is older than txnID */
2708 #define DBI_FRESH MDBX_DBI_FRESH /* Named-DB handle opened in this txn */
2709 #define DBI_CREAT MDBX_DBI_CREAT /* Named-DB handle created in this txn */
2710 #define DBI_VALID 0x10 /* DB handle is valid, see also DB_VALID */
2711 #define DBI_USRVALID 0x20 /* As DB_VALID, but not set for FREE_DBI */
2712 #define DBI_AUDITED 0x40 /* Internal flag for accounting during audit */
2713 /* Array of flags for each DB */
2714 uint8_t *mt_dbistate;
2715 /* Number of DB records in use, or 0 when the txn is finished.
2716 * This number only ever increments until the txn finishes; we
2717 * don't decrement it when individual DB handles are closed. */
2718 MDBX_dbi mt_numdbs;
2719 size_t mt_owner; /* thread ID that owns this transaction */
2720 MDBX_canary mt_canary;
2721 void *mt_userctx; /* User-settable context */
2722
2723 union {
2724 struct {
2725 /* For read txns: This thread/txn's reader table slot, or NULL. */
2726 MDBX_reader *reader;
2727 } to;
2728 struct {
2729 /* In write txns, array of cursors for each DB */
2730 MDBX_cursor **cursors;
2731 pgno_t *reclaimed_pglist; /* Reclaimed GC pages */
2732 txnid_t last_reclaimed; /* ID of last used record */
2733 #if MDBX_ENABLE_REFUND
2734 pgno_t loose_refund_wl /* FIXME: describe */;
2735 #endif /* MDBX_ENABLE_REFUND */
2736 /* dirtylist room: Dirty array size - dirty pages visible to this txn.
2737 * Includes ancestor txns' dirty pages not hidden by other txns'
2738 * dirty/spilled pages. Thus commit(nested txn) has room to merge
2739 * dirtylist into mt_parent after freeing hidden mt_parent pages. */
2740 unsigned dirtyroom;
2741 /* a sequence to spilling dirty page with LRU policy */
2742 unsigned dirtylru;
2743 /* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */
2744 MDBX_dpl *dirtylist;
2745 /* The list of reclaimed txns from GC */
2746 MDBX_TXL lifo_reclaimed;
2747 /* The list of pages that became unused during this transaction. */
2748 MDBX_PNL retired_pages;
2749 /* The list of loose pages that became unused and may be reused
2750 * in this transaction, linked through `mp_next`. */
2751 MDBX_page *loose_pages;
2752 /* Number of loose pages (tw.loose_pages) */
2753 unsigned loose_count;
2754 /* The sorted list of dirty pages we temporarily wrote to disk
2755 * because the dirty list was full. page numbers in here are
2756 * shifted left by 1, deleted slots have the LSB set. */
2757 MDBX_PNL spill_pages;
2758 unsigned spill_least_removed;
2759 } tw;
2760 };
2761 };
2762
2763 #if MDBX_WORDBITS >= 64
2764 #define CURSOR_STACK 32
2765 #else
2766 #define CURSOR_STACK 24
2767 #endif
2768
2769 struct MDBX_xcursor;
2770
2771 /* Cursors are used for all DB operations.
2772 * A cursor holds a path of (page pointer, key index) from the DB
2773 * root to a position in the DB, plus other state. MDBX_DUPSORT
2774 * cursors include an xcursor to the current data item. Write txns
2775 * track their cursors and keep them up to date when data moves.
2776 * Exception: An xcursor's pointer to a P_SUBP page can be stale.
2777 * (A node with F_DUPDATA but no F_SUBDATA contains a subpage). */
2778 struct MDBX_cursor {
2779 #define MDBX_MC_LIVE UINT32_C(0xFE05D5B1)
2780 #define MDBX_MC_READY4CLOSE UINT32_C(0x2817A047)
2781 #define MDBX_MC_WAIT4EOT UINT32_C(0x90E297A7)
2782 uint32_t mc_signature;
2783 /* The database handle this cursor operates on */
2784 MDBX_dbi mc_dbi;
2785 /* Next cursor on this DB in this txn */
2786 MDBX_cursor *mc_next;
2787 /* Backup of the original cursor if this cursor is a shadow */
2788 MDBX_cursor *mc_backup;
2789 /* Context used for databases with MDBX_DUPSORT, otherwise NULL */
2790 struct MDBX_xcursor *mc_xcursor;
2791 /* The transaction that owns this cursor */
2792 MDBX_txn *mc_txn;
2793 /* The database record for this cursor */
2794 MDBX_db *mc_db;
2795 /* The database auxiliary record for this cursor */
2796 MDBX_dbx *mc_dbx;
2797 /* The mt_dbistate for this database */
2798 uint8_t *mc_dbistate;
2799 unsigned mc_snum; /* number of pushed pages */
2800 unsigned mc_top; /* index of top page, normally mc_snum-1 */
2801
2802 /* Cursor state flags. */
2803 #define C_INITIALIZED 0x01 /* cursor has been initialized and is valid */
2804 #define C_EOF 0x02 /* No more data */
2805 #define C_SUB 0x04 /* Cursor is a sub-cursor */
2806 #define C_DEL 0x08 /* last op was a cursor_del */
2807 #define C_UNTRACK 0x10 /* Un-track cursor when closing */
2808 #define C_RECLAIMING 0x20 /* GC lookup is prohibited */
2809 #define C_GCFREEZE 0x40 /* reclaimed_pglist must not be updated */
2810
2811 /* Cursor checking flags. */
2812 #define C_COPYING 0x100 /* skip key-value length check (copying simplify) */
2813 #define C_UPDATING 0x200 /* update/rebalance pending */
2814 #define C_RETIRING 0x400 /* refs to child pages may be invalid */
2815 #define C_SKIPORD 0x800 /* don't check keys ordering */
2816
2817 unsigned mc_flags; /* see mdbx_cursor */
2818 MDBX_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */
2819 indx_t mc_ki[CURSOR_STACK]; /* stack of page indices */
2820 };
2821
2822 /* Context for sorted-dup records.
2823 * We could have gone to a fully recursive design, with arbitrarily
2824 * deep nesting of sub-databases. But for now we only handle these
2825 * levels - main DB, optional sub-DB, sorted-duplicate DB. */
2826 typedef struct MDBX_xcursor {
2827 /* A sub-cursor for traversing the Dup DB */
2828 MDBX_cursor mx_cursor;
2829 /* The database record for this Dup DB */
2830 MDBX_db mx_db;
2831 /* The auxiliary DB record for this Dup DB */
2832 MDBX_dbx mx_dbx;
2833 } MDBX_xcursor;
2834
2835 typedef struct MDBX_cursor_couple {
2836 MDBX_cursor outer;
2837 void *mc_userctx; /* User-settable context */
2838 MDBX_xcursor inner;
2839 } MDBX_cursor_couple;
2840
2841 /* The database environment. */
2842 struct MDBX_env {
2843 /* ----------------------------------------------------- mostly static part */
2844 #define MDBX_ME_SIGNATURE UINT32_C(0x9A899641)
2845 MDBX_atomic_uint32_t me_signature;
2846 /* Failed to update the meta page. Probably an I/O error. */
2847 #define MDBX_FATAL_ERROR UINT32_C(0x80000000)
2848 /* Some fields are initialized. */
2849 #define MDBX_ENV_ACTIVE UINT32_C(0x20000000)
2850 /* me_txkey is set */
2851 #define MDBX_ENV_TXKEY UINT32_C(0x10000000)
2852 /* Legacy MDBX_MAPASYNC (prior v0.9) */
2853 #define MDBX_DEPRECATED_MAPASYNC UINT32_C(0x100000)
2854 #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY)
2855 uint32_t me_flags;
2856 mdbx_mmap_t me_dxb_mmap; /* The main data file */
2857 #define me_map me_dxb_mmap.dxb
2858 #define me_lazy_fd me_dxb_mmap.fd
2859 mdbx_filehandle_t me_dsync_fd;
2860 mdbx_mmap_t me_lck_mmap; /* The lock file */
2861 #define me_lfd me_lck_mmap.fd
2862 struct MDBX_lockinfo *me_lck;
2863
2864 unsigned me_psize; /* DB page size, initialized from me_os_psize */
2865 unsigned me_leaf_nodemax; /* max size of a leaf-node */
2866 uint8_t me_psize2log; /* log2 of DB page size */
2867 int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */
2868 uint16_t me_merge_threshold,
2869 me_merge_threshold_gc; /* pages emptier than this are candidates for
2870 merging */
2871 unsigned me_os_psize; /* OS page size, from mdbx_syspagesize() */
2872 unsigned me_maxreaders; /* size of the reader table */
2873 MDBX_dbi me_maxdbs; /* size of the DB table */
2874 uint32_t me_pid; /* process ID of this env */
2875 mdbx_thread_key_t me_txkey; /* thread-key for readers */
2876 char *me_pathname; /* path to the DB files */
2877 void *me_pbuf; /* scratch area for DUPSORT put() */
2878 MDBX_txn *me_txn0; /* preallocated write transaction */
2879
2880 MDBX_dbx *me_dbxs; /* array of static DB info */
2881 uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */
2882 unsigned *me_dbiseqs; /* array of dbi sequence numbers */
2883 unsigned
2884 me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */
2885 uint32_t me_live_reader; /* have liveness lock in reader table */
2886 void *me_userctx; /* User-settable context */
2887 MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */
2888
2889 struct {
2890 unsigned dp_reserve_limit;
2891 unsigned rp_augment_limit;
2892 unsigned dp_limit;
2893 unsigned dp_initial;
2894 uint8_t dp_loose_limit;
2895 uint8_t spill_max_denominator;
2896 uint8_t spill_min_denominator;
2897 uint8_t spill_parent4child_denominator;
2898 unsigned merge_threshold_16dot16_percent;
2899 union {
2900 unsigned all;
2901 /* tracks options with non-auto values but tuned by user */
2902 struct {
2903 unsigned dp_limit : 1;
2904 } non_auto;
2905 } flags;
2906 } me_options;
2907
2908 /* struct me_dbgeo used for accepting db-geo params from user for the new
2909 * database creation, i.e. when mdbx_env_set_geometry() was called before
2910 * mdbx_env_open(). */
2911 struct {
2912 size_t lower; /* minimal size of datafile */
2913 size_t upper; /* maximal size of datafile */
2914 size_t now; /* current size of datafile */
2915 size_t grow; /* step to grow datafile */
2916 size_t shrink; /* threshold to shrink datafile */
2917 } me_dbgeo;
2918
2919 #if MDBX_LOCKING == MDBX_LOCKING_SYSV
2920 union {
2921 key_t key;
2922 int semid;
2923 } me_sysv_ipc;
2924 #endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */
2925
2926 MDBX_env *me_lcklist_next;
2927
2928 /* --------------------------------------------------- mostly volatile part */
2929
2930 MDBX_txn *me_txn; /* current write transaction */
2931 mdbx_fastmutex_t me_dbi_lock;
2932 MDBX_dbi me_numdbs; /* number of DBs opened */
2933
2934 MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */
2935 unsigned me_dp_reserve_len;
2936 /* PNL of pages that became unused in a write txn */
2937 MDBX_PNL me_retired_pages;
2938
2939 #if defined(_WIN32) || defined(_WIN64)
2940 MDBX_srwlock me_remap_guard;
2941 /* Workaround for LockFileEx and WriteFile multithread bug */
2942 CRITICAL_SECTION me_windowsbug_lock;
2943 #else
2944 mdbx_fastmutex_t me_remap_guard;
2945 #endif
2946
2947 /* -------------------------------------------------------------- debugging */
2948
2949 #if MDBX_DEBUG
2950 MDBX_assert_func *me_assert_func; /* Callback for assertion failures */
2951 #endif
2952 #ifdef MDBX_USE_VALGRIND
2953 int me_valgrind_handle;
2954 #endif
2955 #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__)
2956 pgno_t me_poison_edge;
2957 #endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */
2958
2959 #ifndef xMDBX_DEBUG_SPILLING
2960 #define xMDBX_DEBUG_SPILLING 0
2961 #endif
2962 #if xMDBX_DEBUG_SPILLING == 2
2963 unsigned debug_dirtied_est, debug_dirtied_act;
2964 #endif /* xMDBX_DEBUG_SPILLING */
2965
2966 /* ------------------------------------------------- stub for lck-less mode */
2967 MDBX_atomic_uint64_t
2968 x_lckless_stub[(sizeof(MDBX_lockinfo) + MDBX_CACHELINE_SIZE - 1) /
2969 sizeof(MDBX_atomic_uint64_t)];
2970 };
2971
2972 #ifndef __cplusplus
2973 /*----------------------------------------------------------------------------*/
2974 /* Debug and Logging stuff */
2975
2976 #define MDBX_RUNTIME_FLAGS_INIT \
2977 ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT
2978
2979 extern uint8_t mdbx_runtime_flags;
2980 extern uint8_t mdbx_loglevel;
2981 extern MDBX_debug_func *mdbx_debug_logger;
2982
mdbx_jitter4testing(bool tiny)2983 MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny) {
2984 #if MDBX_DEBUG
2985 if (MDBX_DBG_JITTER & mdbx_runtime_flags)
2986 mdbx_osal_jitter(tiny);
2987 #else
2988 (void)tiny;
2989 #endif
2990 }
2991
2992 MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5)
2993 mdbx_debug_log(int level, const char *function, int line, const char *fmt,
2994 ...) MDBX_PRINTF_ARGS(4, 5);
2995 MDBX_INTERNAL_FUNC void mdbx_debug_log_va(int level, const char *function,
2996 int line, const char *fmt,
2997 va_list args);
2998
2999 #define mdbx_log_enabled(msg) unlikely(msg <= mdbx_loglevel)
3000
3001 #if MDBX_DEBUG
3002
3003 #define mdbx_assert_enabled() unlikely(mdbx_runtime_flags &MDBX_DBG_ASSERT)
3004
3005 #define mdbx_audit_enabled() unlikely(mdbx_runtime_flags &MDBX_DBG_AUDIT)
3006
3007 #else /* MDBX_DEBUG */
3008
3009 #define mdbx_audit_enabled() (0)
3010
3011 #if !defined(NDEBUG) || MDBX_FORCE_ASSERTIONS
3012 #define mdbx_assert_enabled() (1)
3013 #else
3014 #define mdbx_assert_enabled() (0)
3015 #endif /* NDEBUG */
3016
3017 #endif /* MDBX_DEBUG */
3018
3019 #if !MDBX_DEBUG && defined(__ANDROID_API__)
3020 #define mdbx_assert_fail(env, msg, func, line) \
3021 __android_log_assert(msg, "mdbx", "%s:%u", func, line)
3022 #else
3023 void mdbx_assert_fail(const MDBX_env *env, const char *msg, const char *func,
3024 int line);
3025 #endif
3026
3027 #define mdbx_debug_extra(fmt, ...) \
3028 do { \
3029 if (MDBX_DEBUG && mdbx_log_enabled(MDBX_LOG_EXTRA)) \
3030 mdbx_debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \
3031 } while (0)
3032
3033 #define mdbx_debug_extra_print(fmt, ...) \
3034 do { \
3035 if (MDBX_DEBUG && mdbx_log_enabled(MDBX_LOG_EXTRA)) \
3036 mdbx_debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \
3037 } while (0)
3038
3039 #define mdbx_trace(fmt, ...) \
3040 do { \
3041 if (MDBX_DEBUG && mdbx_log_enabled(MDBX_LOG_TRACE)) \
3042 mdbx_debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", \
3043 __VA_ARGS__); \
3044 } while (0)
3045
3046 #define mdbx_debug(fmt, ...) \
3047 do { \
3048 if (MDBX_DEBUG && mdbx_log_enabled(MDBX_LOG_DEBUG)) \
3049 mdbx_debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", \
3050 __VA_ARGS__); \
3051 } while (0)
3052
3053 #define mdbx_verbose(fmt, ...) \
3054 do { \
3055 if (MDBX_DEBUG && mdbx_log_enabled(MDBX_LOG_VERBOSE)) \
3056 mdbx_debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", \
3057 __VA_ARGS__); \
3058 } while (0)
3059
3060 #define mdbx_notice(fmt, ...) \
3061 do { \
3062 if (mdbx_log_enabled(MDBX_LOG_NOTICE)) \
3063 mdbx_debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", \
3064 __VA_ARGS__); \
3065 } while (0)
3066
3067 #define mdbx_warning(fmt, ...) \
3068 do { \
3069 if (mdbx_log_enabled(MDBX_LOG_WARN)) \
3070 mdbx_debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", \
3071 __VA_ARGS__); \
3072 } while (0)
3073
3074 #define mdbx_error(fmt, ...) \
3075 do { \
3076 if (mdbx_log_enabled(MDBX_LOG_ERROR)) \
3077 mdbx_debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", \
3078 __VA_ARGS__); \
3079 } while (0)
3080
3081 #define mdbx_fatal(fmt, ...) \
3082 mdbx_debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__);
3083
3084 #define mdbx_ensure_msg(env, expr, msg) \
3085 do { \
3086 if (unlikely(!(expr))) \
3087 mdbx_assert_fail(env, msg, __func__, __LINE__); \
3088 } while (0)
3089
3090 #define mdbx_ensure(env, expr) mdbx_ensure_msg(env, expr, #expr)
3091
3092 /* assert(3) variant in environment context */
3093 #define mdbx_assert(env, expr) \
3094 do { \
3095 if (mdbx_assert_enabled()) \
3096 mdbx_ensure(env, expr); \
3097 } while (0)
3098
3099 /* assert(3) variant in cursor context */
3100 #define mdbx_cassert(mc, expr) mdbx_assert((mc)->mc_txn->mt_env, expr)
3101
3102 /* assert(3) variant in transaction context */
3103 #define mdbx_tassert(txn, expr) mdbx_assert((txn)->mt_env, expr)
3104
3105 #ifndef xMDBX_TOOLS /* Avoid using internal mdbx_assert() */
3106 #undef assert
3107 #define assert(expr) mdbx_assert(NULL, expr)
3108 #endif
3109
3110 /*----------------------------------------------------------------------------*/
3111 /* Cache coherence and mmap invalidation */
3112
3113 #if MDBX_CPU_WRITEBACK_INCOHERENT
3114 #define mdbx_flush_incoherent_cpu_writeback() mdbx_memory_barrier()
3115 #else
3116 #define mdbx_flush_incoherent_cpu_writeback() mdbx_compiler_barrier()
3117 #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */
3118
3119 MDBX_MAYBE_UNUSED static __inline void
mdbx_flush_incoherent_mmap(void * addr,size_t nbytes,const intptr_t pagesize)3120 mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) {
3121 #if MDBX_MMAP_INCOHERENT_FILE_WRITE
3122 char *const begin = (char *)(-pagesize & (intptr_t)addr);
3123 char *const end =
3124 (char *)(-pagesize & (intptr_t)((char *)addr + nbytes + pagesize - 1));
3125 int err = msync(begin, end - begin, MS_SYNC | MS_INVALIDATE) ? errno : 0;
3126 mdbx_assert(nullptr, err == 0);
3127 (void)err;
3128 #else
3129 (void)pagesize;
3130 #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */
3131
3132 #if MDBX_MMAP_INCOHERENT_CPU_CACHE
3133 #ifdef DCACHE
3134 /* MIPS has cache coherency issues.
3135 * Note: for any nbytes >= on-chip cache size, entire is flushed. */
3136 cacheflush(addr, nbytes, DCACHE);
3137 #else
3138 #error "Oops, cacheflush() not available"
3139 #endif /* DCACHE */
3140 #endif /* MDBX_MMAP_INCOHERENT_CPU_CACHE */
3141
3142 #if !MDBX_MMAP_INCOHERENT_FILE_WRITE && !MDBX_MMAP_INCOHERENT_CPU_CACHE
3143 (void)addr;
3144 (void)nbytes;
3145 #endif
3146 }
3147
3148 /*----------------------------------------------------------------------------*/
3149 /* Internal prototypes */
3150
3151 MDBX_INTERNAL_FUNC int mdbx_cleanup_dead_readers(MDBX_env *env, int rlocked,
3152 int *dead);
3153 MDBX_INTERNAL_FUNC int mdbx_rthc_alloc(mdbx_thread_key_t *key,
3154 MDBX_reader *begin, MDBX_reader *end);
3155 MDBX_INTERNAL_FUNC void mdbx_rthc_remove(const mdbx_thread_key_t key);
3156
3157 MDBX_INTERNAL_FUNC void mdbx_rthc_global_init(void);
3158 MDBX_INTERNAL_FUNC void mdbx_rthc_global_dtor(void);
3159 MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr);
3160
3161 #endif /* !__cplusplus */
3162
3163 #define MDBX_IS_ERROR(rc) \
3164 ((rc) != MDBX_RESULT_TRUE && (rc) != MDBX_RESULT_FALSE)
3165
3166 /* Internal error codes, not exposed outside libmdbx */
3167 #define MDBX_NO_ROOT (MDBX_LAST_ADDED_ERRCODE + 10)
3168
3169 /* Debugging output value of a cursor DBI: Negative in a sub-cursor. */
3170 #define DDBI(mc) \
3171 (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi)
3172
3173 /* Key size which fits in a DKBUF (debug key buffer). */
3174 #define DKBUF_MAX 511
3175 #define DKBUF char _kbuf[DKBUF_MAX * 4 + 2]
3176 #define DKEY(x) mdbx_dump_val(x, _kbuf, DKBUF_MAX * 2 + 1)
3177 #define DVAL(x) mdbx_dump_val(x, _kbuf + DKBUF_MAX * 2 + 1, DKBUF_MAX * 2 + 1)
3178
3179 #if MDBX_DEBUG
3180 #define DKBUF_DEBUG DKBUF
3181 #define DKEY_DEBUG(x) DKEY(x)
3182 #define DVAL_DEBUG(x) DVAL(x)
3183 #else
3184 #define DKBUF_DEBUG ((void)(0))
3185 #define DKEY_DEBUG(x) ("-")
3186 #define DVAL_DEBUG(x) ("-")
3187 #endif
3188
3189 /* An invalid page number.
3190 * Mainly used to denote an empty tree. */
3191 #define P_INVALID (~(pgno_t)0)
3192
3193 /* Test if the flags f are set in a flag word w. */
3194 #define F_ISSET(w, f) (((w) & (f)) == (f))
3195
3196 /* Round n up to an even number. */
3197 #define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */
3198
3199 /* Default size of memory map.
3200 * This is certainly too small for any actual applications. Apps should
3201 * always set the size explicitly using mdbx_env_set_geometry(). */
3202 #define DEFAULT_MAPSIZE MEGABYTE
3203
3204 /* Number of slots in the reader table.
3205 * This value was chosen somewhat arbitrarily. The 61 is a prime number,
3206 * and such readers plus a couple mutexes fit into single 4KB page.
3207 * Applications should set the table size using mdbx_env_set_maxreaders(). */
3208 #define DEFAULT_READERS 61
3209
3210 /* Test if a page is a leaf page */
3211 #define IS_LEAF(p) (((p)->mp_flags & P_LEAF) != 0)
3212 /* Test if a page is a LEAF2 page */
3213 #define IS_LEAF2(p) unlikely(((p)->mp_flags & P_LEAF2) != 0)
3214 /* Test if a page is a branch page */
3215 #define IS_BRANCH(p) (((p)->mp_flags & P_BRANCH) != 0)
3216 /* Test if a page is an overflow page */
3217 #define IS_OVERFLOW(p) unlikely(((p)->mp_flags & P_OVERFLOW) != 0)
3218 /* Test if a page is a sub page */
3219 #define IS_SUBP(p) (((p)->mp_flags & P_SUBP) != 0)
3220
3221 #define PAGETYPE(p) ((p)->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW))
3222
3223 /* Header for a single key/data pair within a page.
3224 * Used in pages of type P_BRANCH and P_LEAF without P_LEAF2.
3225 * We guarantee 2-byte alignment for 'MDBX_node's.
3226 *
3227 * Leaf node flags describe node contents. F_BIGDATA says the node's
3228 * data part is the page number of an overflow page with actual data.
3229 * F_DUPDATA and F_SUBDATA can be combined giving duplicate data in
3230 * a sub-page/sub-database, and named databases (just F_SUBDATA). */
3231 typedef struct MDBX_node {
3232 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
3233 union {
3234 uint32_t mn_dsize;
3235 uint32_t mn_pgno32;
3236 };
3237 uint8_t mn_flags; /* see mdbx_node flags */
3238 uint8_t mn_extra;
3239 uint16_t mn_ksize; /* key size */
3240 #else
3241 uint16_t mn_ksize; /* key size */
3242 uint8_t mn_extra;
3243 uint8_t mn_flags; /* see mdbx_node flags */
3244 union {
3245 uint32_t mn_pgno32;
3246 uint32_t mn_dsize;
3247 };
3248 #endif /* __BYTE_ORDER__ */
3249
3250 /* mdbx_node Flags */
3251 #define F_BIGDATA 0x01 /* data put on overflow page */
3252 #define F_SUBDATA 0x02 /* data is a sub-database */
3253 #define F_DUPDATA 0x04 /* data has duplicates */
3254
3255 /* valid flags for mdbx_node_add() */
3256 #define NODE_ADD_FLAGS (F_DUPDATA | F_SUBDATA | MDBX_RESERVE | MDBX_APPEND)
3257
3258 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
3259 (!defined(__cplusplus) && defined(_MSC_VER))
3260 uint8_t mn_data[] /* key and data are appended here */;
3261 #endif /* C99 */
3262 } MDBX_node;
3263
3264 #define DB_PERSISTENT_FLAGS \
3265 (MDBX_REVERSEKEY | MDBX_DUPSORT | MDBX_INTEGERKEY | MDBX_DUPFIXED | \
3266 MDBX_INTEGERDUP | MDBX_REVERSEDUP)
3267
3268 /* mdbx_dbi_open() flags */
3269 #define DB_USABLE_FLAGS (DB_PERSISTENT_FLAGS | MDBX_CREATE | MDBX_DB_ACCEDE)
3270
3271 #define DB_VALID 0x8000 /* DB handle is valid, for me_dbflags */
3272 #define DB_INTERNAL_FLAGS DB_VALID
3273
3274 #if DB_INTERNAL_FLAGS & DB_USABLE_FLAGS
3275 #error "Oops, some flags overlapped or wrong"
3276 #endif
3277 #if DB_PERSISTENT_FLAGS & ~DB_USABLE_FLAGS
3278 #error "Oops, some flags overlapped or wrong"
3279 #endif
3280
3281 /* max number of pages to commit in one writev() call */
3282 #define MDBX_COMMIT_PAGES 64
3283 #if defined(IOV_MAX) && IOV_MAX < MDBX_COMMIT_PAGES /* sysconf(_SC_IOV_MAX) */
3284 #undef MDBX_COMMIT_PAGES
3285 #define MDBX_COMMIT_PAGES IOV_MAX
3286 #endif
3287
3288 /*
3289 * /
3290 * | -1, a < b
3291 * CMP2INT(a,b) = < 0, a == b
3292 * | 1, a > b
3293 * \
3294 */
3295 #ifndef __e2k__
3296 /* LY: fast enough on most systems */
3297 #define CMP2INT(a, b) (((b) > (a)) ? -1 : (a) > (b))
3298 #else
3299 /* LY: more parallelable on VLIW Elbrus */
3300 #define CMP2INT(a, b) (((a) > (b)) - ((b) > (a)))
3301 #endif
3302
3303 /* Do not spill pages to disk if txn is getting full, may fail instead */
3304 #define MDBX_NOSPILL 0x8000
3305
3306 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t
pgno_add(pgno_t base,pgno_t augend)3307 pgno_add(pgno_t base, pgno_t augend) {
3308 assert(base <= MAX_PAGENO);
3309 return (augend < MAX_PAGENO - base) ? base + augend : MAX_PAGENO;
3310 }
3311
3312 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t
pgno_sub(pgno_t base,pgno_t subtrahend)3313 pgno_sub(pgno_t base, pgno_t subtrahend) {
3314 assert(base >= MIN_PAGENO);
3315 return (subtrahend < base - MIN_PAGENO) ? base - subtrahend : MIN_PAGENO;
3316 }
3317
3318 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline bool
is_powerof2(size_t x)3319 is_powerof2(size_t x) {
3320 return (x & (x - 1)) == 0;
3321 }
3322
3323 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t
floor_powerof2(size_t value,size_t granularity)3324 floor_powerof2(size_t value, size_t granularity) {
3325 assert(is_powerof2(granularity));
3326 return value & ~(granularity - 1);
3327 }
3328
3329 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t
ceil_powerof2(size_t value,size_t granularity)3330 ceil_powerof2(size_t value, size_t granularity) {
3331 return floor_powerof2(value + granularity - 1, granularity);
3332 }
3333
3334 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static unsigned
log2n_powerof2(size_t value)3335 log2n_powerof2(size_t value) {
3336 assert(value > 0 && value < INT32_MAX && is_powerof2(value));
3337 assert((value & -(int32_t)value) == value);
3338 #if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctzl)
3339 return __builtin_ctzl(value);
3340 #elif defined(_MSC_VER)
3341 unsigned long index;
3342 _BitScanForward(&index, (unsigned long)value);
3343 return index;
3344 #else
3345 static const uint8_t debruijn_ctz32[32] = {
3346 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
3347 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9};
3348 return debruijn_ctz32[(uint32_t)(value * 0x077CB531u) >> 27];
3349 #endif
3350 }
3351
3352 /* Only a subset of the mdbx_env flags can be changed
3353 * at runtime. Changing other flags requires closing the
3354 * environment and re-opening it with the new flags. */
3355 #define ENV_CHANGEABLE_FLAGS \
3356 (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_DEPRECATED_MAPASYNC | \
3357 MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE)
3358 #define ENV_CHANGELESS_FLAGS \
3359 (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOTLS | MDBX_NORDAHEAD | \
3360 MDBX_LIFORECLAIM | MDBX_EXCLUSIVE)
3361 #define ENV_USABLE_FLAGS (ENV_CHANGEABLE_FLAGS | ENV_CHANGELESS_FLAGS)
3362
3363 #if !defined(__cplusplus) || CONSTEXPR_ENUM_FLAGS_OPERATIONS
static_checks(void)3364 MDBX_MAYBE_UNUSED static void static_checks(void) {
3365 STATIC_ASSERT_MSG(INT16_MAX - CORE_DBS == MDBX_MAX_DBI,
3366 "Oops, MDBX_MAX_DBI or CORE_DBS?");
3367 STATIC_ASSERT_MSG((unsigned)(MDBX_DB_ACCEDE | MDBX_CREATE) ==
3368 ((DB_USABLE_FLAGS | DB_INTERNAL_FLAGS) &
3369 (ENV_USABLE_FLAGS | ENV_INTERNAL_FLAGS)),
3370 "Oops, some flags overlapped or wrong");
3371 STATIC_ASSERT_MSG((ENV_INTERNAL_FLAGS & ENV_USABLE_FLAGS) == 0,
3372 "Oops, some flags overlapped or wrong");
3373 }
3374 #endif /* Disabled for MSVC 19.0 (VisualStudio 2015) */
3375
3376 #ifdef __cplusplus
3377 }
3378 #endif
3379
3380 #define MDBX_ASAN_POISON_MEMORY_REGION(addr, size) \
3381 do { \
3382 mdbx_trace("POISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \
3383 (size_t)(size), __LINE__); \
3384 ASAN_POISON_MEMORY_REGION(addr, size); \
3385 } while (0)
3386
3387 #define MDBX_ASAN_UNPOISON_MEMORY_REGION(addr, size) \
3388 do { \
3389 mdbx_trace("UNPOISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \
3390 (size_t)(size), __LINE__); \
3391 ASAN_UNPOISON_MEMORY_REGION(addr, size); \
3392 } while (0)
3393
3394 typedef struct flagbit {
3395 int bit;
3396 const char *name;
3397 } flagbit;
3398
3399 const flagbit dbflags[] = {{MDBX_DUPSORT, "dupsort"},
3400 {MDBX_INTEGERKEY, "integerkey"},
3401 {MDBX_REVERSEKEY, "reversekey"},
3402 {MDBX_DUPFIXED, "dupfixed"},
3403 {MDBX_REVERSEDUP, "reversedup"},
3404 {MDBX_INTEGERDUP, "integerdup"},
3405 {0, nullptr}};
3406
3407 #if defined(_WIN32) || defined(_WIN64)
3408 /*
3409 * POSIX getopt for Windows
3410 *
3411 * AT&T Public License
3412 *
3413 * Code given out at the 1985 UNIFORUM conference in Dallas.
3414 */
3415
3416 /*----------------------------------------------------------------------------*/
3417 /* Microsoft compiler generates a lot of warning for self includes... */
3418
3419 #ifdef _MSC_VER
3420 #pragma warning(push, 1)
3421 #pragma warning(disable : 4548) /* expression before comma has no effect; \
3422 expected expression with side - effect */
3423 #pragma warning(disable : 4530) /* C++ exception handler used, but unwind \
3424 * semantics are not enabled. Specify /EHsc */
3425 #pragma warning(disable : 4577) /* 'noexcept' used with no exception handling \
3426 * mode specified; termination on exception is \
3427 * not guaranteed. Specify /EHsc */
3428 #if !defined(_CRT_SECURE_NO_WARNINGS)
3429 #define _CRT_SECURE_NO_WARNINGS
3430 #endif
3431 #endif /* _MSC_VER (warnings) */
3432
3433 #include <stdio.h>
3434 #include <string.h>
3435
3436 #ifdef _MSC_VER
3437 #pragma warning(pop)
3438 #endif
3439 /*----------------------------------------------------------------------------*/
3440
3441 #ifndef NULL
3442 #define NULL 0
3443 #endif
3444
3445 #ifndef EOF
3446 #define EOF (-1)
3447 #endif
3448
3449 int optind = 1;
3450 int optopt;
3451 char *optarg;
3452
getopt(int argc,char * const argv[],const char * opts)3453 int getopt(int argc, char *const argv[], const char *opts) {
3454 static int sp = 1;
3455 int c;
3456 const char *cp;
3457
3458 if (sp == 1) {
3459 if (optind >= argc || argv[optind][0] != '-' || argv[optind][1] == '\0')
3460 return EOF;
3461 else if (strcmp(argv[optind], "--") == 0) {
3462 optind++;
3463 return EOF;
3464 }
3465 }
3466 optopt = c = argv[optind][sp];
3467 if (c == ':' || (cp = strchr(opts, c)) == NULL) {
3468 fprintf(stderr, "%s: %s -- %c\n", argv[0], "illegal option", c);
3469 if (argv[optind][++sp] == '\0') {
3470 optind++;
3471 sp = 1;
3472 }
3473 return '?';
3474 }
3475 if (*++cp == ':') {
3476 if (argv[optind][sp + 1] != '\0')
3477 optarg = &argv[optind++][sp + 1];
3478 else if (++optind >= argc) {
3479 fprintf(stderr, "%s: %s -- %c\n", argv[0], "option requires an argument",
3480 c);
3481 sp = 1;
3482 return '?';
3483 } else
3484 optarg = argv[optind++];
3485 sp = 1;
3486 } else {
3487 if (argv[optind][++sp] == '\0') {
3488 sp = 1;
3489 optind++;
3490 }
3491 optarg = NULL;
3492 }
3493 return c;
3494 }
3495
3496 static volatile BOOL user_break;
ConsoleBreakHandlerRoutine(DWORD dwCtrlType)3497 static BOOL WINAPI ConsoleBreakHandlerRoutine(DWORD dwCtrlType) {
3498 (void)dwCtrlType;
3499 user_break = 1;
3500 return true;
3501 }
3502
GetMilliseconds(void)3503 static uint64_t GetMilliseconds(void) {
3504 LARGE_INTEGER Counter, Frequency;
3505 return (QueryPerformanceFrequency(&Frequency) &&
3506 QueryPerformanceCounter(&Counter))
3507 ? Counter.QuadPart * 1000ul / Frequency.QuadPart
3508 : 0;
3509 }
3510
3511 #else /* WINDOWS */
3512
3513 static volatile sig_atomic_t user_break;
signal_handler(int sig)3514 static void signal_handler(int sig) {
3515 (void)sig;
3516 user_break = 1;
3517 }
3518
3519 #endif /* !WINDOWS */
3520
3521 #define EXIT_INTERRUPTED (EXIT_FAILURE + 4)
3522 #define EXIT_FAILURE_SYS (EXIT_FAILURE + 3)
3523 #define EXIT_FAILURE_MDBX (EXIT_FAILURE + 2)
3524 #define EXIT_FAILURE_CHECK_MAJOR (EXIT_FAILURE + 1)
3525 #define EXIT_FAILURE_CHECK_MINOR EXIT_FAILURE
3526
3527 typedef struct {
3528 const char *name;
3529 struct {
3530 uint64_t branch, large_count, large_volume, leaf;
3531 uint64_t subleaf_dupsort, leaf_dupfixed, subleaf_dupfixed;
3532 uint64_t total, empty, other;
3533 } pages;
3534 uint64_t payload_bytes;
3535 uint64_t lost_bytes;
3536 } walk_dbi_t;
3537
3538 struct {
3539 short *pagemap;
3540 uint64_t total_payload_bytes;
3541 uint64_t pgcount;
3542 walk_dbi_t
3543 dbi[MDBX_MAX_DBI + CORE_DBS + /* account pseudo-entry for meta */ 1];
3544 } walk;
3545
3546 #define dbi_free walk.dbi[FREE_DBI]
3547 #define dbi_main walk.dbi[MAIN_DBI]
3548 #define dbi_meta walk.dbi[CORE_DBS]
3549
3550 int envflags = MDBX_RDONLY | MDBX_EXCLUSIVE;
3551 MDBX_env *env;
3552 MDBX_txn *txn;
3553 MDBX_envinfo envinfo;
3554 size_t userdb_count, skipped_subdb;
3555 uint64_t total_unused_bytes, reclaimable_pages, gc_pages, alloc_pages,
3556 unused_pages, backed_pages;
3557 unsigned verbose;
3558 bool ignore_wrong_order, quiet, dont_traversal;
3559 const char *only_subdb;
3560 int stuck_meta = -1;
3561
3562 struct problem {
3563 struct problem *pr_next;
3564 size_t count;
3565 const char *caption;
3566 };
3567
3568 struct problem *problems_list;
3569 unsigned total_problems, data_tree_problems, gc_tree_problems;
3570
print(const char * msg,...)3571 static void MDBX_PRINTF_ARGS(1, 2) print(const char *msg, ...) {
3572 if (!quiet) {
3573 va_list args;
3574
3575 fflush(stderr);
3576 va_start(args, msg);
3577 vfprintf(stdout, msg, args);
3578 va_end(args);
3579 }
3580 }
3581
va_log(MDBX_log_level_t level,const char * msg,va_list args)3582 static void va_log(MDBX_log_level_t level, const char *msg, va_list args) {
3583 static const char *const prefixes[] = {
3584 "!!!fatal: ", " ! " /* error */, " ~ " /* warning */,
3585 " " /* notice */, " // " /* verbose */, " //// " /* debug */,
3586 " ////// " /* trace */
3587 };
3588
3589 FILE *out = stdout;
3590 if (level <= MDBX_LOG_ERROR) {
3591 total_problems++;
3592 out = stderr;
3593 }
3594
3595 if (!quiet && verbose + 1 >= (unsigned)level) {
3596 fflush(nullptr);
3597 fputs(prefixes[level], out);
3598 vfprintf(out, msg, args);
3599 if (msg[strlen(msg) - 1] != '\n')
3600 fputc('\n', out);
3601 fflush(nullptr);
3602 }
3603
3604 if (level == MDBX_LOG_FATAL) {
3605 exit(EXIT_FAILURE_MDBX);
3606 abort();
3607 }
3608 }
3609
error(const char * msg,...)3610 static void MDBX_PRINTF_ARGS(1, 2) error(const char *msg, ...) {
3611 va_list args;
3612 va_start(args, msg);
3613 va_log(MDBX_LOG_ERROR, msg, args);
3614 va_end(args);
3615 }
3616
logger(MDBX_log_level_t level,const char * function,int line,const char * msg,va_list args)3617 static void logger(MDBX_log_level_t level, const char *function, int line,
3618 const char *msg, va_list args) {
3619 (void)line;
3620 (void)function;
3621 if (level < MDBX_LOG_EXTRA)
3622 va_log(level, msg, args);
3623 }
3624
check_user_break(void)3625 static int check_user_break(void) {
3626 switch (user_break) {
3627 case 0:
3628 return MDBX_SUCCESS;
3629 case 1:
3630 print(" - interrupted by signal\n");
3631 fflush(nullptr);
3632 user_break = 2;
3633 }
3634 return MDBX_EINTR;
3635 }
3636
pagemap_cleanup(void)3637 static void pagemap_cleanup(void) {
3638 for (size_t i = CORE_DBS + /* account pseudo-entry for meta */ 1;
3639 i < ARRAY_LENGTH(walk.dbi); ++i) {
3640 if (walk.dbi[i].name) {
3641 mdbx_free((void *)walk.dbi[i].name);
3642 walk.dbi[i].name = nullptr;
3643 }
3644 }
3645
3646 mdbx_free(walk.pagemap);
3647 walk.pagemap = nullptr;
3648 }
3649
pagemap_lookup_dbi(const char * dbi_name,bool silent)3650 static walk_dbi_t *pagemap_lookup_dbi(const char *dbi_name, bool silent) {
3651 static walk_dbi_t *last;
3652
3653 if (dbi_name == MDBX_PGWALK_MAIN)
3654 return &dbi_main;
3655 if (dbi_name == MDBX_PGWALK_GC)
3656 return &dbi_free;
3657 if (dbi_name == MDBX_PGWALK_META)
3658 return &dbi_meta;
3659
3660 if (last && strcmp(last->name, dbi_name) == 0)
3661 return last;
3662
3663 walk_dbi_t *dbi = walk.dbi + CORE_DBS + /* account pseudo-entry for meta */ 1;
3664 for (; dbi < ARRAY_END(walk.dbi) && dbi->name; ++dbi) {
3665 if (strcmp(dbi->name, dbi_name) == 0)
3666 return last = dbi;
3667 }
3668
3669 if (verbose > 0 && !silent) {
3670 print(" - found '%s' area\n", dbi_name);
3671 fflush(nullptr);
3672 }
3673
3674 if (dbi == ARRAY_END(walk.dbi))
3675 return nullptr;
3676
3677 dbi->name = mdbx_strdup(dbi_name);
3678 return last = dbi;
3679 }
3680
3681 static void MDBX_PRINTF_ARGS(4, 5)
problem_add(const char * object,uint64_t entry_number,const char * msg,const char * extra,...)3682 problem_add(const char *object, uint64_t entry_number, const char *msg,
3683 const char *extra, ...) {
3684 total_problems++;
3685
3686 if (!quiet) {
3687 int need_fflush = 0;
3688 struct problem *p;
3689
3690 for (p = problems_list; p; p = p->pr_next)
3691 if (p->caption == msg)
3692 break;
3693
3694 if (!p) {
3695 p = mdbx_calloc(1, sizeof(*p));
3696 p->caption = msg;
3697 p->pr_next = problems_list;
3698 problems_list = p;
3699 need_fflush = 1;
3700 }
3701
3702 p->count++;
3703 if (verbose > 1) {
3704 print(" %s #%" PRIu64 ": %s", object, entry_number, msg);
3705 if (extra) {
3706 va_list args;
3707 printf(" (");
3708 va_start(args, extra);
3709 vfprintf(stdout, extra, args);
3710 va_end(args);
3711 printf(")");
3712 }
3713 printf("\n");
3714 if (need_fflush)
3715 fflush(nullptr);
3716 }
3717 }
3718 }
3719
problems_push(void)3720 static struct problem *problems_push(void) {
3721 struct problem *p = problems_list;
3722 problems_list = nullptr;
3723 return p;
3724 }
3725
problems_pop(struct problem * list)3726 static size_t problems_pop(struct problem *list) {
3727 size_t count = 0;
3728
3729 if (problems_list) {
3730 int i;
3731
3732 print(" - problems: ");
3733 for (i = 0; problems_list; ++i) {
3734 struct problem *p = problems_list->pr_next;
3735 count += problems_list->count;
3736 print("%s%s (%" PRIuPTR ")", i ? ", " : "", problems_list->caption,
3737 problems_list->count);
3738 mdbx_free(problems_list);
3739 problems_list = p;
3740 }
3741 print("\n");
3742 fflush(nullptr);
3743 }
3744
3745 problems_list = list;
3746 return count;
3747 }
3748
pgvisitor(const uint64_t pgno,const unsigned pgnumber,void * const ctx,const int deep,const char * const dbi_name_or_tag,const size_t page_size,const MDBX_page_type_t pagetype,const MDBX_error_t err,const size_t nentries,const size_t payload_bytes,const size_t header_bytes,const size_t unused_bytes)3749 static int pgvisitor(const uint64_t pgno, const unsigned pgnumber,
3750 void *const ctx, const int deep,
3751 const char *const dbi_name_or_tag, const size_t page_size,
3752 const MDBX_page_type_t pagetype, const MDBX_error_t err,
3753 const size_t nentries, const size_t payload_bytes,
3754 const size_t header_bytes, const size_t unused_bytes) {
3755 (void)ctx;
3756 const bool is_gc_tree = dbi_name_or_tag == MDBX_PGWALK_GC;
3757 if (deep > 42) {
3758 problem_add("deep", deep, "too large", nullptr);
3759 data_tree_problems += !is_gc_tree;
3760 gc_tree_problems += is_gc_tree;
3761 return MDBX_CORRUPTED /* avoid infinite loop/recursion */;
3762 }
3763
3764 walk_dbi_t *dbi = pagemap_lookup_dbi(dbi_name_or_tag, false);
3765 if (!dbi) {
3766 data_tree_problems += !is_gc_tree;
3767 gc_tree_problems += is_gc_tree;
3768 return MDBX_ENOMEM;
3769 }
3770
3771 const size_t page_bytes = payload_bytes + header_bytes + unused_bytes;
3772 walk.pgcount += pgnumber;
3773
3774 const char *pagetype_caption;
3775 bool branch = false;
3776 switch (pagetype) {
3777 default:
3778 problem_add("page", pgno, "unknown page-type", "type %u, deep %i",
3779 (unsigned)pagetype, deep);
3780 pagetype_caption = "unknown";
3781 dbi->pages.other += pgnumber;
3782 data_tree_problems += !is_gc_tree;
3783 gc_tree_problems += is_gc_tree;
3784 break;
3785 case MDBX_page_broken:
3786 pagetype_caption = "broken";
3787 dbi->pages.other += pgnumber;
3788 data_tree_problems += !is_gc_tree;
3789 gc_tree_problems += is_gc_tree;
3790 break;
3791 case MDBX_subpage_broken:
3792 pagetype_caption = "broken-subpage";
3793 data_tree_problems += !is_gc_tree;
3794 gc_tree_problems += is_gc_tree;
3795 break;
3796 case MDBX_page_meta:
3797 pagetype_caption = "meta";
3798 dbi->pages.other += pgnumber;
3799 break;
3800 case MDBX_page_large:
3801 pagetype_caption = "large";
3802 dbi->pages.large_volume += pgnumber;
3803 dbi->pages.large_count += 1;
3804 break;
3805 case MDBX_page_branch:
3806 pagetype_caption = "branch";
3807 dbi->pages.branch += pgnumber;
3808 branch = true;
3809 break;
3810 case MDBX_page_leaf:
3811 pagetype_caption = "leaf";
3812 dbi->pages.leaf += pgnumber;
3813 break;
3814 case MDBX_page_dupfixed_leaf:
3815 pagetype_caption = "leaf-dupfixed";
3816 dbi->pages.leaf_dupfixed += pgnumber;
3817 break;
3818 case MDBX_subpage_leaf:
3819 pagetype_caption = "subleaf-dupsort";
3820 dbi->pages.subleaf_dupsort += 1;
3821 break;
3822 case MDBX_subpage_dupfixed_leaf:
3823 pagetype_caption = "subleaf-dupfixed";
3824 dbi->pages.subleaf_dupfixed += 1;
3825 break;
3826 }
3827
3828 if (pgnumber) {
3829 if (verbose > 3 && (!only_subdb || strcmp(only_subdb, dbi->name) == 0)) {
3830 if (pgnumber == 1)
3831 print(" %s-page %" PRIu64, pagetype_caption, pgno);
3832 else
3833 print(" %s-span %" PRIu64 "[%u]", pagetype_caption, pgno, pgnumber);
3834 print(" of %s: header %" PRIiPTR ", %s %" PRIiPTR ", payload %" PRIiPTR
3835 ", unused %" PRIiPTR ", deep %i\n",
3836 dbi->name, header_bytes,
3837 (pagetype == MDBX_page_branch) ? "keys" : "entries", nentries,
3838 payload_bytes, unused_bytes, deep);
3839 }
3840
3841 bool already_used = false;
3842 for (unsigned n = 0; n < pgnumber; ++n) {
3843 uint64_t spanpgno = pgno + n;
3844 if (spanpgno >= alloc_pages) {
3845 problem_add("page", spanpgno, "wrong page-no",
3846 "%s-page: %" PRIu64 " > %" PRIu64 ", deep %i",
3847 pagetype_caption, spanpgno, alloc_pages, deep);
3848 data_tree_problems += !is_gc_tree;
3849 gc_tree_problems += is_gc_tree;
3850 } else if (walk.pagemap[spanpgno]) {
3851 walk_dbi_t *coll_dbi = &walk.dbi[walk.pagemap[spanpgno] - 1];
3852 problem_add("page", spanpgno,
3853 (branch && coll_dbi == dbi) ? "loop" : "already used",
3854 "%s-page: by %s, deep %i", pagetype_caption, coll_dbi->name,
3855 deep);
3856 already_used = true;
3857 data_tree_problems += !is_gc_tree;
3858 gc_tree_problems += is_gc_tree;
3859 } else {
3860 walk.pagemap[spanpgno] = (short)(dbi - walk.dbi + 1);
3861 dbi->pages.total += 1;
3862 }
3863 }
3864
3865 if (already_used)
3866 return branch ? MDBX_RESULT_TRUE /* avoid infinite loop/recursion */
3867 : MDBX_SUCCESS;
3868 }
3869
3870 if (MDBX_IS_ERROR(err)) {
3871 problem_add("page", pgno, "invalid/corrupted", "%s-page", pagetype_caption);
3872 data_tree_problems += !is_gc_tree;
3873 gc_tree_problems += is_gc_tree;
3874 } else {
3875 if (unused_bytes > page_size) {
3876 problem_add("page", pgno, "illegal unused-bytes",
3877 "%s-page: %u < %" PRIuPTR " < %u", pagetype_caption, 0,
3878 unused_bytes, envinfo.mi_dxb_pagesize);
3879 data_tree_problems += !is_gc_tree;
3880 gc_tree_problems += is_gc_tree;
3881 }
3882
3883 if (header_bytes < (int)sizeof(long) ||
3884 (size_t)header_bytes >= envinfo.mi_dxb_pagesize - sizeof(long)) {
3885 problem_add("page", pgno, "illegal header-length",
3886 "%s-page: %" PRIuPTR " < %" PRIuPTR " < %" PRIuPTR,
3887 pagetype_caption, sizeof(long), header_bytes,
3888 envinfo.mi_dxb_pagesize - sizeof(long));
3889 data_tree_problems += !is_gc_tree;
3890 gc_tree_problems += is_gc_tree;
3891 }
3892 if (payload_bytes < 1) {
3893 if (nentries > 1) {
3894 problem_add("page", pgno, "zero size-of-entry",
3895 "%s-page: payload %" PRIuPTR " bytes, %" PRIuPTR " entries",
3896 pagetype_caption, payload_bytes, nentries);
3897 /* if ((size_t)header_bytes + unused_bytes < page_size) {
3898 // LY: hush a misuse error
3899 page_bytes = page_size;
3900 } */
3901 data_tree_problems += !is_gc_tree;
3902 gc_tree_problems += is_gc_tree;
3903 } else {
3904 problem_add("page", pgno, "empty",
3905 "%s-page: payload %" PRIuPTR " bytes, %" PRIuPTR
3906 " entries, deep %i",
3907 pagetype_caption, payload_bytes, nentries, deep);
3908 dbi->pages.empty += 1;
3909 data_tree_problems += !is_gc_tree;
3910 gc_tree_problems += is_gc_tree;
3911 }
3912 }
3913
3914 if (pgnumber) {
3915 if (page_bytes != page_size) {
3916 problem_add("page", pgno, "misused",
3917 "%s-page: %" PRIuPTR " != %" PRIuPTR " (%" PRIuPTR
3918 "h + %" PRIuPTR "p + %" PRIuPTR "u), deep %i",
3919 pagetype_caption, page_size, page_bytes, header_bytes,
3920 payload_bytes, unused_bytes, deep);
3921 if (page_size > page_bytes)
3922 dbi->lost_bytes += page_size - page_bytes;
3923 data_tree_problems += !is_gc_tree;
3924 gc_tree_problems += is_gc_tree;
3925 } else {
3926 dbi->payload_bytes += payload_bytes + header_bytes;
3927 walk.total_payload_bytes += payload_bytes + header_bytes;
3928 }
3929 }
3930 }
3931
3932 return check_user_break();
3933 }
3934
3935 typedef int(visitor)(const uint64_t record_number, const MDBX_val *key,
3936 const MDBX_val *data);
3937 static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler,
3938 bool silent);
3939
handle_userdb(const uint64_t record_number,const MDBX_val * key,const MDBX_val * data)3940 static int handle_userdb(const uint64_t record_number, const MDBX_val *key,
3941 const MDBX_val *data) {
3942 (void)record_number;
3943 (void)key;
3944 (void)data;
3945 return check_user_break();
3946 }
3947
handle_freedb(const uint64_t record_number,const MDBX_val * key,const MDBX_val * data)3948 static int handle_freedb(const uint64_t record_number, const MDBX_val *key,
3949 const MDBX_val *data) {
3950 char *bad = "";
3951 pgno_t *iptr = data->iov_base;
3952
3953 if (key->iov_len != sizeof(txnid_t))
3954 problem_add("entry", record_number, "wrong txn-id size",
3955 "key-size %" PRIiPTR, key->iov_len);
3956 else {
3957 txnid_t txnid;
3958 memcpy(&txnid, key->iov_base, sizeof(txnid));
3959 if (txnid < 1 || txnid > envinfo.mi_recent_txnid)
3960 problem_add("entry", record_number, "wrong txn-id", "%" PRIaTXN, txnid);
3961 else {
3962 if (data->iov_len < sizeof(pgno_t) || data->iov_len % sizeof(pgno_t))
3963 problem_add("entry", txnid, "wrong idl size", "%" PRIuPTR,
3964 data->iov_len);
3965 size_t number = (data->iov_len >= sizeof(pgno_t)) ? *iptr++ : 0;
3966 if (number < 1 || number > MDBX_PGL_LIMIT)
3967 problem_add("entry", txnid, "wrong idl length", "%" PRIuPTR, number);
3968 else if ((number + 1) * sizeof(pgno_t) > data->iov_len) {
3969 problem_add("entry", txnid, "trimmed idl",
3970 "%" PRIuSIZE " > %" PRIuSIZE " (corruption)",
3971 (number + 1) * sizeof(pgno_t), data->iov_len);
3972 number = data->iov_len / sizeof(pgno_t) - 1;
3973 } else if (data->iov_len - (number + 1) * sizeof(pgno_t) >=
3974 /* LY: allow gap up to one page. it is ok
3975 * and better than shink-and-retry inside mdbx_update_gc() */
3976 envinfo.mi_dxb_pagesize)
3977 problem_add("entry", txnid, "extra idl space",
3978 "%" PRIuSIZE " < %" PRIuSIZE " (minor, not a trouble)",
3979 (number + 1) * sizeof(pgno_t), data->iov_len);
3980
3981 gc_pages += number;
3982 if (envinfo.mi_latter_reader_txnid > txnid)
3983 reclaimable_pages += number;
3984
3985 pgno_t prev = MDBX_PNL_ASCENDING ? NUM_METAS - 1 : txn->mt_next_pgno;
3986 pgno_t span = 1;
3987 for (unsigned i = 0; i < number; ++i) {
3988 if (check_user_break())
3989 return MDBX_EINTR;
3990 const pgno_t pgno = iptr[i];
3991 if (pgno < NUM_METAS)
3992 problem_add("entry", txnid, "wrong idl entry",
3993 "pgno %" PRIaPGNO " < meta-pages %u", pgno, NUM_METAS);
3994 else if (pgno >= backed_pages)
3995 problem_add("entry", txnid, "wrong idl entry",
3996 "pgno %" PRIaPGNO " > backed-pages %" PRIu64, pgno,
3997 backed_pages);
3998 else if (pgno >= alloc_pages)
3999 problem_add("entry", txnid, "wrong idl entry",
4000 "pgno %" PRIaPGNO " > alloc-pages %" PRIu64, pgno,
4001 alloc_pages - 1);
4002 else {
4003 if (MDBX_PNL_DISORDERED(prev, pgno)) {
4004 bad = " [bad sequence]";
4005 problem_add("entry", txnid, "bad sequence",
4006 "%" PRIaPGNO " %c [%u].%" PRIaPGNO, prev,
4007 (prev == pgno) ? '=' : (MDBX_PNL_ASCENDING ? '>' : '<'),
4008 i, pgno);
4009 }
4010 if (walk.pagemap) {
4011 int idx = walk.pagemap[pgno];
4012 if (idx == 0)
4013 walk.pagemap[pgno] = -1;
4014 else if (idx > 0)
4015 problem_add("page", pgno, "already used", "by %s",
4016 walk.dbi[idx - 1].name);
4017 else
4018 problem_add("page", pgno, "already listed in GC", nullptr);
4019 }
4020 }
4021 prev = pgno;
4022 while (i + span < number &&
4023 iptr[i + span] == (MDBX_PNL_ASCENDING ? pgno_add(pgno, span)
4024 : pgno_sub(pgno, span)))
4025 ++span;
4026 }
4027 if (verbose > 3 && !only_subdb) {
4028 print(" transaction %" PRIaTXN ", %" PRIuPTR
4029 " pages, maxspan %" PRIaPGNO "%s\n",
4030 txnid, number, span, bad);
4031 if (verbose > 4) {
4032 for (unsigned i = 0; i < number; i += span) {
4033 const pgno_t pgno = iptr[i];
4034 for (span = 1;
4035 i + span < number &&
4036 iptr[i + span] == (MDBX_PNL_ASCENDING ? pgno_add(pgno, span)
4037 : pgno_sub(pgno, span));
4038 ++span)
4039 ;
4040 if (span > 1) {
4041 print(" %9" PRIaPGNO "[%" PRIaPGNO "]\n", pgno, span);
4042 } else
4043 print(" %9" PRIaPGNO "\n", pgno);
4044 }
4045 }
4046 }
4047 }
4048 }
4049
4050 return check_user_break();
4051 }
4052
equal_or_greater(const MDBX_val * a,const MDBX_val * b)4053 static int equal_or_greater(const MDBX_val *a, const MDBX_val *b) {
4054 return (a->iov_len == b->iov_len &&
4055 memcmp(a->iov_base, b->iov_base, a->iov_len) == 0)
4056 ? 0
4057 : 1;
4058 }
4059
handle_maindb(const uint64_t record_number,const MDBX_val * key,const MDBX_val * data)4060 static int handle_maindb(const uint64_t record_number, const MDBX_val *key,
4061 const MDBX_val *data) {
4062 char *name;
4063 int rc;
4064 size_t i;
4065
4066 name = key->iov_base;
4067 for (i = 0; i < key->iov_len; ++i) {
4068 if (name[i] < ' ')
4069 return handle_userdb(record_number, key, data);
4070 }
4071
4072 name = mdbx_malloc(key->iov_len + 1);
4073 memcpy(name, key->iov_base, key->iov_len);
4074 name[key->iov_len] = '\0';
4075 userdb_count++;
4076
4077 rc = process_db(~0u, name, handle_userdb, false);
4078 mdbx_free(name);
4079 if (rc != MDBX_INCOMPATIBLE)
4080 return rc;
4081
4082 return handle_userdb(record_number, key, data);
4083 }
4084
db_flags2keymode(unsigned flags)4085 static const char *db_flags2keymode(unsigned flags) {
4086 flags &= (MDBX_REVERSEKEY | MDBX_INTEGERKEY);
4087 switch (flags) {
4088 case 0:
4089 return "usual";
4090 case MDBX_REVERSEKEY:
4091 return "reserve";
4092 case MDBX_INTEGERKEY:
4093 return "ordinal";
4094 case MDBX_REVERSEKEY | MDBX_INTEGERKEY:
4095 return "msgpack";
4096 default:
4097 assert(false);
4098 __unreachable();
4099 }
4100 }
4101
db_flags2valuemode(unsigned flags)4102 static const char *db_flags2valuemode(unsigned flags) {
4103 flags &= (MDBX_DUPSORT | MDBX_REVERSEDUP | MDBX_DUPFIXED | MDBX_INTEGERDUP);
4104 switch (flags) {
4105 case 0:
4106 return "single";
4107 case MDBX_DUPSORT:
4108 return "multi";
4109 case MDBX_REVERSEDUP:
4110 case MDBX_DUPSORT | MDBX_REVERSEDUP:
4111 return "multi-reverse";
4112 case MDBX_DUPFIXED:
4113 case MDBX_DUPSORT | MDBX_DUPFIXED:
4114 return "multi-samelength";
4115 case MDBX_DUPFIXED | MDBX_REVERSEDUP:
4116 case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP:
4117 return "multi-reverse-samelength";
4118 case MDBX_INTEGERDUP:
4119 case MDBX_DUPSORT | MDBX_INTEGERDUP:
4120 case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP:
4121 case MDBX_DUPFIXED | MDBX_INTEGERDUP:
4122 return "multi-ordinal";
4123 case MDBX_INTEGERDUP | MDBX_REVERSEDUP:
4124 case MDBX_DUPSORT | MDBX_INTEGERDUP | MDBX_REVERSEDUP:
4125 return "multi-msgpack";
4126 case MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP:
4127 case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP:
4128 return "reserved";
4129 default:
4130 assert(false);
4131 __unreachable();
4132 }
4133 }
4134
process_db(MDBX_dbi dbi_handle,char * dbi_name,visitor * handler,bool silent)4135 static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler,
4136 bool silent) {
4137 MDBX_cursor *mc;
4138 MDBX_stat ms;
4139 MDBX_val key, data;
4140 MDBX_val prev_key, prev_data;
4141 unsigned flags;
4142 int rc, i;
4143 struct problem *saved_list;
4144 uint64_t problems_count;
4145
4146 uint64_t record_count = 0, dups = 0;
4147 uint64_t key_bytes = 0, data_bytes = 0;
4148
4149 if ((MDBX_TXN_FINISHED | MDBX_TXN_ERROR) & mdbx_txn_flags(txn)) {
4150 print(" ! abort processing '%s' due to a previous error\n",
4151 dbi_name ? dbi_name : "@MAIN");
4152 return MDBX_BAD_TXN;
4153 }
4154
4155 if (dbi_handle == ~0u) {
4156 rc = mdbx_dbi_open_ex(
4157 txn, dbi_name, MDBX_DB_ACCEDE, &dbi_handle,
4158 (dbi_name && ignore_wrong_order) ? equal_or_greater : nullptr,
4159 (dbi_name && ignore_wrong_order) ? equal_or_greater : nullptr);
4160 if (rc) {
4161 if (!dbi_name ||
4162 rc !=
4163 MDBX_INCOMPATIBLE) /* LY: mainDB's record is not a user's DB. */ {
4164 error("mdbx_dbi_open('%s') failed, error %d %s\n",
4165 dbi_name ? dbi_name : "main", rc, mdbx_strerror(rc));
4166 }
4167 return rc;
4168 }
4169 }
4170
4171 if (dbi_handle >= CORE_DBS && dbi_name && only_subdb &&
4172 strcmp(only_subdb, dbi_name) != 0) {
4173 if (verbose) {
4174 print("Skip processing '%s'...\n", dbi_name);
4175 fflush(nullptr);
4176 }
4177 skipped_subdb++;
4178 return MDBX_SUCCESS;
4179 }
4180
4181 if (!silent && verbose) {
4182 print("Processing '%s'...\n", dbi_name ? dbi_name : "@MAIN");
4183 fflush(nullptr);
4184 }
4185
4186 rc = mdbx_dbi_flags(txn, dbi_handle, &flags);
4187 if (rc) {
4188 error("mdbx_dbi_flags() failed, error %d %s\n", rc, mdbx_strerror(rc));
4189 return rc;
4190 }
4191
4192 rc = mdbx_dbi_stat(txn, dbi_handle, &ms, sizeof(ms));
4193 if (rc) {
4194 error("mdbx_dbi_stat() failed, error %d %s\n", rc, mdbx_strerror(rc));
4195 return rc;
4196 }
4197
4198 if (!silent && verbose) {
4199 print(" - key-value kind: %s-key => %s-value", db_flags2keymode(flags),
4200 db_flags2valuemode(flags));
4201 if (verbose > 1) {
4202 print(", flags:");
4203 if (!flags)
4204 print(" none");
4205 else {
4206 for (i = 0; dbflags[i].bit; i++)
4207 if (flags & dbflags[i].bit)
4208 print(" %s", dbflags[i].name);
4209 }
4210 if (verbose > 2)
4211 print(" (0x%02X), dbi-id %d", flags, dbi_handle);
4212 }
4213 print("\n");
4214 if (ms.ms_mod_txnid)
4215 print(" - last modification txn#%" PRIu64 "\n", ms.ms_mod_txnid);
4216 if (verbose > 1) {
4217 print(" - page size %u, entries %" PRIu64 "\n", ms.ms_psize,
4218 ms.ms_entries);
4219 print(" - b-tree depth %u, pages: branch %" PRIu64 ", leaf %" PRIu64
4220 ", overflow %" PRIu64 "\n",
4221 ms.ms_depth, ms.ms_branch_pages, ms.ms_leaf_pages,
4222 ms.ms_overflow_pages);
4223 }
4224 }
4225
4226 walk_dbi_t *dbi = (dbi_handle < CORE_DBS)
4227 ? &walk.dbi[dbi_handle]
4228 : pagemap_lookup_dbi(dbi_name, true);
4229 if (!dbi) {
4230 error("too many DBIs or out of memory\n");
4231 return MDBX_ENOMEM;
4232 }
4233 if (!dont_traversal) {
4234 const uint64_t subtotal_pages =
4235 ms.ms_branch_pages + ms.ms_leaf_pages + ms.ms_overflow_pages;
4236 if (subtotal_pages != dbi->pages.total)
4237 error("%s pages mismatch (%" PRIu64 " != walked %" PRIu64 ")\n",
4238 "subtotal", subtotal_pages, dbi->pages.total);
4239 if (ms.ms_branch_pages != dbi->pages.branch)
4240 error("%s pages mismatch (%" PRIu64 " != walked %" PRIu64 ")\n", "branch",
4241 ms.ms_branch_pages, dbi->pages.branch);
4242 const uint64_t allleaf_pages = dbi->pages.leaf + dbi->pages.leaf_dupfixed;
4243 if (ms.ms_leaf_pages != allleaf_pages)
4244 error("%s pages mismatch (%" PRIu64 " != walked %" PRIu64 ")\n",
4245 "all-leaf", ms.ms_leaf_pages, allleaf_pages);
4246 if (ms.ms_overflow_pages != dbi->pages.large_volume)
4247 error("%s pages mismatch (%" PRIu64 " != walked %" PRIu64 ")\n",
4248 "large/overlow", ms.ms_overflow_pages, dbi->pages.large_volume);
4249 }
4250 rc = mdbx_cursor_open(txn, dbi_handle, &mc);
4251 if (rc) {
4252 error("mdbx_cursor_open() failed, error %d %s\n", rc, mdbx_strerror(rc));
4253 return rc;
4254 }
4255
4256 if (ignore_wrong_order) { /* for debugging with enabled assertions */
4257 mc->mc_flags |= C_SKIPORD;
4258 if (mc->mc_xcursor)
4259 mc->mc_xcursor->mx_cursor.mc_flags |= C_SKIPORD;
4260 }
4261
4262 const size_t maxkeysize = mdbx_env_get_maxkeysize_ex(env, flags);
4263 saved_list = problems_push();
4264 prev_key.iov_base = nullptr;
4265 prev_key.iov_len = 0;
4266 prev_data.iov_base = nullptr;
4267 prev_data.iov_len = 0;
4268 rc = mdbx_cursor_get(mc, &key, &data, MDBX_FIRST);
4269 while (rc == MDBX_SUCCESS) {
4270 rc = check_user_break();
4271 if (rc)
4272 goto bailout;
4273
4274 bool bad_key = false;
4275 if (key.iov_len > maxkeysize) {
4276 problem_add("entry", record_count, "key length exceeds max-key-size",
4277 "%" PRIuPTR " > %" PRIuPTR, key.iov_len, maxkeysize);
4278 bad_key = true;
4279 } else if ((flags & MDBX_INTEGERKEY) && key.iov_len != sizeof(uint64_t) &&
4280 key.iov_len != sizeof(uint32_t)) {
4281 problem_add("entry", record_count, "wrong key length",
4282 "%" PRIuPTR " != 4or8", key.iov_len);
4283 bad_key = true;
4284 }
4285
4286 bool bad_data = false;
4287 if ((flags & MDBX_INTEGERDUP) && data.iov_len != sizeof(uint64_t) &&
4288 data.iov_len != sizeof(uint32_t)) {
4289 problem_add("entry", record_count, "wrong data length",
4290 "%" PRIuPTR " != 4or8", data.iov_len);
4291 bad_data = true;
4292 }
4293
4294 if (prev_key.iov_base) {
4295 if (prev_data.iov_base && !bad_data && (flags & MDBX_DUPFIXED) &&
4296 prev_data.iov_len != data.iov_len) {
4297 problem_add("entry", record_count, "different data length",
4298 "%" PRIuPTR " != %" PRIuPTR, prev_data.iov_len,
4299 data.iov_len);
4300 bad_data = true;
4301 }
4302
4303 if (!bad_key) {
4304 int cmp = mdbx_cmp(txn, dbi_handle, &key, &prev_key);
4305 if (cmp == 0) {
4306 ++dups;
4307 if ((flags & MDBX_DUPSORT) == 0) {
4308 problem_add("entry", record_count, "duplicated entries", nullptr);
4309 if (prev_data.iov_base && data.iov_len == prev_data.iov_len &&
4310 memcmp(data.iov_base, prev_data.iov_base, data.iov_len) == 0) {
4311 problem_add("entry", record_count, "complete duplicate", nullptr);
4312 }
4313 } else if (!bad_data && prev_data.iov_base) {
4314 cmp = mdbx_dcmp(txn, dbi_handle, &data, &prev_data);
4315 if (cmp == 0) {
4316 problem_add("entry", record_count, "complete duplicate", nullptr);
4317 } else if (cmp < 0 && !ignore_wrong_order) {
4318 problem_add("entry", record_count, "wrong order of multi-values",
4319 nullptr);
4320 }
4321 }
4322 } else if (cmp < 0 && !ignore_wrong_order) {
4323 problem_add("entry", record_count, "wrong order of entries", nullptr);
4324 }
4325 }
4326 }
4327
4328 if (handler) {
4329 rc = handler(record_count, &key, &data);
4330 if (MDBX_IS_ERROR(rc))
4331 goto bailout;
4332 }
4333
4334 record_count++;
4335 key_bytes += key.iov_len;
4336 data_bytes += data.iov_len;
4337
4338 if (!bad_key) {
4339 if (verbose && (flags & MDBX_INTEGERKEY) && !prev_key.iov_base)
4340 print(" - fixed key-size %" PRIuPTR "\n", key.iov_len);
4341 prev_key = key;
4342 }
4343 if (!bad_data) {
4344 if (verbose && (flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED)) &&
4345 !prev_data.iov_base)
4346 print(" - fixed data-size %" PRIuPTR "\n", data.iov_len);
4347 prev_data = data;
4348 }
4349 rc = mdbx_cursor_get(mc, &key, &data, MDBX_NEXT);
4350 }
4351 if (rc != MDBX_NOTFOUND)
4352 error("mdbx_cursor_get() failed, error %d %s\n", rc, mdbx_strerror(rc));
4353 else
4354 rc = 0;
4355
4356 if (record_count != ms.ms_entries)
4357 problem_add("entry", record_count, "different number of entries",
4358 "%" PRIu64 " != %" PRIu64, record_count, ms.ms_entries);
4359 bailout:
4360 problems_count = problems_pop(saved_list);
4361 if (!silent && verbose) {
4362 print(" - summary: %" PRIu64 " records, %" PRIu64 " dups, %" PRIu64
4363 " key's bytes, %" PRIu64 " data's "
4364 "bytes, %" PRIu64 " problems\n",
4365 record_count, dups, key_bytes, data_bytes, problems_count);
4366 fflush(nullptr);
4367 }
4368
4369 mdbx_cursor_close(mc);
4370 return (rc || problems_count) ? MDBX_RESULT_TRUE : MDBX_SUCCESS;
4371 }
4372
usage(char * prog)4373 static void usage(char *prog) {
4374 fprintf(stderr,
4375 "usage: %s [-V] [-v] [-q] [-c] [-0|1|2] [-w] [-d] [-i] [-s subdb] "
4376 "dbpath\n"
4377 " -V\t\tprint version and exit\n"
4378 " -v\t\tmore verbose, could be used multiple times\n"
4379 " -q\t\tbe quiet\n"
4380 " -c\t\tforce cooperative mode (don't try exclusive)\n"
4381 " -w\t\twrite-mode checking\n"
4382 " -d\t\tdisable page-by-page traversal of B-tree\n"
4383 " -i\t\tignore wrong order errors (for custom comparators case)\n"
4384 " -s subdb\tprocess a specific subdatabase only\n"
4385 " -0|1|2\tforce using specific meta-page 0, or 2 for checking\n"
4386 " -t\t\tturn to a specified meta-page on successful check\n"
4387 " -T\t\tturn to a specified meta-page EVEN ON UNSUCCESSFUL CHECK!\n",
4388 prog);
4389 exit(EXIT_INTERRUPTED);
4390 }
4391
meta_ot(txnid_t txn_a,uint64_t sign_a,txnid_t txn_b,uint64_t sign_b,const bool wanna_steady)4392 static __inline bool meta_ot(txnid_t txn_a, uint64_t sign_a, txnid_t txn_b,
4393 uint64_t sign_b, const bool wanna_steady) {
4394 if (txn_a == txn_b)
4395 return SIGN_IS_STEADY(sign_b);
4396
4397 if (wanna_steady && SIGN_IS_STEADY(sign_a) != SIGN_IS_STEADY(sign_b))
4398 return SIGN_IS_STEADY(sign_b);
4399
4400 return txn_a < txn_b;
4401 }
4402
meta_eq(txnid_t txn_a,uint64_t sign_a,txnid_t txn_b,uint64_t sign_b)4403 static __inline bool meta_eq(txnid_t txn_a, uint64_t sign_a, txnid_t txn_b,
4404 uint64_t sign_b) {
4405 if (!txn_a || txn_a != txn_b)
4406 return false;
4407
4408 if (SIGN_IS_STEADY(sign_a) != SIGN_IS_STEADY(sign_b))
4409 return false;
4410
4411 return true;
4412 }
4413
meta_recent(const bool wanna_steady)4414 static __inline int meta_recent(const bool wanna_steady) {
4415 if (meta_ot(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign,
4416 envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, wanna_steady))
4417 return meta_ot(envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign,
4418 envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, wanna_steady)
4419 ? 1
4420 : 2;
4421 else
4422 return meta_ot(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign,
4423 envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign, wanna_steady)
4424 ? 2
4425 : 0;
4426 }
4427
meta_tail(int head)4428 static __inline int meta_tail(int head) {
4429 switch (head) {
4430 case 0:
4431 return meta_ot(envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign,
4432 envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign, true)
4433 ? 1
4434 : 2;
4435 case 1:
4436 return meta_ot(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign,
4437 envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign, true)
4438 ? 0
4439 : 2;
4440 case 2:
4441 return meta_ot(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign,
4442 envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, true)
4443 ? 0
4444 : 1;
4445 default:
4446 assert(false);
4447 return -1;
4448 }
4449 }
4450
meta_head(void)4451 static int meta_head(void) { return meta_recent(false); }
4452
verbose_meta(int num,txnid_t txnid,uint64_t sign,uint64_t bootid_x,uint64_t bootid_y)4453 void verbose_meta(int num, txnid_t txnid, uint64_t sign, uint64_t bootid_x,
4454 uint64_t bootid_y) {
4455 const bool have_bootid = (bootid_x | bootid_y) != 0;
4456 const bool bootid_match = bootid_x == envinfo.mi_bootid.current.x &&
4457 bootid_y == envinfo.mi_bootid.current.y;
4458
4459 print(" - meta-%d: ", num);
4460 switch (sign) {
4461 case MDBX_DATASIGN_NONE:
4462 print("no-sync/legacy");
4463 break;
4464 case MDBX_DATASIGN_WEAK:
4465 print("weak-%s", bootid_match ? (have_bootid ? "intact (same boot-id)"
4466 : "unknown (no boot-id")
4467 : "dead");
4468 break;
4469 default:
4470 print("steady");
4471 break;
4472 }
4473 print(" txn#%" PRIu64, txnid);
4474
4475 const int head = meta_head();
4476 if (num == head)
4477 print(", head");
4478 else if (num == meta_tail(head))
4479 print(", tail");
4480 else
4481 print(", stay");
4482
4483 if (stuck_meta >= 0) {
4484 if (num == stuck_meta)
4485 print(", forced for checking");
4486 } else if (txnid > envinfo.mi_recent_txnid &&
4487 (envflags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) == MDBX_EXCLUSIVE)
4488 print(", rolled-back %" PRIu64 " (%" PRIu64 " >>> %" PRIu64 ")",
4489 txnid - envinfo.mi_recent_txnid, txnid, envinfo.mi_recent_txnid);
4490 print("\n");
4491 }
4492
get_meta_txnid(const unsigned meta_id)4493 static uint64_t get_meta_txnid(const unsigned meta_id) {
4494 switch (meta_id) {
4495 default:
4496 assert(false);
4497 error("unexpected meta_id %u\n", meta_id);
4498 return 0;
4499 case 0:
4500 return envinfo.mi_meta0_txnid;
4501 case 1:
4502 return envinfo.mi_meta1_txnid;
4503 case 2:
4504 return envinfo.mi_meta2_txnid;
4505 }
4506 }
4507
print_size(const char * prefix,const uint64_t value,const char * suffix)4508 static void print_size(const char *prefix, const uint64_t value,
4509 const char *suffix) {
4510 const char sf[] =
4511 "KMGTPEZY"; /* LY: Kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta! */
4512 double k = 1024.0;
4513 size_t i;
4514 for (i = 0; sf[i + 1] && value / k > 1000.0; ++i)
4515 k *= 1024;
4516 print("%s%" PRIu64 " (%.2f %cb)%s", prefix, value, value / k, sf[i], suffix);
4517 }
4518
main(int argc,char * argv[])4519 int main(int argc, char *argv[]) {
4520 int rc;
4521 char *prog = argv[0];
4522 char *envname;
4523 unsigned problems_maindb = 0, problems_freedb = 0, problems_meta = 0;
4524 bool write_locked = false;
4525 bool turn_meta = false;
4526 bool force_turn_meta = false;
4527
4528 double elapsed;
4529 #if defined(_WIN32) || defined(_WIN64)
4530 uint64_t timestamp_start, timestamp_finish;
4531 timestamp_start = GetMilliseconds();
4532 #else
4533 struct timespec timestamp_start, timestamp_finish;
4534 if (clock_gettime(CLOCK_MONOTONIC, ×tamp_start)) {
4535 rc = errno;
4536 error("clock_gettime() failed, error %d %s\n", rc, mdbx_strerror(rc));
4537 return EXIT_FAILURE_SYS;
4538 }
4539 #endif
4540
4541 dbi_meta.name = "@META";
4542 dbi_free.name = "@GC";
4543 dbi_main.name = "@MAIN";
4544 atexit(pagemap_cleanup);
4545
4546 if (argc < 2)
4547 usage(prog);
4548
4549 for (int i; (i = getopt(argc, argv,
4550 "0"
4551 "1"
4552 "2"
4553 "T"
4554 "V"
4555 "v"
4556 "q"
4557 "n"
4558 "w"
4559 "c"
4560 "t"
4561 "d"
4562 "i"
4563 "s:")) != EOF;) {
4564 switch (i) {
4565 case 'V':
4566 printf("mdbx_chk version %d.%d.%d.%d\n"
4567 " - source: %s %s, commit %s, tree %s\n"
4568 " - anchor: %s\n"
4569 " - build: %s for %s by %s\n"
4570 " - flags: %s\n"
4571 " - options: %s\n",
4572 mdbx_version.major, mdbx_version.minor, mdbx_version.release,
4573 mdbx_version.revision, mdbx_version.git.describe,
4574 mdbx_version.git.datetime, mdbx_version.git.commit,
4575 mdbx_version.git.tree, mdbx_sourcery_anchor, mdbx_build.datetime,
4576 mdbx_build.target, mdbx_build.compiler, mdbx_build.flags,
4577 mdbx_build.options);
4578 return EXIT_SUCCESS;
4579 case 'v':
4580 verbose++;
4581 break;
4582 case '0':
4583 stuck_meta = 0;
4584 break;
4585 case '1':
4586 stuck_meta = 1;
4587 break;
4588 case '2':
4589 stuck_meta = 2;
4590 break;
4591 case 't':
4592 turn_meta = true;
4593 break;
4594 case 'T':
4595 turn_meta = force_turn_meta = true;
4596 quiet = false;
4597 if (verbose < 2)
4598 verbose = 2;
4599 break;
4600 case 'q':
4601 quiet = true;
4602 break;
4603 case 'n':
4604 break;
4605 case 'w':
4606 envflags &= ~MDBX_RDONLY;
4607 #if MDBX_MMAP_INCOHERENT_FILE_WRITE
4608 /* Temporary `workaround` for OpenBSD kernel's flaw.
4609 * See https://github.com/erthink/libmdbx/issues/67 */
4610 envflags |= MDBX_WRITEMAP;
4611 #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */
4612 break;
4613 case 'c':
4614 envflags = (envflags & ~MDBX_EXCLUSIVE) | MDBX_ACCEDE;
4615 break;
4616 case 'd':
4617 dont_traversal = true;
4618 break;
4619 case 's':
4620 if (only_subdb && strcmp(only_subdb, optarg))
4621 usage(prog);
4622 only_subdb = optarg;
4623 break;
4624 case 'i':
4625 ignore_wrong_order = true;
4626 break;
4627 default:
4628 usage(prog);
4629 }
4630 }
4631
4632 if (optind != argc - 1)
4633 usage(prog);
4634
4635 rc = MDBX_SUCCESS;
4636 if (stuck_meta >= 0 && (envflags & MDBX_EXCLUSIVE) == 0) {
4637 error("exclusive mode is required to using specific meta-page(%d) for "
4638 "checking.\n",
4639 stuck_meta);
4640 rc = EXIT_INTERRUPTED;
4641 }
4642 if (turn_meta) {
4643 if (stuck_meta < 0) {
4644 error("meta-page must be specified (by -0, -1 or -2 options) to turn to "
4645 "it.\n");
4646 rc = EXIT_INTERRUPTED;
4647 }
4648 if (envflags & MDBX_RDONLY) {
4649 error("write-mode must be enabled to turn to the specified meta-page.\n");
4650 rc = EXIT_INTERRUPTED;
4651 }
4652 if (only_subdb || dont_traversal) {
4653 error("whole database checking with tree-traversal are required to turn "
4654 "to the specified meta-page.\n");
4655 rc = EXIT_INTERRUPTED;
4656 }
4657 }
4658 if (rc)
4659 exit(rc);
4660
4661 #if defined(_WIN32) || defined(_WIN64)
4662 SetConsoleCtrlHandler(ConsoleBreakHandlerRoutine, true);
4663 #else
4664 #ifdef SIGPIPE
4665 signal(SIGPIPE, signal_handler);
4666 #endif
4667 #ifdef SIGHUP
4668 signal(SIGHUP, signal_handler);
4669 #endif
4670 signal(SIGINT, signal_handler);
4671 signal(SIGTERM, signal_handler);
4672 #endif /* !WINDOWS */
4673
4674 envname = argv[optind];
4675 print("mdbx_chk %s (%s, T-%s)\nRunning for %s in 'read-%s' mode...\n",
4676 mdbx_version.git.describe, mdbx_version.git.datetime,
4677 mdbx_version.git.tree, envname,
4678 (envflags & MDBX_RDONLY) ? "only" : "write");
4679 fflush(nullptr);
4680 mdbx_setup_debug((verbose < MDBX_LOG_TRACE - 1)
4681 ? (MDBX_log_level_t)(verbose + 1)
4682 : MDBX_LOG_TRACE,
4683 MDBX_DBG_LEGACY_OVERLAP, logger);
4684
4685 rc = mdbx_env_create(&env);
4686 if (rc) {
4687 error("mdbx_env_create() failed, error %d %s\n", rc, mdbx_strerror(rc));
4688 return rc < 0 ? EXIT_FAILURE_MDBX : EXIT_FAILURE_SYS;
4689 }
4690
4691 rc = mdbx_env_set_maxdbs(env, MDBX_MAX_DBI);
4692 if (rc) {
4693 error("mdbx_env_set_maxdbs() failed, error %d %s\n", rc, mdbx_strerror(rc));
4694 goto bailout;
4695 }
4696
4697 if (stuck_meta >= 0) {
4698 rc = mdbx_env_open_for_recovery(env, envname, stuck_meta,
4699 (envflags & MDBX_RDONLY) ? false : true);
4700 } else {
4701 rc = mdbx_env_open(env, envname, envflags, 0);
4702 if ((envflags & MDBX_EXCLUSIVE) &&
4703 (rc == MDBX_BUSY ||
4704 #if defined(_WIN32) || defined(_WIN64)
4705 rc == ERROR_LOCK_VIOLATION || rc == ERROR_SHARING_VIOLATION
4706 #else
4707 rc == EBUSY || rc == EAGAIN
4708 #endif
4709 )) {
4710 envflags &= ~MDBX_EXCLUSIVE;
4711 rc = mdbx_env_open(env, envname, envflags | MDBX_ACCEDE, 0);
4712 }
4713 }
4714
4715 if (rc) {
4716 error("mdbx_env_open() failed, error %d %s\n", rc, mdbx_strerror(rc));
4717 if (rc == MDBX_WANNA_RECOVERY && (envflags & MDBX_RDONLY))
4718 print("Please run %s in the read-write mode (with '-w' option).\n", prog);
4719 goto bailout;
4720 }
4721 if (verbose)
4722 print(" - %s mode\n",
4723 (envflags & MDBX_EXCLUSIVE) ? "monopolistic" : "cooperative");
4724
4725 if ((envflags & (MDBX_RDONLY | MDBX_EXCLUSIVE)) == 0) {
4726 rc = mdbx_txn_lock(env, false);
4727 if (rc != MDBX_SUCCESS) {
4728 error("mdbx_txn_lock() failed, error %d %s\n", rc, mdbx_strerror(rc));
4729 goto bailout;
4730 }
4731 write_locked = true;
4732 }
4733
4734 rc = mdbx_txn_begin(env, nullptr, MDBX_TXN_RDONLY, &txn);
4735 if (rc) {
4736 error("mdbx_txn_begin() failed, error %d %s\n", rc, mdbx_strerror(rc));
4737 goto bailout;
4738 }
4739
4740 rc = mdbx_env_info_ex(env, txn, &envinfo, sizeof(envinfo));
4741 if (rc) {
4742 error("mdbx_env_info_ex() failed, error %d %s\n", rc, mdbx_strerror(rc));
4743 goto bailout;
4744 }
4745 if (verbose) {
4746 print(" - current boot-id ");
4747 if (envinfo.mi_bootid.current.x | envinfo.mi_bootid.current.y)
4748 print("%016" PRIx64 "-%016" PRIx64 "\n", envinfo.mi_bootid.current.x,
4749 envinfo.mi_bootid.current.y);
4750 else
4751 print("unavailable\n");
4752 }
4753
4754 mdbx_filehandle_t dxb_fd;
4755 rc = mdbx_env_get_fd(env, &dxb_fd);
4756 if (rc) {
4757 error("mdbx_env_get_fd() failed, error %d %s\n", rc, mdbx_strerror(rc));
4758 goto bailout;
4759 }
4760
4761 uint64_t dxb_filesize = 0;
4762 #if defined(_WIN32) || defined(_WIN64)
4763 {
4764 BY_HANDLE_FILE_INFORMATION info;
4765 if (!GetFileInformationByHandle(dxb_fd, &info))
4766 rc = GetLastError();
4767 else
4768 dxb_filesize = info.nFileSizeLow | (uint64_t)info.nFileSizeHigh << 32;
4769 }
4770 #else
4771 {
4772 struct stat st;
4773 STATIC_ASSERT_MSG(sizeof(off_t) <= sizeof(uint64_t),
4774 "libmdbx requires 64-bit file I/O on 64-bit systems");
4775 if (fstat(dxb_fd, &st))
4776 rc = errno;
4777 else
4778 dxb_filesize = st.st_size;
4779 }
4780 #endif
4781 if (rc) {
4782 error("mdbx_filesize() failed, error %d %s\n", rc, mdbx_strerror(rc));
4783 goto bailout;
4784 }
4785
4786 errno = 0;
4787 const uint64_t dxbfile_pages = dxb_filesize / envinfo.mi_dxb_pagesize;
4788 alloc_pages = txn->mt_next_pgno;
4789 backed_pages = envinfo.mi_geo.current / envinfo.mi_dxb_pagesize;
4790 if (backed_pages > dxbfile_pages) {
4791 print(" ! backed-pages %" PRIu64 " > file-pages %" PRIu64 "\n",
4792 backed_pages, dxbfile_pages);
4793 ++problems_meta;
4794 }
4795 if (dxbfile_pages < NUM_METAS)
4796 print(" ! file-pages %" PRIu64 " < %u\n", dxbfile_pages, NUM_METAS);
4797 if (backed_pages < NUM_METAS)
4798 print(" ! backed-pages %" PRIu64 " < %u\n", backed_pages, NUM_METAS);
4799 if (backed_pages < NUM_METAS || dxbfile_pages < NUM_METAS)
4800 goto bailout;
4801 if (backed_pages > MAX_PAGENO) {
4802 print(" ! backed-pages %" PRIu64 " > max-pages %" PRIaPGNO "\n",
4803 backed_pages, MAX_PAGENO);
4804 ++problems_meta;
4805 backed_pages = MAX_PAGENO;
4806 }
4807
4808 if ((envflags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) != MDBX_RDONLY) {
4809 if (backed_pages > dxbfile_pages) {
4810 print(" ! backed-pages %" PRIu64 " > file-pages %" PRIu64 "\n",
4811 backed_pages, dxbfile_pages);
4812 ++problems_meta;
4813 backed_pages = dxbfile_pages;
4814 }
4815 if (alloc_pages > backed_pages) {
4816 print(" ! alloc-pages %" PRIu64 " > backed-pages %" PRIu64 "\n",
4817 alloc_pages, backed_pages);
4818 ++problems_meta;
4819 alloc_pages = backed_pages;
4820 }
4821 } else {
4822 /* LY: DB may be shrinked by writer down to the allocated pages. */
4823 if (alloc_pages > backed_pages) {
4824 print(" ! alloc-pages %" PRIu64 " > backed-pages %" PRIu64 "\n",
4825 alloc_pages, backed_pages);
4826 ++problems_meta;
4827 alloc_pages = backed_pages;
4828 }
4829 if (alloc_pages > dxbfile_pages) {
4830 print(" ! alloc-pages %" PRIu64 " > file-pages %" PRIu64 "\n",
4831 alloc_pages, dxbfile_pages);
4832 ++problems_meta;
4833 alloc_pages = dxbfile_pages;
4834 }
4835 if (backed_pages > dxbfile_pages)
4836 backed_pages = dxbfile_pages;
4837 }
4838
4839 if (verbose) {
4840 print(" - pagesize %u (%u system), max keysize %d..%d"
4841 ", max readers %u\n",
4842 envinfo.mi_dxb_pagesize, envinfo.mi_sys_pagesize,
4843 mdbx_env_get_maxkeysize_ex(env, MDBX_DUPSORT),
4844 mdbx_env_get_maxkeysize_ex(env, 0), envinfo.mi_maxreaders);
4845 print_size(" - mapsize ", envinfo.mi_mapsize, "\n");
4846 if (envinfo.mi_geo.lower == envinfo.mi_geo.upper)
4847 print_size(" - fixed datafile: ", envinfo.mi_geo.current, "");
4848 else {
4849 print_size(" - dynamic datafile: ", envinfo.mi_geo.lower, "");
4850 print_size(" .. ", envinfo.mi_geo.upper, ", ");
4851 print_size("+", envinfo.mi_geo.grow, ", ");
4852 print_size("-", envinfo.mi_geo.shrink, "\n");
4853 print_size(" - current datafile: ", envinfo.mi_geo.current, "");
4854 }
4855 printf(", %" PRIu64 " pages\n",
4856 envinfo.mi_geo.current / envinfo.mi_dxb_pagesize);
4857 #if defined(_WIN32) || defined(_WIN64)
4858 if (envinfo.mi_geo.shrink && envinfo.mi_geo.current != envinfo.mi_geo.upper)
4859 print(
4860 " WARNING: Due Windows system limitations a "
4861 "file couldn't\n be truncated while the database "
4862 "is opened. So, the size\n database file "
4863 "of may by large than the database itself,\n "
4864 "until it will be closed or reopened in read-write mode.\n");
4865 #endif
4866 verbose_meta(0, envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign,
4867 envinfo.mi_bootid.meta0.x, envinfo.mi_bootid.meta0.y);
4868 verbose_meta(1, envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign,
4869 envinfo.mi_bootid.meta1.x, envinfo.mi_bootid.meta1.y);
4870 verbose_meta(2, envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign,
4871 envinfo.mi_bootid.meta2.x, envinfo.mi_bootid.meta2.y);
4872 }
4873
4874 if (stuck_meta >= 0) {
4875 if (verbose) {
4876 print(" - skip checking meta-pages since the %u"
4877 " is selected for verification\n",
4878 stuck_meta);
4879 print(" - transactions: recent %" PRIu64
4880 ", selected for verification %" PRIu64 ", lag %" PRIi64 "\n",
4881 envinfo.mi_recent_txnid, get_meta_txnid(stuck_meta),
4882 envinfo.mi_recent_txnid - get_meta_txnid(stuck_meta));
4883 }
4884 } else {
4885 if (verbose > 1)
4886 print(" - performs check for meta-pages clashes\n");
4887 if (meta_eq(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign,
4888 envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign)) {
4889 print(" ! meta-%d and meta-%d are clashed\n", 0, 1);
4890 ++problems_meta;
4891 }
4892 if (meta_eq(envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign,
4893 envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign)) {
4894 print(" ! meta-%d and meta-%d are clashed\n", 1, 2);
4895 ++problems_meta;
4896 }
4897 if (meta_eq(envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign,
4898 envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign)) {
4899 print(" ! meta-%d and meta-%d are clashed\n", 2, 0);
4900 ++problems_meta;
4901 }
4902
4903 const unsigned steady_meta_id = meta_recent(true);
4904 const uint64_t steady_meta_txnid = get_meta_txnid(steady_meta_id);
4905 const unsigned weak_meta_id = meta_recent(false);
4906 const uint64_t weak_meta_txnid = get_meta_txnid(weak_meta_id);
4907 if (envflags & MDBX_EXCLUSIVE) {
4908 if (verbose > 1)
4909 print(" - performs full check recent-txn-id with meta-pages\n");
4910 if (steady_meta_txnid != envinfo.mi_recent_txnid) {
4911 print(" ! steady meta-%d txn-id mismatch recent-txn-id (%" PRIi64
4912 " != %" PRIi64 ")\n",
4913 steady_meta_id, steady_meta_txnid, envinfo.mi_recent_txnid);
4914 ++problems_meta;
4915 }
4916 } else if (write_locked) {
4917 if (verbose > 1)
4918 print(" - performs lite check recent-txn-id with meta-pages (not a "
4919 "monopolistic mode)\n");
4920 if (weak_meta_txnid != envinfo.mi_recent_txnid) {
4921 print(" ! weak meta-%d txn-id mismatch recent-txn-id (%" PRIi64
4922 " != %" PRIi64 ")\n",
4923 weak_meta_id, weak_meta_txnid, envinfo.mi_recent_txnid);
4924 ++problems_meta;
4925 }
4926 } else if (verbose) {
4927 print(" - skip check recent-txn-id with meta-pages (monopolistic or "
4928 "read-write mode only)\n");
4929 }
4930 total_problems += problems_meta;
4931
4932 if (verbose)
4933 print(" - transactions: recent %" PRIu64 ", latter reader %" PRIu64
4934 ", lag %" PRIi64 "\n",
4935 envinfo.mi_recent_txnid, envinfo.mi_latter_reader_txnid,
4936 envinfo.mi_recent_txnid - envinfo.mi_latter_reader_txnid);
4937 }
4938
4939 if (!dont_traversal) {
4940 struct problem *saved_list;
4941 size_t traversal_problems;
4942 uint64_t empty_pages, lost_bytes;
4943
4944 print("Traversal b-tree by txn#%" PRIaTXN "...\n", txn->mt_txnid);
4945 fflush(nullptr);
4946 walk.pagemap = mdbx_calloc((size_t)backed_pages, sizeof(*walk.pagemap));
4947 if (!walk.pagemap) {
4948 rc = errno ? errno : MDBX_ENOMEM;
4949 error("calloc() failed, error %d %s\n", rc, mdbx_strerror(rc));
4950 goto bailout;
4951 }
4952
4953 saved_list = problems_push();
4954 rc = mdbx_env_pgwalk(txn, pgvisitor, nullptr,
4955 true /* always skip key ordering checking to avoid
4956 MDBX_CORRUPTED when using custom comparators */);
4957 traversal_problems = problems_pop(saved_list);
4958
4959 if (rc) {
4960 if (rc != MDBX_EINTR || !check_user_break())
4961 error("mdbx_env_pgwalk() failed, error %d %s\n", rc, mdbx_strerror(rc));
4962 goto bailout;
4963 }
4964
4965 for (uint64_t n = 0; n < alloc_pages; ++n)
4966 if (!walk.pagemap[n])
4967 unused_pages += 1;
4968
4969 empty_pages = lost_bytes = 0;
4970 for (walk_dbi_t *dbi = &dbi_main; dbi < ARRAY_END(walk.dbi) && dbi->name;
4971 ++dbi) {
4972 empty_pages += dbi->pages.empty;
4973 lost_bytes += dbi->lost_bytes;
4974 }
4975
4976 if (verbose) {
4977 uint64_t total_page_bytes = walk.pgcount * envinfo.mi_dxb_pagesize;
4978 print(" - pages: walked %" PRIu64 ", left/unused %" PRIu64 "\n",
4979 walk.pgcount, unused_pages);
4980 if (verbose > 1) {
4981 for (walk_dbi_t *dbi = walk.dbi; dbi < ARRAY_END(walk.dbi) && dbi->name;
4982 ++dbi) {
4983 print(" %s: subtotal %" PRIu64, dbi->name, dbi->pages.total);
4984 if (dbi->pages.other && dbi->pages.other != dbi->pages.total)
4985 print(", other %" PRIu64, dbi->pages.other);
4986 if (dbi->pages.branch)
4987 print(", branch %" PRIu64, dbi->pages.branch);
4988 if (dbi->pages.large_count)
4989 print(", large %" PRIu64, dbi->pages.large_count);
4990 uint64_t all_leaf = dbi->pages.leaf + dbi->pages.leaf_dupfixed;
4991 if (all_leaf) {
4992 print(", leaf %" PRIu64, all_leaf);
4993 if (verbose > 2 &&
4994 (dbi->pages.subleaf_dupsort | dbi->pages.leaf_dupfixed |
4995 dbi->pages.subleaf_dupfixed))
4996 print(" (usual %" PRIu64 ", sub-dupsort %" PRIu64
4997 ", dupfixed %" PRIu64 ", sub-dupfixed %" PRIu64 ")",
4998 dbi->pages.leaf, dbi->pages.subleaf_dupsort,
4999 dbi->pages.leaf_dupfixed, dbi->pages.subleaf_dupfixed);
5000 }
5001 print("\n");
5002 }
5003 }
5004
5005 if (verbose > 1)
5006 print(" - usage: total %" PRIu64 " bytes, payload %" PRIu64
5007 " (%.1f%%), unused "
5008 "%" PRIu64 " (%.1f%%)\n",
5009 total_page_bytes, walk.total_payload_bytes,
5010 walk.total_payload_bytes * 100.0 / total_page_bytes,
5011 total_page_bytes - walk.total_payload_bytes,
5012 (total_page_bytes - walk.total_payload_bytes) * 100.0 /
5013 total_page_bytes);
5014 if (verbose > 2) {
5015 for (walk_dbi_t *dbi = walk.dbi; dbi < ARRAY_END(walk.dbi) && dbi->name;
5016 ++dbi)
5017 if (dbi->pages.total) {
5018 uint64_t dbi_bytes = dbi->pages.total * envinfo.mi_dxb_pagesize;
5019 print(" %s: subtotal %" PRIu64 " bytes (%.1f%%),"
5020 " payload %" PRIu64 " (%.1f%%), unused %" PRIu64 " (%.1f%%)",
5021 dbi->name, dbi_bytes, dbi_bytes * 100.0 / total_page_bytes,
5022 dbi->payload_bytes, dbi->payload_bytes * 100.0 / dbi_bytes,
5023 dbi_bytes - dbi->payload_bytes,
5024 (dbi_bytes - dbi->payload_bytes) * 100.0 / dbi_bytes);
5025 if (dbi->pages.empty)
5026 print(", %" PRIu64 " empty pages", dbi->pages.empty);
5027 if (dbi->lost_bytes)
5028 print(", %" PRIu64 " bytes lost", dbi->lost_bytes);
5029 print("\n");
5030 } else
5031 print(" %s: empty\n", dbi->name);
5032 }
5033 print(" - summary: average fill %.1f%%",
5034 walk.total_payload_bytes * 100.0 / total_page_bytes);
5035 if (empty_pages)
5036 print(", %" PRIu64 " empty pages", empty_pages);
5037 if (lost_bytes)
5038 print(", %" PRIu64 " bytes lost", lost_bytes);
5039 print(", %" PRIuPTR " problems\n", traversal_problems);
5040 }
5041 } else if (verbose) {
5042 print("Skipping b-tree walk...\n");
5043 fflush(nullptr);
5044 }
5045
5046 if (!verbose)
5047 print("Iterating DBIs...\n");
5048 if (data_tree_problems) {
5049 print("Skip processing %s since tree is corrupted (%u problems)\n", "@MAIN",
5050 data_tree_problems);
5051 problems_maindb = data_tree_problems;
5052 } else
5053 problems_maindb = process_db(~0u, /* MAIN_DBI */ nullptr, nullptr, false);
5054
5055 if (gc_tree_problems) {
5056 print("Skip processing %s since tree is corrupted (%u problems)\n", "@GC",
5057 gc_tree_problems);
5058 problems_freedb = gc_tree_problems;
5059 } else
5060 problems_freedb = process_db(FREE_DBI, "@GC", handle_freedb, false);
5061
5062 if (verbose) {
5063 uint64_t value = envinfo.mi_mapsize / envinfo.mi_dxb_pagesize;
5064 double percent = value / 100.0;
5065 print(" - space: %" PRIu64 " total pages", value);
5066 print(", backed %" PRIu64 " (%.1f%%)", backed_pages,
5067 backed_pages / percent);
5068 print(", allocated %" PRIu64 " (%.1f%%)", alloc_pages,
5069 alloc_pages / percent);
5070
5071 if (verbose > 1) {
5072 value = envinfo.mi_mapsize / envinfo.mi_dxb_pagesize - alloc_pages;
5073 print(", remained %" PRIu64 " (%.1f%%)", value, value / percent);
5074
5075 value = dont_traversal ? alloc_pages - gc_pages : walk.pgcount;
5076 print(", used %" PRIu64 " (%.1f%%)", value, value / percent);
5077
5078 print(", gc %" PRIu64 " (%.1f%%)", gc_pages, gc_pages / percent);
5079
5080 value = gc_pages - reclaimable_pages;
5081 print(", detained %" PRIu64 " (%.1f%%)", value, value / percent);
5082
5083 print(", reclaimable %" PRIu64 " (%.1f%%)", reclaimable_pages,
5084 reclaimable_pages / percent);
5085 }
5086
5087 value = envinfo.mi_mapsize / envinfo.mi_dxb_pagesize - alloc_pages +
5088 reclaimable_pages;
5089 print(", available %" PRIu64 " (%.1f%%)\n", value, value / percent);
5090 }
5091
5092 if (problems_maindb == 0 && problems_freedb == 0) {
5093 if (!dont_traversal &&
5094 (envflags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) != MDBX_RDONLY) {
5095 if (walk.pgcount != alloc_pages - gc_pages) {
5096 error("used pages mismatch (%" PRIu64 "(walked) != %" PRIu64
5097 "(allocated - GC))\n",
5098 walk.pgcount, alloc_pages - gc_pages);
5099 }
5100 if (unused_pages != gc_pages) {
5101 error("gc pages mismatch (%" PRIu64 "(expected) != %" PRIu64 "(GC))\n",
5102 unused_pages, gc_pages);
5103 }
5104 } else if (verbose) {
5105 print(" - skip check used and gc pages (btree-traversal with "
5106 "monopolistic or read-write mode only)\n");
5107 }
5108
5109 if (!process_db(MAIN_DBI, nullptr, handle_maindb, true)) {
5110 if (!userdb_count && verbose)
5111 print(" - does not contain multiple databases\n");
5112 }
5113 }
5114
5115 if (rc == 0 && total_problems == 1 && problems_meta == 1 && !dont_traversal &&
5116 (envflags & MDBX_RDONLY) == 0 && !only_subdb && stuck_meta < 0 &&
5117 get_meta_txnid(meta_recent(true)) < envinfo.mi_recent_txnid) {
5118 print("Perform sync-to-disk for make steady checkpoint at txn-id #%" PRIi64
5119 "\n",
5120 envinfo.mi_recent_txnid);
5121 fflush(nullptr);
5122 if (write_locked) {
5123 mdbx_txn_unlock(env);
5124 write_locked = false;
5125 }
5126 rc = mdbx_env_sync_ex(env, true, false);
5127 if (rc != MDBX_SUCCESS)
5128 error("mdbx_env_pgwalk() failed, error %d %s\n", rc, mdbx_strerror(rc));
5129 else {
5130 total_problems -= 1;
5131 problems_meta -= 1;
5132 }
5133 }
5134
5135 if (turn_meta && stuck_meta >= 0 && !dont_traversal && !only_subdb &&
5136 (envflags & (MDBX_RDONLY | MDBX_EXCLUSIVE)) == MDBX_EXCLUSIVE) {
5137 const bool successful_check = (rc | total_problems | problems_meta) == 0;
5138 if (successful_check || force_turn_meta) {
5139 fflush(nullptr);
5140 print(" = Performing turn to the specified meta-page (%d) due to %s!\n",
5141 stuck_meta,
5142 successful_check ? "successful check" : "the -T option was given");
5143 fflush(nullptr);
5144 rc = mdbx_env_turn_for_recovery(env, stuck_meta);
5145 if (rc != MDBX_SUCCESS)
5146 error("mdbx_env_turn_for_recovery() failed, error %d %s\n", rc,
5147 mdbx_strerror(rc));
5148 } else {
5149 print(" = Skipping turn to the specified meta-page (%d) due to "
5150 "unsuccessful check!\n",
5151 stuck_meta);
5152 }
5153 }
5154
5155 bailout:
5156 if (txn)
5157 mdbx_txn_abort(txn);
5158 if (write_locked) {
5159 mdbx_txn_unlock(env);
5160 write_locked = false;
5161 }
5162 if (env) {
5163 const bool dont_sync = rc != 0 || total_problems;
5164 mdbx_env_close_ex(env, dont_sync);
5165 }
5166 fflush(nullptr);
5167 if (rc) {
5168 if (rc < 0)
5169 return user_break ? EXIT_INTERRUPTED : EXIT_FAILURE_SYS;
5170 return EXIT_FAILURE_MDBX;
5171 }
5172
5173 #if defined(_WIN32) || defined(_WIN64)
5174 timestamp_finish = GetMilliseconds();
5175 elapsed = (timestamp_finish - timestamp_start) * 1e-3;
5176 #else
5177 if (clock_gettime(CLOCK_MONOTONIC, ×tamp_finish)) {
5178 rc = errno;
5179 error("clock_gettime() failed, error %d %s\n", rc, mdbx_strerror(rc));
5180 return EXIT_FAILURE_SYS;
5181 }
5182 elapsed = timestamp_finish.tv_sec - timestamp_start.tv_sec +
5183 (timestamp_finish.tv_nsec - timestamp_start.tv_nsec) * 1e-9;
5184 #endif /* !WINDOWS */
5185
5186 if (total_problems) {
5187 print("Total %u error%s detected, elapsed %.3f seconds.\n", total_problems,
5188 (total_problems > 1) ? "s are" : " is", elapsed);
5189 if (problems_meta || problems_maindb || problems_freedb)
5190 return EXIT_FAILURE_CHECK_MAJOR;
5191 return EXIT_FAILURE_CHECK_MINOR;
5192 }
5193 print("No error is detected, elapsed %.3f seconds\n", elapsed);
5194 return EXIT_SUCCESS;
5195 }
5196