1 /* mdbx_chk.c - memory-mapped database check tool */
2 
3 /*
4  * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru>
5  * and other libmdbx authors: please see AUTHORS file.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted only as authorized by the OpenLDAP
10  * Public License.
11  *
12  * A copy of this license is available in the file LICENSE in the
13  * top-level directory of the distribution or, alternatively, at
14  * <http://www.OpenLDAP.org/license.html>. */
15 
16 #ifdef _MSC_VER
17 #if _MSC_VER > 1800
18 #pragma warning(disable : 4464) /* relative include path contains '..' */
19 #endif
20 #pragma warning(disable : 4996) /* The POSIX name is deprecated... */
21 #endif                          /* _MSC_VER (warnings) */
22 
23 #define xMDBX_TOOLS /* Avoid using internal mdbx_assert() */
24 /*
25  * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru>
26  * and other libmdbx authors: please see AUTHORS file.
27  * All rights reserved.
28  *
29  * Redistribution and use in source and binary forms, with or without
30  * modification, are permitted only as authorized by the OpenLDAP
31  * Public License.
32  *
33  * A copy of this license is available in the file LICENSE in the
34  * top-level directory of the distribution or, alternatively, at
35  * <http://www.OpenLDAP.org/license.html>. */
36 
37 #define MDBX_BUILD_SOURCERY facaa40d3bb34698b2ba800e2fe225773e3941040aef7dc92580b74ad840e798_v0_11_2_0_gd47eed0
38 #ifdef MDBX_CONFIG_H
39 #include MDBX_CONFIG_H
40 #endif
41 
42 #define LIBMDBX_INTERNALS
43 #ifdef xMDBX_TOOLS
44 #define MDBX_DEPRECATED
45 #endif /* xMDBX_TOOLS */
46 
47 #ifdef xMDBX_ALLOY
48 /* Amalgamated build */
49 #define MDBX_INTERNAL_FUNC static
50 #define MDBX_INTERNAL_VAR static
51 #else
52 /* Non-amalgamated build */
53 #define MDBX_INTERNAL_FUNC
54 #define MDBX_INTERNAL_VAR extern
55 #endif /* xMDBX_ALLOY */
56 
57 /** Disables using GNU/Linux libc extensions.
58  * \ingroup build_option
59  * \note This option couldn't be moved to the options.h since dependant
60  * control macros/defined should be prepared before include the options.h */
61 #ifndef MDBX_DISABLE_GNU_SOURCE
62 #define MDBX_DISABLE_GNU_SOURCE 0
63 #endif
64 #if MDBX_DISABLE_GNU_SOURCE
65 #undef _GNU_SOURCE
66 #elif (defined(__linux__) || defined(__gnu_linux__)) && !defined(_GNU_SOURCE)
67 #define _GNU_SOURCE
68 #endif /* MDBX_DISABLE_GNU_SOURCE */
69 
70 /*----------------------------------------------------------------------------*/
71 
72 /* Should be defined before any includes */
73 #ifndef _FILE_OFFSET_BITS
74 #define _FILE_OFFSET_BITS 64
75 #endif
76 
77 #ifdef __APPLE__
78 #define _DARWIN_C_SOURCE
79 #endif
80 
81 #ifdef _MSC_VER
82 #if _MSC_FULL_VER < 190024234
83 /* Actually libmdbx was not tested with compilers older than 19.00.24234 (Visual
84  * Studio 2015 Update 3). But you could remove this #error and try to continue
85  * at your own risk. In such case please don't rise up an issues related ONLY to
86  * old compilers.
87  *
88  * NOTE:
89  *   Unfortunately, there are several different builds of "Visual Studio" that
90  *   are called "Visual Studio 2015 Update 3".
91  *
92  *   The 190024234 is used here because it is minimal version of Visual Studio
93  *   that was used for build and testing libmdbx in recent years. Soon this
94  *   value will be increased to 19.0.24241.7, since build and testing using
95  *   "Visual Studio 2015" will be performed only at https://ci.appveyor.com.
96  *
97  *   Please ask Microsoft (but not us) for information about version differences
98  *   and how to and where you can obtain the latest "Visual Studio 2015" build
99  *   with all fixes.
100  */
101 #error                                                                         \
102     "At least \"Microsoft C/C++ Compiler\" version 19.00.24234 (Visual Studio 2015 Update 3) is required."
103 #endif
104 #ifndef _CRT_SECURE_NO_WARNINGS
105 #define _CRT_SECURE_NO_WARNINGS
106 #endif /* _CRT_SECURE_NO_WARNINGS */
107 #if _MSC_VER > 1800
108 #pragma warning(disable : 4464) /* relative include path contains '..' */
109 #endif
110 #if _MSC_VER > 1913
111 #pragma warning(disable : 5045) /* Compiler will insert Spectre mitigation...  \
112                                  */
113 #endif
114 #pragma warning(disable : 4710) /* 'xyz': function not inlined */
115 #pragma warning(disable : 4711) /* function 'xyz' selected for automatic       \
116                                    inline expansion */
117 #pragma warning(                                                               \
118     disable : 4201) /* nonstandard extension used : nameless struct / union */
119 #pragma warning(disable : 4702) /* unreachable code */
120 #pragma warning(disable : 4706) /* assignment within conditional expression */
121 #pragma warning(disable : 4127) /* conditional expression is constant */
122 #pragma warning(disable : 4324) /* 'xyz': structure was padded due to          \
123                                    alignment specifier */
124 #pragma warning(disable : 4310) /* cast truncates constant value */
125 #pragma warning(                                                               \
126     disable : 4820) /* bytes padding added after data member for alignment */
127 #pragma warning(disable : 4548) /* expression before comma has no effect;      \
128                                    expected expression with side - effect */
129 #pragma warning(disable : 4366) /* the result of the unary '&' operator may be \
130                                    unaligned */
131 #pragma warning(disable : 4200) /* nonstandard extension used: zero-sized      \
132                                    array in struct/union */
133 #pragma warning(disable : 4204) /* nonstandard extension used: non-constant    \
134                                    aggregate initializer */
135 #pragma warning(                                                               \
136     disable : 4505) /* unreferenced local function has been removed */
137 #endif              /* _MSC_VER (warnings) */
138 
139 #include "mdbx.h"
140 /*
141  * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru>
142  * and other libmdbx authors: please see AUTHORS file.
143  * All rights reserved.
144  *
145  * Redistribution and use in source and binary forms, with or without
146  * modification, are permitted only as authorized by the OpenLDAP
147  * Public License.
148  *
149  * A copy of this license is available in the file LICENSE in the
150  * top-level directory of the distribution or, alternatively, at
151  * <http://www.OpenLDAP.org/license.html>.
152  */
153 
154 /* *INDENT-OFF* */
155 /* clang-format off */
156 
157 #ifndef __GNUC_PREREQ
158 #   if defined(__GNUC__) && defined(__GNUC_MINOR__)
159 #       define __GNUC_PREREQ(maj, min) \
160           ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min))
161 #   else
162 #       define __GNUC_PREREQ(maj, min) (0)
163 #   endif
164 #endif /* __GNUC_PREREQ */
165 
166 #ifndef __CLANG_PREREQ
167 #   ifdef __clang__
168 #       define __CLANG_PREREQ(maj,min) \
169           ((__clang_major__ << 16) + __clang_minor__ >= ((maj) << 16) + (min))
170 #   else
171 #       define __CLANG_PREREQ(maj,min) (0)
172 #   endif
173 #endif /* __CLANG_PREREQ */
174 
175 #ifndef __GLIBC_PREREQ
176 #   if defined(__GLIBC__) && defined(__GLIBC_MINOR__)
177 #       define __GLIBC_PREREQ(maj, min) \
178           ((__GLIBC__ << 16) + __GLIBC_MINOR__ >= ((maj) << 16) + (min))
179 #   else
180 #       define __GLIBC_PREREQ(maj, min) (0)
181 #   endif
182 #endif /* __GLIBC_PREREQ */
183 
184 #ifndef __has_warning
185 #   define __has_warning(x) (0)
186 #endif
187 
188 #ifndef __has_include
189 #   define __has_include(x) (0)
190 #endif
191 
192 #if __has_feature(thread_sanitizer)
193 #   define __SANITIZE_THREAD__ 1
194 #endif
195 
196 #if __has_feature(address_sanitizer)
197 #   define __SANITIZE_ADDRESS__ 1
198 #endif
199 
200 /*----------------------------------------------------------------------------*/
201 
202 #ifndef __extern_C
203 #   ifdef __cplusplus
204 #       define __extern_C extern "C"
205 #   else
206 #       define __extern_C
207 #   endif
208 #endif /* __extern_C */
209 
210 #if !defined(nullptr) && !defined(__cplusplus) || (__cplusplus < 201103L && !defined(_MSC_VER))
211 #   define nullptr NULL
212 #endif
213 
214 /*----------------------------------------------------------------------------*/
215 
216 #ifndef __always_inline
217 #   if defined(__GNUC__) || __has_attribute(__always_inline__)
218 #       define __always_inline __inline __attribute__((__always_inline__))
219 #   elif defined(_MSC_VER)
220 #       define __always_inline __forceinline
221 #   else
222 #       define __always_inline
223 #   endif
224 #endif /* __always_inline */
225 
226 #ifndef __noinline
227 #   if defined(__GNUC__) || __has_attribute(__noinline__)
228 #       define __noinline __attribute__((__noinline__))
229 #   elif defined(_MSC_VER)
230 #       define __noinline __declspec(noinline)
231 #   else
232 #       define __noinline
233 #   endif
234 #endif /* __noinline */
235 
236 #ifndef __must_check_result
237 #   if defined(__GNUC__) || __has_attribute(__warn_unused_result__)
238 #       define __must_check_result __attribute__((__warn_unused_result__))
239 #   else
240 #       define __must_check_result
241 #   endif
242 #endif /* __must_check_result */
243 
244 #if !defined(__noop) && !defined(_MSC_VER)
245 #   define __noop(...) do {} while(0)
246 #endif /* __noop */
247 
248 #ifndef __fallthrough
249 #  if defined(__cplusplus) && (__has_cpp_attribute(fallthrough) &&             \
250      (!defined(__clang__) || __clang__ > 4)) || __cplusplus >= 201703L
251 #    define __fallthrough [[fallthrough]]
252 #  elif __GNUC_PREREQ(8, 0) && defined(__cplusplus) && __cplusplus >= 201103L
253 #    define __fallthrough [[fallthrough]]
254 #  elif __GNUC_PREREQ(7, 0) &&                                                 \
255     (!defined(__LCC__) || (__LCC__ == 124 && __LCC_MINOR__ >= 12) ||           \
256      (__LCC__ == 125 && __LCC_MINOR__ >= 5) || (__LCC__ >= 126))
257 #    define __fallthrough __attribute__((__fallthrough__))
258 #  elif defined(__clang__) && defined(__cplusplus) && __cplusplus >= 201103L &&\
259     __has_feature(cxx_attributes) && __has_warning("-Wimplicit-fallthrough")
260 #    define __fallthrough [[clang::fallthrough]]
261 #  else
262 #    define __fallthrough
263 #  endif
264 #endif /* __fallthrough */
265 
266 #ifndef __unreachable
267 #   if __GNUC_PREREQ(4,5) || __has_builtin(__builtin_unreachable)
268 #       define __unreachable() __builtin_unreachable()
269 #   elif defined(_MSC_VER)
270 #       define __unreachable() __assume(0)
271 #   else
272 #       define __unreachable() __noop()
273 #   endif
274 #endif /* __unreachable */
275 
276 #ifndef __prefetch
277 #   if defined(__GNUC__) || defined(__clang__) || __has_builtin(__builtin_prefetch)
278 #       define __prefetch(ptr) __builtin_prefetch(ptr)
279 #   else
280 #       define __prefetch(ptr) __noop(ptr)
281 #   endif
282 #endif /* __prefetch */
283 
284 #ifndef __nothrow
285 #   if defined(__cplusplus)
286 #       if __cplusplus < 201703L
287 #           define __nothrow throw()
288 #       else
289 #           define __nothrow noexcept(true)
290 #       endif /* __cplusplus */
291 #   elif defined(__GNUC__) || __has_attribute(__nothrow__)
292 #       define __nothrow __attribute__((__nothrow__))
293 #   elif defined(_MSC_VER) && defined(__cplusplus)
294 #       define __nothrow __declspec(nothrow)
295 #   else
296 #       define __nothrow
297 #   endif
298 #endif /* __nothrow */
299 
300 #ifndef __hidden
301 #   if defined(__GNUC__) || __has_attribute(__visibility__)
302 #       define __hidden __attribute__((__visibility__("hidden")))
303 #   else
304 #       define __hidden
305 #   endif
306 #endif /* __hidden */
307 
308 #ifndef __optimize
309 #   if defined(__OPTIMIZE__)
310 #       if (defined(__GNUC__) && !defined(__clang__)) || __has_attribute(__optimize__)
311 #           define __optimize(ops) __attribute__((__optimize__(ops)))
312 #       else
313 #           define __optimize(ops)
314 #       endif
315 #   else
316 #       define __optimize(ops)
317 #   endif
318 #endif /* __optimize */
319 
320 #ifndef __hot
321 #   if defined(__OPTIMIZE__)
322 #       if defined(__e2k__)
323 #           define __hot __attribute__((__hot__)) __optimize(3)
324 #       elif defined(__clang__) && !__has_attribute(__hot_) \
325         && __has_attribute(__section__) && (defined(__linux__) || defined(__gnu_linux__))
326             /* just put frequently used functions in separate section */
327 #           define __hot __attribute__((__section__("text.hot"))) __optimize("O3")
328 #       elif defined(__GNUC__) || __has_attribute(__hot__)
329 #           define __hot __attribute__((__hot__)) __optimize("O3")
330 #       else
331 #           define __hot __optimize("O3")
332 #       endif
333 #   else
334 #       define __hot
335 #   endif
336 #endif /* __hot */
337 
338 #ifndef __cold
339 #   if defined(__OPTIMIZE__)
340 #       if defined(__e2k__)
341 #           define __cold __attribute__((__cold__)) __optimize(1)
342 #       elif defined(__clang__) && !__has_attribute(cold) \
343         && __has_attribute(__section__) && (defined(__linux__) || defined(__gnu_linux__))
344             /* just put infrequently used functions in separate section */
345 #           define __cold __attribute__((__section__("text.unlikely"))) __optimize("Os")
346 #       elif defined(__GNUC__) || __has_attribute(cold)
347 #           define __cold __attribute__((__cold__)) __optimize("Os")
348 #       else
349 #           define __cold __optimize("Os")
350 #       endif
351 #   else
352 #       define __cold
353 #   endif
354 #endif /* __cold */
355 
356 #ifndef __flatten
357 #   if defined(__OPTIMIZE__) && (defined(__GNUC__) || __has_attribute(__flatten__))
358 #       define __flatten __attribute__((__flatten__))
359 #   else
360 #       define __flatten
361 #   endif
362 #endif /* __flatten */
363 
364 #ifndef likely
365 #   if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__)
366 #       define likely(cond) __builtin_expect(!!(cond), 1)
367 #   else
368 #       define likely(x) (!!(x))
369 #   endif
370 #endif /* likely */
371 
372 #ifndef unlikely
373 #   if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__)
374 #       define unlikely(cond) __builtin_expect(!!(cond), 0)
375 #   else
376 #       define unlikely(x) (!!(x))
377 #   endif
378 #endif /* unlikely */
379 
380 #ifndef __anonymous_struct_extension__
381 #   if defined(__GNUC__)
382 #       define __anonymous_struct_extension__ __extension__
383 #   else
384 #       define __anonymous_struct_extension__
385 #   endif
386 #endif /* __anonymous_struct_extension__ */
387 
388 #ifndef __Wpedantic_format_voidptr
389     MDBX_MAYBE_UNUSED MDBX_PURE_FUNCTION static __inline  const void*
__Wpedantic_format_voidptr(const void * ptr)390         __Wpedantic_format_voidptr(const void* ptr) {return ptr;}
391 #   define __Wpedantic_format_voidptr(ARG) __Wpedantic_format_voidptr(ARG)
392 #endif /* __Wpedantic_format_voidptr */
393 
394 /*----------------------------------------------------------------------------*/
395 
396 #if defined(MDBX_USE_VALGRIND)
397 #   include <valgrind/memcheck.h>
398 #   ifndef VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE
399         /* LY: available since Valgrind 3.10 */
400 #       define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s)
401 #       define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s)
402 #   endif
403 #elif !defined(RUNNING_ON_VALGRIND)
404 #   define VALGRIND_CREATE_MEMPOOL(h,r,z)
405 #   define VALGRIND_DESTROY_MEMPOOL(h)
406 #   define VALGRIND_MEMPOOL_TRIM(h,a,s)
407 #   define VALGRIND_MEMPOOL_ALLOC(h,a,s)
408 #   define VALGRIND_MEMPOOL_FREE(h,a)
409 #   define VALGRIND_MEMPOOL_CHANGE(h,a,b,s)
410 #   define VALGRIND_MAKE_MEM_NOACCESS(a,s)
411 #   define VALGRIND_MAKE_MEM_DEFINED(a,s)
412 #   define VALGRIND_MAKE_MEM_UNDEFINED(a,s)
413 #   define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s)
414 #   define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s)
415 #   define VALGRIND_CHECK_MEM_IS_ADDRESSABLE(a,s) (0)
416 #   define VALGRIND_CHECK_MEM_IS_DEFINED(a,s) (0)
417 #   define RUNNING_ON_VALGRIND (0)
418 #endif /* MDBX_USE_VALGRIND */
419 
420 #ifdef __SANITIZE_ADDRESS__
421 #   include <sanitizer/asan_interface.h>
422 #elif !defined(ASAN_POISON_MEMORY_REGION)
423 #   define ASAN_POISON_MEMORY_REGION(addr, size) \
424         ((void)(addr), (void)(size))
425 #   define ASAN_UNPOISON_MEMORY_REGION(addr, size) \
426         ((void)(addr), (void)(size))
427 #endif /* __SANITIZE_ADDRESS__ */
428 
429 /*----------------------------------------------------------------------------*/
430 
431 #ifndef ARRAY_LENGTH
432 #   ifdef __cplusplus
433         template <typename T, size_t N>
434         char (&__ArraySizeHelper(T (&array)[N]))[N];
435 #       define ARRAY_LENGTH(array) (sizeof(::__ArraySizeHelper(array)))
436 #   else
437 #       define ARRAY_LENGTH(array) (sizeof(array) / sizeof(array[0]))
438 #   endif
439 #endif /* ARRAY_LENGTH */
440 
441 #ifndef ARRAY_END
442 #   define ARRAY_END(array) (&array[ARRAY_LENGTH(array)])
443 #endif /* ARRAY_END */
444 
445 #define CONCAT(a,b) a##b
446 #define XCONCAT(a,b) CONCAT(a,b)
447 
448 #ifndef offsetof
449 #   define offsetof(type, member)  __builtin_offsetof(type, member)
450 #endif /* offsetof */
451 
452 #ifndef container_of
453 #   define container_of(ptr, type, member) \
454         ((type *)((char *)(ptr) - offsetof(type, member)))
455 #endif /* container_of */
456 
457 #define MDBX_TETRAD(a, b, c, d)                                                \
458   ((uint32_t)(a) << 24 | (uint32_t)(b) << 16 | (uint32_t)(c) << 8 | (d))
459 
460 #define MDBX_STRING_TETRAD(str) MDBX_TETRAD(str[0], str[1], str[2], str[3])
461 
462 #define FIXME "FIXME: " __FILE__ ", " MDBX_STRINGIFY(__LINE__)
463 
464 #ifndef STATIC_ASSERT_MSG
465 #   if defined(static_assert)
466 #       define STATIC_ASSERT_MSG(expr, msg) static_assert(expr, msg)
467 #   elif defined(_STATIC_ASSERT)
468 #       define STATIC_ASSERT_MSG(expr, msg) _STATIC_ASSERT(expr)
469 #   elif defined(_MSC_VER)
470 #       include <crtdbg.h>
471 #       define STATIC_ASSERT_MSG(expr, msg) _STATIC_ASSERT(expr)
472 #   elif (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) \
473           || __has_feature(c_static_assert)
474 #       define STATIC_ASSERT_MSG(expr, msg) _Static_assert(expr, msg)
475 #   else
476 #       define STATIC_ASSERT_MSG(expr, msg) switch (0) {case 0:case (expr):;}
477 #   endif
478 #endif /* STATIC_ASSERT */
479 
480 #ifndef STATIC_ASSERT
481 #   define STATIC_ASSERT(expr) STATIC_ASSERT_MSG(expr, #expr)
482 #endif
483 
484 /* *INDENT-ON* */
485 /* clang-format on */
486 
487 #if defined(__GNUC__) && !__GNUC_PREREQ(4, 2)
488 /* Actually libmdbx was not tested with compilers older than GCC 4.2.
489  * But you could ignore this warning at your own risk.
490  * In such case please don't rise up an issues related ONLY to old compilers.
491  */
492 #warning "libmdbx required GCC >= 4.2"
493 #endif
494 
495 #if defined(__clang__) && !__CLANG_PREREQ(3, 8)
496 /* Actually libmdbx was not tested with CLANG older than 3.8.
497  * But you could ignore this warning at your own risk.
498  * In such case please don't rise up an issues related ONLY to old compilers.
499  */
500 #warning "libmdbx required CLANG >= 3.8"
501 #endif
502 
503 #if defined(__GLIBC__) && !__GLIBC_PREREQ(2, 12)
504 /* Actually libmdbx was not tested with something older than glibc 2.12.
505  * But you could ignore this warning at your own risk.
506  * In such case please don't rise up an issues related ONLY to old systems.
507  */
508 #warning "libmdbx was only tested with GLIBC >= 2.12."
509 #endif
510 
511 #ifdef __SANITIZE_THREAD__
512 #warning                                                                       \
513     "libmdbx don't compatible with ThreadSanitizer, you will get a lot of false-positive issues."
514 #endif /* __SANITIZE_THREAD__ */
515 
516 #if __has_warning("-Wnested-anon-types")
517 #if defined(__clang__)
518 #pragma clang diagnostic ignored "-Wnested-anon-types"
519 #elif defined(__GNUC__)
520 #pragma GCC diagnostic ignored "-Wnested-anon-types"
521 #else
522 #pragma warning disable "nested-anon-types"
523 #endif
524 #endif /* -Wnested-anon-types */
525 
526 #if __has_warning("-Wconstant-logical-operand")
527 #if defined(__clang__)
528 #pragma clang diagnostic ignored "-Wconstant-logical-operand"
529 #elif defined(__GNUC__)
530 #pragma GCC diagnostic ignored "-Wconstant-logical-operand"
531 #else
532 #pragma warning disable "constant-logical-operand"
533 #endif
534 #endif /* -Wconstant-logical-operand */
535 
536 #if defined(__LCC__) && (__LCC__ <= 121)
537 /* bug #2798 */
538 #pragma diag_suppress alignment_reduction_ignored
539 #elif defined(__ICC)
540 #pragma warning(disable : 3453 1366)
541 #elif __has_warning("-Walignment-reduction-ignored")
542 #if defined(__clang__)
543 #pragma clang diagnostic ignored "-Walignment-reduction-ignored"
544 #elif defined(__GNUC__)
545 #pragma GCC diagnostic ignored "-Walignment-reduction-ignored"
546 #else
547 #pragma warning disable "alignment-reduction-ignored"
548 #endif
549 #endif /* -Walignment-reduction-ignored */
550 
551 #ifdef __cplusplus
552 extern "C" {
553 #endif
554 
555 /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */
556 
557 /*
558  * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru>
559  * and other libmdbx authors: please see AUTHORS file.
560  * All rights reserved.
561  *
562  * Redistribution and use in source and binary forms, with or without
563  * modification, are permitted only as authorized by the OpenLDAP
564  * Public License.
565  *
566  * A copy of this license is available in the file LICENSE in the
567  * top-level directory of the distribution or, alternatively, at
568  * <http://www.OpenLDAP.org/license.html>.
569  */
570 
571 
572 /*----------------------------------------------------------------------------*/
573 /* Microsoft compiler generates a lot of warning for self includes... */
574 
575 #ifdef _MSC_VER
576 #pragma warning(push, 1)
577 #pragma warning(disable : 4548) /* expression before comma has no effect;      \
578                                    expected expression with side - effect */
579 #pragma warning(disable : 4530) /* C++ exception handler used, but unwind      \
580                                  * semantics are not enabled. Specify /EHsc */
581 #pragma warning(disable : 4577) /* 'noexcept' used with no exception handling  \
582                                  * mode specified; termination on exception is \
583                                  * not guaranteed. Specify /EHsc */
584 #endif                          /* _MSC_VER (warnings) */
585 
586 #if defined(_WIN32) || defined(_WIN64)
587 #if !defined(_CRT_SECURE_NO_WARNINGS)
588 #define _CRT_SECURE_NO_WARNINGS
589 #endif /* _CRT_SECURE_NO_WARNINGS */
590 #if !defined(_NO_CRT_STDIO_INLINE) && MDBX_BUILD_SHARED_LIBRARY &&             \
591     !defined(xMDBX_TOOLS) && MDBX_WITHOUT_MSVC_CRT
592 #define _NO_CRT_STDIO_INLINE
593 #endif
594 #elif !defined(_POSIX_C_SOURCE)
595 #define _POSIX_C_SOURCE 200809L
596 #endif /* Windows */
597 
598 /*----------------------------------------------------------------------------*/
599 /* C99 includes */
600 #include <inttypes.h>
601 #include <stddef.h>
602 #include <stdint.h>
603 #include <stdlib.h>
604 
605 #include <assert.h>
606 #include <fcntl.h>
607 #include <limits.h>
608 #include <stdio.h>
609 #include <string.h>
610 #include <time.h>
611 
612 /* C11 stdalign.h */
613 #if __has_include(<stdalign.h>)
614 #include <stdalign.h>
615 #elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
616 #define alignas(N) _Alignas(N)
617 #elif defined(_MSC_VER)
618 #define alignas(N) __declspec(align(N))
619 #elif __has_attribute(__aligned__) || defined(__GNUC__)
620 #define alignas(N) __attribute__((__aligned__(N)))
621 #else
622 #error "FIXME: Required _alignas() or equivalent."
623 #endif
624 
625 /*----------------------------------------------------------------------------*/
626 /* Systems includes */
627 
628 #ifdef __APPLE__
629 #include <TargetConditionals.h>
630 #endif /* Apple OSX & iOS */
631 
632 #if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) ||     \
633     defined(__BSD__) || defined(__bsdi__) || defined(__DragonFly__) ||         \
634     defined(__APPLE__) || defined(__MACH__)
635 #include <sys/cdefs.h>
636 #include <sys/mount.h>
637 #include <sys/sysctl.h>
638 #include <sys/types.h>
639 #if defined(__FreeBSD__) || defined(__DragonFly__)
640 #include <vm/vm_param.h>
641 #elif defined(__OpenBSD__) || defined(__NetBSD__)
642 #include <uvm/uvm_param.h>
643 #else
644 #define SYSCTL_LEGACY_NONCONST_MIB
645 #endif
646 #ifndef __MACH__
647 #include <sys/vmmeter.h>
648 #endif
649 #else
650 #include <malloc.h>
651 #if !(defined(__sun) || defined(__SVR4) || defined(__svr4__) ||                \
652       defined(_WIN32) || defined(_WIN64))
653 #include <mntent.h>
654 #endif /* !Solaris */
655 #endif /* !xBSD */
656 
657 #if defined(__FreeBSD__) || __has_include(<malloc_np.h>)
658 #include <malloc_np.h>
659 #endif
660 
661 #if defined(__APPLE__) || defined(__MACH__) || __has_include(<malloc/malloc.h>)
662 #include <malloc/malloc.h>
663 #endif /* MacOS */
664 
665 #if defined(__MACH__)
666 #include <mach/host_info.h>
667 #include <mach/mach_host.h>
668 #include <mach/mach_port.h>
669 #include <uuid/uuid.h>
670 #endif
671 
672 #if defined(__linux__) || defined(__gnu_linux__)
673 #include <sched.h>
674 #include <sys/sendfile.h>
675 #include <sys/statfs.h>
676 #endif /* Linux */
677 
678 #ifndef _XOPEN_SOURCE
679 #define _XOPEN_SOURCE 0
680 #endif
681 
682 #ifndef _XOPEN_SOURCE_EXTENDED
683 #define _XOPEN_SOURCE_EXTENDED 0
684 #else
685 #include <utmpx.h>
686 #endif /* _XOPEN_SOURCE_EXTENDED */
687 
688 #if defined(__sun) || defined(__SVR4) || defined(__svr4__)
689 #include <kstat.h>
690 #include <sys/mnttab.h>
691 /* On Solaris, it's easier to add a missing prototype rather than find a
692  * combination of #defines that break nothing. */
693 __extern_C key_t ftok(const char *, int);
694 #endif /* SunOS/Solaris */
695 
696 #if defined(_WIN32) || defined(_WIN64)
697 #ifndef _WIN32_WINNT
698 #define _WIN32_WINNT 0x0601 /* Windows 7 */
699 #elif _WIN32_WINNT < 0x0500
700 #error At least 'Windows 2000' API is required for libmdbx.
701 #endif /* _WIN32_WINNT */
702 #if (defined(__MINGW32__) || defined(__MINGW64__)) &&                          \
703     !defined(__USE_MINGW_ANSI_STDIO)
704 #define __USE_MINGW_ANSI_STDIO 1
705 #endif /* MinGW */
706 #ifndef WIN32_LEAN_AND_MEAN
707 #define WIN32_LEAN_AND_MEAN
708 #endif /* WIN32_LEAN_AND_MEAN */
709 #include <excpt.h>
710 #include <tlhelp32.h>
711 #include <windows.h>
712 #include <winnt.h>
713 #include <winternl.h>
714 #define HAVE_SYS_STAT_H
715 #define HAVE_SYS_TYPES_H
716 typedef HANDLE mdbx_thread_t;
717 typedef unsigned mdbx_thread_key_t;
718 #define MAP_FAILED NULL
719 #define HIGH_DWORD(v) ((DWORD)((sizeof(v) > 4) ? ((uint64_t)(v) >> 32) : 0))
720 #define THREAD_CALL WINAPI
721 #define THREAD_RESULT DWORD
722 typedef struct {
723   HANDLE mutex;
724   HANDLE event[2];
725 } mdbx_condpair_t;
726 typedef CRITICAL_SECTION mdbx_fastmutex_t;
727 
728 #if !defined(_MSC_VER) && !defined(__try)
729 /* *INDENT-OFF* */
730 /* clang-format off */
731 #define __try
732 #define __except(COND) if(false)
733 /* *INDENT-ON* */
734 /* clang-format on */
735 #endif /* stub for MSVC's __try/__except */
736 
737 #if MDBX_WITHOUT_MSVC_CRT
738 
739 #ifndef mdbx_malloc
mdbx_malloc(size_t bytes)740 static inline void *mdbx_malloc(size_t bytes) {
741   return HeapAlloc(GetProcessHeap(), 0, bytes);
742 }
743 #endif /* mdbx_malloc */
744 
745 #ifndef mdbx_calloc
mdbx_calloc(size_t nelem,size_t size)746 static inline void *mdbx_calloc(size_t nelem, size_t size) {
747   return HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, nelem * size);
748 }
749 #endif /* mdbx_calloc */
750 
751 #ifndef mdbx_realloc
mdbx_realloc(void * ptr,size_t bytes)752 static inline void *mdbx_realloc(void *ptr, size_t bytes) {
753   return ptr ? HeapReAlloc(GetProcessHeap(), 0, ptr, bytes)
754              : HeapAlloc(GetProcessHeap(), 0, bytes);
755 }
756 #endif /* mdbx_realloc */
757 
758 #ifndef mdbx_free
mdbx_free(void * ptr)759 static inline void mdbx_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); }
760 #endif /* mdbx_free */
761 
762 #else /* MDBX_WITHOUT_MSVC_CRT */
763 
764 #define mdbx_malloc malloc
765 #define mdbx_calloc calloc
766 #define mdbx_realloc realloc
767 #define mdbx_free free
768 #define mdbx_strdup _strdup
769 
770 #endif /* MDBX_WITHOUT_MSVC_CRT */
771 
772 #ifndef snprintf
773 #define snprintf _snprintf /* ntdll */
774 #endif
775 
776 #ifndef vsnprintf
777 #define vsnprintf _vsnprintf /* ntdll */
778 #endif
779 
780 #else /*----------------------------------------------------------------------*/
781 
782 #include <unistd.h>
783 #if !defined(_POSIX_MAPPED_FILES) || _POSIX_MAPPED_FILES < 1
784 #error "libmdbx requires the _POSIX_MAPPED_FILES feature"
785 #endif /* _POSIX_MAPPED_FILES */
786 
787 #include <pthread.h>
788 #include <semaphore.h>
789 #include <signal.h>
790 #include <sys/file.h>
791 #include <sys/ipc.h>
792 #include <sys/mman.h>
793 #include <sys/param.h>
794 #include <sys/stat.h>
795 #include <sys/statvfs.h>
796 #include <sys/uio.h>
797 typedef pthread_t mdbx_thread_t;
798 typedef pthread_key_t mdbx_thread_key_t;
799 #define INVALID_HANDLE_VALUE (-1)
800 #define THREAD_CALL
801 #define THREAD_RESULT void *
802 typedef struct {
803   pthread_mutex_t mutex;
804   pthread_cond_t cond[2];
805 } mdbx_condpair_t;
806 typedef pthread_mutex_t mdbx_fastmutex_t;
807 #define mdbx_malloc malloc
808 #define mdbx_calloc calloc
809 #define mdbx_realloc realloc
810 #define mdbx_free free
811 #define mdbx_strdup strdup
812 #endif /* Platform */
813 
814 #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size)
815 /* malloc_usable_size() already provided */
816 #elif defined(__APPLE__)
817 #define malloc_usable_size(ptr) malloc_size(ptr)
818 #elif defined(_MSC_VER) && !MDBX_WITHOUT_MSVC_CRT
819 #define malloc_usable_size(ptr) _msize(ptr)
820 #endif /* malloc_usable_size */
821 
822 #ifdef __ANDROID_API__
823 #include <android/log.h>
824 #if __ANDROID_API__ >= 21
825 #include <sys/sendfile.h>
826 #endif
827 #endif /* Android */
828 
829 /* *INDENT-OFF* */
830 /* clang-format off */
831 #if defined(HAVE_SYS_STAT_H) || __has_include(<sys/stat.h>)
832 #include <sys/stat.h>
833 #endif
834 #if defined(HAVE_SYS_TYPES_H) || __has_include(<sys/types.h>)
835 #include <sys/types.h>
836 #endif
837 #if defined(HAVE_SYS_FILE_H) || __has_include(<sys/file.h>)
838 #include <sys/file.h>
839 #endif
840 /* *INDENT-ON* */
841 /* clang-format on */
842 
843 #ifndef SSIZE_MAX
844 #define SSIZE_MAX INTPTR_MAX
845 #endif
846 
847 #if !defined(MADV_DODUMP) && defined(MADV_CORE)
848 #define MADV_DODUMP MADV_CORE
849 #endif /* MADV_CORE -> MADV_DODUMP */
850 
851 #if !defined(MADV_DONTDUMP) && defined(MADV_NOCORE)
852 #define MADV_DONTDUMP MADV_NOCORE
853 #endif /* MADV_NOCORE -> MADV_DONTDUMP */
854 
855 #if defined(i386) || defined(__386) || defined(__i386) || defined(__i386__) || \
856     defined(i486) || defined(__i486) || defined(__i486__) ||                   \
857     defined(i586) | defined(__i586) || defined(__i586__) || defined(i686) ||   \
858     defined(__i686) || defined(__i686__) || defined(_M_IX86) ||                \
859     defined(_X86_) || defined(__THW_INTEL__) || defined(__I86__) ||            \
860     defined(__INTEL__) || defined(__x86_64) || defined(__x86_64__) ||          \
861     defined(__amd64__) || defined(__amd64) || defined(_M_X64) ||               \
862     defined(_M_AMD64) || defined(__IA32__) || defined(__INTEL__)
863 #ifndef __ia32__
864 /* LY: define neutral __ia32__ for x86 and x86-64 */
865 #define __ia32__ 1
866 #endif /* __ia32__ */
867 #if !defined(__amd64__) && (defined(__x86_64) || defined(__x86_64__) ||        \
868                             defined(__amd64) || defined(_M_X64))
869 /* LY: define trusty __amd64__ for all AMD64/x86-64 arch */
870 #define __amd64__ 1
871 #endif /* __amd64__ */
872 #endif /* all x86 */
873 
874 #if (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF
875 #error                                                                         \
876     "Sanity checking failed: Two's complement, reasonably sized integer types"
877 #endif
878 
879 #if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul
880 #define MDBX_WORDBITS 64
881 #else
882 #define MDBX_WORDBITS 32
883 #endif /* MDBX_WORDBITS */
884 
885 /*----------------------------------------------------------------------------*/
886 /* Compiler's includes for builtins/intrinsics */
887 
888 #if defined(_MSC_VER) || defined(__INTEL_COMPILER)
889 #include <intrin.h>
890 #elif __GNUC_PREREQ(4, 4) || defined(__clang__)
891 #if defined(__ia32__) || defined(__e2k__)
892 #include <x86intrin.h>
893 #endif /* __ia32__ */
894 #if defined(__ia32__)
895 #include <cpuid.h>
896 #endif /* __ia32__ */
897 #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun)
898 #include <mbarrier.h>
899 #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) &&       \
900     (defined(HP_IA64) || defined(__ia64))
901 #include <machine/sys/inline.h>
902 #elif defined(__IBMC__) && defined(__powerpc)
903 #include <atomic.h>
904 #elif defined(_AIX)
905 #include <builtins.h>
906 #include <sys/atomic_op.h>
907 #elif (defined(__osf__) && defined(__DECC)) || defined(__alpha)
908 #include <c_asm.h>
909 #include <machine/builtins.h>
910 #elif defined(__MWERKS__)
911 /* CodeWarrior - troubles ? */
912 #pragma gcc_extensions
913 #elif defined(__SNC__)
914 /* Sony PS3 - troubles ? */
915 #elif defined(__hppa__) || defined(__hppa)
916 #include <machine/inline.h>
917 #else
918 #error Unsupported C compiler, please use GNU C 4.4 or newer
919 #endif /* Compiler */
920 
921 /*----------------------------------------------------------------------------*/
922 /* Byteorder */
923 
924 #if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) ||           \
925     !defined(__ORDER_BIG_ENDIAN__)
926 
927 /* *INDENT-OFF* */
928 /* clang-format off */
929 #if defined(__GLIBC__) || defined(__GNU_LIBRARY__) || defined(__ANDROID_API__) ||  \
930     defined(HAVE_ENDIAN_H) || __has_include(<endian.h>)
931 #include <endian.h>
932 #elif defined(__APPLE__) || defined(__MACH__) || defined(__OpenBSD__) ||       \
933     defined(HAVE_MACHINE_ENDIAN_H) || __has_include(<machine/endian.h>)
934 #include <machine/endian.h>
935 #elif defined(HAVE_SYS_ISA_DEFS_H) || __has_include(<sys/isa_defs.h>)
936 #include <sys/isa_defs.h>
937 #elif (defined(HAVE_SYS_TYPES_H) && defined(HAVE_SYS_ENDIAN_H)) ||             \
938     (__has_include(<sys/types.h>) && __has_include(<sys/endian.h>))
939 #include <sys/endian.h>
940 #include <sys/types.h>
941 #elif defined(__bsdi__) || defined(__DragonFly__) || defined(__FreeBSD__) ||   \
942     defined(__NetBSD__) ||                              \
943     defined(HAVE_SYS_PARAM_H) || __has_include(<sys/param.h>)
944 #include <sys/param.h>
945 #endif /* OS */
946 /* *INDENT-ON* */
947 /* clang-format on */
948 
949 #if defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && defined(__BIG_ENDIAN)
950 #define __ORDER_LITTLE_ENDIAN__ __LITTLE_ENDIAN
951 #define __ORDER_BIG_ENDIAN__ __BIG_ENDIAN
952 #define __BYTE_ORDER__ __BYTE_ORDER
953 #elif defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN)
954 #define __ORDER_LITTLE_ENDIAN__ _LITTLE_ENDIAN
955 #define __ORDER_BIG_ENDIAN__ _BIG_ENDIAN
956 #define __BYTE_ORDER__ _BYTE_ORDER
957 #else
958 #define __ORDER_LITTLE_ENDIAN__ 1234
959 #define __ORDER_BIG_ENDIAN__ 4321
960 
961 #if defined(__LITTLE_ENDIAN__) ||                                              \
962     (defined(_LITTLE_ENDIAN) && !defined(_BIG_ENDIAN)) ||                      \
963     defined(__ARMEL__) || defined(__THUMBEL__) || defined(__AARCH64EL__) ||    \
964     defined(__MIPSEL__) || defined(_MIPSEL) || defined(__MIPSEL) ||            \
965     defined(_M_ARM) || defined(_M_ARM64) || defined(__e2k__) ||                \
966     defined(__elbrus_4c__) || defined(__elbrus_8c__) || defined(__bfin__) ||   \
967     defined(__BFIN__) || defined(__ia64__) || defined(_IA64) ||                \
968     defined(__IA64__) || defined(__ia64) || defined(_M_IA64) ||                \
969     defined(__itanium__) || defined(__ia32__) || defined(__CYGWIN__) ||        \
970     defined(_WIN64) || defined(_WIN32) || defined(__TOS_WIN__) ||              \
971     defined(__WINDOWS__)
972 #define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
973 
974 #elif defined(__BIG_ENDIAN__) ||                                               \
975     (defined(_BIG_ENDIAN) && !defined(_LITTLE_ENDIAN)) ||                      \
976     defined(__ARMEB__) || defined(__THUMBEB__) || defined(__AARCH64EB__) ||    \
977     defined(__MIPSEB__) || defined(_MIPSEB) || defined(__MIPSEB) ||            \
978     defined(__m68k__) || defined(M68000) || defined(__hppa__) ||               \
979     defined(__hppa) || defined(__HPPA__) || defined(__sparc__) ||              \
980     defined(__sparc) || defined(__370__) || defined(__THW_370__) ||            \
981     defined(__s390__) || defined(__s390x__) || defined(__SYSC_ZARCH__)
982 #define __BYTE_ORDER__ __ORDER_BIG_ENDIAN__
983 
984 #else
985 #error __BYTE_ORDER__ should be defined.
986 #endif /* Arch */
987 
988 #endif
989 #endif /* __BYTE_ORDER__ || __ORDER_LITTLE_ENDIAN__ || __ORDER_BIG_ENDIAN__ */
990 
991 /* Get the size of a memory page for the system.
992  * This is the basic size that the platform's memory manager uses, and is
993  * fundamental to the use of memory-mapped files. */
994 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t
mdbx_syspagesize(void)995 mdbx_syspagesize(void) {
996 #if defined(_WIN32) || defined(_WIN64)
997   SYSTEM_INFO si;
998   GetSystemInfo(&si);
999   return si.dwPageSize;
1000 #else
1001   return sysconf(_SC_PAGE_SIZE);
1002 #endif
1003 }
1004 
1005 typedef struct mdbx_mmap_param {
1006   union {
1007     void *address;
1008     uint8_t *dxb;
1009     struct MDBX_lockinfo *lck;
1010   };
1011   mdbx_filehandle_t fd;
1012   size_t limit;   /* mapping length, but NOT a size of file nor DB */
1013   size_t current; /* mapped region size, i.e. the size of file and DB */
1014   uint64_t filesize /* in-process cache of a file size */;
1015 #if defined(_WIN32) || defined(_WIN64)
1016   HANDLE section; /* memory-mapped section handle */
1017 #endif
1018 } mdbx_mmap_t;
1019 
1020 typedef union bin128 {
1021   __anonymous_struct_extension__ struct { uint64_t x, y; };
1022   __anonymous_struct_extension__ struct { uint32_t a, b, c, d; };
1023 } bin128_t;
1024 
1025 #if defined(_WIN32) || defined(_WIN64)
1026 typedef union MDBX_srwlock {
1027   struct {
1028     long volatile readerCount;
1029     long volatile writerCount;
1030   };
1031   RTL_SRWLOCK native;
1032 } MDBX_srwlock;
1033 #endif /* Windows */
1034 
1035 #ifndef __cplusplus
1036 
1037 MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void mdbx_osal_jitter(bool tiny);
1038 MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny);
1039 
1040 /*----------------------------------------------------------------------------*/
1041 /* Atomics */
1042 
1043 #if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && (__has_include(<cstdatomic>) || __has_extension(cxx_atomic))
1044 #include <cstdatomic>
1045 #define MDBX_HAVE_C11ATOMICS
1046 #elif !defined(__cplusplus) &&                                                 \
1047     (__STDC_VERSION__ >= 201112L || __has_extension(c_atomic)) &&              \
1048     !defined(__STDC_NO_ATOMICS__) &&                                           \
1049     (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) ||                            \
1050      !(defined(__GNUC__) || defined(__clang__)))
1051 #include <stdatomic.h>
1052 #define MDBX_HAVE_C11ATOMICS
1053 #elif defined(__GNUC__) || defined(__clang__)
1054 #elif defined(_MSC_VER)
1055 #pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */
1056 #pragma warning(disable : 4133) /* 'function': incompatible types - from       \
1057                                    'size_t' to 'LONGLONG' */
1058 #pragma warning(disable : 4244) /* 'return': conversion from 'LONGLONG' to     \
1059                                    'std::size_t', possible loss of data */
1060 #pragma warning(disable : 4267) /* 'function': conversion from 'size_t' to     \
1061                                    'long', possible loss of data */
1062 #pragma intrinsic(_InterlockedExchangeAdd, _InterlockedCompareExchange)
1063 #pragma intrinsic(_InterlockedExchangeAdd64, _InterlockedCompareExchange64)
1064 #elif defined(__APPLE__)
1065 #include <libkern/OSAtomic.h>
1066 #else
1067 #error FIXME atomic-ops
1068 #endif
1069 
1070 /*----------------------------------------------------------------------------*/
1071 /* Memory/Compiler barriers, cache coherence */
1072 
1073 #if __has_include(<sys/cachectl.h>)
1074 #include <sys/cachectl.h>
1075 #elif defined(__mips) || defined(__mips__) || defined(__mips64) ||             \
1076     defined(__mips64__) || defined(_M_MRX000) || defined(_MIPS_) ||            \
1077     defined(__MWERKS__) || defined(__sgi)
1078 /* MIPS should have explicit cache control */
1079 #include <sys/cachectl.h>
1080 #endif
1081 
mdbx_compiler_barrier(void)1082 MDBX_MAYBE_UNUSED static __inline void mdbx_compiler_barrier(void) {
1083 #if defined(__clang__) || defined(__GNUC__)
1084   __asm__ __volatile__("" ::: "memory");
1085 #elif defined(_MSC_VER)
1086   _ReadWriteBarrier();
1087 #elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */
1088   __memory_barrier();
1089 #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun)
1090   __compiler_barrier();
1091 #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) &&       \
1092     (defined(HP_IA64) || defined(__ia64))
1093   _Asm_sched_fence(/* LY: no-arg meaning 'all expect ALU', e.g. 0x3D3D */);
1094 #elif defined(_AIX) || defined(__ppc__) || defined(__powerpc__) ||             \
1095     defined(__ppc64__) || defined(__powerpc64__)
1096   __fence();
1097 #else
1098 #error "Could not guess the kind of compiler, please report to us."
1099 #endif
1100 }
1101 
mdbx_memory_barrier(void)1102 MDBX_MAYBE_UNUSED static __inline void mdbx_memory_barrier(void) {
1103 #ifdef MDBX_HAVE_C11ATOMICS
1104   atomic_thread_fence(memory_order_seq_cst);
1105 #elif defined(__ATOMIC_SEQ_CST)
1106 #ifdef __clang__
1107   __c11_atomic_thread_fence(__ATOMIC_SEQ_CST);
1108 #else
1109   __atomic_thread_fence(__ATOMIC_SEQ_CST);
1110 #endif
1111 #elif defined(__clang__) || defined(__GNUC__)
1112   __sync_synchronize();
1113 #elif defined(_WIN32) || defined(_WIN64)
1114   MemoryBarrier();
1115 #elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */
1116 #if defined(__ia32__)
1117   _mm_mfence();
1118 #else
1119   __mf();
1120 #endif
1121 #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun)
1122   __machine_rw_barrier();
1123 #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) &&       \
1124     (defined(HP_IA64) || defined(__ia64))
1125   _Asm_mf();
1126 #elif defined(_AIX) || defined(__ppc__) || defined(__powerpc__) ||             \
1127     defined(__ppc64__) || defined(__powerpc64__)
1128   __lwsync();
1129 #else
1130 #error "Could not guess the kind of compiler, please report to us."
1131 #endif
1132 }
1133 
1134 /*----------------------------------------------------------------------------*/
1135 /* libc compatibility stuff */
1136 
1137 #if (!defined(__GLIBC__) && __GLIBC_PREREQ(2, 1)) &&                           \
1138     (defined(_GNU_SOURCE) || defined(_BSD_SOURCE))
1139 #define mdbx_asprintf asprintf
1140 #define mdbx_vasprintf vasprintf
1141 #else
1142 MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC
1143     MDBX_PRINTF_ARGS(2, 3) int mdbx_asprintf(char **strp, const char *fmt, ...);
1144 MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap);
1145 #endif
1146 
1147 /*----------------------------------------------------------------------------*/
1148 /* OS abstraction layer stuff */
1149 
1150 /* max bytes to write in one call */
1151 #if defined(_WIN32) || defined(_WIN64)
1152 #define MAX_WRITE UINT32_C(0x01000000)
1153 #else
1154 #define MAX_WRITE UINT32_C(0x3fff0000)
1155 #endif
1156 
1157 #if defined(__linux__) || defined(__gnu_linux__)
1158 MDBX_INTERNAL_VAR uint32_t mdbx_linux_kernel_version;
1159 MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */;
1160 #endif /* Linux */
1161 
1162 #ifndef mdbx_strdup
1163 LIBMDBX_API char *mdbx_strdup(const char *str);
1164 #endif
1165 
mdbx_get_errno(void)1166 MDBX_MAYBE_UNUSED static __inline int mdbx_get_errno(void) {
1167 #if defined(_WIN32) || defined(_WIN64)
1168   DWORD rc = GetLastError();
1169 #else
1170   int rc = errno;
1171 #endif
1172   return rc;
1173 }
1174 
1175 #ifndef mdbx_memalign_alloc
1176 MDBX_INTERNAL_FUNC int mdbx_memalign_alloc(size_t alignment, size_t bytes,
1177                                            void **result);
1178 #endif
1179 #ifndef mdbx_memalign_free
1180 MDBX_INTERNAL_FUNC void mdbx_memalign_free(void *ptr);
1181 #endif
1182 
1183 MDBX_INTERNAL_FUNC int mdbx_condpair_init(mdbx_condpair_t *condpair);
1184 MDBX_INTERNAL_FUNC int mdbx_condpair_lock(mdbx_condpair_t *condpair);
1185 MDBX_INTERNAL_FUNC int mdbx_condpair_unlock(mdbx_condpair_t *condpair);
1186 MDBX_INTERNAL_FUNC int mdbx_condpair_signal(mdbx_condpair_t *condpair,
1187                                             bool part);
1188 MDBX_INTERNAL_FUNC int mdbx_condpair_wait(mdbx_condpair_t *condpair, bool part);
1189 MDBX_INTERNAL_FUNC int mdbx_condpair_destroy(mdbx_condpair_t *condpair);
1190 
1191 MDBX_INTERNAL_FUNC int mdbx_fastmutex_init(mdbx_fastmutex_t *fastmutex);
1192 MDBX_INTERNAL_FUNC int mdbx_fastmutex_acquire(mdbx_fastmutex_t *fastmutex);
1193 MDBX_INTERNAL_FUNC int mdbx_fastmutex_release(mdbx_fastmutex_t *fastmutex);
1194 MDBX_INTERNAL_FUNC int mdbx_fastmutex_destroy(mdbx_fastmutex_t *fastmutex);
1195 
1196 MDBX_INTERNAL_FUNC int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov,
1197                                     int iovcnt, uint64_t offset,
1198                                     size_t expected_written);
1199 MDBX_INTERNAL_FUNC int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t count,
1200                                   uint64_t offset);
1201 MDBX_INTERNAL_FUNC int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf,
1202                                    size_t count, uint64_t offset);
1203 MDBX_INTERNAL_FUNC int mdbx_write(mdbx_filehandle_t fd, const void *buf,
1204                                   size_t count);
1205 
1206 MDBX_INTERNAL_FUNC int
1207 mdbx_thread_create(mdbx_thread_t *thread,
1208                    THREAD_RESULT(THREAD_CALL *start_routine)(void *),
1209                    void *arg);
1210 MDBX_INTERNAL_FUNC int mdbx_thread_join(mdbx_thread_t thread);
1211 
1212 enum mdbx_syncmode_bits {
1213   MDBX_SYNC_NONE = 0,
1214   MDBX_SYNC_DATA = 1,
1215   MDBX_SYNC_SIZE = 2,
1216   MDBX_SYNC_IODQ = 4
1217 };
1218 
1219 MDBX_INTERNAL_FUNC int mdbx_fsync(mdbx_filehandle_t fd,
1220                                   const enum mdbx_syncmode_bits mode_bits);
1221 MDBX_INTERNAL_FUNC int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length);
1222 MDBX_INTERNAL_FUNC int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos);
1223 MDBX_INTERNAL_FUNC int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length);
1224 
1225 enum mdbx_openfile_purpose {
1226   MDBX_OPEN_DXB_READ = 0,
1227   MDBX_OPEN_DXB_LAZY = 1,
1228   MDBX_OPEN_DXB_DSYNC = 2,
1229   MDBX_OPEN_LCK = 3,
1230   MDBX_OPEN_COPY = 4,
1231   MDBX_OPEN_DELETE = 5
1232 };
1233 
1234 MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose,
1235                                      const MDBX_env *env, const char *pathname,
1236                                      mdbx_filehandle_t *fd,
1237                                      mdbx_mode_t unix_mode_bits);
1238 MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd);
1239 MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname);
1240 MDBX_INTERNAL_FUNC int mdbx_removedirectory(const char *pathname);
1241 MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd);
1242 MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait);
1243 
1244 #define MMAP_OPTION_TRUNCATE 1
1245 #define MMAP_OPTION_SEMAPHORE 2
1246 MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map,
1247                                  const size_t must, const size_t limit,
1248                                  const unsigned options);
1249 MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map);
1250 #define MDBX_MRESIZE_MAY_MOVE 0x00000100
1251 #define MDBX_MRESIZE_MAY_UNMAP 0x00000200
1252 MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map,
1253                                     size_t size, size_t limit);
1254 #if defined(_WIN32) || defined(_WIN64)
1255 typedef struct {
1256   unsigned limit, count;
1257   HANDLE handles[31];
1258 } mdbx_handle_array_t;
1259 MDBX_INTERNAL_FUNC int
1260 mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array);
1261 MDBX_INTERNAL_FUNC int
1262 mdbx_resume_threads_after_remap(mdbx_handle_array_t *array);
1263 #endif /* Windows */
1264 MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset,
1265                                   size_t length,
1266                                   enum mdbx_syncmode_bits mode_bits);
1267 MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle,
1268                                             const char *pathname, int err);
1269 
mdbx_getpid(void)1270 MDBX_MAYBE_UNUSED static __inline uint32_t mdbx_getpid(void) {
1271   STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t));
1272 #if defined(_WIN32) || defined(_WIN64)
1273   return GetCurrentProcessId();
1274 #else
1275   return getpid();
1276 #endif
1277 }
1278 
mdbx_thread_self(void)1279 MDBX_MAYBE_UNUSED static __inline uintptr_t mdbx_thread_self(void) {
1280   mdbx_tid_t thunk;
1281   STATIC_ASSERT(sizeof(uintptr_t) >= sizeof(thunk));
1282 #if defined(_WIN32) || defined(_WIN64)
1283   thunk = GetCurrentThreadId();
1284 #else
1285   thunk = pthread_self();
1286 #endif
1287   return (uintptr_t)thunk;
1288 }
1289 
1290 MDBX_INTERNAL_FUNC uint64_t mdbx_osal_monotime(void);
1291 MDBX_INTERNAL_FUNC uint64_t
1292 mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16);
1293 MDBX_INTERNAL_FUNC uint32_t mdbx_osal_monotime_to_16dot16(uint64_t monotime);
1294 
1295 MDBX_INTERNAL_FUNC bin128_t mdbx_osal_bootid(void);
1296 /*----------------------------------------------------------------------------*/
1297 /* lck stuff */
1298 
1299 /// \brief Initialization of synchronization primitives linked with MDBX_env
1300 ///   instance both in LCK-file and within the current process.
1301 /// \param
1302 ///   global_uniqueness_flag = true - denotes that there are no other processes
1303 ///     working with DB and LCK-file. Thus the function MUST initialize
1304 ///     shared synchronization objects in memory-mapped LCK-file.
1305 ///   global_uniqueness_flag = false - denotes that at least one process is
1306 ///     already working with DB and LCK-file, including the case when DB
1307 ///     has already been opened in the current process. Thus the function
1308 ///     MUST NOT initialize shared synchronization objects in memory-mapped
1309 ///     LCK-file that are already in use.
1310 /// \return Error code or zero on success.
1311 MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env,
1312                                      MDBX_env *inprocess_neighbor,
1313                                      int global_uniqueness_flag);
1314 
1315 /// \brief Disconnects from shared interprocess objects and destructs
1316 ///   synchronization objects linked with MDBX_env instance
1317 ///   within the current process.
1318 /// \param
1319 ///   inprocess_neighbor = NULL - if the current process does not have other
1320 ///     instances of MDBX_env linked with the DB being closed.
1321 ///     Thus the function MUST check for other processes working with DB or
1322 ///     LCK-file, and keep or destroy shared synchronization objects in
1323 ///     memory-mapped LCK-file depending on the result.
1324 ///   inprocess_neighbor = not-NULL - pointer to another instance of MDBX_env
1325 ///     (anyone of there is several) working with DB or LCK-file within the
1326 ///     current process. Thus the function MUST NOT try to acquire exclusive
1327 ///     lock and/or try to destruct shared synchronization objects linked with
1328 ///     DB or LCK-file. Moreover, the implementation MUST ensure correct work
1329 ///     of other instances of MDBX_env within the current process, e.g.
1330 ///     restore POSIX-fcntl locks after the closing of file descriptors.
1331 /// \return Error code (MDBX_PANIC) or zero on success.
1332 MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env,
1333                                         MDBX_env *inprocess_neighbor);
1334 
1335 /// \brief Connects to shared interprocess locking objects and tries to acquire
1336 ///   the maximum lock level (shared if exclusive is not available)
1337 ///   Depending on implementation or/and platform (Windows) this function may
1338 ///   acquire the non-OS super-level lock (e.g. for shared synchronization
1339 ///   objects initialization), which will be downgraded to OS-exclusive or
1340 ///   shared via explicit calling of mdbx_lck_downgrade().
1341 /// \return
1342 ///   MDBX_RESULT_TRUE (-1) - if an exclusive lock was acquired and thus
1343 ///     the current process is the first and only after the last use of DB.
1344 ///   MDBX_RESULT_FALSE (0) - if a shared lock was acquired and thus
1345 ///     DB has already been opened and now is used by other processes.
1346 ///   Otherwise (not 0 and not -1) - error code.
1347 MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env);
1348 
1349 /// \brief Downgrades the level of initially acquired lock to
1350 ///   operational level specified by argument. The reson for such downgrade:
1351 ///    - unblocking of other processes that are waiting for access, i.e.
1352 ///      if (env->me_flags & MDBX_EXCLUSIVE) != 0, then other processes
1353 ///      should be made aware that access is unavailable rather than
1354 ///      wait for it.
1355 ///    - freeing locks that interfere file operation (especially for Windows)
1356 ///   (env->me_flags & MDBX_EXCLUSIVE) == 0 - downgrade to shared lock.
1357 ///   (env->me_flags & MDBX_EXCLUSIVE) != 0 - downgrade to exclusive
1358 ///   operational lock.
1359 /// \return Error code or zero on success
1360 MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env);
1361 
1362 /// \brief Locks LCK-file or/and table of readers for (de)registering.
1363 /// \return Error code or zero on success
1364 MDBX_INTERNAL_FUNC int mdbx_rdt_lock(MDBX_env *env);
1365 
1366 /// \brief Unlocks LCK-file or/and table of readers after (de)registering.
1367 MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env);
1368 
1369 /// \brief Acquires lock for DB change (on writing transaction start)
1370 ///   Reading transactions will not be blocked.
1371 ///   Declared as LIBMDBX_API because it is used in mdbx_chk.
1372 /// \return Error code or zero on success
1373 LIBMDBX_API int mdbx_txn_lock(MDBX_env *env, bool dont_wait);
1374 
1375 /// \brief Releases lock once DB changes is made (after writing transaction
1376 ///   has finished).
1377 ///   Declared as LIBMDBX_API because it is used in mdbx_chk.
1378 LIBMDBX_API void mdbx_txn_unlock(MDBX_env *env);
1379 
1380 /// \brief Sets alive-flag of reader presence (indicative lock) for PID of
1381 ///   the current process. The function does no more than needed for
1382 ///   the correct working of mdbx_rpid_check() in other processes.
1383 /// \return Error code or zero on success
1384 MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env);
1385 
1386 /// \brief Resets alive-flag of reader presence (indicative lock)
1387 ///   for PID of the current process. The function does no more than needed
1388 ///   for the correct working of mdbx_rpid_check() in other processes.
1389 /// \return Error code or zero on success
1390 MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env);
1391 
1392 /// \brief Checks for reading process status with the given pid with help of
1393 ///   alive-flag of presence (indicative lock) or using another way.
1394 /// \return
1395 ///   MDBX_RESULT_TRUE (-1) - if the reader process with the given PID is alive
1396 ///     and working with DB (indicative lock is present).
1397 ///   MDBX_RESULT_FALSE (0) - if the reader process with the given PID is absent
1398 ///     or not working with DB (indicative lock is not present).
1399 ///   Otherwise (not 0 and not -1) - error code.
1400 MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid);
1401 
1402 #if defined(_WIN32) || defined(_WIN64)
1403 
1404 typedef void(WINAPI *MDBX_srwlock_function)(MDBX_srwlock *);
1405 MDBX_INTERNAL_VAR MDBX_srwlock_function mdbx_srwlock_Init,
1406     mdbx_srwlock_AcquireShared, mdbx_srwlock_ReleaseShared,
1407     mdbx_srwlock_AcquireExclusive, mdbx_srwlock_ReleaseExclusive;
1408 
1409 #if _WIN32_WINNT < 0x0600 /* prior to Windows Vista */
1410 typedef enum _FILE_INFO_BY_HANDLE_CLASS {
1411   FileBasicInfo,
1412   FileStandardInfo,
1413   FileNameInfo,
1414   FileRenameInfo,
1415   FileDispositionInfo,
1416   FileAllocationInfo,
1417   FileEndOfFileInfo,
1418   FileStreamInfo,
1419   FileCompressionInfo,
1420   FileAttributeTagInfo,
1421   FileIdBothDirectoryInfo,
1422   FileIdBothDirectoryRestartInfo,
1423   FileIoPriorityHintInfo,
1424   FileRemoteProtocolInfo,
1425   MaximumFileInfoByHandleClass
1426 } FILE_INFO_BY_HANDLE_CLASS,
1427     *PFILE_INFO_BY_HANDLE_CLASS;
1428 
1429 typedef struct _FILE_END_OF_FILE_INFO {
1430   LARGE_INTEGER EndOfFile;
1431 } FILE_END_OF_FILE_INFO, *PFILE_END_OF_FILE_INFO;
1432 
1433 #define REMOTE_PROTOCOL_INFO_FLAG_LOOPBACK 0x00000001
1434 #define REMOTE_PROTOCOL_INFO_FLAG_OFFLINE 0x00000002
1435 
1436 typedef struct _FILE_REMOTE_PROTOCOL_INFO {
1437   USHORT StructureVersion;
1438   USHORT StructureSize;
1439   DWORD Protocol;
1440   USHORT ProtocolMajorVersion;
1441   USHORT ProtocolMinorVersion;
1442   USHORT ProtocolRevision;
1443   USHORT Reserved;
1444   DWORD Flags;
1445   struct {
1446     DWORD Reserved[8];
1447   } GenericReserved;
1448   struct {
1449     DWORD Reserved[16];
1450   } ProtocolSpecificReserved;
1451 } FILE_REMOTE_PROTOCOL_INFO, *PFILE_REMOTE_PROTOCOL_INFO;
1452 
1453 #endif /* _WIN32_WINNT < 0x0600 (prior to Windows Vista) */
1454 
1455 typedef BOOL(WINAPI *MDBX_GetFileInformationByHandleEx)(
1456     _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass,
1457     _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize);
1458 MDBX_INTERNAL_VAR MDBX_GetFileInformationByHandleEx
1459     mdbx_GetFileInformationByHandleEx;
1460 
1461 typedef BOOL(WINAPI *MDBX_GetVolumeInformationByHandleW)(
1462     _In_ HANDLE hFile, _Out_opt_ LPWSTR lpVolumeNameBuffer,
1463     _In_ DWORD nVolumeNameSize, _Out_opt_ LPDWORD lpVolumeSerialNumber,
1464     _Out_opt_ LPDWORD lpMaximumComponentLength,
1465     _Out_opt_ LPDWORD lpFileSystemFlags,
1466     _Out_opt_ LPWSTR lpFileSystemNameBuffer, _In_ DWORD nFileSystemNameSize);
1467 MDBX_INTERNAL_VAR MDBX_GetVolumeInformationByHandleW
1468     mdbx_GetVolumeInformationByHandleW;
1469 
1470 typedef DWORD(WINAPI *MDBX_GetFinalPathNameByHandleW)(_In_ HANDLE hFile,
1471                                                       _Out_ LPWSTR lpszFilePath,
1472                                                       _In_ DWORD cchFilePath,
1473                                                       _In_ DWORD dwFlags);
1474 MDBX_INTERNAL_VAR MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW;
1475 
1476 typedef BOOL(WINAPI *MDBX_SetFileInformationByHandle)(
1477     _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass,
1478     _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize);
1479 MDBX_INTERNAL_VAR MDBX_SetFileInformationByHandle
1480     mdbx_SetFileInformationByHandle;
1481 
1482 typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)(
1483     IN HANDLE FileHandle, IN OUT HANDLE Event,
1484     IN OUT PVOID /* PIO_APC_ROUTINE */ ApcRoutine, IN OUT PVOID ApcContext,
1485     OUT PIO_STATUS_BLOCK IoStatusBlock, IN ULONG FsControlCode,
1486     IN OUT PVOID InputBuffer, IN ULONG InputBufferLength,
1487     OUT OPTIONAL PVOID OutputBuffer, IN ULONG OutputBufferLength);
1488 MDBX_INTERNAL_VAR MDBX_NtFsControlFile mdbx_NtFsControlFile;
1489 
1490 typedef uint64_t(WINAPI *MDBX_GetTickCount64)(void);
1491 MDBX_INTERNAL_VAR MDBX_GetTickCount64 mdbx_GetTickCount64;
1492 
1493 #if !defined(_WIN32_WINNT_WIN8) || _WIN32_WINNT < _WIN32_WINNT_WIN8
1494 typedef struct _WIN32_MEMORY_RANGE_ENTRY {
1495   PVOID VirtualAddress;
1496   SIZE_T NumberOfBytes;
1497 } WIN32_MEMORY_RANGE_ENTRY, *PWIN32_MEMORY_RANGE_ENTRY;
1498 #endif /* Windows 8.x */
1499 
1500 typedef BOOL(WINAPI *MDBX_PrefetchVirtualMemory)(
1501     HANDLE hProcess, ULONG_PTR NumberOfEntries,
1502     PWIN32_MEMORY_RANGE_ENTRY VirtualAddresses, ULONG Flags);
1503 MDBX_INTERNAL_VAR MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory;
1504 
1505 typedef enum _SECTION_INHERIT { ViewShare = 1, ViewUnmap = 2 } SECTION_INHERIT;
1506 
1507 typedef NTSTATUS(NTAPI *MDBX_NtExtendSection)(IN HANDLE SectionHandle,
1508                                               IN PLARGE_INTEGER NewSectionSize);
1509 MDBX_INTERNAL_VAR MDBX_NtExtendSection mdbx_NtExtendSection;
1510 
mdbx_RunningUnderWine(void)1511 static __inline bool mdbx_RunningUnderWine(void) {
1512   return !mdbx_NtExtendSection;
1513 }
1514 
1515 typedef LSTATUS(WINAPI *MDBX_RegGetValueA)(HKEY hkey, LPCSTR lpSubKey,
1516                                            LPCSTR lpValue, DWORD dwFlags,
1517                                            LPDWORD pdwType, PVOID pvData,
1518                                            LPDWORD pcbData);
1519 MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA;
1520 
1521 #endif /* Windows */
1522 
1523 #endif /* !__cplusplus */
1524 
1525 /*----------------------------------------------------------------------------*/
1526 
1527 #if defined(_MSC_VER) && _MSC_VER >= 1900
1528 /* LY: MSVC 2015/2017/2019 has buggy/inconsistent PRIuPTR/PRIxPTR macros
1529  * for internal format-args checker. */
1530 #undef PRIuPTR
1531 #undef PRIiPTR
1532 #undef PRIdPTR
1533 #undef PRIxPTR
1534 #define PRIuPTR "Iu"
1535 #define PRIiPTR "Ii"
1536 #define PRIdPTR "Id"
1537 #define PRIxPTR "Ix"
1538 #define PRIuSIZE "zu"
1539 #define PRIiSIZE "zi"
1540 #define PRIdSIZE "zd"
1541 #define PRIxSIZE "zx"
1542 #endif /* fix PRI*PTR for _MSC_VER */
1543 
1544 #ifndef PRIuSIZE
1545 #define PRIuSIZE PRIuPTR
1546 #define PRIiSIZE PRIiPTR
1547 #define PRIdSIZE PRIdPTR
1548 #define PRIxSIZE PRIxPTR
1549 #endif /* PRI*SIZE macros for MSVC */
1550 
1551 #ifdef _MSC_VER
1552 #pragma warning(pop)
1553 #endif
1554 
1555 #define mdbx_sourcery_anchor XCONCAT(mdbx_sourcery_, MDBX_BUILD_SOURCERY)
1556 #if defined(xMDBX_TOOLS)
1557 extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
1558 #endif
1559 
1560 /*******************************************************************************
1561  *******************************************************************************
1562  *******************************************************************************
1563  *
1564  *
1565  *         ####   #####    #####     #     ####   #    #   ####
1566  *        #    #  #    #     #       #    #    #  ##   #  #
1567  *        #    #  #    #     #       #    #    #  # #  #   ####
1568  *        #    #  #####      #       #    #    #  #  # #       #
1569  *        #    #  #          #       #    #    #  #   ##  #    #
1570  *         ####   #          #       #     ####   #    #   ####
1571  *
1572  *
1573  */
1574 
1575 /** \defgroup build_option Build options
1576  * The libmdbx build options.
1577  @{ */
1578 
1579 /** Using fcntl(F_FULLFSYNC) with 5-10 times slowdown */
1580 #define MDBX_OSX_WANNA_DURABILITY 0
1581 /** Using fsync() with chance of data lost on power failure */
1582 #define MDBX_OSX_WANNA_SPEED 1
1583 
1584 #ifndef MDBX_OSX_SPEED_INSTEADOF_DURABILITY
1585 /** Choices \ref MDBX_OSX_WANNA_DURABILITY or \ref MDBX_OSX_WANNA_SPEED
1586  * for OSX & iOS */
1587 #define MDBX_OSX_SPEED_INSTEADOF_DURABILITY MDBX_OSX_WANNA_DURABILITY
1588 #endif /* MDBX_OSX_SPEED_INSTEADOF_DURABILITY */
1589 
1590 /** Controls checking PID against reuse DB environment after the fork() */
1591 #ifndef MDBX_ENV_CHECKPID
1592 #if defined(MADV_DONTFORK) || defined(_WIN32) || defined(_WIN64)
1593 /* PID check could be omitted:
1594  *  - on Linux when madvise(MADV_DONTFORK) is available, i.e. after the fork()
1595  *    mapped pages will not be available for child process.
1596  *  - in Windows where fork() not available. */
1597 #define MDBX_ENV_CHECKPID 0
1598 #else
1599 #define MDBX_ENV_CHECKPID 1
1600 #endif
1601 #define MDBX_ENV_CHECKPID_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_ENV_CHECKPID)
1602 #else
1603 #define MDBX_ENV_CHECKPID_CONFIG MDBX_STRINGIFY(MDBX_ENV_CHECKPID)
1604 #endif /* MDBX_ENV_CHECKPID */
1605 
1606 /** Controls checking transaction owner thread against misuse transactions from
1607  * other threads. */
1608 #ifndef MDBX_TXN_CHECKOWNER
1609 #define MDBX_TXN_CHECKOWNER 1
1610 #define MDBX_TXN_CHECKOWNER_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_TXN_CHECKOWNER)
1611 #else
1612 #define MDBX_TXN_CHECKOWNER_CONFIG MDBX_STRINGIFY(MDBX_TXN_CHECKOWNER)
1613 #endif /* MDBX_TXN_CHECKOWNER */
1614 
1615 /** Does a system have battery-backed Real-Time Clock or just a fake. */
1616 #ifndef MDBX_TRUST_RTC
1617 #if defined(__linux__) || defined(__gnu_linux__) || defined(__NetBSD__) ||     \
1618     defined(__OpenBSD__)
1619 #define MDBX_TRUST_RTC 0 /* a lot of embedded systems have a fake RTC */
1620 #else
1621 #define MDBX_TRUST_RTC 1
1622 #endif
1623 #define MDBX_TRUST_RTC_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_TRUST_RTC)
1624 #else
1625 #define MDBX_TRUST_RTC_CONFIG MDBX_STRINGIFY(MDBX_TRUST_RTC)
1626 #endif /* MDBX_TRUST_RTC */
1627 
1628 /** Controls online database auto-compactification during write-transactions. */
1629 #ifndef MDBX_ENABLE_REFUND
1630 #define MDBX_ENABLE_REFUND 1
1631 #elif !(MDBX_ENABLE_REFUND == 0 || MDBX_ENABLE_REFUND == 1)
1632 #error MDBX_ENABLE_REFUND must be defined as 0 or 1
1633 #endif /* MDBX_ENABLE_REFUND */
1634 
1635 /** Controls gathering statistics for page operations. */
1636 #ifndef MDBX_ENABLE_PGOP_STAT
1637 #define MDBX_ENABLE_PGOP_STAT 1
1638 #elif !(MDBX_ENABLE_PGOP_STAT == 0 || MDBX_ENABLE_PGOP_STAT == 1)
1639 #error MDBX_ENABLE_PGOP_STAT must be defined as 0 or 1
1640 #endif /* MDBX_ENABLE_PGOP_STAT */
1641 
1642 /** Controls use of POSIX madvise() hints and friends. */
1643 #ifndef MDBX_ENABLE_MADVISE
1644 #define MDBX_ENABLE_MADVISE 1
1645 #elif !(MDBX_ENABLE_MADVISE == 0 || MDBX_ENABLE_MADVISE == 1)
1646 #error MDBX_ENABLE_MADVISE must be defined as 0 or 1
1647 #endif /* MDBX_ENABLE_MADVISE */
1648 
1649 /** Disable some checks to reduce an overhead and detection probability of
1650  * database corruption to a values closer to the LMDB. */
1651 #ifndef MDBX_DISABLE_PAGECHECKS
1652 #define MDBX_DISABLE_PAGECHECKS 0
1653 #elif !(MDBX_DISABLE_PAGECHECKS == 0 || MDBX_DISABLE_PAGECHECKS == 1)
1654 #error MDBX_DISABLE_PAGECHECKS must be defined as 0 or 1
1655 #endif /* MDBX_DISABLE_PAGECHECKS */
1656 
1657 #ifndef MDBX_PNL_PREALLOC_FOR_RADIXSORT
1658 #define MDBX_PNL_PREALLOC_FOR_RADIXSORT 1
1659 #elif !(MDBX_PNL_PREALLOC_FOR_RADIXSORT == 0 ||                                \
1660         MDBX_PNL_PREALLOC_FOR_RADIXSORT == 1)
1661 #error MDBX_PNL_PREALLOC_FOR_RADIXSORT must be defined as 0 or 1
1662 #endif /* MDBX_PNL_PREALLOC_FOR_RADIXSORT */
1663 
1664 #ifndef MDBX_DPL_PREALLOC_FOR_RADIXSORT
1665 #define MDBX_DPL_PREALLOC_FOR_RADIXSORT 1
1666 #elif !(MDBX_DPL_PREALLOC_FOR_RADIXSORT == 0 ||                                \
1667         MDBX_DPL_PREALLOC_FOR_RADIXSORT == 1)
1668 #error MDBX_DPL_PREALLOC_FOR_RADIXSORT must be defined as 0 or 1
1669 #endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */
1670 
1671 /* Basically, this build-option is for TODO. Guess it should be replaced
1672  * with MDBX_ENABLE_WRITEMAP_SPILLING with the three variants:
1673  *  0/OFF = Don't track dirty pages at all and don't spilling ones.
1674  *          This should be by-default on Linux and may-be other systems
1675  *          (not sure: Darwin/OSX, FreeBSD, Windows 10) where kernel provides
1676  *          properly LRU tracking and async writing on-demand.
1677  *  1/ON  = Lite tracking of dirty pages but with LRU labels and explicit
1678  *          spilling with msync(MS_ASYNC). */
1679 #ifndef MDBX_FAKE_SPILL_WRITEMAP
1680 #if defined(__linux__) || defined(__gnu_linux__)
1681 #define MDBX_FAKE_SPILL_WRITEMAP 1 /* msync(MS_ASYNC) is no-op on Linux */
1682 #else
1683 #define MDBX_FAKE_SPILL_WRITEMAP 0
1684 #endif
1685 #elif !(MDBX_FAKE_SPILL_WRITEMAP == 0 || MDBX_FAKE_SPILL_WRITEMAP == 1)
1686 #error MDBX_FAKE_SPILL_WRITEMAP must be defined as 0 or 1
1687 #endif /* MDBX_FAKE_SPILL_WRITEMAP */
1688 
1689 /** Controls sort order of internal page number lists.
1690  * This mostly experimental/advanced option with not for regular MDBX users.
1691  * \warning The database format depend on this option and libmdbx builded with
1692  * different option value are incompatible. */
1693 #ifndef MDBX_PNL_ASCENDING
1694 #define MDBX_PNL_ASCENDING 0
1695 #elif !(MDBX_PNL_ASCENDING == 0 || MDBX_PNL_ASCENDING == 1)
1696 #error MDBX_PNL_ASCENDING must be defined as 0 or 1
1697 #endif /* MDBX_PNL_ASCENDING */
1698 
1699 /** Avoid dependence from MSVC CRT and use ntdll.dll instead. */
1700 #ifndef MDBX_WITHOUT_MSVC_CRT
1701 #define MDBX_WITHOUT_MSVC_CRT 1
1702 #elif !(MDBX_WITHOUT_MSVC_CRT == 0 || MDBX_WITHOUT_MSVC_CRT == 1)
1703 #error MDBX_WITHOUT_MSVC_CRT must be defined as 0 or 1
1704 #endif /* MDBX_WITHOUT_MSVC_CRT */
1705 
1706 /** Size of buffer used during copying a environment/database file. */
1707 #ifndef MDBX_ENVCOPY_WRITEBUF
1708 #define MDBX_ENVCOPY_WRITEBUF 1048576u
1709 #elif MDBX_ENVCOPY_WRITEBUF < 65536u || MDBX_ENVCOPY_WRITEBUF > 1073741824u || \
1710     MDBX_ENVCOPY_WRITEBUF % 65536u
1711 #error MDBX_ENVCOPY_WRITEBUF must be defined in range 65536..1073741824 and be multiple of 65536
1712 #endif /* MDBX_ENVCOPY_WRITEBUF */
1713 
1714 /** Forces assertion checking */
1715 #ifndef MDBX_FORCE_ASSERTIONS
1716 #define MDBX_FORCE_ASSERTIONS 0
1717 #elif !(MDBX_FORCE_ASSERTIONS == 0 || MDBX_FORCE_ASSERTIONS == 1)
1718 #error MDBX_FORCE_ASSERTIONS must be defined as 0 or 1
1719 #endif /* MDBX_FORCE_ASSERTIONS */
1720 
1721 /** Presumed malloc size overhead for each allocation
1722  * to adjust allocations to be more aligned. */
1723 #ifndef MDBX_ASSUME_MALLOC_OVERHEAD
1724 #ifdef __SIZEOF_POINTER__
1725 #define MDBX_ASSUME_MALLOC_OVERHEAD (__SIZEOF_POINTER__ * 2u)
1726 #else
1727 #define MDBX_ASSUME_MALLOC_OVERHEAD (sizeof(void *) * 2u)
1728 #endif
1729 #elif MDBX_ASSUME_MALLOC_OVERHEAD < 0 || MDBX_ASSUME_MALLOC_OVERHEAD > 64 ||   \
1730     MDBX_ASSUME_MALLOC_OVERHEAD % 4
1731 #error MDBX_ASSUME_MALLOC_OVERHEAD must be defined in range 0..64 and be multiple of 4
1732 #endif /* MDBX_ASSUME_MALLOC_OVERHEAD */
1733 
1734 /** In case the MDBX_DEBUG is undefined set it corresponding to NDEBUG */
1735 #ifndef MDBX_DEBUG
1736 #ifdef NDEBUG
1737 #define MDBX_DEBUG 0
1738 #else
1739 #define MDBX_DEBUG 1
1740 #endif
1741 #endif /* MDBX_DEBUG */
1742 
1743 /** If defined then enables integration with Valgrind,
1744  * a memory analyzing tool. */
1745 #ifndef MDBX_USE_VALGRIND
1746 #endif /* MDBX_USE_VALGRIND */
1747 
1748 /** If defined then enables use C11 atomics,
1749  *  otherwise detects ones availability automatically. */
1750 #ifndef MDBX_HAVE_C11ATOMICS
1751 #endif /* MDBX_HAVE_C11ATOMICS */
1752 
1753 //------------------------------------------------------------------------------
1754 
1755 /** Win32 File Locking API for \ref MDBX_LOCKING */
1756 #define MDBX_LOCKING_WIN32FILES -1
1757 
1758 /** SystemV IPC semaphores for \ref MDBX_LOCKING */
1759 #define MDBX_LOCKING_SYSV 5
1760 
1761 /** POSIX-1 Shared anonymous semaphores for \ref MDBX_LOCKING */
1762 #define MDBX_LOCKING_POSIX1988 1988
1763 
1764 /** POSIX-2001 Shared Mutexes for \ref MDBX_LOCKING */
1765 #define MDBX_LOCKING_POSIX2001 2001
1766 
1767 /** POSIX-2008 Robust Mutexes for \ref MDBX_LOCKING */
1768 #define MDBX_LOCKING_POSIX2008 2008
1769 
1770 /** BeOS Benaphores, aka Futexes for \ref MDBX_LOCKING */
1771 #define MDBX_LOCKING_BENAPHORE 1995
1772 
1773 /** Advanced: Choices the locking implementation (autodetection by default). */
1774 #if defined(_WIN32) || defined(_WIN64)
1775 #define MDBX_LOCKING MDBX_LOCKING_WIN32FILES
1776 #else
1777 #ifndef MDBX_LOCKING
1778 #if defined(_POSIX_THREAD_PROCESS_SHARED) &&                                   \
1779     _POSIX_THREAD_PROCESS_SHARED >= 200112L && !defined(__FreeBSD__)
1780 
1781 /* Some platforms define the EOWNERDEAD error code even though they
1782  * don't support Robust Mutexes. If doubt compile with -MDBX_LOCKING=2001. */
1783 #if defined(EOWNERDEAD) && _POSIX_THREAD_PROCESS_SHARED >= 200809L &&          \
1784     ((defined(_POSIX_THREAD_ROBUST_PRIO_INHERIT) &&                            \
1785       _POSIX_THREAD_ROBUST_PRIO_INHERIT > 0) ||                                \
1786      (defined(_POSIX_THREAD_ROBUST_PRIO_PROTECT) &&                            \
1787       _POSIX_THREAD_ROBUST_PRIO_PROTECT > 0) ||                                \
1788      defined(PTHREAD_MUTEX_ROBUST) || defined(PTHREAD_MUTEX_ROBUST_NP)) &&     \
1789     (!defined(__GLIBC__) ||                                                    \
1790      __GLIBC_PREREQ(2, 10) /* troubles with Robust mutexes before 2.10 */)
1791 #define MDBX_LOCKING MDBX_LOCKING_POSIX2008
1792 #else
1793 #define MDBX_LOCKING MDBX_LOCKING_POSIX2001
1794 #endif
1795 #elif defined(__sun) || defined(__SVR4) || defined(__svr4__)
1796 #define MDBX_LOCKING MDBX_LOCKING_POSIX1988
1797 #else
1798 #define MDBX_LOCKING MDBX_LOCKING_SYSV
1799 #endif
1800 #define MDBX_LOCKING_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_LOCKING)
1801 #else
1802 #define MDBX_LOCKING_CONFIG MDBX_STRINGIFY(MDBX_LOCKING)
1803 #endif /* MDBX_LOCKING */
1804 #endif /* !Windows */
1805 
1806 /** Advanced: Using POSIX OFD-locks (autodetection by default). */
1807 #ifndef MDBX_USE_OFDLOCKS
1808 #if defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && defined(F_OFD_GETLK) &&   \
1809     !defined(MDBX_SAFE4QEMU) &&                                                \
1810     !defined(__sun) /* OFD-lock are broken on Solaris */
1811 #define MDBX_USE_OFDLOCKS 1
1812 #else
1813 #define MDBX_USE_OFDLOCKS 0
1814 #endif
1815 #define MDBX_USE_OFDLOCKS_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_USE_OFDLOCKS)
1816 #else
1817 #define MDBX_USE_OFDLOCKS_CONFIG MDBX_STRINGIFY(MDBX_USE_OFDLOCKS)
1818 #endif /* MDBX_USE_OFDLOCKS */
1819 
1820 /** Advanced: Using sendfile() syscall (autodetection by default). */
1821 #ifndef MDBX_USE_SENDFILE
1822 #if ((defined(__linux__) || defined(__gnu_linux__)) &&                         \
1823      !defined(__ANDROID_API__)) ||                                             \
1824     (defined(__ANDROID_API__) && __ANDROID_API__ >= 21)
1825 #define MDBX_USE_SENDFILE 1
1826 #else
1827 #define MDBX_USE_SENDFILE 0
1828 #endif
1829 #endif /* MDBX_USE_SENDFILE */
1830 
1831 /** Advanced: Using copy_file_range() syscall (autodetection by default). */
1832 #ifndef MDBX_USE_COPYFILERANGE
1833 #if __GLIBC_PREREQ(2, 27) && defined(_GNU_SOURCE)
1834 #define MDBX_USE_COPYFILERANGE 1
1835 #else
1836 #define MDBX_USE_COPYFILERANGE 0
1837 #endif
1838 #endif /* MDBX_USE_COPYFILERANGE */
1839 
1840 /** Advanced: Using sync_file_range() syscall (autodetection by default). */
1841 #ifndef MDBX_USE_SYNCFILERANGE
1842 #if ((defined(__linux__) || defined(__gnu_linux__)) &&                         \
1843      defined(SYNC_FILE_RANGE_WRITE) && !defined(__ANDROID_API__)) ||           \
1844     (defined(__ANDROID_API__) && __ANDROID_API__ >= 26)
1845 #define MDBX_USE_SYNCFILERANGE 1
1846 #else
1847 #define MDBX_USE_SYNCFILERANGE 0
1848 #endif
1849 #endif /* MDBX_USE_SYNCFILERANGE */
1850 
1851 //------------------------------------------------------------------------------
1852 
1853 #ifndef MDBX_CPU_WRITEBACK_INCOHERENT
1854 #if defined(__ia32__) || defined(__e2k__) || defined(__hppa) ||                \
1855     defined(__hppa__) || defined(DOXYGEN)
1856 #define MDBX_CPU_WRITEBACK_INCOHERENT 0
1857 #else
1858 #define MDBX_CPU_WRITEBACK_INCOHERENT 1
1859 #endif
1860 #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */
1861 
1862 #ifndef MDBX_MMAP_INCOHERENT_FILE_WRITE
1863 #ifdef __OpenBSD__
1864 #define MDBX_MMAP_INCOHERENT_FILE_WRITE 1
1865 #else
1866 #define MDBX_MMAP_INCOHERENT_FILE_WRITE 0
1867 #endif
1868 #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */
1869 
1870 #ifndef MDBX_MMAP_INCOHERENT_CPU_CACHE
1871 #if defined(__mips) || defined(__mips__) || defined(__mips64) ||               \
1872     defined(__mips64__) || defined(_M_MRX000) || defined(_MIPS_) ||            \
1873     defined(__MWERKS__) || defined(__sgi)
1874 /* MIPS has cache coherency issues. */
1875 #define MDBX_MMAP_INCOHERENT_CPU_CACHE 1
1876 #else
1877 /* LY: assume no relevant mmap/dcache issues. */
1878 #define MDBX_MMAP_INCOHERENT_CPU_CACHE 0
1879 #endif
1880 #endif /* MDBX_MMAP_INCOHERENT_CPU_CACHE */
1881 
1882 #ifndef MDBX_64BIT_ATOMIC
1883 #if MDBX_WORDBITS >= 64 || defined(DOXYGEN)
1884 #define MDBX_64BIT_ATOMIC 1
1885 #else
1886 #define MDBX_64BIT_ATOMIC 0
1887 #endif
1888 #define MDBX_64BIT_ATOMIC_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_64BIT_ATOMIC)
1889 #else
1890 #define MDBX_64BIT_ATOMIC_CONFIG MDBX_STRINGIFY(MDBX_64BIT_ATOMIC)
1891 #endif /* MDBX_64BIT_ATOMIC */
1892 
1893 #ifndef MDBX_64BIT_CAS
1894 #if defined(ATOMIC_LLONG_LOCK_FREE)
1895 #if ATOMIC_LLONG_LOCK_FREE > 1
1896 #define MDBX_64BIT_CAS 1
1897 #else
1898 #define MDBX_64BIT_CAS 0
1899 #endif
1900 #elif defined(__GCC_ATOMIC_LLONG_LOCK_FREE)
1901 #if __GCC_ATOMIC_LLONG_LOCK_FREE > 1
1902 #define MDBX_64BIT_CAS 1
1903 #else
1904 #define MDBX_64BIT_CAS 0
1905 #endif
1906 #elif defined(__CLANG_ATOMIC_LLONG_LOCK_FREE)
1907 #if __CLANG_ATOMIC_LLONG_LOCK_FREE > 1
1908 #define MDBX_64BIT_CAS 1
1909 #else
1910 #define MDBX_64BIT_CAS 0
1911 #endif
1912 #elif defined(_MSC_VER) || defined(__APPLE__) || defined(DOXYGEN)
1913 #define MDBX_64BIT_CAS 1
1914 #else
1915 #define MDBX_64BIT_CAS MDBX_64BIT_ATOMIC
1916 #endif
1917 #define MDBX_64BIT_CAS_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_64BIT_CAS)
1918 #else
1919 #define MDBX_64BIT_CAS_CONFIG MDBX_STRINGIFY(MDBX_64BIT_CAS)
1920 #endif /* MDBX_64BIT_CAS */
1921 
1922 #ifndef MDBX_UNALIGNED_OK
1923 #ifdef _MSC_VER
1924 #define MDBX_UNALIGNED_OK 1 /* avoid MSVC misoptimization */
1925 #elif __CLANG_PREREQ(5, 0) || __GNUC_PREREQ(5, 0)
1926 #define MDBX_UNALIGNED_OK 0 /* expecting optimization is well done */
1927 #elif (defined(__ia32__) || defined(__ARM_FEATURE_UNALIGNED)) &&               \
1928     !defined(__ALIGNED__)
1929 #define MDBX_UNALIGNED_OK 1
1930 #else
1931 #define MDBX_UNALIGNED_OK 0
1932 #endif
1933 #endif /* MDBX_UNALIGNED_OK */
1934 
1935 #ifndef MDBX_CACHELINE_SIZE
1936 #if defined(SYSTEM_CACHE_ALIGNMENT_SIZE)
1937 #define MDBX_CACHELINE_SIZE SYSTEM_CACHE_ALIGNMENT_SIZE
1938 #elif defined(__ia64__) || defined(__ia64) || defined(_M_IA64)
1939 #define MDBX_CACHELINE_SIZE 128
1940 #else
1941 #define MDBX_CACHELINE_SIZE 64
1942 #endif
1943 #endif /* MDBX_CACHELINE_SIZE */
1944 
1945 /** @} end of build options */
1946 /*******************************************************************************
1947  *******************************************************************************
1948  ******************************************************************************/
1949 
1950 #ifdef DOXYGEN
1951 /* !!! Actually this is a fake definitions     !!!
1952  * !!! for documentation generation by Doxygen !!! */
1953 
1954 /** Controls enabling of debugging features.
1955  *
1956  *  - `MDBX_DEBUG = 0` (by default) Disables any debugging features at all,
1957  *                     including logging and assertion controls.
1958  *                     Logging level and corresponding debug flags changing
1959  *                     by \ref mdbx_setup_debug() will not have effect.
1960  *  - `MDBX_DEBUG > 0` Enables code for the debugging features (logging,
1961  *                     assertions checking and internal audit).
1962  *                     Simultaneously sets the default logging level
1963  *                     to the `MDBX_DEBUG` value.
1964  *                     Also enables \ref MDBX_DBG_AUDIT if `MDBX_DEBUG >= 2`.
1965  *
1966  * \ingroup build_option */
1967 #define MDBX_DEBUG 0...7
1968 
1969 /** Disables using of GNU libc extensions. */
1970 #define MDBX_DISABLE_GNU_SOURCE 0 or 1
1971 
1972 #endif /* DOXYGEN */
1973 
1974 /* Undefine the NDEBUG if debugging is enforced by MDBX_DEBUG */
1975 #if MDBX_DEBUG
1976 #undef NDEBUG
1977 #endif
1978 
1979 /*----------------------------------------------------------------------------*/
1980 /* Atomics */
1981 
1982 enum MDBX_memory_order {
1983   mo_Relaxed,
1984   mo_AcquireRelease,
1985   mo_SequentialConsistency
1986 };
1987 
1988 typedef union {
1989   volatile uint32_t weak;
1990 #ifdef MDBX_HAVE_C11ATOMICS
1991   volatile _Atomic uint32_t c11a;
1992 #endif /* MDBX_HAVE_C11ATOMICS */
1993 } MDBX_atomic_uint32_t;
1994 
1995 typedef union {
1996   volatile uint64_t weak;
1997 #if defined(MDBX_HAVE_C11ATOMICS) && (MDBX_64BIT_CAS || MDBX_64BIT_ATOMIC)
1998   volatile _Atomic uint64_t c11a;
1999 #endif
2000 #if !defined(MDBX_HAVE_C11ATOMICS) || !MDBX_64BIT_CAS || !MDBX_64BIT_ATOMIC
2001   __anonymous_struct_extension__ struct {
2002 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
2003     MDBX_atomic_uint32_t low, high;
2004 #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
2005     MDBX_atomic_uint32_t high, low;
2006 #else
2007 #error "FIXME: Unsupported byte order"
2008 #endif /* __BYTE_ORDER__ */
2009   };
2010 #endif
2011 } MDBX_atomic_uint64_t;
2012 
2013 #ifdef MDBX_HAVE_C11ATOMICS
2014 
2015 /* Crutches for C11 atomic compiler's bugs */
2016 #if defined(__e2k__) && defined(__LCC__) && __LCC__ < /* FIXME */ 127
2017 #define MDBX_c11a_ro(type, ptr) (&(ptr)->weak)
2018 #define MDBX_c11a_rw(type, ptr) (&(ptr)->weak)
2019 #elif defined(__clang__) && __clang__ < 8
2020 #define MDBX_c11a_ro(type, ptr) ((volatile _Atomic(type) *)&(ptr)->c11a)
2021 #define MDBX_c11a_rw(type, ptr) (&(ptr)->c11a)
2022 #else
2023 #define MDBX_c11a_ro(type, ptr) (&(ptr)->c11a)
2024 #define MDBX_c11a_rw(type, ptr) (&(ptr)->c11a)
2025 #endif /* Crutches for C11 atomic compiler's bugs */
2026 
mo_c11_store(enum MDBX_memory_order fence)2027 static __always_inline memory_order mo_c11_store(enum MDBX_memory_order fence) {
2028   switch (fence) {
2029   default:
2030     assert(false);
2031     __unreachable();
2032   case mo_Relaxed:
2033     return memory_order_relaxed;
2034   case mo_AcquireRelease:
2035     return memory_order_release;
2036   case mo_SequentialConsistency:
2037     return memory_order_seq_cst;
2038   }
2039 }
2040 
mo_c11_load(enum MDBX_memory_order fence)2041 static __always_inline memory_order mo_c11_load(enum MDBX_memory_order fence) {
2042   switch (fence) {
2043   default:
2044     assert(false);
2045     __unreachable();
2046   case mo_Relaxed:
2047     return memory_order_relaxed;
2048   case mo_AcquireRelease:
2049     return memory_order_acquire;
2050   case mo_SequentialConsistency:
2051     return memory_order_seq_cst;
2052   }
2053 }
2054 #endif /* MDBX_HAVE_C11ATOMICS */
2055 
2056 #ifndef __cplusplus
2057 
2058 MDBX_MAYBE_UNUSED static __always_inline void
mdbx_memory_fence(enum MDBX_memory_order order,bool write)2059 mdbx_memory_fence(enum MDBX_memory_order order, bool write) {
2060 #ifdef MDBX_HAVE_C11ATOMICS
2061   atomic_thread_fence(write ? mo_c11_store(order) : mo_c11_load(order));
2062 #else  /* MDBX_HAVE_C11ATOMICS */
2063   mdbx_compiler_barrier();
2064   if (write &&
2065       order > (MDBX_CPU_WRITEBACK_INCOHERENT ? mo_Relaxed : mo_AcquireRelease))
2066     mdbx_memory_barrier();
2067 #endif /* MDBX_HAVE_C11ATOMICS */
2068 }
2069 
2070 MDBX_MAYBE_UNUSED static __always_inline uint32_t
atomic_store32(MDBX_atomic_uint32_t * p,const uint32_t value,enum MDBX_memory_order order)2071 atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value,
2072                enum MDBX_memory_order order) {
2073   STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4);
2074 #ifdef MDBX_HAVE_C11ATOMICS
2075   assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p)));
2076   atomic_store_explicit(MDBX_c11a_rw(uint32_t, p), value, mo_c11_store(order));
2077 #else  /* MDBX_HAVE_C11ATOMICS */
2078   if (order != mo_Relaxed)
2079     mdbx_compiler_barrier();
2080   p->weak = value;
2081   mdbx_memory_fence(order, true);
2082 #endif /* MDBX_HAVE_C11ATOMICS */
2083   return value;
2084 }
2085 
2086 MDBX_MAYBE_UNUSED static __always_inline uint32_t
atomic_load32(const MDBX_atomic_uint32_t * p,enum MDBX_memory_order order)2087 atomic_load32(const MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) {
2088   STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4);
2089 #ifdef MDBX_HAVE_C11ATOMICS
2090   assert(atomic_is_lock_free(MDBX_c11a_ro(uint32_t, p)));
2091   return atomic_load_explicit(MDBX_c11a_ro(uint32_t, p), mo_c11_load(order));
2092 #else  /* MDBX_HAVE_C11ATOMICS */
2093   mdbx_memory_fence(order, false);
2094   const uint32_t value = p->weak;
2095   if (order != mo_Relaxed)
2096     mdbx_compiler_barrier();
2097   return value;
2098 #endif /* MDBX_HAVE_C11ATOMICS */
2099 }
2100 
2101 #endif /* !__cplusplus */
2102 
2103 /*----------------------------------------------------------------------------*/
2104 /* Basic constants and types */
2105 
2106 /* A stamp that identifies a file as an MDBX file.
2107  * There's nothing special about this value other than that it is easily
2108  * recognizable, and it will reflect any byte order mismatches. */
2109 #define MDBX_MAGIC UINT64_C(/* 56-bit prime */ 0x59659DBDEF4C11)
2110 
2111 /* FROZEN: The version number for a database's datafile format. */
2112 #define MDBX_DATA_VERSION 3
2113 /* The version number for a database's lockfile format. */
2114 #define MDBX_LOCK_VERSION 4
2115 
2116 /* handle for the DB used to track free pages. */
2117 #define FREE_DBI 0
2118 /* handle for the default DB. */
2119 #define MAIN_DBI 1
2120 /* Number of DBs in metapage (free and main) - also hardcoded elsewhere */
2121 #define CORE_DBS 2
2122 
2123 /* Number of meta pages - also hardcoded elsewhere */
2124 #define NUM_METAS 3
2125 
2126 /* A page number in the database.
2127  *
2128  * MDBX uses 32 bit for page numbers. This limits database
2129  * size up to 2^44 bytes, in case of 4K pages. */
2130 typedef uint32_t pgno_t;
2131 typedef MDBX_atomic_uint32_t atomic_pgno_t;
2132 #define PRIaPGNO PRIu32
2133 #define MAX_PAGENO UINT32_C(0x7FFFffff)
2134 #define MIN_PAGENO NUM_METAS
2135 
2136 #define SAFE64_INVALID_THRESHOLD UINT64_C(0xffffFFFF00000000)
2137 
2138 /* A transaction ID. */
2139 typedef uint64_t txnid_t;
2140 typedef MDBX_atomic_uint64_t atomic_txnid_t;
2141 #define PRIaTXN PRIi64
2142 #define MIN_TXNID UINT64_C(1)
2143 #define MAX_TXNID (SAFE64_INVALID_THRESHOLD - 1)
2144 #define INITIAL_TXNID (MIN_TXNID + NUM_METAS - 1)
2145 #define INVALID_TXNID UINT64_MAX
2146 /* LY: for testing non-atomic 64-bit txnid on 32-bit arches.
2147  * #define xMDBX_TXNID_STEP (UINT32_MAX / 3) */
2148 #ifndef xMDBX_TXNID_STEP
2149 #if MDBX_64BIT_CAS
2150 #define xMDBX_TXNID_STEP 1u
2151 #else
2152 #define xMDBX_TXNID_STEP 2u
2153 #endif
2154 #endif /* xMDBX_TXNID_STEP */
2155 
2156 /* Used for offsets within a single page.
2157  * Since memory pages are typically 4 or 8KB in size, 12-13 bits,
2158  * this is plenty. */
2159 typedef uint16_t indx_t;
2160 
2161 #define MEGABYTE ((size_t)1 << 20)
2162 
2163 /*----------------------------------------------------------------------------*/
2164 /* Core structures for database and shared memory (i.e. format definition) */
2165 #pragma pack(push, 4)
2166 
2167 /* Information about a single database in the environment. */
2168 typedef struct MDBX_db {
2169   uint16_t md_flags;        /* see mdbx_dbi_open */
2170   uint16_t md_depth;        /* depth of this tree */
2171   uint32_t md_xsize;        /* key-size for MDBX_DUPFIXED (LEAF2 pages) */
2172   pgno_t md_root;           /* the root page of this tree */
2173   pgno_t md_branch_pages;   /* number of internal pages */
2174   pgno_t md_leaf_pages;     /* number of leaf pages */
2175   pgno_t md_overflow_pages; /* number of overflow pages */
2176   uint64_t md_seq;          /* table sequence counter */
2177   uint64_t md_entries;      /* number of data items */
2178   uint64_t md_mod_txnid;    /* txnid of last committed modification */
2179 } MDBX_db;
2180 
2181 /* database size-related parameters */
2182 typedef struct MDBX_geo {
2183   uint16_t grow_pv;   /* datafile growth step as a 16-bit packed (exponential
2184                            quantized) value */
2185   uint16_t shrink_pv; /* datafile shrink threshold as a 16-bit packed
2186                            (exponential quantized) value */
2187   pgno_t lower;       /* minimal size of datafile in pages */
2188   pgno_t upper;       /* maximal size of datafile in pages */
2189   pgno_t now;         /* current size of datafile in pages */
2190   pgno_t next;        /* first unused page in the datafile,
2191                          but actually the file may be shorter. */
2192 } MDBX_geo;
2193 
2194 /* Meta page content.
2195  * A meta page is the start point for accessing a database snapshot.
2196  * Pages 0-1 are meta pages. Transaction N writes meta page (N % 2). */
2197 typedef struct MDBX_meta {
2198   /* Stamp identifying this as an MDBX file.
2199    * It must be set to MDBX_MAGIC with MDBX_DATA_VERSION. */
2200   uint32_t mm_magic_and_version[2];
2201 
2202   /* txnid that committed this page, the first of a two-phase-update pair */
2203   uint32_t mm_txnid_a[2];
2204 
2205   uint16_t mm_extra_flags;  /* extra DB flags, zero (nothing) for now */
2206   uint8_t mm_validator_id;  /* ID of checksum and page validation method,
2207                              * zero (nothing) for now */
2208   uint8_t mm_extra_pagehdr; /* extra bytes in the page header,
2209                              * zero (nothing) for now */
2210 
2211   MDBX_geo mm_geo; /* database size-related parameters */
2212 
2213   MDBX_db mm_dbs[CORE_DBS]; /* first is free space, 2nd is main db */
2214                             /* The size of pages used in this DB */
2215 #define mm_psize mm_dbs[FREE_DBI].md_xsize
2216   MDBX_canary mm_canary;
2217 
2218 #define MDBX_DATASIGN_NONE 0u
2219 #define MDBX_DATASIGN_WEAK 1u
2220 #define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK)
2221 #define META_IS_STEADY(meta)                                                   \
2222   SIGN_IS_STEADY(unaligned_peek_u64(4, (meta)->mm_datasync_sign))
2223   uint32_t mm_datasync_sign[2];
2224 
2225   /* txnid that committed this page, the second of a two-phase-update pair */
2226   uint32_t mm_txnid_b[2];
2227 
2228   /* Number of non-meta pages which were put in GC after COW. May be 0 in case
2229    * DB was previously handled by libmdbx without corresponding feature.
2230    * This value in couple with mr_snapshot_pages_retired allows fast estimation
2231    * of "how much reader is restraining GC recycling". */
2232   uint32_t mm_pages_retired[2];
2233 
2234   /* The analogue /proc/sys/kernel/random/boot_id or similar to determine
2235    * whether the system was rebooted after the last use of the database files.
2236    * If there was no reboot, but there is no need to rollback to the last
2237    * steady sync point. Zeros mean that no relevant information is available
2238    * from the system. */
2239   bin128_t mm_bootid;
2240 
2241 } MDBX_meta;
2242 
2243 #pragma pack(1)
2244 
2245 /* Common header for all page types. The page type depends on mp_flags.
2246  *
2247  * P_BRANCH and P_LEAF pages have unsorted 'MDBX_node's at the end, with
2248  * sorted mp_ptrs[] entries referring to them. Exception: P_LEAF2 pages
2249  * omit mp_ptrs and pack sorted MDBX_DUPFIXED values after the page header.
2250  *
2251  * P_OVERFLOW records occupy one or more contiguous pages where only the
2252  * first has a page header. They hold the real data of F_BIGDATA nodes.
2253  *
2254  * P_SUBP sub-pages are small leaf "pages" with duplicate data.
2255  * A node with flag F_DUPDATA but not F_SUBDATA contains a sub-page.
2256  * (Duplicate data can also go in sub-databases, which use normal pages.)
2257  *
2258  * P_META pages contain MDBX_meta, the start point of an MDBX snapshot.
2259  *
2260  * Each non-metapage up to MDBX_meta.mm_last_pg is reachable exactly once
2261  * in the snapshot: Either used by a database or listed in a GC record. */
2262 typedef struct MDBX_page {
2263   union {
2264 #define IS_FROZEN(txn, p) ((p)->mp_txnid < (txn)->mt_txnid)
2265 #define IS_SPILLED(txn, p) ((p)->mp_txnid == (txn)->mt_txnid)
2266 #define IS_SHADOWED(txn, p) ((p)->mp_txnid > (txn)->mt_txnid)
2267 #define IS_VALID(txn, p) ((p)->mp_txnid <= (txn)->mt_front)
2268 #define IS_MODIFIABLE(txn, p) ((p)->mp_txnid == (txn)->mt_front)
2269     uint64_t mp_txnid;
2270     struct MDBX_page *mp_next; /* for in-memory list of freed pages */
2271   };
2272   uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */
2273 #define P_BRANCH 0x01      /* branch page */
2274 #define P_LEAF 0x02        /* leaf page */
2275 #define P_OVERFLOW 0x04    /* overflow page */
2276 #define P_META 0x08        /* meta page */
2277 #define P_BAD 0x10         /* explicit flag for invalid/bad page */
2278 #define P_LEAF2 0x20       /* for MDBX_DUPFIXED records */
2279 #define P_SUBP 0x40        /* for MDBX_DUPSORT sub-pages */
2280 #define P_SPILLED 0x2000   /* spilled in parent txn */
2281 #define P_LOOSE 0x4000     /* page was dirtied then freed, can be reused */
2282 #define P_FROZEN 0x8000    /* used for retire page with known status */
2283 #define P_ILL_BITS (~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_SPILLED))
2284   uint16_t mp_flags;
2285   union {
2286     uint32_t mp_pages; /* number of overflow pages */
2287     __anonymous_struct_extension__ struct {
2288       indx_t mp_lower; /* lower bound of free space */
2289       indx_t mp_upper; /* upper bound of free space */
2290     };
2291   };
2292   pgno_t mp_pgno; /* page number */
2293 
2294 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) ||              \
2295     (!defined(__cplusplus) && defined(_MSC_VER))
2296   indx_t mp_ptrs[] /* dynamic size */;
2297 #endif /* C99 */
2298 } MDBX_page;
2299 
2300 /* Size of the page header, excluding dynamic data at the end */
2301 #define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_ptrs))
2302 
2303 #pragma pack(pop)
2304 
2305 #if MDBX_ENABLE_PGOP_STAT
2306 /* Statistics of page operations overall of all (running, completed and aborted)
2307  * transactions */
2308 typedef struct {
2309   MDBX_atomic_uint64_t newly;   /* Quantity of a new pages added */
2310   MDBX_atomic_uint64_t cow;     /* Quantity of pages copied for update */
2311   MDBX_atomic_uint64_t clone;   /* Quantity of parent's dirty pages clones
2312                                    for nested transactions */
2313   MDBX_atomic_uint64_t split;   /* Page splits */
2314   MDBX_atomic_uint64_t merge;   /* Page merges */
2315   MDBX_atomic_uint64_t spill;   /* Quantity of spilled dirty pages */
2316   MDBX_atomic_uint64_t unspill; /* Quantity of unspilled/reloaded pages */
2317   MDBX_atomic_uint64_t
2318       wops; /* Number of explicit write operations (not a pages) to a disk */
2319 } MDBX_pgop_stat_t;
2320 #endif /* MDBX_ENABLE_PGOP_STAT */
2321 
2322 #if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES
2323 #define MDBX_CLOCK_SIGN UINT32_C(0xF10C)
2324 typedef void mdbx_ipclock_t;
2325 #elif MDBX_LOCKING == MDBX_LOCKING_SYSV
2326 
2327 #define MDBX_CLOCK_SIGN UINT32_C(0xF18D)
2328 typedef mdbx_pid_t mdbx_ipclock_t;
2329 #ifndef EOWNERDEAD
2330 #define EOWNERDEAD MDBX_RESULT_TRUE
2331 #endif
2332 
2333 #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 ||                                \
2334     MDBX_LOCKING == MDBX_LOCKING_POSIX2008
2335 #define MDBX_CLOCK_SIGN UINT32_C(0x8017)
2336 typedef pthread_mutex_t mdbx_ipclock_t;
2337 #elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988
2338 #define MDBX_CLOCK_SIGN UINT32_C(0xFC29)
2339 typedef sem_t mdbx_ipclock_t;
2340 #else
2341 #error "FIXME"
2342 #endif /* MDBX_LOCKING */
2343 
2344 #if MDBX_LOCKING > MDBX_LOCKING_SYSV && !defined(__cplusplus)
2345 MDBX_INTERNAL_FUNC int mdbx_ipclock_stub(mdbx_ipclock_t *ipc);
2346 MDBX_INTERNAL_FUNC int mdbx_ipclock_destroy(mdbx_ipclock_t *ipc);
2347 #endif /* MDBX_LOCKING */
2348 
2349 /* Reader Lock Table
2350  *
2351  * Readers don't acquire any locks for their data access. Instead, they
2352  * simply record their transaction ID in the reader table. The reader
2353  * mutex is needed just to find an empty slot in the reader table. The
2354  * slot's address is saved in thread-specific data so that subsequent
2355  * read transactions started by the same thread need no further locking to
2356  * proceed.
2357  *
2358  * If MDBX_NOTLS is set, the slot address is not saved in thread-specific data.
2359  * No reader table is used if the database is on a read-only filesystem.
2360  *
2361  * Since the database uses multi-version concurrency control, readers don't
2362  * actually need any locking. This table is used to keep track of which
2363  * readers are using data from which old transactions, so that we'll know
2364  * when a particular old transaction is no longer in use. Old transactions
2365  * that have discarded any data pages can then have those pages reclaimed
2366  * for use by a later write transaction.
2367  *
2368  * The lock table is constructed such that reader slots are aligned with the
2369  * processor's cache line size. Any slot is only ever used by one thread.
2370  * This alignment guarantees that there will be no contention or cache
2371  * thrashing as threads update their own slot info, and also eliminates
2372  * any need for locking when accessing a slot.
2373  *
2374  * A writer thread will scan every slot in the table to determine the oldest
2375  * outstanding reader transaction. Any freed pages older than this will be
2376  * reclaimed by the writer. The writer doesn't use any locks when scanning
2377  * this table. This means that there's no guarantee that the writer will
2378  * see the most up-to-date reader info, but that's not required for correct
2379  * operation - all we need is to know the upper bound on the oldest reader,
2380  * we don't care at all about the newest reader. So the only consequence of
2381  * reading stale information here is that old pages might hang around a
2382  * while longer before being reclaimed. That's actually good anyway, because
2383  * the longer we delay reclaiming old pages, the more likely it is that a
2384  * string of contiguous pages can be found after coalescing old pages from
2385  * many old transactions together. */
2386 
2387 /* The actual reader record, with cacheline padding. */
2388 typedef struct MDBX_reader {
2389   /* Current Transaction ID when this transaction began, or (txnid_t)-1.
2390    * Multiple readers that start at the same time will probably have the
2391    * same ID here. Again, it's not important to exclude them from
2392    * anything; all we need to know is which version of the DB they
2393    * started from so we can avoid overwriting any data used in that
2394    * particular version. */
2395   MDBX_atomic_uint64_t /* txnid_t */ mr_txnid;
2396 
2397   /* The information we store in a single slot of the reader table.
2398    * In addition to a transaction ID, we also record the process and
2399    * thread ID that owns a slot, so that we can detect stale information,
2400    * e.g. threads or processes that went away without cleaning up.
2401    *
2402    * NOTE: We currently don't check for stale records.
2403    * We simply re-init the table when we know that we're the only process
2404    * opening the lock file. */
2405 
2406   /* The thread ID of the thread owning this txn. */
2407   MDBX_atomic_uint64_t mr_tid;
2408 
2409   /* The process ID of the process owning this reader txn. */
2410   MDBX_atomic_uint32_t mr_pid;
2411 
2412   /* The number of pages used in the reader's MVCC snapshot,
2413    * i.e. the value of meta->mm_geo.next and txn->mt_next_pgno */
2414   atomic_pgno_t mr_snapshot_pages_used;
2415   /* Number of retired pages at the time this reader starts transaction. So,
2416    * at any time the difference mm_pages_retired - mr_snapshot_pages_retired
2417    * will give the number of pages which this reader restraining from reuse. */
2418   MDBX_atomic_uint64_t mr_snapshot_pages_retired;
2419 } MDBX_reader;
2420 
2421 /* The header for the reader table (a memory-mapped lock file). */
2422 typedef struct MDBX_lockinfo {
2423   /* Stamp identifying this as an MDBX file.
2424    * It must be set to MDBX_MAGIC with with MDBX_LOCK_VERSION. */
2425   uint64_t mti_magic_and_version;
2426 
2427   /* Format of this lock file. Must be set to MDBX_LOCK_FORMAT. */
2428   uint32_t mti_os_and_format;
2429 
2430   /* Flags which environment was opened. */
2431   MDBX_atomic_uint32_t mti_envmode;
2432 
2433   /* Threshold of un-synced-with-disk pages for auto-sync feature,
2434    * zero means no-threshold, i.e. auto-sync is disabled. */
2435   atomic_pgno_t mti_autosync_threshold;
2436 
2437   /* Low 32-bit of txnid with which meta-pages was synced,
2438    * i.e. for sync-polling in the MDBX_NOMETASYNC mode. */
2439   MDBX_atomic_uint32_t mti_meta_sync_txnid;
2440 
2441   /* Period for timed auto-sync feature, i.e. at the every steady checkpoint
2442    * the mti_unsynced_timeout sets to the current_time + mti_autosync_period.
2443    * The time value is represented in a suitable system-dependent form, for
2444    * example clock_gettime(CLOCK_BOOTTIME) or clock_gettime(CLOCK_MONOTONIC).
2445    * Zero means timed auto-sync is disabled. */
2446   MDBX_atomic_uint64_t mti_autosync_period;
2447 
2448   /* Marker to distinguish uniqueness of DB/CLK. */
2449   MDBX_atomic_uint64_t mti_bait_uniqueness;
2450 
2451   alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/
2452 
2453 #if MDBX_ENABLE_PGOP_STAT
2454       /* Statistics of costly ops of all (running, completed and aborted)
2455        * transactions */
2456       MDBX_pgop_stat_t mti_pgop_stat;
2457 #endif /* MDBX_ENABLE_PGOP_STAT*/
2458 
2459   alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/
2460 
2461   /* Write transaction lock. */
2462 #if MDBX_LOCKING > 0
2463       mdbx_ipclock_t mti_wlock;
2464 #endif /* MDBX_LOCKING > 0 */
2465 
2466   atomic_txnid_t mti_oldest_reader;
2467 
2468   /* Timestamp of the last steady sync. Value is represented in a suitable
2469    * system-dependent form, for example clock_gettime(CLOCK_BOOTTIME) or
2470    * clock_gettime(CLOCK_MONOTONIC). */
2471   MDBX_atomic_uint64_t mti_sync_timestamp;
2472 
2473   /* Number un-synced-with-disk pages for auto-sync feature. */
2474   atomic_pgno_t mti_unsynced_pages;
2475 
2476   /* Number of page which was discarded last time by madvise(MADV_FREE). */
2477   atomic_pgno_t mti_discarded_tail;
2478 
2479   /* Timestamp of the last readers check. */
2480   MDBX_atomic_uint64_t mti_reader_check_timestamp;
2481 
2482   /* Shared anchor for tracking readahead edge and enabled/disabled status. */
2483   pgno_t mti_readahead_anchor;
2484 
2485   alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/
2486 
2487   /* Readeaders registration lock. */
2488 #if MDBX_LOCKING > 0
2489       mdbx_ipclock_t mti_rlock;
2490 #endif /* MDBX_LOCKING > 0 */
2491 
2492   /* The number of slots that have been used in the reader table.
2493    * This always records the maximum count, it is not decremented
2494    * when readers release their slots. */
2495   MDBX_atomic_uint32_t mti_numreaders;
2496   MDBX_atomic_uint32_t mti_readers_refresh_flag;
2497 
2498 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) ||              \
2499     (!defined(__cplusplus) && defined(_MSC_VER))
2500   alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/
2501       MDBX_reader mti_readers[] /* dynamic size */;
2502 #endif /* C99 */
2503 } MDBX_lockinfo;
2504 
2505 /* Lockfile format signature: version, features and field layout */
2506 #define MDBX_LOCK_FORMAT                                                       \
2507   (MDBX_CLOCK_SIGN * 27733 + (unsigned)sizeof(MDBX_reader) * 13 +              \
2508    (unsigned)offsetof(MDBX_reader, mr_snapshot_pages_used) * 251 +             \
2509    (unsigned)offsetof(MDBX_lockinfo, mti_oldest_reader) * 83 +                 \
2510    (unsigned)offsetof(MDBX_lockinfo, mti_numreaders) * 37 +                    \
2511    (unsigned)offsetof(MDBX_lockinfo, mti_readers) * 29)
2512 
2513 #define MDBX_DATA_MAGIC                                                        \
2514   ((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + MDBX_DATA_VERSION)
2515 
2516 #define MDBX_DATA_MAGIC_LEGACY_COMPAT                                          \
2517   ((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + 2)
2518 
2519 #define MDBX_DATA_MAGIC_LEGACY_DEVEL ((MDBX_MAGIC << 8) + 255)
2520 
2521 #define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION)
2522 
2523 /* The maximum size of a database page.
2524  *
2525  * It is 64K, but value-PAGEHDRSZ must fit in MDBX_page.mp_upper.
2526  *
2527  * MDBX will use database pages < OS pages if needed.
2528  * That causes more I/O in write transactions: The OS must
2529  * know (read) the whole page before writing a partial page.
2530  *
2531  * Note that we don't currently support Huge pages. On Linux,
2532  * regular data files cannot use Huge pages, and in general
2533  * Huge pages aren't actually pageable. We rely on the OS
2534  * demand-pager to read our data and page it out when memory
2535  * pressure from other processes is high. So until OSs have
2536  * actual paging support for Huge pages, they're not viable. */
2537 #define MAX_PAGESIZE MDBX_MAX_PAGESIZE
2538 #define MIN_PAGESIZE MDBX_MIN_PAGESIZE
2539 
2540 #define MIN_MAPSIZE (MIN_PAGESIZE * MIN_PAGENO)
2541 #if defined(_WIN32) || defined(_WIN64)
2542 #define MAX_MAPSIZE32 UINT32_C(0x38000000)
2543 #else
2544 #define MAX_MAPSIZE32 UINT32_C(0x7f000000)
2545 #endif
2546 #define MAX_MAPSIZE64 (MAX_PAGENO * (uint64_t)MAX_PAGESIZE)
2547 
2548 #if MDBX_WORDBITS >= 64
2549 #define MAX_MAPSIZE MAX_MAPSIZE64
2550 #define MDBX_PGL_LIMIT ((size_t)MAX_PAGENO)
2551 #else
2552 #define MAX_MAPSIZE MAX_MAPSIZE32
2553 #define MDBX_PGL_LIMIT (MAX_MAPSIZE32 / MIN_PAGESIZE)
2554 #endif /* MDBX_WORDBITS */
2555 
2556 #define MDBX_READERS_LIMIT 32767
2557 #define MDBX_RADIXSORT_THRESHOLD 333
2558 
2559 /*----------------------------------------------------------------------------*/
2560 
2561 /* An PNL is an Page Number List, a sorted array of IDs.
2562  * The first element of the array is a counter for how many actual page-numbers
2563  * are in the list. By default PNLs are sorted in descending order, this allow
2564  * cut off a page with lowest pgno (at the tail) just truncating the list. The
2565  * sort order of PNLs is controlled by the MDBX_PNL_ASCENDING build option. */
2566 typedef pgno_t *MDBX_PNL;
2567 
2568 #if MDBX_PNL_ASCENDING
2569 #define MDBX_PNL_ORDERED(first, last) ((first) < (last))
2570 #define MDBX_PNL_DISORDERED(first, last) ((first) >= (last))
2571 #else
2572 #define MDBX_PNL_ORDERED(first, last) ((first) > (last))
2573 #define MDBX_PNL_DISORDERED(first, last) ((first) <= (last))
2574 #endif
2575 
2576 /* List of txnid, only for MDBX_txn.tw.lifo_reclaimed */
2577 typedef txnid_t *MDBX_TXL;
2578 
2579 /* An Dirty-Page list item is an pgno/pointer pair. */
2580 typedef struct MDBX_dp {
2581   MDBX_page *ptr;
2582   pgno_t pgno;
2583   union {
2584     unsigned extra;
2585     __anonymous_struct_extension__ struct {
2586       unsigned multi : 1;
2587       unsigned lru : 31;
2588     };
2589   };
2590 } MDBX_dp;
2591 
2592 /* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */
2593 typedef struct MDBX_dpl {
2594   unsigned sorted;
2595   unsigned length;
2596   unsigned detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */
2597 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) ||              \
2598     (!defined(__cplusplus) && defined(_MSC_VER))
2599   MDBX_dp items[] /* dynamic size with holes at zero and after the last */;
2600 #endif
2601 } MDBX_dpl;
2602 
2603 /* PNL sizes */
2604 #define MDBX_PNL_GRANULATE 1024
2605 #define MDBX_PNL_INITIAL                                                       \
2606   (MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
2607 
2608 #define MDBX_TXL_GRANULATE 32
2609 #define MDBX_TXL_INITIAL                                                       \
2610   (MDBX_TXL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t))
2611 #define MDBX_TXL_MAX                                                           \
2612   ((1u << 17) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t))
2613 
2614 #define MDBX_PNL_ALLOCLEN(pl) ((pl)[-1])
2615 #define MDBX_PNL_SIZE(pl) ((pl)[0])
2616 #define MDBX_PNL_FIRST(pl) ((pl)[1])
2617 #define MDBX_PNL_LAST(pl) ((pl)[MDBX_PNL_SIZE(pl)])
2618 #define MDBX_PNL_BEGIN(pl) (&(pl)[1])
2619 #define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_SIZE(pl) + 1])
2620 
2621 #if MDBX_PNL_ASCENDING
2622 #define MDBX_PNL_LEAST(pl) MDBX_PNL_FIRST(pl)
2623 #define MDBX_PNL_MOST(pl) MDBX_PNL_LAST(pl)
2624 #else
2625 #define MDBX_PNL_LEAST(pl) MDBX_PNL_LAST(pl)
2626 #define MDBX_PNL_MOST(pl) MDBX_PNL_FIRST(pl)
2627 #endif
2628 
2629 #define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_SIZE(pl) + 1) * sizeof(pgno_t))
2630 #define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_SIZE(pl) == 0)
2631 
2632 /*----------------------------------------------------------------------------*/
2633 /* Internal structures */
2634 
2635 /* Auxiliary DB info.
2636  * The information here is mostly static/read-only. There is
2637  * only a single copy of this record in the environment. */
2638 typedef struct MDBX_dbx {
2639   MDBX_val md_name;                /* name of the database */
2640   MDBX_cmp_func *md_cmp;           /* function for comparing keys */
2641   MDBX_cmp_func *md_dcmp;          /* function for comparing data items */
2642   size_t md_klen_min, md_klen_max; /* min/max key length for the database */
2643   size_t md_vlen_min,
2644       md_vlen_max; /* min/max value/data length for the database */
2645 } MDBX_dbx;
2646 
2647 /* A database transaction.
2648  * Every operation requires a transaction handle. */
2649 struct MDBX_txn {
2650 #define MDBX_MT_SIGNATURE UINT32_C(0x93D53A31)
2651   uint32_t mt_signature;
2652 
2653   /* Transaction Flags */
2654   /* mdbx_txn_begin() flags */
2655 #define MDBX_TXN_RO_BEGIN_FLAGS (MDBX_TXN_RDONLY | MDBX_TXN_RDONLY_PREPARE)
2656 #define MDBX_TXN_RW_BEGIN_FLAGS                                                \
2657   (MDBX_TXN_NOMETASYNC | MDBX_TXN_NOSYNC | MDBX_TXN_TRY)
2658   /* Additional flag for mdbx_sync_locked() */
2659 #define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000)
2660 
2661   /* internal txn flags */
2662 #define MDBX_TXN_FINISHED 0x01  /* txn is finished or never began */
2663 #define MDBX_TXN_ERROR 0x02     /* txn is unusable after an error */
2664 #define MDBX_TXN_DIRTY 0x04     /* must write, even if dirty list is empty */
2665 #define MDBX_TXN_SPILLS 0x08    /* txn or a parent has spilled pages */
2666 #define MDBX_TXN_HAS_CHILD 0x10 /* txn has an MDBX_txn.mt_child */
2667   /* most operations on the txn are currently illegal */
2668 #define MDBX_TXN_BLOCKED                                                       \
2669   (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_HAS_CHILD)
2670 
2671 #define TXN_FLAGS                                                              \
2672   (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS |     \
2673    MDBX_TXN_HAS_CHILD)
2674 
2675 #if (TXN_FLAGS & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS)) ||       \
2676     ((MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS | TXN_FLAGS) &         \
2677      MDBX_SHRINK_ALLOWED)
2678 #error "Oops, some flags overlapped or wrong"
2679 #endif
2680   uint32_t mt_flags;
2681 
2682   MDBX_txn *mt_parent; /* parent of a nested txn */
2683   /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */
2684   MDBX_txn *mt_child;
2685   MDBX_geo mt_geo;
2686   /* next unallocated page */
2687 #define mt_next_pgno mt_geo.next
2688   /* corresponding to the current size of datafile */
2689 #define mt_end_pgno mt_geo.now
2690 
2691   /* The ID of this transaction. IDs are integers incrementing from 1.
2692    * Only committed write transactions increment the ID. If a transaction
2693    * aborts, the ID may be re-used by the next writer. */
2694   txnid_t mt_txnid;
2695   txnid_t mt_front;
2696 
2697   MDBX_env *mt_env; /* the DB environment */
2698   /* Array of records for each DB known in the environment. */
2699   MDBX_dbx *mt_dbxs;
2700   /* Array of MDBX_db records for each known DB */
2701   MDBX_db *mt_dbs;
2702   /* Array of sequence numbers for each DB handle */
2703   unsigned *mt_dbiseqs;
2704 
2705   /* Transaction DBI Flags */
2706 #define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */
2707 #define DBI_STALE MDBX_DBI_STALE /* Named-DB record is older than txnID */
2708 #define DBI_FRESH MDBX_DBI_FRESH /* Named-DB handle opened in this txn */
2709 #define DBI_CREAT MDBX_DBI_CREAT /* Named-DB handle created in this txn */
2710 #define DBI_VALID 0x10           /* DB handle is valid, see also DB_VALID */
2711 #define DBI_USRVALID 0x20        /* As DB_VALID, but not set for FREE_DBI */
2712 #define DBI_AUDITED 0x40         /* Internal flag for accounting during audit */
2713   /* Array of flags for each DB */
2714   uint8_t *mt_dbistate;
2715   /* Number of DB records in use, or 0 when the txn is finished.
2716    * This number only ever increments until the txn finishes; we
2717    * don't decrement it when individual DB handles are closed. */
2718   MDBX_dbi mt_numdbs;
2719   size_t mt_owner; /* thread ID that owns this transaction */
2720   MDBX_canary mt_canary;
2721   void *mt_userctx; /* User-settable context */
2722 
2723   union {
2724     struct {
2725       /* For read txns: This thread/txn's reader table slot, or NULL. */
2726       MDBX_reader *reader;
2727     } to;
2728     struct {
2729       /* In write txns, array of cursors for each DB */
2730       MDBX_cursor **cursors;
2731       pgno_t *reclaimed_pglist; /* Reclaimed GC pages */
2732       txnid_t last_reclaimed;   /* ID of last used record */
2733 #if MDBX_ENABLE_REFUND
2734       pgno_t loose_refund_wl /* FIXME: describe */;
2735 #endif /* MDBX_ENABLE_REFUND */
2736       /* dirtylist room: Dirty array size - dirty pages visible to this txn.
2737        * Includes ancestor txns' dirty pages not hidden by other txns'
2738        * dirty/spilled pages. Thus commit(nested txn) has room to merge
2739        * dirtylist into mt_parent after freeing hidden mt_parent pages. */
2740       unsigned dirtyroom;
2741       /* a sequence to spilling dirty page with LRU policy */
2742       unsigned dirtylru;
2743       /* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */
2744       MDBX_dpl *dirtylist;
2745       /* The list of reclaimed txns from GC */
2746       MDBX_TXL lifo_reclaimed;
2747       /* The list of pages that became unused during this transaction. */
2748       MDBX_PNL retired_pages;
2749       /* The list of loose pages that became unused and may be reused
2750        * in this transaction, linked through `mp_next`. */
2751       MDBX_page *loose_pages;
2752       /* Number of loose pages (tw.loose_pages) */
2753       unsigned loose_count;
2754       /* The sorted list of dirty pages we temporarily wrote to disk
2755        * because the dirty list was full. page numbers in here are
2756        * shifted left by 1, deleted slots have the LSB set. */
2757       MDBX_PNL spill_pages;
2758       unsigned spill_least_removed;
2759     } tw;
2760   };
2761 };
2762 
2763 #if MDBX_WORDBITS >= 64
2764 #define CURSOR_STACK 32
2765 #else
2766 #define CURSOR_STACK 24
2767 #endif
2768 
2769 struct MDBX_xcursor;
2770 
2771 /* Cursors are used for all DB operations.
2772  * A cursor holds a path of (page pointer, key index) from the DB
2773  * root to a position in the DB, plus other state. MDBX_DUPSORT
2774  * cursors include an xcursor to the current data item. Write txns
2775  * track their cursors and keep them up to date when data moves.
2776  * Exception: An xcursor's pointer to a P_SUBP page can be stale.
2777  * (A node with F_DUPDATA but no F_SUBDATA contains a subpage). */
2778 struct MDBX_cursor {
2779 #define MDBX_MC_LIVE UINT32_C(0xFE05D5B1)
2780 #define MDBX_MC_READY4CLOSE UINT32_C(0x2817A047)
2781 #define MDBX_MC_WAIT4EOT UINT32_C(0x90E297A7)
2782   uint32_t mc_signature;
2783   /* The database handle this cursor operates on */
2784   MDBX_dbi mc_dbi;
2785   /* Next cursor on this DB in this txn */
2786   MDBX_cursor *mc_next;
2787   /* Backup of the original cursor if this cursor is a shadow */
2788   MDBX_cursor *mc_backup;
2789   /* Context used for databases with MDBX_DUPSORT, otherwise NULL */
2790   struct MDBX_xcursor *mc_xcursor;
2791   /* The transaction that owns this cursor */
2792   MDBX_txn *mc_txn;
2793   /* The database record for this cursor */
2794   MDBX_db *mc_db;
2795   /* The database auxiliary record for this cursor */
2796   MDBX_dbx *mc_dbx;
2797   /* The mt_dbistate for this database */
2798   uint8_t *mc_dbistate;
2799   unsigned mc_snum; /* number of pushed pages */
2800   unsigned mc_top;  /* index of top page, normally mc_snum-1 */
2801 
2802   /* Cursor state flags. */
2803 #define C_INITIALIZED 0x01 /* cursor has been initialized and is valid */
2804 #define C_EOF 0x02         /* No more data */
2805 #define C_SUB 0x04         /* Cursor is a sub-cursor */
2806 #define C_DEL 0x08         /* last op was a cursor_del */
2807 #define C_UNTRACK 0x10     /* Un-track cursor when closing */
2808 #define C_RECLAIMING 0x20  /* GC lookup is prohibited */
2809 #define C_GCFREEZE 0x40    /* reclaimed_pglist must not be updated */
2810 
2811   /* Cursor checking flags. */
2812 #define C_COPYING 0x100  /* skip key-value length check (copying simplify) */
2813 #define C_UPDATING 0x200 /* update/rebalance pending */
2814 #define C_RETIRING 0x400 /* refs to child pages may be invalid */
2815 #define C_SKIPORD 0x800  /* don't check keys ordering */
2816 
2817   unsigned mc_flags;              /* see mdbx_cursor */
2818   MDBX_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */
2819   indx_t mc_ki[CURSOR_STACK];     /* stack of page indices */
2820 };
2821 
2822 /* Context for sorted-dup records.
2823  * We could have gone to a fully recursive design, with arbitrarily
2824  * deep nesting of sub-databases. But for now we only handle these
2825  * levels - main DB, optional sub-DB, sorted-duplicate DB. */
2826 typedef struct MDBX_xcursor {
2827   /* A sub-cursor for traversing the Dup DB */
2828   MDBX_cursor mx_cursor;
2829   /* The database record for this Dup DB */
2830   MDBX_db mx_db;
2831   /* The auxiliary DB record for this Dup DB */
2832   MDBX_dbx mx_dbx;
2833 } MDBX_xcursor;
2834 
2835 typedef struct MDBX_cursor_couple {
2836   MDBX_cursor outer;
2837   void *mc_userctx; /* User-settable context */
2838   MDBX_xcursor inner;
2839 } MDBX_cursor_couple;
2840 
2841 /* The database environment. */
2842 struct MDBX_env {
2843   /* ----------------------------------------------------- mostly static part */
2844 #define MDBX_ME_SIGNATURE UINT32_C(0x9A899641)
2845   MDBX_atomic_uint32_t me_signature;
2846   /* Failed to update the meta page. Probably an I/O error. */
2847 #define MDBX_FATAL_ERROR UINT32_C(0x80000000)
2848   /* Some fields are initialized. */
2849 #define MDBX_ENV_ACTIVE UINT32_C(0x20000000)
2850   /* me_txkey is set */
2851 #define MDBX_ENV_TXKEY UINT32_C(0x10000000)
2852   /* Legacy MDBX_MAPASYNC (prior v0.9) */
2853 #define MDBX_DEPRECATED_MAPASYNC UINT32_C(0x100000)
2854 #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY)
2855   uint32_t me_flags;
2856   mdbx_mmap_t me_dxb_mmap; /* The main data file */
2857 #define me_map me_dxb_mmap.dxb
2858 #define me_lazy_fd me_dxb_mmap.fd
2859   mdbx_filehandle_t me_dsync_fd;
2860   mdbx_mmap_t me_lck_mmap; /* The lock file */
2861 #define me_lfd me_lck_mmap.fd
2862   struct MDBX_lockinfo *me_lck;
2863 
2864   unsigned me_psize;        /* DB page size, initialized from me_os_psize */
2865   unsigned me_leaf_nodemax; /* max size of a leaf-node */
2866   uint8_t me_psize2log;     /* log2 of DB page size */
2867   int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */
2868   uint16_t me_merge_threshold,
2869       me_merge_threshold_gc;  /* pages emptier than this are candidates for
2870                                  merging */
2871   unsigned me_os_psize;       /* OS page size, from mdbx_syspagesize() */
2872   unsigned me_maxreaders;     /* size of the reader table */
2873   MDBX_dbi me_maxdbs;         /* size of the DB table */
2874   uint32_t me_pid;            /* process ID of this env */
2875   mdbx_thread_key_t me_txkey; /* thread-key for readers */
2876   char *me_pathname;          /* path to the DB files */
2877   void *me_pbuf;              /* scratch area for DUPSORT put() */
2878   MDBX_txn *me_txn0;          /* preallocated write transaction */
2879 
2880   MDBX_dbx *me_dbxs;    /* array of static DB info */
2881   uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */
2882   unsigned *me_dbiseqs; /* array of dbi sequence numbers */
2883   unsigned
2884       me_maxgc_ov1page;    /* Number of pgno_t fit in a single overflow page */
2885   uint32_t me_live_reader; /* have liveness lock in reader table */
2886   void *me_userctx;        /* User-settable context */
2887   MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */
2888 
2889   struct {
2890     unsigned dp_reserve_limit;
2891     unsigned rp_augment_limit;
2892     unsigned dp_limit;
2893     unsigned dp_initial;
2894     uint8_t dp_loose_limit;
2895     uint8_t spill_max_denominator;
2896     uint8_t spill_min_denominator;
2897     uint8_t spill_parent4child_denominator;
2898     unsigned merge_threshold_16dot16_percent;
2899     union {
2900       unsigned all;
2901       /* tracks options with non-auto values but tuned by user */
2902       struct {
2903         unsigned dp_limit : 1;
2904       } non_auto;
2905     } flags;
2906   } me_options;
2907 
2908   /* struct me_dbgeo used for accepting db-geo params from user for the new
2909    * database creation, i.e. when mdbx_env_set_geometry() was called before
2910    * mdbx_env_open(). */
2911   struct {
2912     size_t lower;  /* minimal size of datafile */
2913     size_t upper;  /* maximal size of datafile */
2914     size_t now;    /* current size of datafile */
2915     size_t grow;   /* step to grow datafile */
2916     size_t shrink; /* threshold to shrink datafile */
2917   } me_dbgeo;
2918 
2919 #if MDBX_LOCKING == MDBX_LOCKING_SYSV
2920   union {
2921     key_t key;
2922     int semid;
2923   } me_sysv_ipc;
2924 #endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */
2925 
2926   MDBX_env *me_lcklist_next;
2927 
2928   /* --------------------------------------------------- mostly volatile part */
2929 
2930   MDBX_txn *me_txn; /* current write transaction */
2931   mdbx_fastmutex_t me_dbi_lock;
2932   MDBX_dbi me_numdbs; /* number of DBs opened */
2933 
2934   MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */
2935   unsigned me_dp_reserve_len;
2936   /* PNL of pages that became unused in a write txn */
2937   MDBX_PNL me_retired_pages;
2938 
2939 #if defined(_WIN32) || defined(_WIN64)
2940   MDBX_srwlock me_remap_guard;
2941   /* Workaround for LockFileEx and WriteFile multithread bug */
2942   CRITICAL_SECTION me_windowsbug_lock;
2943 #else
2944   mdbx_fastmutex_t me_remap_guard;
2945 #endif
2946 
2947   /* -------------------------------------------------------------- debugging */
2948 
2949 #if MDBX_DEBUG
2950   MDBX_assert_func *me_assert_func; /*  Callback for assertion failures */
2951 #endif
2952 #ifdef MDBX_USE_VALGRIND
2953   int me_valgrind_handle;
2954 #endif
2955 #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__)
2956   pgno_t me_poison_edge;
2957 #endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */
2958 
2959 #ifndef xMDBX_DEBUG_SPILLING
2960 #define xMDBX_DEBUG_SPILLING 0
2961 #endif
2962 #if xMDBX_DEBUG_SPILLING == 2
2963   unsigned debug_dirtied_est, debug_dirtied_act;
2964 #endif /* xMDBX_DEBUG_SPILLING */
2965 
2966   /* ------------------------------------------------- stub for lck-less mode */
2967   MDBX_atomic_uint64_t
2968       x_lckless_stub[(sizeof(MDBX_lockinfo) + MDBX_CACHELINE_SIZE - 1) /
2969                      sizeof(MDBX_atomic_uint64_t)];
2970 };
2971 
2972 #ifndef __cplusplus
2973 /*----------------------------------------------------------------------------*/
2974 /* Debug and Logging stuff */
2975 
2976 #define MDBX_RUNTIME_FLAGS_INIT                                                \
2977   ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT
2978 
2979 extern uint8_t mdbx_runtime_flags;
2980 extern uint8_t mdbx_loglevel;
2981 extern MDBX_debug_func *mdbx_debug_logger;
2982 
mdbx_jitter4testing(bool tiny)2983 MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny) {
2984 #if MDBX_DEBUG
2985   if (MDBX_DBG_JITTER & mdbx_runtime_flags)
2986     mdbx_osal_jitter(tiny);
2987 #else
2988   (void)tiny;
2989 #endif
2990 }
2991 
2992 MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5)
2993     mdbx_debug_log(int level, const char *function, int line, const char *fmt,
2994                    ...) MDBX_PRINTF_ARGS(4, 5);
2995 MDBX_INTERNAL_FUNC void mdbx_debug_log_va(int level, const char *function,
2996                                           int line, const char *fmt,
2997                                           va_list args);
2998 
2999 #define mdbx_log_enabled(msg) unlikely(msg <= mdbx_loglevel)
3000 
3001 #if MDBX_DEBUG
3002 
3003 #define mdbx_assert_enabled() unlikely(mdbx_runtime_flags &MDBX_DBG_ASSERT)
3004 
3005 #define mdbx_audit_enabled() unlikely(mdbx_runtime_flags &MDBX_DBG_AUDIT)
3006 
3007 #else /* MDBX_DEBUG */
3008 
3009 #define mdbx_audit_enabled() (0)
3010 
3011 #if !defined(NDEBUG) || MDBX_FORCE_ASSERTIONS
3012 #define mdbx_assert_enabled() (1)
3013 #else
3014 #define mdbx_assert_enabled() (0)
3015 #endif /* NDEBUG */
3016 
3017 #endif /* MDBX_DEBUG */
3018 
3019 #if !MDBX_DEBUG && defined(__ANDROID_API__)
3020 #define mdbx_assert_fail(env, msg, func, line)                                 \
3021   __android_log_assert(msg, "mdbx", "%s:%u", func, line)
3022 #else
3023 void mdbx_assert_fail(const MDBX_env *env, const char *msg, const char *func,
3024                       int line);
3025 #endif
3026 
3027 #define mdbx_debug_extra(fmt, ...)                                             \
3028   do {                                                                         \
3029     if (MDBX_DEBUG && mdbx_log_enabled(MDBX_LOG_EXTRA))                        \
3030       mdbx_debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__);    \
3031   } while (0)
3032 
3033 #define mdbx_debug_extra_print(fmt, ...)                                       \
3034   do {                                                                         \
3035     if (MDBX_DEBUG && mdbx_log_enabled(MDBX_LOG_EXTRA))                        \
3036       mdbx_debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__);               \
3037   } while (0)
3038 
3039 #define mdbx_trace(fmt, ...)                                                   \
3040   do {                                                                         \
3041     if (MDBX_DEBUG && mdbx_log_enabled(MDBX_LOG_TRACE))                        \
3042       mdbx_debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n",             \
3043                      __VA_ARGS__);                                             \
3044   } while (0)
3045 
3046 #define mdbx_debug(fmt, ...)                                                   \
3047   do {                                                                         \
3048     if (MDBX_DEBUG && mdbx_log_enabled(MDBX_LOG_DEBUG))                        \
3049       mdbx_debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n",             \
3050                      __VA_ARGS__);                                             \
3051   } while (0)
3052 
3053 #define mdbx_verbose(fmt, ...)                                                 \
3054   do {                                                                         \
3055     if (MDBX_DEBUG && mdbx_log_enabled(MDBX_LOG_VERBOSE))                      \
3056       mdbx_debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n",           \
3057                      __VA_ARGS__);                                             \
3058   } while (0)
3059 
3060 #define mdbx_notice(fmt, ...)                                                  \
3061   do {                                                                         \
3062     if (mdbx_log_enabled(MDBX_LOG_NOTICE))                                     \
3063       mdbx_debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n",            \
3064                      __VA_ARGS__);                                             \
3065   } while (0)
3066 
3067 #define mdbx_warning(fmt, ...)                                                 \
3068   do {                                                                         \
3069     if (mdbx_log_enabled(MDBX_LOG_WARN))                                       \
3070       mdbx_debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n",              \
3071                      __VA_ARGS__);                                             \
3072   } while (0)
3073 
3074 #define mdbx_error(fmt, ...)                                                   \
3075   do {                                                                         \
3076     if (mdbx_log_enabled(MDBX_LOG_ERROR))                                      \
3077       mdbx_debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n",             \
3078                      __VA_ARGS__);                                             \
3079   } while (0)
3080 
3081 #define mdbx_fatal(fmt, ...)                                                   \
3082   mdbx_debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__);
3083 
3084 #define mdbx_ensure_msg(env, expr, msg)                                        \
3085   do {                                                                         \
3086     if (unlikely(!(expr)))                                                     \
3087       mdbx_assert_fail(env, msg, __func__, __LINE__);                          \
3088   } while (0)
3089 
3090 #define mdbx_ensure(env, expr) mdbx_ensure_msg(env, expr, #expr)
3091 
3092 /* assert(3) variant in environment context */
3093 #define mdbx_assert(env, expr)                                                 \
3094   do {                                                                         \
3095     if (mdbx_assert_enabled())                                                 \
3096       mdbx_ensure(env, expr);                                                  \
3097   } while (0)
3098 
3099 /* assert(3) variant in cursor context */
3100 #define mdbx_cassert(mc, expr) mdbx_assert((mc)->mc_txn->mt_env, expr)
3101 
3102 /* assert(3) variant in transaction context */
3103 #define mdbx_tassert(txn, expr) mdbx_assert((txn)->mt_env, expr)
3104 
3105 #ifndef xMDBX_TOOLS /* Avoid using internal mdbx_assert() */
3106 #undef assert
3107 #define assert(expr) mdbx_assert(NULL, expr)
3108 #endif
3109 
3110 /*----------------------------------------------------------------------------*/
3111 /* Cache coherence and mmap invalidation */
3112 
3113 #if MDBX_CPU_WRITEBACK_INCOHERENT
3114 #define mdbx_flush_incoherent_cpu_writeback() mdbx_memory_barrier()
3115 #else
3116 #define mdbx_flush_incoherent_cpu_writeback() mdbx_compiler_barrier()
3117 #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */
3118 
3119 MDBX_MAYBE_UNUSED static __inline void
mdbx_flush_incoherent_mmap(void * addr,size_t nbytes,const intptr_t pagesize)3120 mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) {
3121 #if MDBX_MMAP_INCOHERENT_FILE_WRITE
3122   char *const begin = (char *)(-pagesize & (intptr_t)addr);
3123   char *const end =
3124       (char *)(-pagesize & (intptr_t)((char *)addr + nbytes + pagesize - 1));
3125   int err = msync(begin, end - begin, MS_SYNC | MS_INVALIDATE) ? errno : 0;
3126   mdbx_assert(nullptr, err == 0);
3127   (void)err;
3128 #else
3129   (void)pagesize;
3130 #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */
3131 
3132 #if MDBX_MMAP_INCOHERENT_CPU_CACHE
3133 #ifdef DCACHE
3134   /* MIPS has cache coherency issues.
3135    * Note: for any nbytes >= on-chip cache size, entire is flushed. */
3136   cacheflush(addr, nbytes, DCACHE);
3137 #else
3138 #error "Oops, cacheflush() not available"
3139 #endif /* DCACHE */
3140 #endif /* MDBX_MMAP_INCOHERENT_CPU_CACHE */
3141 
3142 #if !MDBX_MMAP_INCOHERENT_FILE_WRITE && !MDBX_MMAP_INCOHERENT_CPU_CACHE
3143   (void)addr;
3144   (void)nbytes;
3145 #endif
3146 }
3147 
3148 /*----------------------------------------------------------------------------*/
3149 /* Internal prototypes */
3150 
3151 MDBX_INTERNAL_FUNC int mdbx_cleanup_dead_readers(MDBX_env *env, int rlocked,
3152                                                  int *dead);
3153 MDBX_INTERNAL_FUNC int mdbx_rthc_alloc(mdbx_thread_key_t *key,
3154                                        MDBX_reader *begin, MDBX_reader *end);
3155 MDBX_INTERNAL_FUNC void mdbx_rthc_remove(const mdbx_thread_key_t key);
3156 
3157 MDBX_INTERNAL_FUNC void mdbx_rthc_global_init(void);
3158 MDBX_INTERNAL_FUNC void mdbx_rthc_global_dtor(void);
3159 MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr);
3160 
3161 #endif /* !__cplusplus */
3162 
3163 #define MDBX_IS_ERROR(rc)                                                      \
3164   ((rc) != MDBX_RESULT_TRUE && (rc) != MDBX_RESULT_FALSE)
3165 
3166 /* Internal error codes, not exposed outside libmdbx */
3167 #define MDBX_NO_ROOT (MDBX_LAST_ADDED_ERRCODE + 10)
3168 
3169 /* Debugging output value of a cursor DBI: Negative in a sub-cursor. */
3170 #define DDBI(mc)                                                               \
3171   (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi)
3172 
3173 /* Key size which fits in a DKBUF (debug key buffer). */
3174 #define DKBUF_MAX 511
3175 #define DKBUF char _kbuf[DKBUF_MAX * 4 + 2]
3176 #define DKEY(x) mdbx_dump_val(x, _kbuf, DKBUF_MAX * 2 + 1)
3177 #define DVAL(x) mdbx_dump_val(x, _kbuf + DKBUF_MAX * 2 + 1, DKBUF_MAX * 2 + 1)
3178 
3179 #if MDBX_DEBUG
3180 #define DKBUF_DEBUG DKBUF
3181 #define DKEY_DEBUG(x) DKEY(x)
3182 #define DVAL_DEBUG(x) DVAL(x)
3183 #else
3184 #define DKBUF_DEBUG ((void)(0))
3185 #define DKEY_DEBUG(x) ("-")
3186 #define DVAL_DEBUG(x) ("-")
3187 #endif
3188 
3189 /* An invalid page number.
3190  * Mainly used to denote an empty tree. */
3191 #define P_INVALID (~(pgno_t)0)
3192 
3193 /* Test if the flags f are set in a flag word w. */
3194 #define F_ISSET(w, f) (((w) & (f)) == (f))
3195 
3196 /* Round n up to an even number. */
3197 #define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */
3198 
3199 /* Default size of memory map.
3200  * This is certainly too small for any actual applications. Apps should
3201  * always set the size explicitly using mdbx_env_set_geometry(). */
3202 #define DEFAULT_MAPSIZE MEGABYTE
3203 
3204 /* Number of slots in the reader table.
3205  * This value was chosen somewhat arbitrarily. The 61 is a prime number,
3206  * and such readers plus a couple mutexes fit into single 4KB page.
3207  * Applications should set the table size using mdbx_env_set_maxreaders(). */
3208 #define DEFAULT_READERS 61
3209 
3210 /* Test if a page is a leaf page */
3211 #define IS_LEAF(p) (((p)->mp_flags & P_LEAF) != 0)
3212 /* Test if a page is a LEAF2 page */
3213 #define IS_LEAF2(p) unlikely(((p)->mp_flags & P_LEAF2) != 0)
3214 /* Test if a page is a branch page */
3215 #define IS_BRANCH(p) (((p)->mp_flags & P_BRANCH) != 0)
3216 /* Test if a page is an overflow page */
3217 #define IS_OVERFLOW(p) unlikely(((p)->mp_flags & P_OVERFLOW) != 0)
3218 /* Test if a page is a sub page */
3219 #define IS_SUBP(p) (((p)->mp_flags & P_SUBP) != 0)
3220 
3221 #define PAGETYPE(p) ((p)->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW))
3222 
3223 /* Header for a single key/data pair within a page.
3224  * Used in pages of type P_BRANCH and P_LEAF without P_LEAF2.
3225  * We guarantee 2-byte alignment for 'MDBX_node's.
3226  *
3227  * Leaf node flags describe node contents.  F_BIGDATA says the node's
3228  * data part is the page number of an overflow page with actual data.
3229  * F_DUPDATA and F_SUBDATA can be combined giving duplicate data in
3230  * a sub-page/sub-database, and named databases (just F_SUBDATA). */
3231 typedef struct MDBX_node {
3232 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
3233   union {
3234     uint32_t mn_dsize;
3235     uint32_t mn_pgno32;
3236   };
3237   uint8_t mn_flags; /* see mdbx_node flags */
3238   uint8_t mn_extra;
3239   uint16_t mn_ksize; /* key size */
3240 #else
3241   uint16_t mn_ksize; /* key size */
3242   uint8_t mn_extra;
3243   uint8_t mn_flags; /* see mdbx_node flags */
3244   union {
3245     uint32_t mn_pgno32;
3246     uint32_t mn_dsize;
3247   };
3248 #endif /* __BYTE_ORDER__ */
3249 
3250   /* mdbx_node Flags */
3251 #define F_BIGDATA 0x01 /* data put on overflow page */
3252 #define F_SUBDATA 0x02 /* data is a sub-database */
3253 #define F_DUPDATA 0x04 /* data has duplicates */
3254 
3255   /* valid flags for mdbx_node_add() */
3256 #define NODE_ADD_FLAGS (F_DUPDATA | F_SUBDATA | MDBX_RESERVE | MDBX_APPEND)
3257 
3258 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) ||              \
3259     (!defined(__cplusplus) && defined(_MSC_VER))
3260   uint8_t mn_data[] /* key and data are appended here */;
3261 #endif /* C99 */
3262 } MDBX_node;
3263 
3264 #define DB_PERSISTENT_FLAGS                                                    \
3265   (MDBX_REVERSEKEY | MDBX_DUPSORT | MDBX_INTEGERKEY | MDBX_DUPFIXED |          \
3266    MDBX_INTEGERDUP | MDBX_REVERSEDUP)
3267 
3268 /* mdbx_dbi_open() flags */
3269 #define DB_USABLE_FLAGS (DB_PERSISTENT_FLAGS | MDBX_CREATE | MDBX_DB_ACCEDE)
3270 
3271 #define DB_VALID 0x8000 /* DB handle is valid, for me_dbflags */
3272 #define DB_INTERNAL_FLAGS DB_VALID
3273 
3274 #if DB_INTERNAL_FLAGS & DB_USABLE_FLAGS
3275 #error "Oops, some flags overlapped or wrong"
3276 #endif
3277 #if DB_PERSISTENT_FLAGS & ~DB_USABLE_FLAGS
3278 #error "Oops, some flags overlapped or wrong"
3279 #endif
3280 
3281 /* max number of pages to commit in one writev() call */
3282 #define MDBX_COMMIT_PAGES 64
3283 #if defined(IOV_MAX) && IOV_MAX < MDBX_COMMIT_PAGES /* sysconf(_SC_IOV_MAX) */
3284 #undef MDBX_COMMIT_PAGES
3285 #define MDBX_COMMIT_PAGES IOV_MAX
3286 #endif
3287 
3288 /*
3289  *                /
3290  *                | -1, a < b
3291  * CMP2INT(a,b) = <  0, a == b
3292  *                |  1, a > b
3293  *                \
3294  */
3295 #ifndef __e2k__
3296 /* LY: fast enough on most systems */
3297 #define CMP2INT(a, b) (((b) > (a)) ? -1 : (a) > (b))
3298 #else
3299 /* LY: more parallelable on VLIW Elbrus */
3300 #define CMP2INT(a, b) (((a) > (b)) - ((b) > (a)))
3301 #endif
3302 
3303 /* Do not spill pages to disk if txn is getting full, may fail instead */
3304 #define MDBX_NOSPILL 0x8000
3305 
3306 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t
pgno_add(pgno_t base,pgno_t augend)3307 pgno_add(pgno_t base, pgno_t augend) {
3308   assert(base <= MAX_PAGENO);
3309   return (augend < MAX_PAGENO - base) ? base + augend : MAX_PAGENO;
3310 }
3311 
3312 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t
pgno_sub(pgno_t base,pgno_t subtrahend)3313 pgno_sub(pgno_t base, pgno_t subtrahend) {
3314   assert(base >= MIN_PAGENO);
3315   return (subtrahend < base - MIN_PAGENO) ? base - subtrahend : MIN_PAGENO;
3316 }
3317 
3318 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline bool
is_powerof2(size_t x)3319 is_powerof2(size_t x) {
3320   return (x & (x - 1)) == 0;
3321 }
3322 
3323 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t
floor_powerof2(size_t value,size_t granularity)3324 floor_powerof2(size_t value, size_t granularity) {
3325   assert(is_powerof2(granularity));
3326   return value & ~(granularity - 1);
3327 }
3328 
3329 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t
ceil_powerof2(size_t value,size_t granularity)3330 ceil_powerof2(size_t value, size_t granularity) {
3331   return floor_powerof2(value + granularity - 1, granularity);
3332 }
3333 
3334 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static unsigned
log2n_powerof2(size_t value)3335 log2n_powerof2(size_t value) {
3336   assert(value > 0 && value < INT32_MAX && is_powerof2(value));
3337   assert((value & -(int32_t)value) == value);
3338 #if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctzl)
3339   return __builtin_ctzl(value);
3340 #elif defined(_MSC_VER)
3341   unsigned long index;
3342   _BitScanForward(&index, (unsigned long)value);
3343   return index;
3344 #else
3345   static const uint8_t debruijn_ctz32[32] = {
3346       0,  1,  28, 2,  29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4,  8,
3347       31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6,  11, 5,  10, 9};
3348   return debruijn_ctz32[(uint32_t)(value * 0x077CB531u) >> 27];
3349 #endif
3350 }
3351 
3352 /* Only a subset of the mdbx_env flags can be changed
3353  * at runtime. Changing other flags requires closing the
3354  * environment and re-opening it with the new flags. */
3355 #define ENV_CHANGEABLE_FLAGS                                                   \
3356   (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_DEPRECATED_MAPASYNC |             \
3357    MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE)
3358 #define ENV_CHANGELESS_FLAGS                                                   \
3359   (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOTLS | MDBX_NORDAHEAD | \
3360    MDBX_LIFORECLAIM | MDBX_EXCLUSIVE)
3361 #define ENV_USABLE_FLAGS (ENV_CHANGEABLE_FLAGS | ENV_CHANGELESS_FLAGS)
3362 
3363 #if !defined(__cplusplus) || CONSTEXPR_ENUM_FLAGS_OPERATIONS
static_checks(void)3364 MDBX_MAYBE_UNUSED static void static_checks(void) {
3365   STATIC_ASSERT_MSG(INT16_MAX - CORE_DBS == MDBX_MAX_DBI,
3366                     "Oops, MDBX_MAX_DBI or CORE_DBS?");
3367   STATIC_ASSERT_MSG((unsigned)(MDBX_DB_ACCEDE | MDBX_CREATE) ==
3368                         ((DB_USABLE_FLAGS | DB_INTERNAL_FLAGS) &
3369                          (ENV_USABLE_FLAGS | ENV_INTERNAL_FLAGS)),
3370                     "Oops, some flags overlapped or wrong");
3371   STATIC_ASSERT_MSG((ENV_INTERNAL_FLAGS & ENV_USABLE_FLAGS) == 0,
3372                     "Oops, some flags overlapped or wrong");
3373 }
3374 #endif /* Disabled for MSVC 19.0 (VisualStudio 2015) */
3375 
3376 #ifdef __cplusplus
3377 }
3378 #endif
3379 
3380 #define MDBX_ASAN_POISON_MEMORY_REGION(addr, size)                             \
3381   do {                                                                         \
3382     mdbx_trace("POISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr),          \
3383                (size_t)(size), __LINE__);                                      \
3384     ASAN_POISON_MEMORY_REGION(addr, size);                                     \
3385   } while (0)
3386 
3387 #define MDBX_ASAN_UNPOISON_MEMORY_REGION(addr, size)                           \
3388   do {                                                                         \
3389     mdbx_trace("UNPOISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr),        \
3390                (size_t)(size), __LINE__);                                      \
3391     ASAN_UNPOISON_MEMORY_REGION(addr, size);                                   \
3392   } while (0)
3393 
3394 typedef struct flagbit {
3395   int bit;
3396   const char *name;
3397 } flagbit;
3398 
3399 const flagbit dbflags[] = {{MDBX_DUPSORT, "dupsort"},
3400                            {MDBX_INTEGERKEY, "integerkey"},
3401                            {MDBX_REVERSEKEY, "reversekey"},
3402                            {MDBX_DUPFIXED, "dupfixed"},
3403                            {MDBX_REVERSEDUP, "reversedup"},
3404                            {MDBX_INTEGERDUP, "integerdup"},
3405                            {0, nullptr}};
3406 
3407 #if defined(_WIN32) || defined(_WIN64)
3408 /*
3409  * POSIX getopt for Windows
3410  *
3411  * AT&T Public License
3412  *
3413  * Code given out at the 1985 UNIFORUM conference in Dallas.
3414  */
3415 
3416 /*----------------------------------------------------------------------------*/
3417 /* Microsoft compiler generates a lot of warning for self includes... */
3418 
3419 #ifdef _MSC_VER
3420 #pragma warning(push, 1)
3421 #pragma warning(disable : 4548) /* expression before comma has no effect;      \
3422                                    expected expression with side - effect */
3423 #pragma warning(disable : 4530) /* C++ exception handler used, but unwind      \
3424                                  * semantics are not enabled. Specify /EHsc */
3425 #pragma warning(disable : 4577) /* 'noexcept' used with no exception handling  \
3426                                  * mode specified; termination on exception is \
3427                                  * not guaranteed. Specify /EHsc */
3428 #if !defined(_CRT_SECURE_NO_WARNINGS)
3429 #define _CRT_SECURE_NO_WARNINGS
3430 #endif
3431 #endif /* _MSC_VER (warnings) */
3432 
3433 #include <stdio.h>
3434 #include <string.h>
3435 
3436 #ifdef _MSC_VER
3437 #pragma warning(pop)
3438 #endif
3439 /*----------------------------------------------------------------------------*/
3440 
3441 #ifndef NULL
3442 #define NULL 0
3443 #endif
3444 
3445 #ifndef EOF
3446 #define EOF (-1)
3447 #endif
3448 
3449 int optind = 1;
3450 int optopt;
3451 char *optarg;
3452 
getopt(int argc,char * const argv[],const char * opts)3453 int getopt(int argc, char *const argv[], const char *opts) {
3454   static int sp = 1;
3455   int c;
3456   const char *cp;
3457 
3458   if (sp == 1) {
3459     if (optind >= argc || argv[optind][0] != '-' || argv[optind][1] == '\0')
3460       return EOF;
3461     else if (strcmp(argv[optind], "--") == 0) {
3462       optind++;
3463       return EOF;
3464     }
3465   }
3466   optopt = c = argv[optind][sp];
3467   if (c == ':' || (cp = strchr(opts, c)) == NULL) {
3468     fprintf(stderr, "%s: %s -- %c\n", argv[0], "illegal option", c);
3469     if (argv[optind][++sp] == '\0') {
3470       optind++;
3471       sp = 1;
3472     }
3473     return '?';
3474   }
3475   if (*++cp == ':') {
3476     if (argv[optind][sp + 1] != '\0')
3477       optarg = &argv[optind++][sp + 1];
3478     else if (++optind >= argc) {
3479       fprintf(stderr, "%s: %s -- %c\n", argv[0], "option requires an argument",
3480               c);
3481       sp = 1;
3482       return '?';
3483     } else
3484       optarg = argv[optind++];
3485     sp = 1;
3486   } else {
3487     if (argv[optind][++sp] == '\0') {
3488       sp = 1;
3489       optind++;
3490     }
3491     optarg = NULL;
3492   }
3493   return c;
3494 }
3495 
3496 static volatile BOOL user_break;
ConsoleBreakHandlerRoutine(DWORD dwCtrlType)3497 static BOOL WINAPI ConsoleBreakHandlerRoutine(DWORD dwCtrlType) {
3498   (void)dwCtrlType;
3499   user_break = 1;
3500   return true;
3501 }
3502 
GetMilliseconds(void)3503 static uint64_t GetMilliseconds(void) {
3504   LARGE_INTEGER Counter, Frequency;
3505   return (QueryPerformanceFrequency(&Frequency) &&
3506           QueryPerformanceCounter(&Counter))
3507              ? Counter.QuadPart * 1000ul / Frequency.QuadPart
3508              : 0;
3509 }
3510 
3511 #else /* WINDOWS */
3512 
3513 static volatile sig_atomic_t user_break;
signal_handler(int sig)3514 static void signal_handler(int sig) {
3515   (void)sig;
3516   user_break = 1;
3517 }
3518 
3519 #endif /* !WINDOWS */
3520 
3521 #define EXIT_INTERRUPTED (EXIT_FAILURE + 4)
3522 #define EXIT_FAILURE_SYS (EXIT_FAILURE + 3)
3523 #define EXIT_FAILURE_MDBX (EXIT_FAILURE + 2)
3524 #define EXIT_FAILURE_CHECK_MAJOR (EXIT_FAILURE + 1)
3525 #define EXIT_FAILURE_CHECK_MINOR EXIT_FAILURE
3526 
3527 typedef struct {
3528   const char *name;
3529   struct {
3530     uint64_t branch, large_count, large_volume, leaf;
3531     uint64_t subleaf_dupsort, leaf_dupfixed, subleaf_dupfixed;
3532     uint64_t total, empty, other;
3533   } pages;
3534   uint64_t payload_bytes;
3535   uint64_t lost_bytes;
3536 } walk_dbi_t;
3537 
3538 struct {
3539   short *pagemap;
3540   uint64_t total_payload_bytes;
3541   uint64_t pgcount;
3542   walk_dbi_t
3543       dbi[MDBX_MAX_DBI + CORE_DBS + /* account pseudo-entry for meta */ 1];
3544 } walk;
3545 
3546 #define dbi_free walk.dbi[FREE_DBI]
3547 #define dbi_main walk.dbi[MAIN_DBI]
3548 #define dbi_meta walk.dbi[CORE_DBS]
3549 
3550 int envflags = MDBX_RDONLY | MDBX_EXCLUSIVE;
3551 MDBX_env *env;
3552 MDBX_txn *txn;
3553 MDBX_envinfo envinfo;
3554 size_t userdb_count, skipped_subdb;
3555 uint64_t total_unused_bytes, reclaimable_pages, gc_pages, alloc_pages,
3556     unused_pages, backed_pages;
3557 unsigned verbose;
3558 bool ignore_wrong_order, quiet, dont_traversal;
3559 const char *only_subdb;
3560 int stuck_meta = -1;
3561 
3562 struct problem {
3563   struct problem *pr_next;
3564   size_t count;
3565   const char *caption;
3566 };
3567 
3568 struct problem *problems_list;
3569 unsigned total_problems, data_tree_problems, gc_tree_problems;
3570 
print(const char * msg,...)3571 static void MDBX_PRINTF_ARGS(1, 2) print(const char *msg, ...) {
3572   if (!quiet) {
3573     va_list args;
3574 
3575     fflush(stderr);
3576     va_start(args, msg);
3577     vfprintf(stdout, msg, args);
3578     va_end(args);
3579   }
3580 }
3581 
va_log(MDBX_log_level_t level,const char * msg,va_list args)3582 static void va_log(MDBX_log_level_t level, const char *msg, va_list args) {
3583   static const char *const prefixes[] = {
3584       "!!!fatal: ",       " ! " /* error */,      " ~ " /* warning */,
3585       "   " /* notice */, "   // " /* verbose */, "   //// " /* debug */,
3586       "   ////// " /* trace */
3587   };
3588 
3589   FILE *out = stdout;
3590   if (level <= MDBX_LOG_ERROR) {
3591     total_problems++;
3592     out = stderr;
3593   }
3594 
3595   if (!quiet && verbose + 1 >= (unsigned)level) {
3596     fflush(nullptr);
3597     fputs(prefixes[level], out);
3598     vfprintf(out, msg, args);
3599     if (msg[strlen(msg) - 1] != '\n')
3600       fputc('\n', out);
3601     fflush(nullptr);
3602   }
3603 
3604   if (level == MDBX_LOG_FATAL) {
3605     exit(EXIT_FAILURE_MDBX);
3606     abort();
3607   }
3608 }
3609 
error(const char * msg,...)3610 static void MDBX_PRINTF_ARGS(1, 2) error(const char *msg, ...) {
3611   va_list args;
3612   va_start(args, msg);
3613   va_log(MDBX_LOG_ERROR, msg, args);
3614   va_end(args);
3615 }
3616 
logger(MDBX_log_level_t level,const char * function,int line,const char * msg,va_list args)3617 static void logger(MDBX_log_level_t level, const char *function, int line,
3618                    const char *msg, va_list args) {
3619   (void)line;
3620   (void)function;
3621   if (level < MDBX_LOG_EXTRA)
3622     va_log(level, msg, args);
3623 }
3624 
check_user_break(void)3625 static int check_user_break(void) {
3626   switch (user_break) {
3627   case 0:
3628     return MDBX_SUCCESS;
3629   case 1:
3630     print(" - interrupted by signal\n");
3631     fflush(nullptr);
3632     user_break = 2;
3633   }
3634   return MDBX_EINTR;
3635 }
3636 
pagemap_cleanup(void)3637 static void pagemap_cleanup(void) {
3638   for (size_t i = CORE_DBS + /* account pseudo-entry for meta */ 1;
3639        i < ARRAY_LENGTH(walk.dbi); ++i) {
3640     if (walk.dbi[i].name) {
3641       mdbx_free((void *)walk.dbi[i].name);
3642       walk.dbi[i].name = nullptr;
3643     }
3644   }
3645 
3646   mdbx_free(walk.pagemap);
3647   walk.pagemap = nullptr;
3648 }
3649 
pagemap_lookup_dbi(const char * dbi_name,bool silent)3650 static walk_dbi_t *pagemap_lookup_dbi(const char *dbi_name, bool silent) {
3651   static walk_dbi_t *last;
3652 
3653   if (dbi_name == MDBX_PGWALK_MAIN)
3654     return &dbi_main;
3655   if (dbi_name == MDBX_PGWALK_GC)
3656     return &dbi_free;
3657   if (dbi_name == MDBX_PGWALK_META)
3658     return &dbi_meta;
3659 
3660   if (last && strcmp(last->name, dbi_name) == 0)
3661     return last;
3662 
3663   walk_dbi_t *dbi = walk.dbi + CORE_DBS + /* account pseudo-entry for meta */ 1;
3664   for (; dbi < ARRAY_END(walk.dbi) && dbi->name; ++dbi) {
3665     if (strcmp(dbi->name, dbi_name) == 0)
3666       return last = dbi;
3667   }
3668 
3669   if (verbose > 0 && !silent) {
3670     print(" - found '%s' area\n", dbi_name);
3671     fflush(nullptr);
3672   }
3673 
3674   if (dbi == ARRAY_END(walk.dbi))
3675     return nullptr;
3676 
3677   dbi->name = mdbx_strdup(dbi_name);
3678   return last = dbi;
3679 }
3680 
3681 static void MDBX_PRINTF_ARGS(4, 5)
problem_add(const char * object,uint64_t entry_number,const char * msg,const char * extra,...)3682     problem_add(const char *object, uint64_t entry_number, const char *msg,
3683                 const char *extra, ...) {
3684   total_problems++;
3685 
3686   if (!quiet) {
3687     int need_fflush = 0;
3688     struct problem *p;
3689 
3690     for (p = problems_list; p; p = p->pr_next)
3691       if (p->caption == msg)
3692         break;
3693 
3694     if (!p) {
3695       p = mdbx_calloc(1, sizeof(*p));
3696       p->caption = msg;
3697       p->pr_next = problems_list;
3698       problems_list = p;
3699       need_fflush = 1;
3700     }
3701 
3702     p->count++;
3703     if (verbose > 1) {
3704       print("     %s #%" PRIu64 ": %s", object, entry_number, msg);
3705       if (extra) {
3706         va_list args;
3707         printf(" (");
3708         va_start(args, extra);
3709         vfprintf(stdout, extra, args);
3710         va_end(args);
3711         printf(")");
3712       }
3713       printf("\n");
3714       if (need_fflush)
3715         fflush(nullptr);
3716     }
3717   }
3718 }
3719 
problems_push(void)3720 static struct problem *problems_push(void) {
3721   struct problem *p = problems_list;
3722   problems_list = nullptr;
3723   return p;
3724 }
3725 
problems_pop(struct problem * list)3726 static size_t problems_pop(struct problem *list) {
3727   size_t count = 0;
3728 
3729   if (problems_list) {
3730     int i;
3731 
3732     print(" - problems: ");
3733     for (i = 0; problems_list; ++i) {
3734       struct problem *p = problems_list->pr_next;
3735       count += problems_list->count;
3736       print("%s%s (%" PRIuPTR ")", i ? ", " : "", problems_list->caption,
3737             problems_list->count);
3738       mdbx_free(problems_list);
3739       problems_list = p;
3740     }
3741     print("\n");
3742     fflush(nullptr);
3743   }
3744 
3745   problems_list = list;
3746   return count;
3747 }
3748 
pgvisitor(const uint64_t pgno,const unsigned pgnumber,void * const ctx,const int deep,const char * const dbi_name_or_tag,const size_t page_size,const MDBX_page_type_t pagetype,const MDBX_error_t err,const size_t nentries,const size_t payload_bytes,const size_t header_bytes,const size_t unused_bytes)3749 static int pgvisitor(const uint64_t pgno, const unsigned pgnumber,
3750                      void *const ctx, const int deep,
3751                      const char *const dbi_name_or_tag, const size_t page_size,
3752                      const MDBX_page_type_t pagetype, const MDBX_error_t err,
3753                      const size_t nentries, const size_t payload_bytes,
3754                      const size_t header_bytes, const size_t unused_bytes) {
3755   (void)ctx;
3756   const bool is_gc_tree = dbi_name_or_tag == MDBX_PGWALK_GC;
3757   if (deep > 42) {
3758     problem_add("deep", deep, "too large", nullptr);
3759     data_tree_problems += !is_gc_tree;
3760     gc_tree_problems += is_gc_tree;
3761     return MDBX_CORRUPTED /* avoid infinite loop/recursion */;
3762   }
3763 
3764   walk_dbi_t *dbi = pagemap_lookup_dbi(dbi_name_or_tag, false);
3765   if (!dbi) {
3766     data_tree_problems += !is_gc_tree;
3767     gc_tree_problems += is_gc_tree;
3768     return MDBX_ENOMEM;
3769   }
3770 
3771   const size_t page_bytes = payload_bytes + header_bytes + unused_bytes;
3772   walk.pgcount += pgnumber;
3773 
3774   const char *pagetype_caption;
3775   bool branch = false;
3776   switch (pagetype) {
3777   default:
3778     problem_add("page", pgno, "unknown page-type", "type %u, deep %i",
3779                 (unsigned)pagetype, deep);
3780     pagetype_caption = "unknown";
3781     dbi->pages.other += pgnumber;
3782     data_tree_problems += !is_gc_tree;
3783     gc_tree_problems += is_gc_tree;
3784     break;
3785   case MDBX_page_broken:
3786     pagetype_caption = "broken";
3787     dbi->pages.other += pgnumber;
3788     data_tree_problems += !is_gc_tree;
3789     gc_tree_problems += is_gc_tree;
3790     break;
3791   case MDBX_subpage_broken:
3792     pagetype_caption = "broken-subpage";
3793     data_tree_problems += !is_gc_tree;
3794     gc_tree_problems += is_gc_tree;
3795     break;
3796   case MDBX_page_meta:
3797     pagetype_caption = "meta";
3798     dbi->pages.other += pgnumber;
3799     break;
3800   case MDBX_page_large:
3801     pagetype_caption = "large";
3802     dbi->pages.large_volume += pgnumber;
3803     dbi->pages.large_count += 1;
3804     break;
3805   case MDBX_page_branch:
3806     pagetype_caption = "branch";
3807     dbi->pages.branch += pgnumber;
3808     branch = true;
3809     break;
3810   case MDBX_page_leaf:
3811     pagetype_caption = "leaf";
3812     dbi->pages.leaf += pgnumber;
3813     break;
3814   case MDBX_page_dupfixed_leaf:
3815     pagetype_caption = "leaf-dupfixed";
3816     dbi->pages.leaf_dupfixed += pgnumber;
3817     break;
3818   case MDBX_subpage_leaf:
3819     pagetype_caption = "subleaf-dupsort";
3820     dbi->pages.subleaf_dupsort += 1;
3821     break;
3822   case MDBX_subpage_dupfixed_leaf:
3823     pagetype_caption = "subleaf-dupfixed";
3824     dbi->pages.subleaf_dupfixed += 1;
3825     break;
3826   }
3827 
3828   if (pgnumber) {
3829     if (verbose > 3 && (!only_subdb || strcmp(only_subdb, dbi->name) == 0)) {
3830       if (pgnumber == 1)
3831         print("     %s-page %" PRIu64, pagetype_caption, pgno);
3832       else
3833         print("     %s-span %" PRIu64 "[%u]", pagetype_caption, pgno, pgnumber);
3834       print(" of %s: header %" PRIiPTR ", %s %" PRIiPTR ", payload %" PRIiPTR
3835             ", unused %" PRIiPTR ", deep %i\n",
3836             dbi->name, header_bytes,
3837             (pagetype == MDBX_page_branch) ? "keys" : "entries", nentries,
3838             payload_bytes, unused_bytes, deep);
3839     }
3840 
3841     bool already_used = false;
3842     for (unsigned n = 0; n < pgnumber; ++n) {
3843       uint64_t spanpgno = pgno + n;
3844       if (spanpgno >= alloc_pages) {
3845         problem_add("page", spanpgno, "wrong page-no",
3846                     "%s-page: %" PRIu64 " > %" PRIu64 ", deep %i",
3847                     pagetype_caption, spanpgno, alloc_pages, deep);
3848         data_tree_problems += !is_gc_tree;
3849         gc_tree_problems += is_gc_tree;
3850       } else if (walk.pagemap[spanpgno]) {
3851         walk_dbi_t *coll_dbi = &walk.dbi[walk.pagemap[spanpgno] - 1];
3852         problem_add("page", spanpgno,
3853                     (branch && coll_dbi == dbi) ? "loop" : "already used",
3854                     "%s-page: by %s, deep %i", pagetype_caption, coll_dbi->name,
3855                     deep);
3856         already_used = true;
3857         data_tree_problems += !is_gc_tree;
3858         gc_tree_problems += is_gc_tree;
3859       } else {
3860         walk.pagemap[spanpgno] = (short)(dbi - walk.dbi + 1);
3861         dbi->pages.total += 1;
3862       }
3863     }
3864 
3865     if (already_used)
3866       return branch ? MDBX_RESULT_TRUE /* avoid infinite loop/recursion */
3867                     : MDBX_SUCCESS;
3868   }
3869 
3870   if (MDBX_IS_ERROR(err)) {
3871     problem_add("page", pgno, "invalid/corrupted", "%s-page", pagetype_caption);
3872     data_tree_problems += !is_gc_tree;
3873     gc_tree_problems += is_gc_tree;
3874   } else {
3875     if (unused_bytes > page_size) {
3876       problem_add("page", pgno, "illegal unused-bytes",
3877                   "%s-page: %u < %" PRIuPTR " < %u", pagetype_caption, 0,
3878                   unused_bytes, envinfo.mi_dxb_pagesize);
3879       data_tree_problems += !is_gc_tree;
3880       gc_tree_problems += is_gc_tree;
3881     }
3882 
3883     if (header_bytes < (int)sizeof(long) ||
3884         (size_t)header_bytes >= envinfo.mi_dxb_pagesize - sizeof(long)) {
3885       problem_add("page", pgno, "illegal header-length",
3886                   "%s-page: %" PRIuPTR " < %" PRIuPTR " < %" PRIuPTR,
3887                   pagetype_caption, sizeof(long), header_bytes,
3888                   envinfo.mi_dxb_pagesize - sizeof(long));
3889       data_tree_problems += !is_gc_tree;
3890       gc_tree_problems += is_gc_tree;
3891     }
3892     if (payload_bytes < 1) {
3893       if (nentries > 1) {
3894         problem_add("page", pgno, "zero size-of-entry",
3895                     "%s-page: payload %" PRIuPTR " bytes, %" PRIuPTR " entries",
3896                     pagetype_caption, payload_bytes, nentries);
3897         /* if ((size_t)header_bytes + unused_bytes < page_size) {
3898           // LY: hush a misuse error
3899           page_bytes = page_size;
3900         } */
3901         data_tree_problems += !is_gc_tree;
3902         gc_tree_problems += is_gc_tree;
3903       } else {
3904         problem_add("page", pgno, "empty",
3905                     "%s-page: payload %" PRIuPTR " bytes, %" PRIuPTR
3906                     " entries, deep %i",
3907                     pagetype_caption, payload_bytes, nentries, deep);
3908         dbi->pages.empty += 1;
3909         data_tree_problems += !is_gc_tree;
3910         gc_tree_problems += is_gc_tree;
3911       }
3912     }
3913 
3914     if (pgnumber) {
3915       if (page_bytes != page_size) {
3916         problem_add("page", pgno, "misused",
3917                     "%s-page: %" PRIuPTR " != %" PRIuPTR " (%" PRIuPTR
3918                     "h + %" PRIuPTR "p + %" PRIuPTR "u), deep %i",
3919                     pagetype_caption, page_size, page_bytes, header_bytes,
3920                     payload_bytes, unused_bytes, deep);
3921         if (page_size > page_bytes)
3922           dbi->lost_bytes += page_size - page_bytes;
3923         data_tree_problems += !is_gc_tree;
3924         gc_tree_problems += is_gc_tree;
3925       } else {
3926         dbi->payload_bytes += payload_bytes + header_bytes;
3927         walk.total_payload_bytes += payload_bytes + header_bytes;
3928       }
3929     }
3930   }
3931 
3932   return check_user_break();
3933 }
3934 
3935 typedef int(visitor)(const uint64_t record_number, const MDBX_val *key,
3936                      const MDBX_val *data);
3937 static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler,
3938                       bool silent);
3939 
handle_userdb(const uint64_t record_number,const MDBX_val * key,const MDBX_val * data)3940 static int handle_userdb(const uint64_t record_number, const MDBX_val *key,
3941                          const MDBX_val *data) {
3942   (void)record_number;
3943   (void)key;
3944   (void)data;
3945   return check_user_break();
3946 }
3947 
handle_freedb(const uint64_t record_number,const MDBX_val * key,const MDBX_val * data)3948 static int handle_freedb(const uint64_t record_number, const MDBX_val *key,
3949                          const MDBX_val *data) {
3950   char *bad = "";
3951   pgno_t *iptr = data->iov_base;
3952 
3953   if (key->iov_len != sizeof(txnid_t))
3954     problem_add("entry", record_number, "wrong txn-id size",
3955                 "key-size %" PRIiPTR, key->iov_len);
3956   else {
3957     txnid_t txnid;
3958     memcpy(&txnid, key->iov_base, sizeof(txnid));
3959     if (txnid < 1 || txnid > envinfo.mi_recent_txnid)
3960       problem_add("entry", record_number, "wrong txn-id", "%" PRIaTXN, txnid);
3961     else {
3962       if (data->iov_len < sizeof(pgno_t) || data->iov_len % sizeof(pgno_t))
3963         problem_add("entry", txnid, "wrong idl size", "%" PRIuPTR,
3964                     data->iov_len);
3965       size_t number = (data->iov_len >= sizeof(pgno_t)) ? *iptr++ : 0;
3966       if (number < 1 || number > MDBX_PGL_LIMIT)
3967         problem_add("entry", txnid, "wrong idl length", "%" PRIuPTR, number);
3968       else if ((number + 1) * sizeof(pgno_t) > data->iov_len) {
3969         problem_add("entry", txnid, "trimmed idl",
3970                     "%" PRIuSIZE " > %" PRIuSIZE " (corruption)",
3971                     (number + 1) * sizeof(pgno_t), data->iov_len);
3972         number = data->iov_len / sizeof(pgno_t) - 1;
3973       } else if (data->iov_len - (number + 1) * sizeof(pgno_t) >=
3974                  /* LY: allow gap up to one page. it is ok
3975                   * and better than shink-and-retry inside mdbx_update_gc() */
3976                  envinfo.mi_dxb_pagesize)
3977         problem_add("entry", txnid, "extra idl space",
3978                     "%" PRIuSIZE " < %" PRIuSIZE " (minor, not a trouble)",
3979                     (number + 1) * sizeof(pgno_t), data->iov_len);
3980 
3981       gc_pages += number;
3982       if (envinfo.mi_latter_reader_txnid > txnid)
3983         reclaimable_pages += number;
3984 
3985       pgno_t prev = MDBX_PNL_ASCENDING ? NUM_METAS - 1 : txn->mt_next_pgno;
3986       pgno_t span = 1;
3987       for (unsigned i = 0; i < number; ++i) {
3988         if (check_user_break())
3989           return MDBX_EINTR;
3990         const pgno_t pgno = iptr[i];
3991         if (pgno < NUM_METAS)
3992           problem_add("entry", txnid, "wrong idl entry",
3993                       "pgno %" PRIaPGNO " < meta-pages %u", pgno, NUM_METAS);
3994         else if (pgno >= backed_pages)
3995           problem_add("entry", txnid, "wrong idl entry",
3996                       "pgno %" PRIaPGNO " > backed-pages %" PRIu64, pgno,
3997                       backed_pages);
3998         else if (pgno >= alloc_pages)
3999           problem_add("entry", txnid, "wrong idl entry",
4000                       "pgno %" PRIaPGNO " > alloc-pages %" PRIu64, pgno,
4001                       alloc_pages - 1);
4002         else {
4003           if (MDBX_PNL_DISORDERED(prev, pgno)) {
4004             bad = " [bad sequence]";
4005             problem_add("entry", txnid, "bad sequence",
4006                         "%" PRIaPGNO " %c [%u].%" PRIaPGNO, prev,
4007                         (prev == pgno) ? '=' : (MDBX_PNL_ASCENDING ? '>' : '<'),
4008                         i, pgno);
4009           }
4010           if (walk.pagemap) {
4011             int idx = walk.pagemap[pgno];
4012             if (idx == 0)
4013               walk.pagemap[pgno] = -1;
4014             else if (idx > 0)
4015               problem_add("page", pgno, "already used", "by %s",
4016                           walk.dbi[idx - 1].name);
4017             else
4018               problem_add("page", pgno, "already listed in GC", nullptr);
4019           }
4020         }
4021         prev = pgno;
4022         while (i + span < number &&
4023                iptr[i + span] == (MDBX_PNL_ASCENDING ? pgno_add(pgno, span)
4024                                                      : pgno_sub(pgno, span)))
4025           ++span;
4026       }
4027       if (verbose > 3 && !only_subdb) {
4028         print("     transaction %" PRIaTXN ", %" PRIuPTR
4029               " pages, maxspan %" PRIaPGNO "%s\n",
4030               txnid, number, span, bad);
4031         if (verbose > 4) {
4032           for (unsigned i = 0; i < number; i += span) {
4033             const pgno_t pgno = iptr[i];
4034             for (span = 1;
4035                  i + span < number &&
4036                  iptr[i + span] == (MDBX_PNL_ASCENDING ? pgno_add(pgno, span)
4037                                                        : pgno_sub(pgno, span));
4038                  ++span)
4039               ;
4040             if (span > 1) {
4041               print("    %9" PRIaPGNO "[%" PRIaPGNO "]\n", pgno, span);
4042             } else
4043               print("    %9" PRIaPGNO "\n", pgno);
4044           }
4045         }
4046       }
4047     }
4048   }
4049 
4050   return check_user_break();
4051 }
4052 
equal_or_greater(const MDBX_val * a,const MDBX_val * b)4053 static int equal_or_greater(const MDBX_val *a, const MDBX_val *b) {
4054   return (a->iov_len == b->iov_len &&
4055           memcmp(a->iov_base, b->iov_base, a->iov_len) == 0)
4056              ? 0
4057              : 1;
4058 }
4059 
handle_maindb(const uint64_t record_number,const MDBX_val * key,const MDBX_val * data)4060 static int handle_maindb(const uint64_t record_number, const MDBX_val *key,
4061                          const MDBX_val *data) {
4062   char *name;
4063   int rc;
4064   size_t i;
4065 
4066   name = key->iov_base;
4067   for (i = 0; i < key->iov_len; ++i) {
4068     if (name[i] < ' ')
4069       return handle_userdb(record_number, key, data);
4070   }
4071 
4072   name = mdbx_malloc(key->iov_len + 1);
4073   memcpy(name, key->iov_base, key->iov_len);
4074   name[key->iov_len] = '\0';
4075   userdb_count++;
4076 
4077   rc = process_db(~0u, name, handle_userdb, false);
4078   mdbx_free(name);
4079   if (rc != MDBX_INCOMPATIBLE)
4080     return rc;
4081 
4082   return handle_userdb(record_number, key, data);
4083 }
4084 
db_flags2keymode(unsigned flags)4085 static const char *db_flags2keymode(unsigned flags) {
4086   flags &= (MDBX_REVERSEKEY | MDBX_INTEGERKEY);
4087   switch (flags) {
4088   case 0:
4089     return "usual";
4090   case MDBX_REVERSEKEY:
4091     return "reserve";
4092   case MDBX_INTEGERKEY:
4093     return "ordinal";
4094   case MDBX_REVERSEKEY | MDBX_INTEGERKEY:
4095     return "msgpack";
4096   default:
4097     assert(false);
4098     __unreachable();
4099   }
4100 }
4101 
db_flags2valuemode(unsigned flags)4102 static const char *db_flags2valuemode(unsigned flags) {
4103   flags &= (MDBX_DUPSORT | MDBX_REVERSEDUP | MDBX_DUPFIXED | MDBX_INTEGERDUP);
4104   switch (flags) {
4105   case 0:
4106     return "single";
4107   case MDBX_DUPSORT:
4108     return "multi";
4109   case MDBX_REVERSEDUP:
4110   case MDBX_DUPSORT | MDBX_REVERSEDUP:
4111     return "multi-reverse";
4112   case MDBX_DUPFIXED:
4113   case MDBX_DUPSORT | MDBX_DUPFIXED:
4114     return "multi-samelength";
4115   case MDBX_DUPFIXED | MDBX_REVERSEDUP:
4116   case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP:
4117     return "multi-reverse-samelength";
4118   case MDBX_INTEGERDUP:
4119   case MDBX_DUPSORT | MDBX_INTEGERDUP:
4120   case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP:
4121   case MDBX_DUPFIXED | MDBX_INTEGERDUP:
4122     return "multi-ordinal";
4123   case MDBX_INTEGERDUP | MDBX_REVERSEDUP:
4124   case MDBX_DUPSORT | MDBX_INTEGERDUP | MDBX_REVERSEDUP:
4125     return "multi-msgpack";
4126   case MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP:
4127   case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP:
4128     return "reserved";
4129   default:
4130     assert(false);
4131     __unreachable();
4132   }
4133 }
4134 
process_db(MDBX_dbi dbi_handle,char * dbi_name,visitor * handler,bool silent)4135 static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler,
4136                       bool silent) {
4137   MDBX_cursor *mc;
4138   MDBX_stat ms;
4139   MDBX_val key, data;
4140   MDBX_val prev_key, prev_data;
4141   unsigned flags;
4142   int rc, i;
4143   struct problem *saved_list;
4144   uint64_t problems_count;
4145 
4146   uint64_t record_count = 0, dups = 0;
4147   uint64_t key_bytes = 0, data_bytes = 0;
4148 
4149   if ((MDBX_TXN_FINISHED | MDBX_TXN_ERROR) & mdbx_txn_flags(txn)) {
4150     print(" ! abort processing '%s' due to a previous error\n",
4151           dbi_name ? dbi_name : "@MAIN");
4152     return MDBX_BAD_TXN;
4153   }
4154 
4155   if (dbi_handle == ~0u) {
4156     rc = mdbx_dbi_open_ex(
4157         txn, dbi_name, MDBX_DB_ACCEDE, &dbi_handle,
4158         (dbi_name && ignore_wrong_order) ? equal_or_greater : nullptr,
4159         (dbi_name && ignore_wrong_order) ? equal_or_greater : nullptr);
4160     if (rc) {
4161       if (!dbi_name ||
4162           rc !=
4163               MDBX_INCOMPATIBLE) /* LY: mainDB's record is not a user's DB. */ {
4164         error("mdbx_dbi_open('%s') failed, error %d %s\n",
4165               dbi_name ? dbi_name : "main", rc, mdbx_strerror(rc));
4166       }
4167       return rc;
4168     }
4169   }
4170 
4171   if (dbi_handle >= CORE_DBS && dbi_name && only_subdb &&
4172       strcmp(only_subdb, dbi_name) != 0) {
4173     if (verbose) {
4174       print("Skip processing '%s'...\n", dbi_name);
4175       fflush(nullptr);
4176     }
4177     skipped_subdb++;
4178     return MDBX_SUCCESS;
4179   }
4180 
4181   if (!silent && verbose) {
4182     print("Processing '%s'...\n", dbi_name ? dbi_name : "@MAIN");
4183     fflush(nullptr);
4184   }
4185 
4186   rc = mdbx_dbi_flags(txn, dbi_handle, &flags);
4187   if (rc) {
4188     error("mdbx_dbi_flags() failed, error %d %s\n", rc, mdbx_strerror(rc));
4189     return rc;
4190   }
4191 
4192   rc = mdbx_dbi_stat(txn, dbi_handle, &ms, sizeof(ms));
4193   if (rc) {
4194     error("mdbx_dbi_stat() failed, error %d %s\n", rc, mdbx_strerror(rc));
4195     return rc;
4196   }
4197 
4198   if (!silent && verbose) {
4199     print(" - key-value kind: %s-key => %s-value", db_flags2keymode(flags),
4200           db_flags2valuemode(flags));
4201     if (verbose > 1) {
4202       print(", flags:");
4203       if (!flags)
4204         print(" none");
4205       else {
4206         for (i = 0; dbflags[i].bit; i++)
4207           if (flags & dbflags[i].bit)
4208             print(" %s", dbflags[i].name);
4209       }
4210       if (verbose > 2)
4211         print(" (0x%02X), dbi-id %d", flags, dbi_handle);
4212     }
4213     print("\n");
4214     if (ms.ms_mod_txnid)
4215       print(" - last modification txn#%" PRIu64 "\n", ms.ms_mod_txnid);
4216     if (verbose > 1) {
4217       print(" - page size %u, entries %" PRIu64 "\n", ms.ms_psize,
4218             ms.ms_entries);
4219       print(" - b-tree depth %u, pages: branch %" PRIu64 ", leaf %" PRIu64
4220             ", overflow %" PRIu64 "\n",
4221             ms.ms_depth, ms.ms_branch_pages, ms.ms_leaf_pages,
4222             ms.ms_overflow_pages);
4223     }
4224   }
4225 
4226   walk_dbi_t *dbi = (dbi_handle < CORE_DBS)
4227                         ? &walk.dbi[dbi_handle]
4228                         : pagemap_lookup_dbi(dbi_name, true);
4229   if (!dbi) {
4230     error("too many DBIs or out of memory\n");
4231     return MDBX_ENOMEM;
4232   }
4233   if (!dont_traversal) {
4234     const uint64_t subtotal_pages =
4235         ms.ms_branch_pages + ms.ms_leaf_pages + ms.ms_overflow_pages;
4236     if (subtotal_pages != dbi->pages.total)
4237       error("%s pages mismatch (%" PRIu64 " != walked %" PRIu64 ")\n",
4238             "subtotal", subtotal_pages, dbi->pages.total);
4239     if (ms.ms_branch_pages != dbi->pages.branch)
4240       error("%s pages mismatch (%" PRIu64 " != walked %" PRIu64 ")\n", "branch",
4241             ms.ms_branch_pages, dbi->pages.branch);
4242     const uint64_t allleaf_pages = dbi->pages.leaf + dbi->pages.leaf_dupfixed;
4243     if (ms.ms_leaf_pages != allleaf_pages)
4244       error("%s pages mismatch (%" PRIu64 " != walked %" PRIu64 ")\n",
4245             "all-leaf", ms.ms_leaf_pages, allleaf_pages);
4246     if (ms.ms_overflow_pages != dbi->pages.large_volume)
4247       error("%s pages mismatch (%" PRIu64 " != walked %" PRIu64 ")\n",
4248             "large/overlow", ms.ms_overflow_pages, dbi->pages.large_volume);
4249   }
4250   rc = mdbx_cursor_open(txn, dbi_handle, &mc);
4251   if (rc) {
4252     error("mdbx_cursor_open() failed, error %d %s\n", rc, mdbx_strerror(rc));
4253     return rc;
4254   }
4255 
4256   if (ignore_wrong_order) { /* for debugging with enabled assertions */
4257     mc->mc_flags |= C_SKIPORD;
4258     if (mc->mc_xcursor)
4259       mc->mc_xcursor->mx_cursor.mc_flags |= C_SKIPORD;
4260   }
4261 
4262   const size_t maxkeysize = mdbx_env_get_maxkeysize_ex(env, flags);
4263   saved_list = problems_push();
4264   prev_key.iov_base = nullptr;
4265   prev_key.iov_len = 0;
4266   prev_data.iov_base = nullptr;
4267   prev_data.iov_len = 0;
4268   rc = mdbx_cursor_get(mc, &key, &data, MDBX_FIRST);
4269   while (rc == MDBX_SUCCESS) {
4270     rc = check_user_break();
4271     if (rc)
4272       goto bailout;
4273 
4274     bool bad_key = false;
4275     if (key.iov_len > maxkeysize) {
4276       problem_add("entry", record_count, "key length exceeds max-key-size",
4277                   "%" PRIuPTR " > %" PRIuPTR, key.iov_len, maxkeysize);
4278       bad_key = true;
4279     } else if ((flags & MDBX_INTEGERKEY) && key.iov_len != sizeof(uint64_t) &&
4280                key.iov_len != sizeof(uint32_t)) {
4281       problem_add("entry", record_count, "wrong key length",
4282                   "%" PRIuPTR " != 4or8", key.iov_len);
4283       bad_key = true;
4284     }
4285 
4286     bool bad_data = false;
4287     if ((flags & MDBX_INTEGERDUP) && data.iov_len != sizeof(uint64_t) &&
4288         data.iov_len != sizeof(uint32_t)) {
4289       problem_add("entry", record_count, "wrong data length",
4290                   "%" PRIuPTR " != 4or8", data.iov_len);
4291       bad_data = true;
4292     }
4293 
4294     if (prev_key.iov_base) {
4295       if (prev_data.iov_base && !bad_data && (flags & MDBX_DUPFIXED) &&
4296           prev_data.iov_len != data.iov_len) {
4297         problem_add("entry", record_count, "different data length",
4298                     "%" PRIuPTR " != %" PRIuPTR, prev_data.iov_len,
4299                     data.iov_len);
4300         bad_data = true;
4301       }
4302 
4303       if (!bad_key) {
4304         int cmp = mdbx_cmp(txn, dbi_handle, &key, &prev_key);
4305         if (cmp == 0) {
4306           ++dups;
4307           if ((flags & MDBX_DUPSORT) == 0) {
4308             problem_add("entry", record_count, "duplicated entries", nullptr);
4309             if (prev_data.iov_base && data.iov_len == prev_data.iov_len &&
4310                 memcmp(data.iov_base, prev_data.iov_base, data.iov_len) == 0) {
4311               problem_add("entry", record_count, "complete duplicate", nullptr);
4312             }
4313           } else if (!bad_data && prev_data.iov_base) {
4314             cmp = mdbx_dcmp(txn, dbi_handle, &data, &prev_data);
4315             if (cmp == 0) {
4316               problem_add("entry", record_count, "complete duplicate", nullptr);
4317             } else if (cmp < 0 && !ignore_wrong_order) {
4318               problem_add("entry", record_count, "wrong order of multi-values",
4319                           nullptr);
4320             }
4321           }
4322         } else if (cmp < 0 && !ignore_wrong_order) {
4323           problem_add("entry", record_count, "wrong order of entries", nullptr);
4324         }
4325       }
4326     }
4327 
4328     if (handler) {
4329       rc = handler(record_count, &key, &data);
4330       if (MDBX_IS_ERROR(rc))
4331         goto bailout;
4332     }
4333 
4334     record_count++;
4335     key_bytes += key.iov_len;
4336     data_bytes += data.iov_len;
4337 
4338     if (!bad_key) {
4339       if (verbose && (flags & MDBX_INTEGERKEY) && !prev_key.iov_base)
4340         print(" - fixed key-size %" PRIuPTR "\n", key.iov_len);
4341       prev_key = key;
4342     }
4343     if (!bad_data) {
4344       if (verbose && (flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED)) &&
4345           !prev_data.iov_base)
4346         print(" - fixed data-size %" PRIuPTR "\n", data.iov_len);
4347       prev_data = data;
4348     }
4349     rc = mdbx_cursor_get(mc, &key, &data, MDBX_NEXT);
4350   }
4351   if (rc != MDBX_NOTFOUND)
4352     error("mdbx_cursor_get() failed, error %d %s\n", rc, mdbx_strerror(rc));
4353   else
4354     rc = 0;
4355 
4356   if (record_count != ms.ms_entries)
4357     problem_add("entry", record_count, "different number of entries",
4358                 "%" PRIu64 " != %" PRIu64, record_count, ms.ms_entries);
4359 bailout:
4360   problems_count = problems_pop(saved_list);
4361   if (!silent && verbose) {
4362     print(" - summary: %" PRIu64 " records, %" PRIu64 " dups, %" PRIu64
4363           " key's bytes, %" PRIu64 " data's "
4364           "bytes, %" PRIu64 " problems\n",
4365           record_count, dups, key_bytes, data_bytes, problems_count);
4366     fflush(nullptr);
4367   }
4368 
4369   mdbx_cursor_close(mc);
4370   return (rc || problems_count) ? MDBX_RESULT_TRUE : MDBX_SUCCESS;
4371 }
4372 
usage(char * prog)4373 static void usage(char *prog) {
4374   fprintf(stderr,
4375           "usage: %s [-V] [-v] [-q] [-c] [-0|1|2] [-w] [-d] [-i] [-s subdb] "
4376           "dbpath\n"
4377           "  -V\t\tprint version and exit\n"
4378           "  -v\t\tmore verbose, could be used multiple times\n"
4379           "  -q\t\tbe quiet\n"
4380           "  -c\t\tforce cooperative mode (don't try exclusive)\n"
4381           "  -w\t\twrite-mode checking\n"
4382           "  -d\t\tdisable page-by-page traversal of B-tree\n"
4383           "  -i\t\tignore wrong order errors (for custom comparators case)\n"
4384           "  -s subdb\tprocess a specific subdatabase only\n"
4385           "  -0|1|2\tforce using specific meta-page 0, or 2 for checking\n"
4386           "  -t\t\tturn to a specified meta-page on successful check\n"
4387           "  -T\t\tturn to a specified meta-page EVEN ON UNSUCCESSFUL CHECK!\n",
4388           prog);
4389   exit(EXIT_INTERRUPTED);
4390 }
4391 
meta_ot(txnid_t txn_a,uint64_t sign_a,txnid_t txn_b,uint64_t sign_b,const bool wanna_steady)4392 static __inline bool meta_ot(txnid_t txn_a, uint64_t sign_a, txnid_t txn_b,
4393                              uint64_t sign_b, const bool wanna_steady) {
4394   if (txn_a == txn_b)
4395     return SIGN_IS_STEADY(sign_b);
4396 
4397   if (wanna_steady && SIGN_IS_STEADY(sign_a) != SIGN_IS_STEADY(sign_b))
4398     return SIGN_IS_STEADY(sign_b);
4399 
4400   return txn_a < txn_b;
4401 }
4402 
meta_eq(txnid_t txn_a,uint64_t sign_a,txnid_t txn_b,uint64_t sign_b)4403 static __inline bool meta_eq(txnid_t txn_a, uint64_t sign_a, txnid_t txn_b,
4404                              uint64_t sign_b) {
4405   if (!txn_a || txn_a != txn_b)
4406     return false;
4407 
4408   if (SIGN_IS_STEADY(sign_a) != SIGN_IS_STEADY(sign_b))
4409     return false;
4410 
4411   return true;
4412 }
4413 
meta_recent(const bool wanna_steady)4414 static __inline int meta_recent(const bool wanna_steady) {
4415   if (meta_ot(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign,
4416               envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, wanna_steady))
4417     return meta_ot(envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign,
4418                    envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, wanna_steady)
4419                ? 1
4420                : 2;
4421   else
4422     return meta_ot(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign,
4423                    envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign, wanna_steady)
4424                ? 2
4425                : 0;
4426 }
4427 
meta_tail(int head)4428 static __inline int meta_tail(int head) {
4429   switch (head) {
4430   case 0:
4431     return meta_ot(envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign,
4432                    envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign, true)
4433                ? 1
4434                : 2;
4435   case 1:
4436     return meta_ot(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign,
4437                    envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign, true)
4438                ? 0
4439                : 2;
4440   case 2:
4441     return meta_ot(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign,
4442                    envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, true)
4443                ? 0
4444                : 1;
4445   default:
4446     assert(false);
4447     return -1;
4448   }
4449 }
4450 
meta_head(void)4451 static int meta_head(void) { return meta_recent(false); }
4452 
verbose_meta(int num,txnid_t txnid,uint64_t sign,uint64_t bootid_x,uint64_t bootid_y)4453 void verbose_meta(int num, txnid_t txnid, uint64_t sign, uint64_t bootid_x,
4454                   uint64_t bootid_y) {
4455   const bool have_bootid = (bootid_x | bootid_y) != 0;
4456   const bool bootid_match = bootid_x == envinfo.mi_bootid.current.x &&
4457                             bootid_y == envinfo.mi_bootid.current.y;
4458 
4459   print(" - meta-%d: ", num);
4460   switch (sign) {
4461   case MDBX_DATASIGN_NONE:
4462     print("no-sync/legacy");
4463     break;
4464   case MDBX_DATASIGN_WEAK:
4465     print("weak-%s", bootid_match ? (have_bootid ? "intact (same boot-id)"
4466                                                  : "unknown (no boot-id")
4467                                   : "dead");
4468     break;
4469   default:
4470     print("steady");
4471     break;
4472   }
4473   print(" txn#%" PRIu64, txnid);
4474 
4475   const int head = meta_head();
4476   if (num == head)
4477     print(", head");
4478   else if (num == meta_tail(head))
4479     print(", tail");
4480   else
4481     print(", stay");
4482 
4483   if (stuck_meta >= 0) {
4484     if (num == stuck_meta)
4485       print(", forced for checking");
4486   } else if (txnid > envinfo.mi_recent_txnid &&
4487              (envflags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) == MDBX_EXCLUSIVE)
4488     print(", rolled-back %" PRIu64 " (%" PRIu64 " >>> %" PRIu64 ")",
4489           txnid - envinfo.mi_recent_txnid, txnid, envinfo.mi_recent_txnid);
4490   print("\n");
4491 }
4492 
get_meta_txnid(const unsigned meta_id)4493 static uint64_t get_meta_txnid(const unsigned meta_id) {
4494   switch (meta_id) {
4495   default:
4496     assert(false);
4497     error("unexpected meta_id %u\n", meta_id);
4498     return 0;
4499   case 0:
4500     return envinfo.mi_meta0_txnid;
4501   case 1:
4502     return envinfo.mi_meta1_txnid;
4503   case 2:
4504     return envinfo.mi_meta2_txnid;
4505   }
4506 }
4507 
print_size(const char * prefix,const uint64_t value,const char * suffix)4508 static void print_size(const char *prefix, const uint64_t value,
4509                        const char *suffix) {
4510   const char sf[] =
4511       "KMGTPEZY"; /* LY: Kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta! */
4512   double k = 1024.0;
4513   size_t i;
4514   for (i = 0; sf[i + 1] && value / k > 1000.0; ++i)
4515     k *= 1024;
4516   print("%s%" PRIu64 " (%.2f %cb)%s", prefix, value, value / k, sf[i], suffix);
4517 }
4518 
main(int argc,char * argv[])4519 int main(int argc, char *argv[]) {
4520   int rc;
4521   char *prog = argv[0];
4522   char *envname;
4523   unsigned problems_maindb = 0, problems_freedb = 0, problems_meta = 0;
4524   bool write_locked = false;
4525   bool turn_meta = false;
4526   bool force_turn_meta = false;
4527 
4528   double elapsed;
4529 #if defined(_WIN32) || defined(_WIN64)
4530   uint64_t timestamp_start, timestamp_finish;
4531   timestamp_start = GetMilliseconds();
4532 #else
4533   struct timespec timestamp_start, timestamp_finish;
4534   if (clock_gettime(CLOCK_MONOTONIC, &timestamp_start)) {
4535     rc = errno;
4536     error("clock_gettime() failed, error %d %s\n", rc, mdbx_strerror(rc));
4537     return EXIT_FAILURE_SYS;
4538   }
4539 #endif
4540 
4541   dbi_meta.name = "@META";
4542   dbi_free.name = "@GC";
4543   dbi_main.name = "@MAIN";
4544   atexit(pagemap_cleanup);
4545 
4546   if (argc < 2)
4547     usage(prog);
4548 
4549   for (int i; (i = getopt(argc, argv,
4550                           "0"
4551                           "1"
4552                           "2"
4553                           "T"
4554                           "V"
4555                           "v"
4556                           "q"
4557                           "n"
4558                           "w"
4559                           "c"
4560                           "t"
4561                           "d"
4562                           "i"
4563                           "s:")) != EOF;) {
4564     switch (i) {
4565     case 'V':
4566       printf("mdbx_chk version %d.%d.%d.%d\n"
4567              " - source: %s %s, commit %s, tree %s\n"
4568              " - anchor: %s\n"
4569              " - build: %s for %s by %s\n"
4570              " - flags: %s\n"
4571              " - options: %s\n",
4572              mdbx_version.major, mdbx_version.minor, mdbx_version.release,
4573              mdbx_version.revision, mdbx_version.git.describe,
4574              mdbx_version.git.datetime, mdbx_version.git.commit,
4575              mdbx_version.git.tree, mdbx_sourcery_anchor, mdbx_build.datetime,
4576              mdbx_build.target, mdbx_build.compiler, mdbx_build.flags,
4577              mdbx_build.options);
4578       return EXIT_SUCCESS;
4579     case 'v':
4580       verbose++;
4581       break;
4582     case '0':
4583       stuck_meta = 0;
4584       break;
4585     case '1':
4586       stuck_meta = 1;
4587       break;
4588     case '2':
4589       stuck_meta = 2;
4590       break;
4591     case 't':
4592       turn_meta = true;
4593       break;
4594     case 'T':
4595       turn_meta = force_turn_meta = true;
4596       quiet = false;
4597       if (verbose < 2)
4598         verbose = 2;
4599       break;
4600     case 'q':
4601       quiet = true;
4602       break;
4603     case 'n':
4604       break;
4605     case 'w':
4606       envflags &= ~MDBX_RDONLY;
4607 #if MDBX_MMAP_INCOHERENT_FILE_WRITE
4608       /* Temporary `workaround` for OpenBSD kernel's flaw.
4609        * See https://github.com/erthink/libmdbx/issues/67 */
4610       envflags |= MDBX_WRITEMAP;
4611 #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */
4612       break;
4613     case 'c':
4614       envflags = (envflags & ~MDBX_EXCLUSIVE) | MDBX_ACCEDE;
4615       break;
4616     case 'd':
4617       dont_traversal = true;
4618       break;
4619     case 's':
4620       if (only_subdb && strcmp(only_subdb, optarg))
4621         usage(prog);
4622       only_subdb = optarg;
4623       break;
4624     case 'i':
4625       ignore_wrong_order = true;
4626       break;
4627     default:
4628       usage(prog);
4629     }
4630   }
4631 
4632   if (optind != argc - 1)
4633     usage(prog);
4634 
4635   rc = MDBX_SUCCESS;
4636   if (stuck_meta >= 0 && (envflags & MDBX_EXCLUSIVE) == 0) {
4637     error("exclusive mode is required to using specific meta-page(%d) for "
4638           "checking.\n",
4639           stuck_meta);
4640     rc = EXIT_INTERRUPTED;
4641   }
4642   if (turn_meta) {
4643     if (stuck_meta < 0) {
4644       error("meta-page must be specified (by -0, -1 or -2 options) to turn to "
4645             "it.\n");
4646       rc = EXIT_INTERRUPTED;
4647     }
4648     if (envflags & MDBX_RDONLY) {
4649       error("write-mode must be enabled to turn to the specified meta-page.\n");
4650       rc = EXIT_INTERRUPTED;
4651     }
4652     if (only_subdb || dont_traversal) {
4653       error("whole database checking with tree-traversal are required to turn "
4654             "to the specified meta-page.\n");
4655       rc = EXIT_INTERRUPTED;
4656     }
4657   }
4658   if (rc)
4659     exit(rc);
4660 
4661 #if defined(_WIN32) || defined(_WIN64)
4662   SetConsoleCtrlHandler(ConsoleBreakHandlerRoutine, true);
4663 #else
4664 #ifdef SIGPIPE
4665   signal(SIGPIPE, signal_handler);
4666 #endif
4667 #ifdef SIGHUP
4668   signal(SIGHUP, signal_handler);
4669 #endif
4670   signal(SIGINT, signal_handler);
4671   signal(SIGTERM, signal_handler);
4672 #endif /* !WINDOWS */
4673 
4674   envname = argv[optind];
4675   print("mdbx_chk %s (%s, T-%s)\nRunning for %s in 'read-%s' mode...\n",
4676         mdbx_version.git.describe, mdbx_version.git.datetime,
4677         mdbx_version.git.tree, envname,
4678         (envflags & MDBX_RDONLY) ? "only" : "write");
4679   fflush(nullptr);
4680   mdbx_setup_debug((verbose < MDBX_LOG_TRACE - 1)
4681                        ? (MDBX_log_level_t)(verbose + 1)
4682                        : MDBX_LOG_TRACE,
4683                    MDBX_DBG_LEGACY_OVERLAP, logger);
4684 
4685   rc = mdbx_env_create(&env);
4686   if (rc) {
4687     error("mdbx_env_create() failed, error %d %s\n", rc, mdbx_strerror(rc));
4688     return rc < 0 ? EXIT_FAILURE_MDBX : EXIT_FAILURE_SYS;
4689   }
4690 
4691   rc = mdbx_env_set_maxdbs(env, MDBX_MAX_DBI);
4692   if (rc) {
4693     error("mdbx_env_set_maxdbs() failed, error %d %s\n", rc, mdbx_strerror(rc));
4694     goto bailout;
4695   }
4696 
4697   if (stuck_meta >= 0) {
4698     rc = mdbx_env_open_for_recovery(env, envname, stuck_meta,
4699                                     (envflags & MDBX_RDONLY) ? false : true);
4700   } else {
4701     rc = mdbx_env_open(env, envname, envflags, 0);
4702     if ((envflags & MDBX_EXCLUSIVE) &&
4703         (rc == MDBX_BUSY ||
4704 #if defined(_WIN32) || defined(_WIN64)
4705          rc == ERROR_LOCK_VIOLATION || rc == ERROR_SHARING_VIOLATION
4706 #else
4707          rc == EBUSY || rc == EAGAIN
4708 #endif
4709          )) {
4710       envflags &= ~MDBX_EXCLUSIVE;
4711       rc = mdbx_env_open(env, envname, envflags | MDBX_ACCEDE, 0);
4712     }
4713   }
4714 
4715   if (rc) {
4716     error("mdbx_env_open() failed, error %d %s\n", rc, mdbx_strerror(rc));
4717     if (rc == MDBX_WANNA_RECOVERY && (envflags & MDBX_RDONLY))
4718       print("Please run %s in the read-write mode (with '-w' option).\n", prog);
4719     goto bailout;
4720   }
4721   if (verbose)
4722     print(" - %s mode\n",
4723           (envflags & MDBX_EXCLUSIVE) ? "monopolistic" : "cooperative");
4724 
4725   if ((envflags & (MDBX_RDONLY | MDBX_EXCLUSIVE)) == 0) {
4726     rc = mdbx_txn_lock(env, false);
4727     if (rc != MDBX_SUCCESS) {
4728       error("mdbx_txn_lock() failed, error %d %s\n", rc, mdbx_strerror(rc));
4729       goto bailout;
4730     }
4731     write_locked = true;
4732   }
4733 
4734   rc = mdbx_txn_begin(env, nullptr, MDBX_TXN_RDONLY, &txn);
4735   if (rc) {
4736     error("mdbx_txn_begin() failed, error %d %s\n", rc, mdbx_strerror(rc));
4737     goto bailout;
4738   }
4739 
4740   rc = mdbx_env_info_ex(env, txn, &envinfo, sizeof(envinfo));
4741   if (rc) {
4742     error("mdbx_env_info_ex() failed, error %d %s\n", rc, mdbx_strerror(rc));
4743     goto bailout;
4744   }
4745   if (verbose) {
4746     print(" - current boot-id ");
4747     if (envinfo.mi_bootid.current.x | envinfo.mi_bootid.current.y)
4748       print("%016" PRIx64 "-%016" PRIx64 "\n", envinfo.mi_bootid.current.x,
4749             envinfo.mi_bootid.current.y);
4750     else
4751       print("unavailable\n");
4752   }
4753 
4754   mdbx_filehandle_t dxb_fd;
4755   rc = mdbx_env_get_fd(env, &dxb_fd);
4756   if (rc) {
4757     error("mdbx_env_get_fd() failed, error %d %s\n", rc, mdbx_strerror(rc));
4758     goto bailout;
4759   }
4760 
4761   uint64_t dxb_filesize = 0;
4762 #if defined(_WIN32) || defined(_WIN64)
4763   {
4764     BY_HANDLE_FILE_INFORMATION info;
4765     if (!GetFileInformationByHandle(dxb_fd, &info))
4766       rc = GetLastError();
4767     else
4768       dxb_filesize = info.nFileSizeLow | (uint64_t)info.nFileSizeHigh << 32;
4769   }
4770 #else
4771   {
4772     struct stat st;
4773     STATIC_ASSERT_MSG(sizeof(off_t) <= sizeof(uint64_t),
4774                       "libmdbx requires 64-bit file I/O on 64-bit systems");
4775     if (fstat(dxb_fd, &st))
4776       rc = errno;
4777     else
4778       dxb_filesize = st.st_size;
4779   }
4780 #endif
4781   if (rc) {
4782     error("mdbx_filesize() failed, error %d %s\n", rc, mdbx_strerror(rc));
4783     goto bailout;
4784   }
4785 
4786   errno = 0;
4787   const uint64_t dxbfile_pages = dxb_filesize / envinfo.mi_dxb_pagesize;
4788   alloc_pages = txn->mt_next_pgno;
4789   backed_pages = envinfo.mi_geo.current / envinfo.mi_dxb_pagesize;
4790   if (backed_pages > dxbfile_pages) {
4791     print(" ! backed-pages %" PRIu64 " > file-pages %" PRIu64 "\n",
4792           backed_pages, dxbfile_pages);
4793     ++problems_meta;
4794   }
4795   if (dxbfile_pages < NUM_METAS)
4796     print(" ! file-pages %" PRIu64 " < %u\n", dxbfile_pages, NUM_METAS);
4797   if (backed_pages < NUM_METAS)
4798     print(" ! backed-pages %" PRIu64 " < %u\n", backed_pages, NUM_METAS);
4799   if (backed_pages < NUM_METAS || dxbfile_pages < NUM_METAS)
4800     goto bailout;
4801   if (backed_pages > MAX_PAGENO) {
4802     print(" ! backed-pages %" PRIu64 " > max-pages %" PRIaPGNO "\n",
4803           backed_pages, MAX_PAGENO);
4804     ++problems_meta;
4805     backed_pages = MAX_PAGENO;
4806   }
4807 
4808   if ((envflags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) != MDBX_RDONLY) {
4809     if (backed_pages > dxbfile_pages) {
4810       print(" ! backed-pages %" PRIu64 " > file-pages %" PRIu64 "\n",
4811             backed_pages, dxbfile_pages);
4812       ++problems_meta;
4813       backed_pages = dxbfile_pages;
4814     }
4815     if (alloc_pages > backed_pages) {
4816       print(" ! alloc-pages %" PRIu64 " > backed-pages %" PRIu64 "\n",
4817             alloc_pages, backed_pages);
4818       ++problems_meta;
4819       alloc_pages = backed_pages;
4820     }
4821   } else {
4822     /* LY: DB may be shrinked by writer down to the allocated pages. */
4823     if (alloc_pages > backed_pages) {
4824       print(" ! alloc-pages %" PRIu64 " > backed-pages %" PRIu64 "\n",
4825             alloc_pages, backed_pages);
4826       ++problems_meta;
4827       alloc_pages = backed_pages;
4828     }
4829     if (alloc_pages > dxbfile_pages) {
4830       print(" ! alloc-pages %" PRIu64 " > file-pages %" PRIu64 "\n",
4831             alloc_pages, dxbfile_pages);
4832       ++problems_meta;
4833       alloc_pages = dxbfile_pages;
4834     }
4835     if (backed_pages > dxbfile_pages)
4836       backed_pages = dxbfile_pages;
4837   }
4838 
4839   if (verbose) {
4840     print(" - pagesize %u (%u system), max keysize %d..%d"
4841           ", max readers %u\n",
4842           envinfo.mi_dxb_pagesize, envinfo.mi_sys_pagesize,
4843           mdbx_env_get_maxkeysize_ex(env, MDBX_DUPSORT),
4844           mdbx_env_get_maxkeysize_ex(env, 0), envinfo.mi_maxreaders);
4845     print_size(" - mapsize ", envinfo.mi_mapsize, "\n");
4846     if (envinfo.mi_geo.lower == envinfo.mi_geo.upper)
4847       print_size(" - fixed datafile: ", envinfo.mi_geo.current, "");
4848     else {
4849       print_size(" - dynamic datafile: ", envinfo.mi_geo.lower, "");
4850       print_size(" .. ", envinfo.mi_geo.upper, ", ");
4851       print_size("+", envinfo.mi_geo.grow, ", ");
4852       print_size("-", envinfo.mi_geo.shrink, "\n");
4853       print_size(" - current datafile: ", envinfo.mi_geo.current, "");
4854     }
4855     printf(", %" PRIu64 " pages\n",
4856            envinfo.mi_geo.current / envinfo.mi_dxb_pagesize);
4857 #if defined(_WIN32) || defined(_WIN64)
4858     if (envinfo.mi_geo.shrink && envinfo.mi_geo.current != envinfo.mi_geo.upper)
4859       print(
4860           "                     WARNING: Due Windows system limitations a "
4861           "file couldn't\n                     be truncated while the database "
4862           "is opened. So, the size\n                     database file "
4863           "of may by large than the database itself,\n                     "
4864           "until it will be closed or reopened in read-write mode.\n");
4865 #endif
4866     verbose_meta(0, envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign,
4867                  envinfo.mi_bootid.meta0.x, envinfo.mi_bootid.meta0.y);
4868     verbose_meta(1, envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign,
4869                  envinfo.mi_bootid.meta1.x, envinfo.mi_bootid.meta1.y);
4870     verbose_meta(2, envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign,
4871                  envinfo.mi_bootid.meta2.x, envinfo.mi_bootid.meta2.y);
4872   }
4873 
4874   if (stuck_meta >= 0) {
4875     if (verbose) {
4876       print(" - skip checking meta-pages since the %u"
4877             " is selected for verification\n",
4878             stuck_meta);
4879       print(" - transactions: recent %" PRIu64
4880             ", selected for verification %" PRIu64 ", lag %" PRIi64 "\n",
4881             envinfo.mi_recent_txnid, get_meta_txnid(stuck_meta),
4882             envinfo.mi_recent_txnid - get_meta_txnid(stuck_meta));
4883     }
4884   } else {
4885     if (verbose > 1)
4886       print(" - performs check for meta-pages clashes\n");
4887     if (meta_eq(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign,
4888                 envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign)) {
4889       print(" ! meta-%d and meta-%d are clashed\n", 0, 1);
4890       ++problems_meta;
4891     }
4892     if (meta_eq(envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign,
4893                 envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign)) {
4894       print(" ! meta-%d and meta-%d are clashed\n", 1, 2);
4895       ++problems_meta;
4896     }
4897     if (meta_eq(envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign,
4898                 envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign)) {
4899       print(" ! meta-%d and meta-%d are clashed\n", 2, 0);
4900       ++problems_meta;
4901     }
4902 
4903     const unsigned steady_meta_id = meta_recent(true);
4904     const uint64_t steady_meta_txnid = get_meta_txnid(steady_meta_id);
4905     const unsigned weak_meta_id = meta_recent(false);
4906     const uint64_t weak_meta_txnid = get_meta_txnid(weak_meta_id);
4907     if (envflags & MDBX_EXCLUSIVE) {
4908       if (verbose > 1)
4909         print(" - performs full check recent-txn-id with meta-pages\n");
4910       if (steady_meta_txnid != envinfo.mi_recent_txnid) {
4911         print(" ! steady meta-%d txn-id mismatch recent-txn-id (%" PRIi64
4912               " != %" PRIi64 ")\n",
4913               steady_meta_id, steady_meta_txnid, envinfo.mi_recent_txnid);
4914         ++problems_meta;
4915       }
4916     } else if (write_locked) {
4917       if (verbose > 1)
4918         print(" - performs lite check recent-txn-id with meta-pages (not a "
4919               "monopolistic mode)\n");
4920       if (weak_meta_txnid != envinfo.mi_recent_txnid) {
4921         print(" ! weak meta-%d txn-id mismatch recent-txn-id (%" PRIi64
4922               " != %" PRIi64 ")\n",
4923               weak_meta_id, weak_meta_txnid, envinfo.mi_recent_txnid);
4924         ++problems_meta;
4925       }
4926     } else if (verbose) {
4927       print(" - skip check recent-txn-id with meta-pages (monopolistic or "
4928             "read-write mode only)\n");
4929     }
4930     total_problems += problems_meta;
4931 
4932     if (verbose)
4933       print(" - transactions: recent %" PRIu64 ", latter reader %" PRIu64
4934             ", lag %" PRIi64 "\n",
4935             envinfo.mi_recent_txnid, envinfo.mi_latter_reader_txnid,
4936             envinfo.mi_recent_txnid - envinfo.mi_latter_reader_txnid);
4937   }
4938 
4939   if (!dont_traversal) {
4940     struct problem *saved_list;
4941     size_t traversal_problems;
4942     uint64_t empty_pages, lost_bytes;
4943 
4944     print("Traversal b-tree by txn#%" PRIaTXN "...\n", txn->mt_txnid);
4945     fflush(nullptr);
4946     walk.pagemap = mdbx_calloc((size_t)backed_pages, sizeof(*walk.pagemap));
4947     if (!walk.pagemap) {
4948       rc = errno ? errno : MDBX_ENOMEM;
4949       error("calloc() failed, error %d %s\n", rc, mdbx_strerror(rc));
4950       goto bailout;
4951     }
4952 
4953     saved_list = problems_push();
4954     rc = mdbx_env_pgwalk(txn, pgvisitor, nullptr,
4955                          true /* always skip key ordering checking to avoid
4956                                MDBX_CORRUPTED when using custom comparators */);
4957     traversal_problems = problems_pop(saved_list);
4958 
4959     if (rc) {
4960       if (rc != MDBX_EINTR || !check_user_break())
4961         error("mdbx_env_pgwalk() failed, error %d %s\n", rc, mdbx_strerror(rc));
4962       goto bailout;
4963     }
4964 
4965     for (uint64_t n = 0; n < alloc_pages; ++n)
4966       if (!walk.pagemap[n])
4967         unused_pages += 1;
4968 
4969     empty_pages = lost_bytes = 0;
4970     for (walk_dbi_t *dbi = &dbi_main; dbi < ARRAY_END(walk.dbi) && dbi->name;
4971          ++dbi) {
4972       empty_pages += dbi->pages.empty;
4973       lost_bytes += dbi->lost_bytes;
4974     }
4975 
4976     if (verbose) {
4977       uint64_t total_page_bytes = walk.pgcount * envinfo.mi_dxb_pagesize;
4978       print(" - pages: walked %" PRIu64 ", left/unused %" PRIu64 "\n",
4979             walk.pgcount, unused_pages);
4980       if (verbose > 1) {
4981         for (walk_dbi_t *dbi = walk.dbi; dbi < ARRAY_END(walk.dbi) && dbi->name;
4982              ++dbi) {
4983           print("     %s: subtotal %" PRIu64, dbi->name, dbi->pages.total);
4984           if (dbi->pages.other && dbi->pages.other != dbi->pages.total)
4985             print(", other %" PRIu64, dbi->pages.other);
4986           if (dbi->pages.branch)
4987             print(", branch %" PRIu64, dbi->pages.branch);
4988           if (dbi->pages.large_count)
4989             print(", large %" PRIu64, dbi->pages.large_count);
4990           uint64_t all_leaf = dbi->pages.leaf + dbi->pages.leaf_dupfixed;
4991           if (all_leaf) {
4992             print(", leaf %" PRIu64, all_leaf);
4993             if (verbose > 2 &&
4994                 (dbi->pages.subleaf_dupsort | dbi->pages.leaf_dupfixed |
4995                  dbi->pages.subleaf_dupfixed))
4996               print(" (usual %" PRIu64 ", sub-dupsort %" PRIu64
4997                     ", dupfixed %" PRIu64 ", sub-dupfixed %" PRIu64 ")",
4998                     dbi->pages.leaf, dbi->pages.subleaf_dupsort,
4999                     dbi->pages.leaf_dupfixed, dbi->pages.subleaf_dupfixed);
5000           }
5001           print("\n");
5002         }
5003       }
5004 
5005       if (verbose > 1)
5006         print(" - usage: total %" PRIu64 " bytes, payload %" PRIu64
5007               " (%.1f%%), unused "
5008               "%" PRIu64 " (%.1f%%)\n",
5009               total_page_bytes, walk.total_payload_bytes,
5010               walk.total_payload_bytes * 100.0 / total_page_bytes,
5011               total_page_bytes - walk.total_payload_bytes,
5012               (total_page_bytes - walk.total_payload_bytes) * 100.0 /
5013                   total_page_bytes);
5014       if (verbose > 2) {
5015         for (walk_dbi_t *dbi = walk.dbi; dbi < ARRAY_END(walk.dbi) && dbi->name;
5016              ++dbi)
5017           if (dbi->pages.total) {
5018             uint64_t dbi_bytes = dbi->pages.total * envinfo.mi_dxb_pagesize;
5019             print("     %s: subtotal %" PRIu64 " bytes (%.1f%%),"
5020                   " payload %" PRIu64 " (%.1f%%), unused %" PRIu64 " (%.1f%%)",
5021                   dbi->name, dbi_bytes, dbi_bytes * 100.0 / total_page_bytes,
5022                   dbi->payload_bytes, dbi->payload_bytes * 100.0 / dbi_bytes,
5023                   dbi_bytes - dbi->payload_bytes,
5024                   (dbi_bytes - dbi->payload_bytes) * 100.0 / dbi_bytes);
5025             if (dbi->pages.empty)
5026               print(", %" PRIu64 " empty pages", dbi->pages.empty);
5027             if (dbi->lost_bytes)
5028               print(", %" PRIu64 " bytes lost", dbi->lost_bytes);
5029             print("\n");
5030           } else
5031             print("     %s: empty\n", dbi->name);
5032       }
5033       print(" - summary: average fill %.1f%%",
5034             walk.total_payload_bytes * 100.0 / total_page_bytes);
5035       if (empty_pages)
5036         print(", %" PRIu64 " empty pages", empty_pages);
5037       if (lost_bytes)
5038         print(", %" PRIu64 " bytes lost", lost_bytes);
5039       print(", %" PRIuPTR " problems\n", traversal_problems);
5040     }
5041   } else if (verbose) {
5042     print("Skipping b-tree walk...\n");
5043     fflush(nullptr);
5044   }
5045 
5046   if (!verbose)
5047     print("Iterating DBIs...\n");
5048   if (data_tree_problems) {
5049     print("Skip processing %s since tree is corrupted (%u problems)\n", "@MAIN",
5050           data_tree_problems);
5051     problems_maindb = data_tree_problems;
5052   } else
5053     problems_maindb = process_db(~0u, /* MAIN_DBI */ nullptr, nullptr, false);
5054 
5055   if (gc_tree_problems) {
5056     print("Skip processing %s since tree is corrupted (%u problems)\n", "@GC",
5057           gc_tree_problems);
5058     problems_freedb = gc_tree_problems;
5059   } else
5060     problems_freedb = process_db(FREE_DBI, "@GC", handle_freedb, false);
5061 
5062   if (verbose) {
5063     uint64_t value = envinfo.mi_mapsize / envinfo.mi_dxb_pagesize;
5064     double percent = value / 100.0;
5065     print(" - space: %" PRIu64 " total pages", value);
5066     print(", backed %" PRIu64 " (%.1f%%)", backed_pages,
5067           backed_pages / percent);
5068     print(", allocated %" PRIu64 " (%.1f%%)", alloc_pages,
5069           alloc_pages / percent);
5070 
5071     if (verbose > 1) {
5072       value = envinfo.mi_mapsize / envinfo.mi_dxb_pagesize - alloc_pages;
5073       print(", remained %" PRIu64 " (%.1f%%)", value, value / percent);
5074 
5075       value = dont_traversal ? alloc_pages - gc_pages : walk.pgcount;
5076       print(", used %" PRIu64 " (%.1f%%)", value, value / percent);
5077 
5078       print(", gc %" PRIu64 " (%.1f%%)", gc_pages, gc_pages / percent);
5079 
5080       value = gc_pages - reclaimable_pages;
5081       print(", detained %" PRIu64 " (%.1f%%)", value, value / percent);
5082 
5083       print(", reclaimable %" PRIu64 " (%.1f%%)", reclaimable_pages,
5084             reclaimable_pages / percent);
5085     }
5086 
5087     value = envinfo.mi_mapsize / envinfo.mi_dxb_pagesize - alloc_pages +
5088             reclaimable_pages;
5089     print(", available %" PRIu64 " (%.1f%%)\n", value, value / percent);
5090   }
5091 
5092   if (problems_maindb == 0 && problems_freedb == 0) {
5093     if (!dont_traversal &&
5094         (envflags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) != MDBX_RDONLY) {
5095       if (walk.pgcount != alloc_pages - gc_pages) {
5096         error("used pages mismatch (%" PRIu64 "(walked) != %" PRIu64
5097               "(allocated - GC))\n",
5098               walk.pgcount, alloc_pages - gc_pages);
5099       }
5100       if (unused_pages != gc_pages) {
5101         error("gc pages mismatch (%" PRIu64 "(expected) != %" PRIu64 "(GC))\n",
5102               unused_pages, gc_pages);
5103       }
5104     } else if (verbose) {
5105       print(" - skip check used and gc pages (btree-traversal with "
5106             "monopolistic or read-write mode only)\n");
5107     }
5108 
5109     if (!process_db(MAIN_DBI, nullptr, handle_maindb, true)) {
5110       if (!userdb_count && verbose)
5111         print(" - does not contain multiple databases\n");
5112     }
5113   }
5114 
5115   if (rc == 0 && total_problems == 1 && problems_meta == 1 && !dont_traversal &&
5116       (envflags & MDBX_RDONLY) == 0 && !only_subdb && stuck_meta < 0 &&
5117       get_meta_txnid(meta_recent(true)) < envinfo.mi_recent_txnid) {
5118     print("Perform sync-to-disk for make steady checkpoint at txn-id #%" PRIi64
5119           "\n",
5120           envinfo.mi_recent_txnid);
5121     fflush(nullptr);
5122     if (write_locked) {
5123       mdbx_txn_unlock(env);
5124       write_locked = false;
5125     }
5126     rc = mdbx_env_sync_ex(env, true, false);
5127     if (rc != MDBX_SUCCESS)
5128       error("mdbx_env_pgwalk() failed, error %d %s\n", rc, mdbx_strerror(rc));
5129     else {
5130       total_problems -= 1;
5131       problems_meta -= 1;
5132     }
5133   }
5134 
5135   if (turn_meta && stuck_meta >= 0 && !dont_traversal && !only_subdb &&
5136       (envflags & (MDBX_RDONLY | MDBX_EXCLUSIVE)) == MDBX_EXCLUSIVE) {
5137     const bool successful_check = (rc | total_problems | problems_meta) == 0;
5138     if (successful_check || force_turn_meta) {
5139       fflush(nullptr);
5140       print(" = Performing turn to the specified meta-page (%d) due to %s!\n",
5141             stuck_meta,
5142             successful_check ? "successful check" : "the -T option was given");
5143       fflush(nullptr);
5144       rc = mdbx_env_turn_for_recovery(env, stuck_meta);
5145       if (rc != MDBX_SUCCESS)
5146         error("mdbx_env_turn_for_recovery() failed, error %d %s\n", rc,
5147               mdbx_strerror(rc));
5148     } else {
5149       print(" = Skipping turn to the specified meta-page (%d) due to "
5150             "unsuccessful check!\n",
5151             stuck_meta);
5152     }
5153   }
5154 
5155 bailout:
5156   if (txn)
5157     mdbx_txn_abort(txn);
5158   if (write_locked) {
5159     mdbx_txn_unlock(env);
5160     write_locked = false;
5161   }
5162   if (env) {
5163     const bool dont_sync = rc != 0 || total_problems;
5164     mdbx_env_close_ex(env, dont_sync);
5165   }
5166   fflush(nullptr);
5167   if (rc) {
5168     if (rc < 0)
5169       return user_break ? EXIT_INTERRUPTED : EXIT_FAILURE_SYS;
5170     return EXIT_FAILURE_MDBX;
5171   }
5172 
5173 #if defined(_WIN32) || defined(_WIN64)
5174   timestamp_finish = GetMilliseconds();
5175   elapsed = (timestamp_finish - timestamp_start) * 1e-3;
5176 #else
5177   if (clock_gettime(CLOCK_MONOTONIC, &timestamp_finish)) {
5178     rc = errno;
5179     error("clock_gettime() failed, error %d %s\n", rc, mdbx_strerror(rc));
5180     return EXIT_FAILURE_SYS;
5181   }
5182   elapsed = timestamp_finish.tv_sec - timestamp_start.tv_sec +
5183             (timestamp_finish.tv_nsec - timestamp_start.tv_nsec) * 1e-9;
5184 #endif /* !WINDOWS */
5185 
5186   if (total_problems) {
5187     print("Total %u error%s detected, elapsed %.3f seconds.\n", total_problems,
5188           (total_problems > 1) ? "s are" : " is", elapsed);
5189     if (problems_meta || problems_maindb || problems_freedb)
5190       return EXIT_FAILURE_CHECK_MAJOR;
5191     return EXIT_FAILURE_CHECK_MINOR;
5192   }
5193   print("No error is detected, elapsed %.3f seconds\n", elapsed);
5194   return EXIT_SUCCESS;
5195 }
5196