1 /*
2  * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru>
3  * and other libmdbx authors: please see AUTHORS file.
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted only as authorized by the OpenLDAP
8  * Public License.
9  *
10  * A copy of this license is available in the file LICENSE in the
11  * top-level directory of the distribution or, alternatively, at
12  * <http://www.OpenLDAP.org/license.html>. */
13 
14 #define xMDBX_ALLOY 1
15 #define MDBX_BUILD_SOURCERY facaa40d3bb34698b2ba800e2fe225773e3941040aef7dc92580b74ad840e798_v0_11_2_0_gd47eed0
16 #ifdef MDBX_CONFIG_H
17 #include MDBX_CONFIG_H
18 #endif
19 
20 #define LIBMDBX_INTERNALS
21 #ifdef xMDBX_TOOLS
22 #define MDBX_DEPRECATED
23 #endif /* xMDBX_TOOLS */
24 
25 #ifdef xMDBX_ALLOY
26 /* Amalgamated build */
27 #define MDBX_INTERNAL_FUNC static
28 #define MDBX_INTERNAL_VAR static
29 #else
30 /* Non-amalgamated build */
31 #define MDBX_INTERNAL_FUNC
32 #define MDBX_INTERNAL_VAR extern
33 #endif /* xMDBX_ALLOY */
34 
35 /** Disables using GNU/Linux libc extensions.
36  * \ingroup build_option
37  * \note This option couldn't be moved to the options.h since dependant
38  * control macros/defined should be prepared before include the options.h */
39 #ifndef MDBX_DISABLE_GNU_SOURCE
40 #define MDBX_DISABLE_GNU_SOURCE 0
41 #endif
42 #if MDBX_DISABLE_GNU_SOURCE
43 #undef _GNU_SOURCE
44 #elif (defined(__linux__) || defined(__gnu_linux__)) && !defined(_GNU_SOURCE)
45 #define _GNU_SOURCE
46 #endif /* MDBX_DISABLE_GNU_SOURCE */
47 
48 /*----------------------------------------------------------------------------*/
49 
50 /* Should be defined before any includes */
51 #ifndef _FILE_OFFSET_BITS
52 #define _FILE_OFFSET_BITS 64
53 #endif
54 
55 #ifdef __APPLE__
56 #define _DARWIN_C_SOURCE
57 #endif
58 
59 #ifdef _MSC_VER
60 #if _MSC_FULL_VER < 190024234
61 /* Actually libmdbx was not tested with compilers older than 19.00.24234 (Visual
62  * Studio 2015 Update 3). But you could remove this #error and try to continue
63  * at your own risk. In such case please don't rise up an issues related ONLY to
64  * old compilers.
65  *
66  * NOTE:
67  *   Unfortunately, there are several different builds of "Visual Studio" that
68  *   are called "Visual Studio 2015 Update 3".
69  *
70  *   The 190024234 is used here because it is minimal version of Visual Studio
71  *   that was used for build and testing libmdbx in recent years. Soon this
72  *   value will be increased to 19.0.24241.7, since build and testing using
73  *   "Visual Studio 2015" will be performed only at https://ci.appveyor.com.
74  *
75  *   Please ask Microsoft (but not us) for information about version differences
76  *   and how to and where you can obtain the latest "Visual Studio 2015" build
77  *   with all fixes.
78  */
79 #error                                                                         \
80     "At least \"Microsoft C/C++ Compiler\" version 19.00.24234 (Visual Studio 2015 Update 3) is required."
81 #endif
82 #ifndef _CRT_SECURE_NO_WARNINGS
83 #define _CRT_SECURE_NO_WARNINGS
84 #endif /* _CRT_SECURE_NO_WARNINGS */
85 #if _MSC_VER > 1800
86 #pragma warning(disable : 4464) /* relative include path contains '..' */
87 #endif
88 #if _MSC_VER > 1913
89 #pragma warning(disable : 5045) /* Compiler will insert Spectre mitigation...  \
90                                  */
91 #endif
92 #pragma warning(disable : 4710) /* 'xyz': function not inlined */
93 #pragma warning(disable : 4711) /* function 'xyz' selected for automatic       \
94                                    inline expansion */
95 #pragma warning(                                                               \
96     disable : 4201) /* nonstandard extension used : nameless struct / union */
97 #pragma warning(disable : 4702) /* unreachable code */
98 #pragma warning(disable : 4706) /* assignment within conditional expression */
99 #pragma warning(disable : 4127) /* conditional expression is constant */
100 #pragma warning(disable : 4324) /* 'xyz': structure was padded due to          \
101                                    alignment specifier */
102 #pragma warning(disable : 4310) /* cast truncates constant value */
103 #pragma warning(                                                               \
104     disable : 4820) /* bytes padding added after data member for alignment */
105 #pragma warning(disable : 4548) /* expression before comma has no effect;      \
106                                    expected expression with side - effect */
107 #pragma warning(disable : 4366) /* the result of the unary '&' operator may be \
108                                    unaligned */
109 #pragma warning(disable : 4200) /* nonstandard extension used: zero-sized      \
110                                    array in struct/union */
111 #pragma warning(disable : 4204) /* nonstandard extension used: non-constant    \
112                                    aggregate initializer */
113 #pragma warning(                                                               \
114     disable : 4505) /* unreferenced local function has been removed */
115 #endif              /* _MSC_VER (warnings) */
116 
117 #include "mdbx.h"
118 /*
119  * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru>
120  * and other libmdbx authors: please see AUTHORS file.
121  * All rights reserved.
122  *
123  * Redistribution and use in source and binary forms, with or without
124  * modification, are permitted only as authorized by the OpenLDAP
125  * Public License.
126  *
127  * A copy of this license is available in the file LICENSE in the
128  * top-level directory of the distribution or, alternatively, at
129  * <http://www.OpenLDAP.org/license.html>.
130  */
131 
132 /* *INDENT-OFF* */
133 /* clang-format off */
134 
135 #ifndef __GNUC_PREREQ
136 #   if defined(__GNUC__) && defined(__GNUC_MINOR__)
137 #       define __GNUC_PREREQ(maj, min) \
138           ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min))
139 #   else
140 #       define __GNUC_PREREQ(maj, min) (0)
141 #   endif
142 #endif /* __GNUC_PREREQ */
143 
144 #ifndef __CLANG_PREREQ
145 #   ifdef __clang__
146 #       define __CLANG_PREREQ(maj,min) \
147           ((__clang_major__ << 16) + __clang_minor__ >= ((maj) << 16) + (min))
148 #   else
149 #       define __CLANG_PREREQ(maj,min) (0)
150 #   endif
151 #endif /* __CLANG_PREREQ */
152 
153 #ifndef __GLIBC_PREREQ
154 #   if defined(__GLIBC__) && defined(__GLIBC_MINOR__)
155 #       define __GLIBC_PREREQ(maj, min) \
156           ((__GLIBC__ << 16) + __GLIBC_MINOR__ >= ((maj) << 16) + (min))
157 #   else
158 #       define __GLIBC_PREREQ(maj, min) (0)
159 #   endif
160 #endif /* __GLIBC_PREREQ */
161 
162 #ifndef __has_warning
163 #   define __has_warning(x) (0)
164 #endif
165 
166 #ifndef __has_include
167 #   define __has_include(x) (0)
168 #endif
169 
170 #if __has_feature(thread_sanitizer)
171 #   define __SANITIZE_THREAD__ 1
172 #endif
173 
174 #if __has_feature(address_sanitizer)
175 #   define __SANITIZE_ADDRESS__ 1
176 #endif
177 
178 /*----------------------------------------------------------------------------*/
179 
180 #ifndef __extern_C
181 #   ifdef __cplusplus
182 #       define __extern_C extern "C"
183 #   else
184 #       define __extern_C
185 #   endif
186 #endif /* __extern_C */
187 
188 #if !defined(nullptr) && !defined(__cplusplus) || (__cplusplus < 201103L && !defined(_MSC_VER))
189 #   define nullptr NULL
190 #endif
191 
192 /*----------------------------------------------------------------------------*/
193 
194 #ifndef __always_inline
195 #   if defined(__GNUC__) || __has_attribute(__always_inline__)
196 #       define __always_inline __inline __attribute__((__always_inline__))
197 #   elif defined(_MSC_VER)
198 #       define __always_inline __forceinline
199 #   else
200 #       define __always_inline
201 #   endif
202 #endif /* __always_inline */
203 
204 #ifndef __noinline
205 #   if defined(__GNUC__) || __has_attribute(__noinline__)
206 #       define __noinline __attribute__((__noinline__))
207 #   elif defined(_MSC_VER)
208 #       define __noinline __declspec(noinline)
209 #   else
210 #       define __noinline
211 #   endif
212 #endif /* __noinline */
213 
214 #ifndef __must_check_result
215 #   if defined(__GNUC__) || __has_attribute(__warn_unused_result__)
216 #       define __must_check_result __attribute__((__warn_unused_result__))
217 #   else
218 #       define __must_check_result
219 #   endif
220 #endif /* __must_check_result */
221 
222 #if !defined(__noop) && !defined(_MSC_VER)
223 #   define __noop(...) do {} while(0)
224 #endif /* __noop */
225 
226 #ifndef __fallthrough
227 #  if defined(__cplusplus) && (__has_cpp_attribute(fallthrough) &&             \
228      (!defined(__clang__) || __clang__ > 4)) || __cplusplus >= 201703L
229 #    define __fallthrough [[fallthrough]]
230 #  elif __GNUC_PREREQ(8, 0) && defined(__cplusplus) && __cplusplus >= 201103L
231 #    define __fallthrough [[fallthrough]]
232 #  elif __GNUC_PREREQ(7, 0) &&                                                 \
233     (!defined(__LCC__) || (__LCC__ == 124 && __LCC_MINOR__ >= 12) ||           \
234      (__LCC__ == 125 && __LCC_MINOR__ >= 5) || (__LCC__ >= 126))
235 #    define __fallthrough __attribute__((__fallthrough__))
236 #  elif defined(__clang__) && defined(__cplusplus) && __cplusplus >= 201103L &&\
237     __has_feature(cxx_attributes) && __has_warning("-Wimplicit-fallthrough")
238 #    define __fallthrough [[clang::fallthrough]]
239 #  else
240 #    define __fallthrough
241 #  endif
242 #endif /* __fallthrough */
243 
244 #ifndef __unreachable
245 #   if __GNUC_PREREQ(4,5) || __has_builtin(__builtin_unreachable)
246 #       define __unreachable() __builtin_unreachable()
247 #   elif defined(_MSC_VER)
248 #       define __unreachable() __assume(0)
249 #   else
250 #       define __unreachable() __noop()
251 #   endif
252 #endif /* __unreachable */
253 
254 #ifndef __prefetch
255 #   if defined(__GNUC__) || defined(__clang__) || __has_builtin(__builtin_prefetch)
256 #       define __prefetch(ptr) __builtin_prefetch(ptr)
257 #   else
258 #       define __prefetch(ptr) __noop(ptr)
259 #   endif
260 #endif /* __prefetch */
261 
262 #ifndef __nothrow
263 #   if defined(__cplusplus)
264 #       if __cplusplus < 201703L
265 #           define __nothrow throw()
266 #       else
267 #           define __nothrow noexcept(true)
268 #       endif /* __cplusplus */
269 #   elif defined(__GNUC__) || __has_attribute(__nothrow__)
270 #       define __nothrow __attribute__((__nothrow__))
271 #   elif defined(_MSC_VER) && defined(__cplusplus)
272 #       define __nothrow __declspec(nothrow)
273 #   else
274 #       define __nothrow
275 #   endif
276 #endif /* __nothrow */
277 
278 #ifndef __hidden
279 #   if defined(__GNUC__) || __has_attribute(__visibility__)
280 #       define __hidden __attribute__((__visibility__("hidden")))
281 #   else
282 #       define __hidden
283 #   endif
284 #endif /* __hidden */
285 
286 #ifndef __optimize
287 #   if defined(__OPTIMIZE__)
288 #       if (defined(__GNUC__) && !defined(__clang__)) || __has_attribute(__optimize__)
289 #           define __optimize(ops) __attribute__((__optimize__(ops)))
290 #       else
291 #           define __optimize(ops)
292 #       endif
293 #   else
294 #       define __optimize(ops)
295 #   endif
296 #endif /* __optimize */
297 
298 #ifndef __hot
299 #   if defined(__OPTIMIZE__)
300 #       if defined(__e2k__)
301 #           define __hot __attribute__((__hot__)) __optimize(3)
302 #       elif defined(__clang__) && !__has_attribute(__hot_) \
303         && __has_attribute(__section__) && (defined(__linux__) || defined(__gnu_linux__))
304             /* just put frequently used functions in separate section */
305 #           define __hot __attribute__((__section__("text.hot"))) __optimize("O3")
306 #       elif defined(__GNUC__) || __has_attribute(__hot__)
307 #           define __hot __attribute__((__hot__)) __optimize("O3")
308 #       else
309 #           define __hot __optimize("O3")
310 #       endif
311 #   else
312 #       define __hot
313 #   endif
314 #endif /* __hot */
315 
316 #ifndef __cold
317 #   if defined(__OPTIMIZE__)
318 #       if defined(__e2k__)
319 #           define __cold __attribute__((__cold__)) __optimize(1)
320 #       elif defined(__clang__) && !__has_attribute(cold) \
321         && __has_attribute(__section__) && (defined(__linux__) || defined(__gnu_linux__))
322             /* just put infrequently used functions in separate section */
323 #           define __cold __attribute__((__section__("text.unlikely"))) __optimize("Os")
324 #       elif defined(__GNUC__) || __has_attribute(cold)
325 #           define __cold __attribute__((__cold__)) __optimize("Os")
326 #       else
327 #           define __cold __optimize("Os")
328 #       endif
329 #   else
330 #       define __cold
331 #   endif
332 #endif /* __cold */
333 
334 #ifndef __flatten
335 #   if defined(__OPTIMIZE__) && (defined(__GNUC__) || __has_attribute(__flatten__))
336 #       define __flatten __attribute__((__flatten__))
337 #   else
338 #       define __flatten
339 #   endif
340 #endif /* __flatten */
341 
342 #ifndef likely
343 #   if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__)
344 #       define likely(cond) __builtin_expect(!!(cond), 1)
345 #   else
346 #       define likely(x) (!!(x))
347 #   endif
348 #endif /* likely */
349 
350 #ifndef unlikely
351 #   if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__)
352 #       define unlikely(cond) __builtin_expect(!!(cond), 0)
353 #   else
354 #       define unlikely(x) (!!(x))
355 #   endif
356 #endif /* unlikely */
357 
358 #ifndef __anonymous_struct_extension__
359 #   if defined(__GNUC__)
360 #       define __anonymous_struct_extension__ __extension__
361 #   else
362 #       define __anonymous_struct_extension__
363 #   endif
364 #endif /* __anonymous_struct_extension__ */
365 
366 #ifndef __Wpedantic_format_voidptr
367     MDBX_MAYBE_UNUSED MDBX_PURE_FUNCTION static __inline  const void*
__Wpedantic_format_voidptr(const void * ptr)368         __Wpedantic_format_voidptr(const void* ptr) {return ptr;}
369 #   define __Wpedantic_format_voidptr(ARG) __Wpedantic_format_voidptr(ARG)
370 #endif /* __Wpedantic_format_voidptr */
371 
372 /*----------------------------------------------------------------------------*/
373 
374 #if defined(MDBX_USE_VALGRIND)
375 #   include <valgrind/memcheck.h>
376 #   ifndef VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE
377         /* LY: available since Valgrind 3.10 */
378 #       define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s)
379 #       define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s)
380 #   endif
381 #elif !defined(RUNNING_ON_VALGRIND)
382 #   define VALGRIND_CREATE_MEMPOOL(h,r,z)
383 #   define VALGRIND_DESTROY_MEMPOOL(h)
384 #   define VALGRIND_MEMPOOL_TRIM(h,a,s)
385 #   define VALGRIND_MEMPOOL_ALLOC(h,a,s)
386 #   define VALGRIND_MEMPOOL_FREE(h,a)
387 #   define VALGRIND_MEMPOOL_CHANGE(h,a,b,s)
388 #   define VALGRIND_MAKE_MEM_NOACCESS(a,s)
389 #   define VALGRIND_MAKE_MEM_DEFINED(a,s)
390 #   define VALGRIND_MAKE_MEM_UNDEFINED(a,s)
391 #   define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s)
392 #   define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s)
393 #   define VALGRIND_CHECK_MEM_IS_ADDRESSABLE(a,s) (0)
394 #   define VALGRIND_CHECK_MEM_IS_DEFINED(a,s) (0)
395 #   define RUNNING_ON_VALGRIND (0)
396 #endif /* MDBX_USE_VALGRIND */
397 
398 #ifdef __SANITIZE_ADDRESS__
399 #   include <sanitizer/asan_interface.h>
400 #elif !defined(ASAN_POISON_MEMORY_REGION)
401 #   define ASAN_POISON_MEMORY_REGION(addr, size) \
402         ((void)(addr), (void)(size))
403 #   define ASAN_UNPOISON_MEMORY_REGION(addr, size) \
404         ((void)(addr), (void)(size))
405 #endif /* __SANITIZE_ADDRESS__ */
406 
407 /*----------------------------------------------------------------------------*/
408 
409 #ifndef ARRAY_LENGTH
410 #   ifdef __cplusplus
411         template <typename T, size_t N>
412         char (&__ArraySizeHelper(T (&array)[N]))[N];
413 #       define ARRAY_LENGTH(array) (sizeof(::__ArraySizeHelper(array)))
414 #   else
415 #       define ARRAY_LENGTH(array) (sizeof(array) / sizeof(array[0]))
416 #   endif
417 #endif /* ARRAY_LENGTH */
418 
419 #ifndef ARRAY_END
420 #   define ARRAY_END(array) (&array[ARRAY_LENGTH(array)])
421 #endif /* ARRAY_END */
422 
423 #define CONCAT(a,b) a##b
424 #define XCONCAT(a,b) CONCAT(a,b)
425 
426 #ifndef offsetof
427 #   define offsetof(type, member)  __builtin_offsetof(type, member)
428 #endif /* offsetof */
429 
430 #ifndef container_of
431 #   define container_of(ptr, type, member) \
432         ((type *)((char *)(ptr) - offsetof(type, member)))
433 #endif /* container_of */
434 
435 #define MDBX_TETRAD(a, b, c, d)                                                \
436   ((uint32_t)(a) << 24 | (uint32_t)(b) << 16 | (uint32_t)(c) << 8 | (d))
437 
438 #define MDBX_STRING_TETRAD(str) MDBX_TETRAD(str[0], str[1], str[2], str[3])
439 
440 #define FIXME "FIXME: " __FILE__ ", " MDBX_STRINGIFY(__LINE__)
441 
442 #ifndef STATIC_ASSERT_MSG
443 #   if defined(static_assert)
444 #       define STATIC_ASSERT_MSG(expr, msg) static_assert(expr, msg)
445 #   elif defined(_STATIC_ASSERT)
446 #       define STATIC_ASSERT_MSG(expr, msg) _STATIC_ASSERT(expr)
447 #   elif defined(_MSC_VER)
448 #       include <crtdbg.h>
449 #       define STATIC_ASSERT_MSG(expr, msg) _STATIC_ASSERT(expr)
450 #   elif (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) \
451           || __has_feature(c_static_assert)
452 #       define STATIC_ASSERT_MSG(expr, msg) _Static_assert(expr, msg)
453 #   else
454 #       define STATIC_ASSERT_MSG(expr, msg) switch (0) {case 0:case (expr):;}
455 #   endif
456 #endif /* STATIC_ASSERT */
457 
458 #ifndef STATIC_ASSERT
459 #   define STATIC_ASSERT(expr) STATIC_ASSERT_MSG(expr, #expr)
460 #endif
461 
462 /* *INDENT-ON* */
463 /* clang-format on */
464 
465 #if defined(__GNUC__) && !__GNUC_PREREQ(4, 2)
466 /* Actually libmdbx was not tested with compilers older than GCC 4.2.
467  * But you could ignore this warning at your own risk.
468  * In such case please don't rise up an issues related ONLY to old compilers.
469  */
470 #warning "libmdbx required GCC >= 4.2"
471 #endif
472 
473 #if defined(__clang__) && !__CLANG_PREREQ(3, 8)
474 /* Actually libmdbx was not tested with CLANG older than 3.8.
475  * But you could ignore this warning at your own risk.
476  * In such case please don't rise up an issues related ONLY to old compilers.
477  */
478 #warning "libmdbx required CLANG >= 3.8"
479 #endif
480 
481 #if defined(__GLIBC__) && !__GLIBC_PREREQ(2, 12)
482 /* Actually libmdbx was not tested with something older than glibc 2.12.
483  * But you could ignore this warning at your own risk.
484  * In such case please don't rise up an issues related ONLY to old systems.
485  */
486 #warning "libmdbx was only tested with GLIBC >= 2.12."
487 #endif
488 
489 #ifdef __SANITIZE_THREAD__
490 #warning                                                                       \
491     "libmdbx don't compatible with ThreadSanitizer, you will get a lot of false-positive issues."
492 #endif /* __SANITIZE_THREAD__ */
493 
494 #if __has_warning("-Wnested-anon-types")
495 #if defined(__clang__)
496 #pragma clang diagnostic ignored "-Wnested-anon-types"
497 #elif defined(__GNUC__)
498 #pragma GCC diagnostic ignored "-Wnested-anon-types"
499 #else
500 #pragma warning disable "nested-anon-types"
501 #endif
502 #endif /* -Wnested-anon-types */
503 
504 #if __has_warning("-Wconstant-logical-operand")
505 #if defined(__clang__)
506 #pragma clang diagnostic ignored "-Wconstant-logical-operand"
507 #elif defined(__GNUC__)
508 #pragma GCC diagnostic ignored "-Wconstant-logical-operand"
509 #else
510 #pragma warning disable "constant-logical-operand"
511 #endif
512 #endif /* -Wconstant-logical-operand */
513 
514 #if defined(__LCC__) && (__LCC__ <= 121)
515 /* bug #2798 */
516 #pragma diag_suppress alignment_reduction_ignored
517 #elif defined(__ICC)
518 #pragma warning(disable : 3453 1366)
519 #elif __has_warning("-Walignment-reduction-ignored")
520 #if defined(__clang__)
521 #pragma clang diagnostic ignored "-Walignment-reduction-ignored"
522 #elif defined(__GNUC__)
523 #pragma GCC diagnostic ignored "-Walignment-reduction-ignored"
524 #else
525 #pragma warning disable "alignment-reduction-ignored"
526 #endif
527 #endif /* -Walignment-reduction-ignored */
528 
529 #ifdef __cplusplus
530 extern "C" {
531 #endif
532 
533 /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */
534 
535 /*
536  * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru>
537  * and other libmdbx authors: please see AUTHORS file.
538  * All rights reserved.
539  *
540  * Redistribution and use in source and binary forms, with or without
541  * modification, are permitted only as authorized by the OpenLDAP
542  * Public License.
543  *
544  * A copy of this license is available in the file LICENSE in the
545  * top-level directory of the distribution or, alternatively, at
546  * <http://www.OpenLDAP.org/license.html>.
547  */
548 
549 
550 /*----------------------------------------------------------------------------*/
551 /* Microsoft compiler generates a lot of warning for self includes... */
552 
553 #ifdef _MSC_VER
554 #pragma warning(push, 1)
555 #pragma warning(disable : 4548) /* expression before comma has no effect;      \
556                                    expected expression with side - effect */
557 #pragma warning(disable : 4530) /* C++ exception handler used, but unwind      \
558                                  * semantics are not enabled. Specify /EHsc */
559 #pragma warning(disable : 4577) /* 'noexcept' used with no exception handling  \
560                                  * mode specified; termination on exception is \
561                                  * not guaranteed. Specify /EHsc */
562 #endif                          /* _MSC_VER (warnings) */
563 
564 #if defined(_WIN32) || defined(_WIN64)
565 #if !defined(_CRT_SECURE_NO_WARNINGS)
566 #define _CRT_SECURE_NO_WARNINGS
567 #endif /* _CRT_SECURE_NO_WARNINGS */
568 #if !defined(_NO_CRT_STDIO_INLINE) && MDBX_BUILD_SHARED_LIBRARY &&             \
569     !defined(xMDBX_TOOLS) && MDBX_WITHOUT_MSVC_CRT
570 #define _NO_CRT_STDIO_INLINE
571 #endif
572 #elif !defined(_POSIX_C_SOURCE)
573 #define _POSIX_C_SOURCE 200809L
574 #endif /* Windows */
575 
576 /*----------------------------------------------------------------------------*/
577 /* C99 includes */
578 #include <inttypes.h>
579 #include <stddef.h>
580 #include <stdint.h>
581 #include <stdlib.h>
582 
583 #include <assert.h>
584 #include <fcntl.h>
585 #include <limits.h>
586 #include <stdio.h>
587 #include <string.h>
588 #include <time.h>
589 
590 /* C11 stdalign.h */
591 #if __has_include(<stdalign.h>)
592 #include <stdalign.h>
593 #elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
594 #define alignas(N) _Alignas(N)
595 #elif defined(_MSC_VER)
596 #define alignas(N) __declspec(align(N))
597 #elif __has_attribute(__aligned__) || defined(__GNUC__)
598 #define alignas(N) __attribute__((__aligned__(N)))
599 #else
600 #error "FIXME: Required _alignas() or equivalent."
601 #endif
602 
603 /*----------------------------------------------------------------------------*/
604 /* Systems includes */
605 
606 #ifdef __APPLE__
607 #include <TargetConditionals.h>
608 #endif /* Apple OSX & iOS */
609 
610 #if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) ||     \
611     defined(__BSD__) || defined(__bsdi__) || defined(__DragonFly__) ||         \
612     defined(__APPLE__) || defined(__MACH__)
613 #include <sys/cdefs.h>
614 #include <sys/mount.h>
615 #include <sys/sysctl.h>
616 #include <sys/types.h>
617 #if defined(__FreeBSD__) || defined(__DragonFly__)
618 #include <vm/vm_param.h>
619 #elif defined(__OpenBSD__) || defined(__NetBSD__)
620 #include <uvm/uvm_param.h>
621 #else
622 #define SYSCTL_LEGACY_NONCONST_MIB
623 #endif
624 #ifndef __MACH__
625 #include <sys/vmmeter.h>
626 #endif
627 #else
628 #include <malloc.h>
629 #if !(defined(__sun) || defined(__SVR4) || defined(__svr4__) ||                \
630       defined(_WIN32) || defined(_WIN64))
631 #include <mntent.h>
632 #endif /* !Solaris */
633 #endif /* !xBSD */
634 
635 #if defined(__FreeBSD__) || __has_include(<malloc_np.h>)
636 #include <malloc_np.h>
637 #endif
638 
639 #if defined(__APPLE__) || defined(__MACH__) || __has_include(<malloc/malloc.h>)
640 #include <malloc/malloc.h>
641 #endif /* MacOS */
642 
643 #if defined(__MACH__)
644 #include <mach/host_info.h>
645 #include <mach/mach_host.h>
646 #include <mach/mach_port.h>
647 #include <uuid/uuid.h>
648 #endif
649 
650 #if defined(__linux__) || defined(__gnu_linux__)
651 #include <sched.h>
652 #include <sys/sendfile.h>
653 #include <sys/statfs.h>
654 #endif /* Linux */
655 
656 #ifndef _XOPEN_SOURCE
657 #define _XOPEN_SOURCE 0
658 #endif
659 
660 #ifndef _XOPEN_SOURCE_EXTENDED
661 #define _XOPEN_SOURCE_EXTENDED 0
662 #else
663 #include <utmpx.h>
664 #endif /* _XOPEN_SOURCE_EXTENDED */
665 
666 #if defined(__sun) || defined(__SVR4) || defined(__svr4__)
667 #include <kstat.h>
668 #include <sys/mnttab.h>
669 /* On Solaris, it's easier to add a missing prototype rather than find a
670  * combination of #defines that break nothing. */
671 __extern_C key_t ftok(const char *, int);
672 #endif /* SunOS/Solaris */
673 
674 #if defined(_WIN32) || defined(_WIN64)
675 #ifndef _WIN32_WINNT
676 #define _WIN32_WINNT 0x0601 /* Windows 7 */
677 #elif _WIN32_WINNT < 0x0500
678 #error At least 'Windows 2000' API is required for libmdbx.
679 #endif /* _WIN32_WINNT */
680 #if (defined(__MINGW32__) || defined(__MINGW64__)) &&                          \
681     !defined(__USE_MINGW_ANSI_STDIO)
682 #define __USE_MINGW_ANSI_STDIO 1
683 #endif /* MinGW */
684 #ifndef WIN32_LEAN_AND_MEAN
685 #define WIN32_LEAN_AND_MEAN
686 #endif /* WIN32_LEAN_AND_MEAN */
687 #include <excpt.h>
688 #include <tlhelp32.h>
689 #include <windows.h>
690 #include <winnt.h>
691 #include <winternl.h>
692 #define HAVE_SYS_STAT_H
693 #define HAVE_SYS_TYPES_H
694 typedef HANDLE mdbx_thread_t;
695 typedef unsigned mdbx_thread_key_t;
696 #define MAP_FAILED NULL
697 #define HIGH_DWORD(v) ((DWORD)((sizeof(v) > 4) ? ((uint64_t)(v) >> 32) : 0))
698 #define THREAD_CALL WINAPI
699 #define THREAD_RESULT DWORD
700 typedef struct {
701   HANDLE mutex;
702   HANDLE event[2];
703 } mdbx_condpair_t;
704 typedef CRITICAL_SECTION mdbx_fastmutex_t;
705 
706 #if !defined(_MSC_VER) && !defined(__try)
707 /* *INDENT-OFF* */
708 /* clang-format off */
709 #define __try
710 #define __except(COND) if(false)
711 /* *INDENT-ON* */
712 /* clang-format on */
713 #endif /* stub for MSVC's __try/__except */
714 
715 #if MDBX_WITHOUT_MSVC_CRT
716 
717 #ifndef mdbx_malloc
mdbx_malloc(size_t bytes)718 static inline void *mdbx_malloc(size_t bytes) {
719   return HeapAlloc(GetProcessHeap(), 0, bytes);
720 }
721 #endif /* mdbx_malloc */
722 
723 #ifndef mdbx_calloc
mdbx_calloc(size_t nelem,size_t size)724 static inline void *mdbx_calloc(size_t nelem, size_t size) {
725   return HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, nelem * size);
726 }
727 #endif /* mdbx_calloc */
728 
729 #ifndef mdbx_realloc
mdbx_realloc(void * ptr,size_t bytes)730 static inline void *mdbx_realloc(void *ptr, size_t bytes) {
731   return ptr ? HeapReAlloc(GetProcessHeap(), 0, ptr, bytes)
732              : HeapAlloc(GetProcessHeap(), 0, bytes);
733 }
734 #endif /* mdbx_realloc */
735 
736 #ifndef mdbx_free
mdbx_free(void * ptr)737 static inline void mdbx_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); }
738 #endif /* mdbx_free */
739 
740 #else /* MDBX_WITHOUT_MSVC_CRT */
741 
742 #define mdbx_malloc malloc
743 #define mdbx_calloc calloc
744 #define mdbx_realloc realloc
745 #define mdbx_free free
746 #define mdbx_strdup _strdup
747 
748 #endif /* MDBX_WITHOUT_MSVC_CRT */
749 
750 #ifndef snprintf
751 #define snprintf _snprintf /* ntdll */
752 #endif
753 
754 #ifndef vsnprintf
755 #define vsnprintf _vsnprintf /* ntdll */
756 #endif
757 
758 #else /*----------------------------------------------------------------------*/
759 
760 #include <unistd.h>
761 #if !defined(_POSIX_MAPPED_FILES) || _POSIX_MAPPED_FILES < 1
762 #error "libmdbx requires the _POSIX_MAPPED_FILES feature"
763 #endif /* _POSIX_MAPPED_FILES */
764 
765 #include <pthread.h>
766 #include <semaphore.h>
767 #include <signal.h>
768 #include <sys/file.h>
769 #include <sys/ipc.h>
770 #include <sys/mman.h>
771 #include <sys/param.h>
772 #include <sys/stat.h>
773 #include <sys/statvfs.h>
774 #include <sys/uio.h>
775 typedef pthread_t mdbx_thread_t;
776 typedef pthread_key_t mdbx_thread_key_t;
777 #define INVALID_HANDLE_VALUE (-1)
778 #define THREAD_CALL
779 #define THREAD_RESULT void *
780 typedef struct {
781   pthread_mutex_t mutex;
782   pthread_cond_t cond[2];
783 } mdbx_condpair_t;
784 typedef pthread_mutex_t mdbx_fastmutex_t;
785 #define mdbx_malloc malloc
786 #define mdbx_calloc calloc
787 #define mdbx_realloc realloc
788 #define mdbx_free free
789 #define mdbx_strdup strdup
790 #endif /* Platform */
791 
792 #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size)
793 /* malloc_usable_size() already provided */
794 #elif defined(__APPLE__)
795 #define malloc_usable_size(ptr) malloc_size(ptr)
796 #elif defined(_MSC_VER) && !MDBX_WITHOUT_MSVC_CRT
797 #define malloc_usable_size(ptr) _msize(ptr)
798 #endif /* malloc_usable_size */
799 
800 #ifdef __ANDROID_API__
801 #include <android/log.h>
802 #if __ANDROID_API__ >= 21
803 #include <sys/sendfile.h>
804 #endif
805 #endif /* Android */
806 
807 /* *INDENT-OFF* */
808 /* clang-format off */
809 #if defined(HAVE_SYS_STAT_H) || __has_include(<sys/stat.h>)
810 #include <sys/stat.h>
811 #endif
812 #if defined(HAVE_SYS_TYPES_H) || __has_include(<sys/types.h>)
813 #include <sys/types.h>
814 #endif
815 #if defined(HAVE_SYS_FILE_H) || __has_include(<sys/file.h>)
816 #include <sys/file.h>
817 #endif
818 /* *INDENT-ON* */
819 /* clang-format on */
820 
821 #ifndef SSIZE_MAX
822 #define SSIZE_MAX INTPTR_MAX
823 #endif
824 
825 #if !defined(MADV_DODUMP) && defined(MADV_CORE)
826 #define MADV_DODUMP MADV_CORE
827 #endif /* MADV_CORE -> MADV_DODUMP */
828 
829 #if !defined(MADV_DONTDUMP) && defined(MADV_NOCORE)
830 #define MADV_DONTDUMP MADV_NOCORE
831 #endif /* MADV_NOCORE -> MADV_DONTDUMP */
832 
833 #if defined(i386) || defined(__386) || defined(__i386) || defined(__i386__) || \
834     defined(i486) || defined(__i486) || defined(__i486__) ||                   \
835     defined(i586) | defined(__i586) || defined(__i586__) || defined(i686) ||   \
836     defined(__i686) || defined(__i686__) || defined(_M_IX86) ||                \
837     defined(_X86_) || defined(__THW_INTEL__) || defined(__I86__) ||            \
838     defined(__INTEL__) || defined(__x86_64) || defined(__x86_64__) ||          \
839     defined(__amd64__) || defined(__amd64) || defined(_M_X64) ||               \
840     defined(_M_AMD64) || defined(__IA32__) || defined(__INTEL__)
841 #ifndef __ia32__
842 /* LY: define neutral __ia32__ for x86 and x86-64 */
843 #define __ia32__ 1
844 #endif /* __ia32__ */
845 #if !defined(__amd64__) && (defined(__x86_64) || defined(__x86_64__) ||        \
846                             defined(__amd64) || defined(_M_X64))
847 /* LY: define trusty __amd64__ for all AMD64/x86-64 arch */
848 #define __amd64__ 1
849 #endif /* __amd64__ */
850 #endif /* all x86 */
851 
852 #if (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF
853 #error                                                                         \
854     "Sanity checking failed: Two's complement, reasonably sized integer types"
855 #endif
856 
857 #if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul
858 #define MDBX_WORDBITS 64
859 #else
860 #define MDBX_WORDBITS 32
861 #endif /* MDBX_WORDBITS */
862 
863 /*----------------------------------------------------------------------------*/
864 /* Compiler's includes for builtins/intrinsics */
865 
866 #if defined(_MSC_VER) || defined(__INTEL_COMPILER)
867 #include <intrin.h>
868 #elif __GNUC_PREREQ(4, 4) || defined(__clang__)
869 #if defined(__ia32__) || defined(__e2k__)
870 #include <x86intrin.h>
871 #endif /* __ia32__ */
872 #if defined(__ia32__)
873 #include <cpuid.h>
874 #endif /* __ia32__ */
875 #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun)
876 #include <mbarrier.h>
877 #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) &&       \
878     (defined(HP_IA64) || defined(__ia64))
879 #include <machine/sys/inline.h>
880 #elif defined(__IBMC__) && defined(__powerpc)
881 #include <atomic.h>
882 #elif defined(_AIX)
883 #include <builtins.h>
884 #include <sys/atomic_op.h>
885 #elif (defined(__osf__) && defined(__DECC)) || defined(__alpha)
886 #include <c_asm.h>
887 #include <machine/builtins.h>
888 #elif defined(__MWERKS__)
889 /* CodeWarrior - troubles ? */
890 #pragma gcc_extensions
891 #elif defined(__SNC__)
892 /* Sony PS3 - troubles ? */
893 #elif defined(__hppa__) || defined(__hppa)
894 #include <machine/inline.h>
895 #else
896 #error Unsupported C compiler, please use GNU C 4.4 or newer
897 #endif /* Compiler */
898 
899 /*----------------------------------------------------------------------------*/
900 /* Byteorder */
901 
902 #if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) ||           \
903     !defined(__ORDER_BIG_ENDIAN__)
904 
905 /* *INDENT-OFF* */
906 /* clang-format off */
907 #if defined(__GLIBC__) || defined(__GNU_LIBRARY__) || defined(__ANDROID_API__) ||  \
908     defined(HAVE_ENDIAN_H) || __has_include(<endian.h>)
909 #include <endian.h>
910 #elif defined(__APPLE__) || defined(__MACH__) || defined(__OpenBSD__) ||       \
911     defined(HAVE_MACHINE_ENDIAN_H) || __has_include(<machine/endian.h>)
912 #include <machine/endian.h>
913 #elif defined(HAVE_SYS_ISA_DEFS_H) || __has_include(<sys/isa_defs.h>)
914 #include <sys/isa_defs.h>
915 #elif (defined(HAVE_SYS_TYPES_H) && defined(HAVE_SYS_ENDIAN_H)) ||             \
916     (__has_include(<sys/types.h>) && __has_include(<sys/endian.h>))
917 #include <sys/endian.h>
918 #include <sys/types.h>
919 #elif defined(__bsdi__) || defined(__DragonFly__) || defined(__FreeBSD__) ||   \
920     defined(__NetBSD__) ||                              \
921     defined(HAVE_SYS_PARAM_H) || __has_include(<sys/param.h>)
922 #include <sys/param.h>
923 #endif /* OS */
924 /* *INDENT-ON* */
925 /* clang-format on */
926 
927 #if defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && defined(__BIG_ENDIAN)
928 #define __ORDER_LITTLE_ENDIAN__ __LITTLE_ENDIAN
929 #define __ORDER_BIG_ENDIAN__ __BIG_ENDIAN
930 #define __BYTE_ORDER__ __BYTE_ORDER
931 #elif defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN)
932 #define __ORDER_LITTLE_ENDIAN__ _LITTLE_ENDIAN
933 #define __ORDER_BIG_ENDIAN__ _BIG_ENDIAN
934 #define __BYTE_ORDER__ _BYTE_ORDER
935 #else
936 #define __ORDER_LITTLE_ENDIAN__ 1234
937 #define __ORDER_BIG_ENDIAN__ 4321
938 
939 #if defined(__LITTLE_ENDIAN__) ||                                              \
940     (defined(_LITTLE_ENDIAN) && !defined(_BIG_ENDIAN)) ||                      \
941     defined(__ARMEL__) || defined(__THUMBEL__) || defined(__AARCH64EL__) ||    \
942     defined(__MIPSEL__) || defined(_MIPSEL) || defined(__MIPSEL) ||            \
943     defined(_M_ARM) || defined(_M_ARM64) || defined(__e2k__) ||                \
944     defined(__elbrus_4c__) || defined(__elbrus_8c__) || defined(__bfin__) ||   \
945     defined(__BFIN__) || defined(__ia64__) || defined(_IA64) ||                \
946     defined(__IA64__) || defined(__ia64) || defined(_M_IA64) ||                \
947     defined(__itanium__) || defined(__ia32__) || defined(__CYGWIN__) ||        \
948     defined(_WIN64) || defined(_WIN32) || defined(__TOS_WIN__) ||              \
949     defined(__WINDOWS__)
950 #define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
951 
952 #elif defined(__BIG_ENDIAN__) ||                                               \
953     (defined(_BIG_ENDIAN) && !defined(_LITTLE_ENDIAN)) ||                      \
954     defined(__ARMEB__) || defined(__THUMBEB__) || defined(__AARCH64EB__) ||    \
955     defined(__MIPSEB__) || defined(_MIPSEB) || defined(__MIPSEB) ||            \
956     defined(__m68k__) || defined(M68000) || defined(__hppa__) ||               \
957     defined(__hppa) || defined(__HPPA__) || defined(__sparc__) ||              \
958     defined(__sparc) || defined(__370__) || defined(__THW_370__) ||            \
959     defined(__s390__) || defined(__s390x__) || defined(__SYSC_ZARCH__)
960 #define __BYTE_ORDER__ __ORDER_BIG_ENDIAN__
961 
962 #else
963 #error __BYTE_ORDER__ should be defined.
964 #endif /* Arch */
965 
966 #endif
967 #endif /* __BYTE_ORDER__ || __ORDER_LITTLE_ENDIAN__ || __ORDER_BIG_ENDIAN__ */
968 
969 /* Get the size of a memory page for the system.
970  * This is the basic size that the platform's memory manager uses, and is
971  * fundamental to the use of memory-mapped files. */
972 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t
mdbx_syspagesize(void)973 mdbx_syspagesize(void) {
974 #if defined(_WIN32) || defined(_WIN64)
975   SYSTEM_INFO si;
976   GetSystemInfo(&si);
977   return si.dwPageSize;
978 #else
979   return sysconf(_SC_PAGE_SIZE);
980 #endif
981 }
982 
983 typedef struct mdbx_mmap_param {
984   union {
985     void *address;
986     uint8_t *dxb;
987     struct MDBX_lockinfo *lck;
988   };
989   mdbx_filehandle_t fd;
990   size_t limit;   /* mapping length, but NOT a size of file nor DB */
991   size_t current; /* mapped region size, i.e. the size of file and DB */
992   uint64_t filesize /* in-process cache of a file size */;
993 #if defined(_WIN32) || defined(_WIN64)
994   HANDLE section; /* memory-mapped section handle */
995 #endif
996 } mdbx_mmap_t;
997 
998 typedef union bin128 {
999   __anonymous_struct_extension__ struct { uint64_t x, y; };
1000   __anonymous_struct_extension__ struct { uint32_t a, b, c, d; };
1001 } bin128_t;
1002 
1003 #if defined(_WIN32) || defined(_WIN64)
1004 typedef union MDBX_srwlock {
1005   struct {
1006     long volatile readerCount;
1007     long volatile writerCount;
1008   };
1009   RTL_SRWLOCK native;
1010 } MDBX_srwlock;
1011 #endif /* Windows */
1012 
1013 #ifndef __cplusplus
1014 
1015 MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void mdbx_osal_jitter(bool tiny);
1016 MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny);
1017 
1018 /*----------------------------------------------------------------------------*/
1019 /* Atomics */
1020 
1021 #if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && (__has_include(<cstdatomic>) || __has_extension(cxx_atomic))
1022 #include <cstdatomic>
1023 #define MDBX_HAVE_C11ATOMICS
1024 #elif !defined(__cplusplus) &&                                                 \
1025     (__STDC_VERSION__ >= 201112L || __has_extension(c_atomic)) &&              \
1026     !defined(__STDC_NO_ATOMICS__) &&                                           \
1027     (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) ||                            \
1028      !(defined(__GNUC__) || defined(__clang__)))
1029 #include <stdatomic.h>
1030 #define MDBX_HAVE_C11ATOMICS
1031 #elif defined(__GNUC__) || defined(__clang__)
1032 #elif defined(_MSC_VER)
1033 #pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */
1034 #pragma warning(disable : 4133) /* 'function': incompatible types - from       \
1035                                    'size_t' to 'LONGLONG' */
1036 #pragma warning(disable : 4244) /* 'return': conversion from 'LONGLONG' to     \
1037                                    'std::size_t', possible loss of data */
1038 #pragma warning(disable : 4267) /* 'function': conversion from 'size_t' to     \
1039                                    'long', possible loss of data */
1040 #pragma intrinsic(_InterlockedExchangeAdd, _InterlockedCompareExchange)
1041 #pragma intrinsic(_InterlockedExchangeAdd64, _InterlockedCompareExchange64)
1042 #elif defined(__APPLE__)
1043 #include <libkern/OSAtomic.h>
1044 #else
1045 #error FIXME atomic-ops
1046 #endif
1047 
1048 /*----------------------------------------------------------------------------*/
1049 /* Memory/Compiler barriers, cache coherence */
1050 
1051 #if __has_include(<sys/cachectl.h>)
1052 #include <sys/cachectl.h>
1053 #elif defined(__mips) || defined(__mips__) || defined(__mips64) ||             \
1054     defined(__mips64__) || defined(_M_MRX000) || defined(_MIPS_) ||            \
1055     defined(__MWERKS__) || defined(__sgi)
1056 /* MIPS should have explicit cache control */
1057 #include <sys/cachectl.h>
1058 #endif
1059 
mdbx_compiler_barrier(void)1060 MDBX_MAYBE_UNUSED static __inline void mdbx_compiler_barrier(void) {
1061 #if defined(__clang__) || defined(__GNUC__)
1062   __asm__ __volatile__("" ::: "memory");
1063 #elif defined(_MSC_VER)
1064   _ReadWriteBarrier();
1065 #elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */
1066   __memory_barrier();
1067 #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun)
1068   __compiler_barrier();
1069 #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) &&       \
1070     (defined(HP_IA64) || defined(__ia64))
1071   _Asm_sched_fence(/* LY: no-arg meaning 'all expect ALU', e.g. 0x3D3D */);
1072 #elif defined(_AIX) || defined(__ppc__) || defined(__powerpc__) ||             \
1073     defined(__ppc64__) || defined(__powerpc64__)
1074   __fence();
1075 #else
1076 #error "Could not guess the kind of compiler, please report to us."
1077 #endif
1078 }
1079 
mdbx_memory_barrier(void)1080 MDBX_MAYBE_UNUSED static __inline void mdbx_memory_barrier(void) {
1081 #ifdef MDBX_HAVE_C11ATOMICS
1082   atomic_thread_fence(memory_order_seq_cst);
1083 #elif defined(__ATOMIC_SEQ_CST)
1084 #ifdef __clang__
1085   __c11_atomic_thread_fence(__ATOMIC_SEQ_CST);
1086 #else
1087   __atomic_thread_fence(__ATOMIC_SEQ_CST);
1088 #endif
1089 #elif defined(__clang__) || defined(__GNUC__)
1090   __sync_synchronize();
1091 #elif defined(_WIN32) || defined(_WIN64)
1092   MemoryBarrier();
1093 #elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */
1094 #if defined(__ia32__)
1095   _mm_mfence();
1096 #else
1097   __mf();
1098 #endif
1099 #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun)
1100   __machine_rw_barrier();
1101 #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) &&       \
1102     (defined(HP_IA64) || defined(__ia64))
1103   _Asm_mf();
1104 #elif defined(_AIX) || defined(__ppc__) || defined(__powerpc__) ||             \
1105     defined(__ppc64__) || defined(__powerpc64__)
1106   __lwsync();
1107 #else
1108 #error "Could not guess the kind of compiler, please report to us."
1109 #endif
1110 }
1111 
1112 /*----------------------------------------------------------------------------*/
1113 /* libc compatibility stuff */
1114 
1115 #if (!defined(__GLIBC__) && __GLIBC_PREREQ(2, 1)) &&                           \
1116     (defined(_GNU_SOURCE) || defined(_BSD_SOURCE))
1117 #define mdbx_asprintf asprintf
1118 #define mdbx_vasprintf vasprintf
1119 #else
1120 MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC
1121     MDBX_PRINTF_ARGS(2, 3) int mdbx_asprintf(char **strp, const char *fmt, ...);
1122 MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap);
1123 #endif
1124 
1125 /*----------------------------------------------------------------------------*/
1126 /* OS abstraction layer stuff */
1127 
1128 /* max bytes to write in one call */
1129 #if defined(_WIN32) || defined(_WIN64)
1130 #define MAX_WRITE UINT32_C(0x01000000)
1131 #else
1132 #define MAX_WRITE UINT32_C(0x3fff0000)
1133 #endif
1134 
1135 #if defined(__linux__) || defined(__gnu_linux__)
1136 MDBX_INTERNAL_VAR uint32_t mdbx_linux_kernel_version;
1137 MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */;
1138 #endif /* Linux */
1139 
1140 #ifndef mdbx_strdup
1141 LIBMDBX_API char *mdbx_strdup(const char *str);
1142 #endif
1143 
mdbx_get_errno(void)1144 MDBX_MAYBE_UNUSED static __inline int mdbx_get_errno(void) {
1145 #if defined(_WIN32) || defined(_WIN64)
1146   DWORD rc = GetLastError();
1147 #else
1148   int rc = errno;
1149 #endif
1150   return rc;
1151 }
1152 
1153 #ifndef mdbx_memalign_alloc
1154 MDBX_INTERNAL_FUNC int mdbx_memalign_alloc(size_t alignment, size_t bytes,
1155                                            void **result);
1156 #endif
1157 #ifndef mdbx_memalign_free
1158 MDBX_INTERNAL_FUNC void mdbx_memalign_free(void *ptr);
1159 #endif
1160 
1161 MDBX_INTERNAL_FUNC int mdbx_condpair_init(mdbx_condpair_t *condpair);
1162 MDBX_INTERNAL_FUNC int mdbx_condpair_lock(mdbx_condpair_t *condpair);
1163 MDBX_INTERNAL_FUNC int mdbx_condpair_unlock(mdbx_condpair_t *condpair);
1164 MDBX_INTERNAL_FUNC int mdbx_condpair_signal(mdbx_condpair_t *condpair,
1165                                             bool part);
1166 MDBX_INTERNAL_FUNC int mdbx_condpair_wait(mdbx_condpair_t *condpair, bool part);
1167 MDBX_INTERNAL_FUNC int mdbx_condpair_destroy(mdbx_condpair_t *condpair);
1168 
1169 MDBX_INTERNAL_FUNC int mdbx_fastmutex_init(mdbx_fastmutex_t *fastmutex);
1170 MDBX_INTERNAL_FUNC int mdbx_fastmutex_acquire(mdbx_fastmutex_t *fastmutex);
1171 MDBX_INTERNAL_FUNC int mdbx_fastmutex_release(mdbx_fastmutex_t *fastmutex);
1172 MDBX_INTERNAL_FUNC int mdbx_fastmutex_destroy(mdbx_fastmutex_t *fastmutex);
1173 
1174 MDBX_INTERNAL_FUNC int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov,
1175                                     int iovcnt, uint64_t offset,
1176                                     size_t expected_written);
1177 MDBX_INTERNAL_FUNC int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t count,
1178                                   uint64_t offset);
1179 MDBX_INTERNAL_FUNC int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf,
1180                                    size_t count, uint64_t offset);
1181 MDBX_INTERNAL_FUNC int mdbx_write(mdbx_filehandle_t fd, const void *buf,
1182                                   size_t count);
1183 
1184 MDBX_INTERNAL_FUNC int
1185 mdbx_thread_create(mdbx_thread_t *thread,
1186                    THREAD_RESULT(THREAD_CALL *start_routine)(void *),
1187                    void *arg);
1188 MDBX_INTERNAL_FUNC int mdbx_thread_join(mdbx_thread_t thread);
1189 
1190 enum mdbx_syncmode_bits {
1191   MDBX_SYNC_NONE = 0,
1192   MDBX_SYNC_DATA = 1,
1193   MDBX_SYNC_SIZE = 2,
1194   MDBX_SYNC_IODQ = 4
1195 };
1196 
1197 MDBX_INTERNAL_FUNC int mdbx_fsync(mdbx_filehandle_t fd,
1198                                   const enum mdbx_syncmode_bits mode_bits);
1199 MDBX_INTERNAL_FUNC int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length);
1200 MDBX_INTERNAL_FUNC int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos);
1201 MDBX_INTERNAL_FUNC int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length);
1202 
1203 enum mdbx_openfile_purpose {
1204   MDBX_OPEN_DXB_READ = 0,
1205   MDBX_OPEN_DXB_LAZY = 1,
1206   MDBX_OPEN_DXB_DSYNC = 2,
1207   MDBX_OPEN_LCK = 3,
1208   MDBX_OPEN_COPY = 4,
1209   MDBX_OPEN_DELETE = 5
1210 };
1211 
1212 MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose,
1213                                      const MDBX_env *env, const char *pathname,
1214                                      mdbx_filehandle_t *fd,
1215                                      mdbx_mode_t unix_mode_bits);
1216 MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd);
1217 MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname);
1218 MDBX_INTERNAL_FUNC int mdbx_removedirectory(const char *pathname);
1219 MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd);
1220 MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait);
1221 
1222 #define MMAP_OPTION_TRUNCATE 1
1223 #define MMAP_OPTION_SEMAPHORE 2
1224 MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map,
1225                                  const size_t must, const size_t limit,
1226                                  const unsigned options);
1227 MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map);
1228 #define MDBX_MRESIZE_MAY_MOVE 0x00000100
1229 #define MDBX_MRESIZE_MAY_UNMAP 0x00000200
1230 MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map,
1231                                     size_t size, size_t limit);
1232 #if defined(_WIN32) || defined(_WIN64)
1233 typedef struct {
1234   unsigned limit, count;
1235   HANDLE handles[31];
1236 } mdbx_handle_array_t;
1237 MDBX_INTERNAL_FUNC int
1238 mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array);
1239 MDBX_INTERNAL_FUNC int
1240 mdbx_resume_threads_after_remap(mdbx_handle_array_t *array);
1241 #endif /* Windows */
1242 MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset,
1243                                   size_t length,
1244                                   enum mdbx_syncmode_bits mode_bits);
1245 MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle,
1246                                             const char *pathname, int err);
1247 
mdbx_getpid(void)1248 MDBX_MAYBE_UNUSED static __inline uint32_t mdbx_getpid(void) {
1249   STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t));
1250 #if defined(_WIN32) || defined(_WIN64)
1251   return GetCurrentProcessId();
1252 #else
1253   return getpid();
1254 #endif
1255 }
1256 
mdbx_thread_self(void)1257 MDBX_MAYBE_UNUSED static __inline uintptr_t mdbx_thread_self(void) {
1258   mdbx_tid_t thunk;
1259   STATIC_ASSERT(sizeof(uintptr_t) >= sizeof(thunk));
1260 #if defined(_WIN32) || defined(_WIN64)
1261   thunk = GetCurrentThreadId();
1262 #else
1263   thunk = pthread_self();
1264 #endif
1265   return (uintptr_t)thunk;
1266 }
1267 
1268 MDBX_INTERNAL_FUNC uint64_t mdbx_osal_monotime(void);
1269 MDBX_INTERNAL_FUNC uint64_t
1270 mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16);
1271 MDBX_INTERNAL_FUNC uint32_t mdbx_osal_monotime_to_16dot16(uint64_t monotime);
1272 
1273 MDBX_INTERNAL_FUNC bin128_t mdbx_osal_bootid(void);
1274 /*----------------------------------------------------------------------------*/
1275 /* lck stuff */
1276 
1277 /// \brief Initialization of synchronization primitives linked with MDBX_env
1278 ///   instance both in LCK-file and within the current process.
1279 /// \param
1280 ///   global_uniqueness_flag = true - denotes that there are no other processes
1281 ///     working with DB and LCK-file. Thus the function MUST initialize
1282 ///     shared synchronization objects in memory-mapped LCK-file.
1283 ///   global_uniqueness_flag = false - denotes that at least one process is
1284 ///     already working with DB and LCK-file, including the case when DB
1285 ///     has already been opened in the current process. Thus the function
1286 ///     MUST NOT initialize shared synchronization objects in memory-mapped
1287 ///     LCK-file that are already in use.
1288 /// \return Error code or zero on success.
1289 MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env,
1290                                      MDBX_env *inprocess_neighbor,
1291                                      int global_uniqueness_flag);
1292 
1293 /// \brief Disconnects from shared interprocess objects and destructs
1294 ///   synchronization objects linked with MDBX_env instance
1295 ///   within the current process.
1296 /// \param
1297 ///   inprocess_neighbor = NULL - if the current process does not have other
1298 ///     instances of MDBX_env linked with the DB being closed.
1299 ///     Thus the function MUST check for other processes working with DB or
1300 ///     LCK-file, and keep or destroy shared synchronization objects in
1301 ///     memory-mapped LCK-file depending on the result.
1302 ///   inprocess_neighbor = not-NULL - pointer to another instance of MDBX_env
1303 ///     (anyone of there is several) working with DB or LCK-file within the
1304 ///     current process. Thus the function MUST NOT try to acquire exclusive
1305 ///     lock and/or try to destruct shared synchronization objects linked with
1306 ///     DB or LCK-file. Moreover, the implementation MUST ensure correct work
1307 ///     of other instances of MDBX_env within the current process, e.g.
1308 ///     restore POSIX-fcntl locks after the closing of file descriptors.
1309 /// \return Error code (MDBX_PANIC) or zero on success.
1310 MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env,
1311                                         MDBX_env *inprocess_neighbor);
1312 
1313 /// \brief Connects to shared interprocess locking objects and tries to acquire
1314 ///   the maximum lock level (shared if exclusive is not available)
1315 ///   Depending on implementation or/and platform (Windows) this function may
1316 ///   acquire the non-OS super-level lock (e.g. for shared synchronization
1317 ///   objects initialization), which will be downgraded to OS-exclusive or
1318 ///   shared via explicit calling of mdbx_lck_downgrade().
1319 /// \return
1320 ///   MDBX_RESULT_TRUE (-1) - if an exclusive lock was acquired and thus
1321 ///     the current process is the first and only after the last use of DB.
1322 ///   MDBX_RESULT_FALSE (0) - if a shared lock was acquired and thus
1323 ///     DB has already been opened and now is used by other processes.
1324 ///   Otherwise (not 0 and not -1) - error code.
1325 MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env);
1326 
1327 /// \brief Downgrades the level of initially acquired lock to
1328 ///   operational level specified by argument. The reson for such downgrade:
1329 ///    - unblocking of other processes that are waiting for access, i.e.
1330 ///      if (env->me_flags & MDBX_EXCLUSIVE) != 0, then other processes
1331 ///      should be made aware that access is unavailable rather than
1332 ///      wait for it.
1333 ///    - freeing locks that interfere file operation (especially for Windows)
1334 ///   (env->me_flags & MDBX_EXCLUSIVE) == 0 - downgrade to shared lock.
1335 ///   (env->me_flags & MDBX_EXCLUSIVE) != 0 - downgrade to exclusive
1336 ///   operational lock.
1337 /// \return Error code or zero on success
1338 MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env);
1339 
1340 /// \brief Locks LCK-file or/and table of readers for (de)registering.
1341 /// \return Error code or zero on success
1342 MDBX_INTERNAL_FUNC int mdbx_rdt_lock(MDBX_env *env);
1343 
1344 /// \brief Unlocks LCK-file or/and table of readers after (de)registering.
1345 MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env);
1346 
1347 /// \brief Acquires lock for DB change (on writing transaction start)
1348 ///   Reading transactions will not be blocked.
1349 ///   Declared as LIBMDBX_API because it is used in mdbx_chk.
1350 /// \return Error code or zero on success
1351 LIBMDBX_API int mdbx_txn_lock(MDBX_env *env, bool dont_wait);
1352 
1353 /// \brief Releases lock once DB changes is made (after writing transaction
1354 ///   has finished).
1355 ///   Declared as LIBMDBX_API because it is used in mdbx_chk.
1356 LIBMDBX_API void mdbx_txn_unlock(MDBX_env *env);
1357 
1358 /// \brief Sets alive-flag of reader presence (indicative lock) for PID of
1359 ///   the current process. The function does no more than needed for
1360 ///   the correct working of mdbx_rpid_check() in other processes.
1361 /// \return Error code or zero on success
1362 MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env);
1363 
1364 /// \brief Resets alive-flag of reader presence (indicative lock)
1365 ///   for PID of the current process. The function does no more than needed
1366 ///   for the correct working of mdbx_rpid_check() in other processes.
1367 /// \return Error code or zero on success
1368 MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env);
1369 
1370 /// \brief Checks for reading process status with the given pid with help of
1371 ///   alive-flag of presence (indicative lock) or using another way.
1372 /// \return
1373 ///   MDBX_RESULT_TRUE (-1) - if the reader process with the given PID is alive
1374 ///     and working with DB (indicative lock is present).
1375 ///   MDBX_RESULT_FALSE (0) - if the reader process with the given PID is absent
1376 ///     or not working with DB (indicative lock is not present).
1377 ///   Otherwise (not 0 and not -1) - error code.
1378 MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid);
1379 
1380 #if defined(_WIN32) || defined(_WIN64)
1381 
1382 typedef void(WINAPI *MDBX_srwlock_function)(MDBX_srwlock *);
1383 MDBX_INTERNAL_VAR MDBX_srwlock_function mdbx_srwlock_Init,
1384     mdbx_srwlock_AcquireShared, mdbx_srwlock_ReleaseShared,
1385     mdbx_srwlock_AcquireExclusive, mdbx_srwlock_ReleaseExclusive;
1386 
1387 #if _WIN32_WINNT < 0x0600 /* prior to Windows Vista */
1388 typedef enum _FILE_INFO_BY_HANDLE_CLASS {
1389   FileBasicInfo,
1390   FileStandardInfo,
1391   FileNameInfo,
1392   FileRenameInfo,
1393   FileDispositionInfo,
1394   FileAllocationInfo,
1395   FileEndOfFileInfo,
1396   FileStreamInfo,
1397   FileCompressionInfo,
1398   FileAttributeTagInfo,
1399   FileIdBothDirectoryInfo,
1400   FileIdBothDirectoryRestartInfo,
1401   FileIoPriorityHintInfo,
1402   FileRemoteProtocolInfo,
1403   MaximumFileInfoByHandleClass
1404 } FILE_INFO_BY_HANDLE_CLASS,
1405     *PFILE_INFO_BY_HANDLE_CLASS;
1406 
1407 typedef struct _FILE_END_OF_FILE_INFO {
1408   LARGE_INTEGER EndOfFile;
1409 } FILE_END_OF_FILE_INFO, *PFILE_END_OF_FILE_INFO;
1410 
1411 #define REMOTE_PROTOCOL_INFO_FLAG_LOOPBACK 0x00000001
1412 #define REMOTE_PROTOCOL_INFO_FLAG_OFFLINE 0x00000002
1413 
1414 typedef struct _FILE_REMOTE_PROTOCOL_INFO {
1415   USHORT StructureVersion;
1416   USHORT StructureSize;
1417   DWORD Protocol;
1418   USHORT ProtocolMajorVersion;
1419   USHORT ProtocolMinorVersion;
1420   USHORT ProtocolRevision;
1421   USHORT Reserved;
1422   DWORD Flags;
1423   struct {
1424     DWORD Reserved[8];
1425   } GenericReserved;
1426   struct {
1427     DWORD Reserved[16];
1428   } ProtocolSpecificReserved;
1429 } FILE_REMOTE_PROTOCOL_INFO, *PFILE_REMOTE_PROTOCOL_INFO;
1430 
1431 #endif /* _WIN32_WINNT < 0x0600 (prior to Windows Vista) */
1432 
1433 typedef BOOL(WINAPI *MDBX_GetFileInformationByHandleEx)(
1434     _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass,
1435     _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize);
1436 MDBX_INTERNAL_VAR MDBX_GetFileInformationByHandleEx
1437     mdbx_GetFileInformationByHandleEx;
1438 
1439 typedef BOOL(WINAPI *MDBX_GetVolumeInformationByHandleW)(
1440     _In_ HANDLE hFile, _Out_opt_ LPWSTR lpVolumeNameBuffer,
1441     _In_ DWORD nVolumeNameSize, _Out_opt_ LPDWORD lpVolumeSerialNumber,
1442     _Out_opt_ LPDWORD lpMaximumComponentLength,
1443     _Out_opt_ LPDWORD lpFileSystemFlags,
1444     _Out_opt_ LPWSTR lpFileSystemNameBuffer, _In_ DWORD nFileSystemNameSize);
1445 MDBX_INTERNAL_VAR MDBX_GetVolumeInformationByHandleW
1446     mdbx_GetVolumeInformationByHandleW;
1447 
1448 typedef DWORD(WINAPI *MDBX_GetFinalPathNameByHandleW)(_In_ HANDLE hFile,
1449                                                       _Out_ LPWSTR lpszFilePath,
1450                                                       _In_ DWORD cchFilePath,
1451                                                       _In_ DWORD dwFlags);
1452 MDBX_INTERNAL_VAR MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW;
1453 
1454 typedef BOOL(WINAPI *MDBX_SetFileInformationByHandle)(
1455     _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass,
1456     _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize);
1457 MDBX_INTERNAL_VAR MDBX_SetFileInformationByHandle
1458     mdbx_SetFileInformationByHandle;
1459 
1460 typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)(
1461     IN HANDLE FileHandle, IN OUT HANDLE Event,
1462     IN OUT PVOID /* PIO_APC_ROUTINE */ ApcRoutine, IN OUT PVOID ApcContext,
1463     OUT PIO_STATUS_BLOCK IoStatusBlock, IN ULONG FsControlCode,
1464     IN OUT PVOID InputBuffer, IN ULONG InputBufferLength,
1465     OUT OPTIONAL PVOID OutputBuffer, IN ULONG OutputBufferLength);
1466 MDBX_INTERNAL_VAR MDBX_NtFsControlFile mdbx_NtFsControlFile;
1467 
1468 typedef uint64_t(WINAPI *MDBX_GetTickCount64)(void);
1469 MDBX_INTERNAL_VAR MDBX_GetTickCount64 mdbx_GetTickCount64;
1470 
1471 #if !defined(_WIN32_WINNT_WIN8) || _WIN32_WINNT < _WIN32_WINNT_WIN8
1472 typedef struct _WIN32_MEMORY_RANGE_ENTRY {
1473   PVOID VirtualAddress;
1474   SIZE_T NumberOfBytes;
1475 } WIN32_MEMORY_RANGE_ENTRY, *PWIN32_MEMORY_RANGE_ENTRY;
1476 #endif /* Windows 8.x */
1477 
1478 typedef BOOL(WINAPI *MDBX_PrefetchVirtualMemory)(
1479     HANDLE hProcess, ULONG_PTR NumberOfEntries,
1480     PWIN32_MEMORY_RANGE_ENTRY VirtualAddresses, ULONG Flags);
1481 MDBX_INTERNAL_VAR MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory;
1482 
1483 typedef enum _SECTION_INHERIT { ViewShare = 1, ViewUnmap = 2 } SECTION_INHERIT;
1484 
1485 typedef NTSTATUS(NTAPI *MDBX_NtExtendSection)(IN HANDLE SectionHandle,
1486                                               IN PLARGE_INTEGER NewSectionSize);
1487 MDBX_INTERNAL_VAR MDBX_NtExtendSection mdbx_NtExtendSection;
1488 
mdbx_RunningUnderWine(void)1489 static __inline bool mdbx_RunningUnderWine(void) {
1490   return !mdbx_NtExtendSection;
1491 }
1492 
1493 typedef LSTATUS(WINAPI *MDBX_RegGetValueA)(HKEY hkey, LPCSTR lpSubKey,
1494                                            LPCSTR lpValue, DWORD dwFlags,
1495                                            LPDWORD pdwType, PVOID pvData,
1496                                            LPDWORD pcbData);
1497 MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA;
1498 
1499 #endif /* Windows */
1500 
1501 #endif /* !__cplusplus */
1502 
1503 /*----------------------------------------------------------------------------*/
1504 
1505 #if defined(_MSC_VER) && _MSC_VER >= 1900
1506 /* LY: MSVC 2015/2017/2019 has buggy/inconsistent PRIuPTR/PRIxPTR macros
1507  * for internal format-args checker. */
1508 #undef PRIuPTR
1509 #undef PRIiPTR
1510 #undef PRIdPTR
1511 #undef PRIxPTR
1512 #define PRIuPTR "Iu"
1513 #define PRIiPTR "Ii"
1514 #define PRIdPTR "Id"
1515 #define PRIxPTR "Ix"
1516 #define PRIuSIZE "zu"
1517 #define PRIiSIZE "zi"
1518 #define PRIdSIZE "zd"
1519 #define PRIxSIZE "zx"
1520 #endif /* fix PRI*PTR for _MSC_VER */
1521 
1522 #ifndef PRIuSIZE
1523 #define PRIuSIZE PRIuPTR
1524 #define PRIiSIZE PRIiPTR
1525 #define PRIdSIZE PRIdPTR
1526 #define PRIxSIZE PRIxPTR
1527 #endif /* PRI*SIZE macros for MSVC */
1528 
1529 #ifdef _MSC_VER
1530 #pragma warning(pop)
1531 #endif
1532 
1533 #define mdbx_sourcery_anchor XCONCAT(mdbx_sourcery_, MDBX_BUILD_SOURCERY)
1534 #if defined(xMDBX_TOOLS)
1535 extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
1536 #endif
1537 
1538 /*******************************************************************************
1539  *******************************************************************************
1540  *******************************************************************************
1541  *
1542  *
1543  *         ####   #####    #####     #     ####   #    #   ####
1544  *        #    #  #    #     #       #    #    #  ##   #  #
1545  *        #    #  #    #     #       #    #    #  # #  #   ####
1546  *        #    #  #####      #       #    #    #  #  # #       #
1547  *        #    #  #          #       #    #    #  #   ##  #    #
1548  *         ####   #          #       #     ####   #    #   ####
1549  *
1550  *
1551  */
1552 
1553 /** \defgroup build_option Build options
1554  * The libmdbx build options.
1555  @{ */
1556 
1557 /** Using fcntl(F_FULLFSYNC) with 5-10 times slowdown */
1558 #define MDBX_OSX_WANNA_DURABILITY 0
1559 /** Using fsync() with chance of data lost on power failure */
1560 #define MDBX_OSX_WANNA_SPEED 1
1561 
1562 #ifndef MDBX_OSX_SPEED_INSTEADOF_DURABILITY
1563 /** Choices \ref MDBX_OSX_WANNA_DURABILITY or \ref MDBX_OSX_WANNA_SPEED
1564  * for OSX & iOS */
1565 #define MDBX_OSX_SPEED_INSTEADOF_DURABILITY MDBX_OSX_WANNA_DURABILITY
1566 #endif /* MDBX_OSX_SPEED_INSTEADOF_DURABILITY */
1567 
1568 /** Controls checking PID against reuse DB environment after the fork() */
1569 #ifndef MDBX_ENV_CHECKPID
1570 #if defined(MADV_DONTFORK) || defined(_WIN32) || defined(_WIN64)
1571 /* PID check could be omitted:
1572  *  - on Linux when madvise(MADV_DONTFORK) is available, i.e. after the fork()
1573  *    mapped pages will not be available for child process.
1574  *  - in Windows where fork() not available. */
1575 #define MDBX_ENV_CHECKPID 0
1576 #else
1577 #define MDBX_ENV_CHECKPID 1
1578 #endif
1579 #define MDBX_ENV_CHECKPID_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_ENV_CHECKPID)
1580 #else
1581 #define MDBX_ENV_CHECKPID_CONFIG MDBX_STRINGIFY(MDBX_ENV_CHECKPID)
1582 #endif /* MDBX_ENV_CHECKPID */
1583 
1584 /** Controls checking transaction owner thread against misuse transactions from
1585  * other threads. */
1586 #ifndef MDBX_TXN_CHECKOWNER
1587 #define MDBX_TXN_CHECKOWNER 1
1588 #define MDBX_TXN_CHECKOWNER_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_TXN_CHECKOWNER)
1589 #else
1590 #define MDBX_TXN_CHECKOWNER_CONFIG MDBX_STRINGIFY(MDBX_TXN_CHECKOWNER)
1591 #endif /* MDBX_TXN_CHECKOWNER */
1592 
1593 /** Does a system have battery-backed Real-Time Clock or just a fake. */
1594 #ifndef MDBX_TRUST_RTC
1595 #if defined(__linux__) || defined(__gnu_linux__) || defined(__NetBSD__) ||     \
1596     defined(__OpenBSD__)
1597 #define MDBX_TRUST_RTC 0 /* a lot of embedded systems have a fake RTC */
1598 #else
1599 #define MDBX_TRUST_RTC 1
1600 #endif
1601 #define MDBX_TRUST_RTC_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_TRUST_RTC)
1602 #else
1603 #define MDBX_TRUST_RTC_CONFIG MDBX_STRINGIFY(MDBX_TRUST_RTC)
1604 #endif /* MDBX_TRUST_RTC */
1605 
1606 /** Controls online database auto-compactification during write-transactions. */
1607 #ifndef MDBX_ENABLE_REFUND
1608 #define MDBX_ENABLE_REFUND 1
1609 #elif !(MDBX_ENABLE_REFUND == 0 || MDBX_ENABLE_REFUND == 1)
1610 #error MDBX_ENABLE_REFUND must be defined as 0 or 1
1611 #endif /* MDBX_ENABLE_REFUND */
1612 
1613 /** Controls gathering statistics for page operations. */
1614 #ifndef MDBX_ENABLE_PGOP_STAT
1615 #define MDBX_ENABLE_PGOP_STAT 1
1616 #elif !(MDBX_ENABLE_PGOP_STAT == 0 || MDBX_ENABLE_PGOP_STAT == 1)
1617 #error MDBX_ENABLE_PGOP_STAT must be defined as 0 or 1
1618 #endif /* MDBX_ENABLE_PGOP_STAT */
1619 
1620 /** Controls use of POSIX madvise() hints and friends. */
1621 #ifndef MDBX_ENABLE_MADVISE
1622 #define MDBX_ENABLE_MADVISE 1
1623 #elif !(MDBX_ENABLE_MADVISE == 0 || MDBX_ENABLE_MADVISE == 1)
1624 #error MDBX_ENABLE_MADVISE must be defined as 0 or 1
1625 #endif /* MDBX_ENABLE_MADVISE */
1626 
1627 /** Disable some checks to reduce an overhead and detection probability of
1628  * database corruption to a values closer to the LMDB. */
1629 #ifndef MDBX_DISABLE_PAGECHECKS
1630 #define MDBX_DISABLE_PAGECHECKS 0
1631 #elif !(MDBX_DISABLE_PAGECHECKS == 0 || MDBX_DISABLE_PAGECHECKS == 1)
1632 #error MDBX_DISABLE_PAGECHECKS must be defined as 0 or 1
1633 #endif /* MDBX_DISABLE_PAGECHECKS */
1634 
1635 #ifndef MDBX_PNL_PREALLOC_FOR_RADIXSORT
1636 #define MDBX_PNL_PREALLOC_FOR_RADIXSORT 1
1637 #elif !(MDBX_PNL_PREALLOC_FOR_RADIXSORT == 0 ||                                \
1638         MDBX_PNL_PREALLOC_FOR_RADIXSORT == 1)
1639 #error MDBX_PNL_PREALLOC_FOR_RADIXSORT must be defined as 0 or 1
1640 #endif /* MDBX_PNL_PREALLOC_FOR_RADIXSORT */
1641 
1642 #ifndef MDBX_DPL_PREALLOC_FOR_RADIXSORT
1643 #define MDBX_DPL_PREALLOC_FOR_RADIXSORT 1
1644 #elif !(MDBX_DPL_PREALLOC_FOR_RADIXSORT == 0 ||                                \
1645         MDBX_DPL_PREALLOC_FOR_RADIXSORT == 1)
1646 #error MDBX_DPL_PREALLOC_FOR_RADIXSORT must be defined as 0 or 1
1647 #endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */
1648 
1649 /* Basically, this build-option is for TODO. Guess it should be replaced
1650  * with MDBX_ENABLE_WRITEMAP_SPILLING with the three variants:
1651  *  0/OFF = Don't track dirty pages at all and don't spilling ones.
1652  *          This should be by-default on Linux and may-be other systems
1653  *          (not sure: Darwin/OSX, FreeBSD, Windows 10) where kernel provides
1654  *          properly LRU tracking and async writing on-demand.
1655  *  1/ON  = Lite tracking of dirty pages but with LRU labels and explicit
1656  *          spilling with msync(MS_ASYNC). */
1657 #ifndef MDBX_FAKE_SPILL_WRITEMAP
1658 #if defined(__linux__) || defined(__gnu_linux__)
1659 #define MDBX_FAKE_SPILL_WRITEMAP 1 /* msync(MS_ASYNC) is no-op on Linux */
1660 #else
1661 #define MDBX_FAKE_SPILL_WRITEMAP 0
1662 #endif
1663 #elif !(MDBX_FAKE_SPILL_WRITEMAP == 0 || MDBX_FAKE_SPILL_WRITEMAP == 1)
1664 #error MDBX_FAKE_SPILL_WRITEMAP must be defined as 0 or 1
1665 #endif /* MDBX_FAKE_SPILL_WRITEMAP */
1666 
1667 /** Controls sort order of internal page number lists.
1668  * This mostly experimental/advanced option with not for regular MDBX users.
1669  * \warning The database format depend on this option and libmdbx builded with
1670  * different option value are incompatible. */
1671 #ifndef MDBX_PNL_ASCENDING
1672 #define MDBX_PNL_ASCENDING 0
1673 #elif !(MDBX_PNL_ASCENDING == 0 || MDBX_PNL_ASCENDING == 1)
1674 #error MDBX_PNL_ASCENDING must be defined as 0 or 1
1675 #endif /* MDBX_PNL_ASCENDING */
1676 
1677 /** Avoid dependence from MSVC CRT and use ntdll.dll instead. */
1678 #ifndef MDBX_WITHOUT_MSVC_CRT
1679 #define MDBX_WITHOUT_MSVC_CRT 1
1680 #elif !(MDBX_WITHOUT_MSVC_CRT == 0 || MDBX_WITHOUT_MSVC_CRT == 1)
1681 #error MDBX_WITHOUT_MSVC_CRT must be defined as 0 or 1
1682 #endif /* MDBX_WITHOUT_MSVC_CRT */
1683 
1684 /** Size of buffer used during copying a environment/database file. */
1685 #ifndef MDBX_ENVCOPY_WRITEBUF
1686 #define MDBX_ENVCOPY_WRITEBUF 1048576u
1687 #elif MDBX_ENVCOPY_WRITEBUF < 65536u || MDBX_ENVCOPY_WRITEBUF > 1073741824u || \
1688     MDBX_ENVCOPY_WRITEBUF % 65536u
1689 #error MDBX_ENVCOPY_WRITEBUF must be defined in range 65536..1073741824 and be multiple of 65536
1690 #endif /* MDBX_ENVCOPY_WRITEBUF */
1691 
1692 /** Forces assertion checking */
1693 #ifndef MDBX_FORCE_ASSERTIONS
1694 #define MDBX_FORCE_ASSERTIONS 0
1695 #elif !(MDBX_FORCE_ASSERTIONS == 0 || MDBX_FORCE_ASSERTIONS == 1)
1696 #error MDBX_FORCE_ASSERTIONS must be defined as 0 or 1
1697 #endif /* MDBX_FORCE_ASSERTIONS */
1698 
1699 /** Presumed malloc size overhead for each allocation
1700  * to adjust allocations to be more aligned. */
1701 #ifndef MDBX_ASSUME_MALLOC_OVERHEAD
1702 #ifdef __SIZEOF_POINTER__
1703 #define MDBX_ASSUME_MALLOC_OVERHEAD (__SIZEOF_POINTER__ * 2u)
1704 #else
1705 #define MDBX_ASSUME_MALLOC_OVERHEAD (sizeof(void *) * 2u)
1706 #endif
1707 #elif MDBX_ASSUME_MALLOC_OVERHEAD < 0 || MDBX_ASSUME_MALLOC_OVERHEAD > 64 ||   \
1708     MDBX_ASSUME_MALLOC_OVERHEAD % 4
1709 #error MDBX_ASSUME_MALLOC_OVERHEAD must be defined in range 0..64 and be multiple of 4
1710 #endif /* MDBX_ASSUME_MALLOC_OVERHEAD */
1711 
1712 /** In case the MDBX_DEBUG is undefined set it corresponding to NDEBUG */
1713 #ifndef MDBX_DEBUG
1714 #ifdef NDEBUG
1715 #define MDBX_DEBUG 0
1716 #else
1717 #define MDBX_DEBUG 1
1718 #endif
1719 #endif /* MDBX_DEBUG */
1720 
1721 /** If defined then enables integration with Valgrind,
1722  * a memory analyzing tool. */
1723 #ifndef MDBX_USE_VALGRIND
1724 #endif /* MDBX_USE_VALGRIND */
1725 
1726 /** If defined then enables use C11 atomics,
1727  *  otherwise detects ones availability automatically. */
1728 #ifndef MDBX_HAVE_C11ATOMICS
1729 #endif /* MDBX_HAVE_C11ATOMICS */
1730 
1731 //------------------------------------------------------------------------------
1732 
1733 /** Win32 File Locking API for \ref MDBX_LOCKING */
1734 #define MDBX_LOCKING_WIN32FILES -1
1735 
1736 /** SystemV IPC semaphores for \ref MDBX_LOCKING */
1737 #define MDBX_LOCKING_SYSV 5
1738 
1739 /** POSIX-1 Shared anonymous semaphores for \ref MDBX_LOCKING */
1740 #define MDBX_LOCKING_POSIX1988 1988
1741 
1742 /** POSIX-2001 Shared Mutexes for \ref MDBX_LOCKING */
1743 #define MDBX_LOCKING_POSIX2001 2001
1744 
1745 /** POSIX-2008 Robust Mutexes for \ref MDBX_LOCKING */
1746 #define MDBX_LOCKING_POSIX2008 2008
1747 
1748 /** BeOS Benaphores, aka Futexes for \ref MDBX_LOCKING */
1749 #define MDBX_LOCKING_BENAPHORE 1995
1750 
1751 /** Advanced: Choices the locking implementation (autodetection by default). */
1752 #if defined(_WIN32) || defined(_WIN64)
1753 #define MDBX_LOCKING MDBX_LOCKING_WIN32FILES
1754 #else
1755 #ifndef MDBX_LOCKING
1756 #if defined(_POSIX_THREAD_PROCESS_SHARED) &&                                   \
1757     _POSIX_THREAD_PROCESS_SHARED >= 200112L && !defined(__FreeBSD__)
1758 
1759 /* Some platforms define the EOWNERDEAD error code even though they
1760  * don't support Robust Mutexes. If doubt compile with -MDBX_LOCKING=2001. */
1761 #if defined(EOWNERDEAD) && _POSIX_THREAD_PROCESS_SHARED >= 200809L &&          \
1762     ((defined(_POSIX_THREAD_ROBUST_PRIO_INHERIT) &&                            \
1763       _POSIX_THREAD_ROBUST_PRIO_INHERIT > 0) ||                                \
1764      (defined(_POSIX_THREAD_ROBUST_PRIO_PROTECT) &&                            \
1765       _POSIX_THREAD_ROBUST_PRIO_PROTECT > 0) ||                                \
1766      defined(PTHREAD_MUTEX_ROBUST) || defined(PTHREAD_MUTEX_ROBUST_NP)) &&     \
1767     (!defined(__GLIBC__) ||                                                    \
1768      __GLIBC_PREREQ(2, 10) /* troubles with Robust mutexes before 2.10 */)
1769 #define MDBX_LOCKING MDBX_LOCKING_POSIX2008
1770 #else
1771 #define MDBX_LOCKING MDBX_LOCKING_POSIX2001
1772 #endif
1773 #elif defined(__sun) || defined(__SVR4) || defined(__svr4__)
1774 #define MDBX_LOCKING MDBX_LOCKING_POSIX1988
1775 #else
1776 #define MDBX_LOCKING MDBX_LOCKING_SYSV
1777 #endif
1778 #define MDBX_LOCKING_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_LOCKING)
1779 #else
1780 #define MDBX_LOCKING_CONFIG MDBX_STRINGIFY(MDBX_LOCKING)
1781 #endif /* MDBX_LOCKING */
1782 #endif /* !Windows */
1783 
1784 /** Advanced: Using POSIX OFD-locks (autodetection by default). */
1785 #ifndef MDBX_USE_OFDLOCKS
1786 #if defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && defined(F_OFD_GETLK) &&   \
1787     !defined(MDBX_SAFE4QEMU) &&                                                \
1788     !defined(__sun) /* OFD-lock are broken on Solaris */
1789 #define MDBX_USE_OFDLOCKS 1
1790 #else
1791 #define MDBX_USE_OFDLOCKS 0
1792 #endif
1793 #define MDBX_USE_OFDLOCKS_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_USE_OFDLOCKS)
1794 #else
1795 #define MDBX_USE_OFDLOCKS_CONFIG MDBX_STRINGIFY(MDBX_USE_OFDLOCKS)
1796 #endif /* MDBX_USE_OFDLOCKS */
1797 
1798 /** Advanced: Using sendfile() syscall (autodetection by default). */
1799 #ifndef MDBX_USE_SENDFILE
1800 #if ((defined(__linux__) || defined(__gnu_linux__)) &&                         \
1801      !defined(__ANDROID_API__)) ||                                             \
1802     (defined(__ANDROID_API__) && __ANDROID_API__ >= 21)
1803 #define MDBX_USE_SENDFILE 1
1804 #else
1805 #define MDBX_USE_SENDFILE 0
1806 #endif
1807 #endif /* MDBX_USE_SENDFILE */
1808 
1809 /** Advanced: Using copy_file_range() syscall (autodetection by default). */
1810 #ifndef MDBX_USE_COPYFILERANGE
1811 #if __GLIBC_PREREQ(2, 27) && defined(_GNU_SOURCE)
1812 #define MDBX_USE_COPYFILERANGE 1
1813 #else
1814 #define MDBX_USE_COPYFILERANGE 0
1815 #endif
1816 #endif /* MDBX_USE_COPYFILERANGE */
1817 
1818 /** Advanced: Using sync_file_range() syscall (autodetection by default). */
1819 #ifndef MDBX_USE_SYNCFILERANGE
1820 #if ((defined(__linux__) || defined(__gnu_linux__)) &&                         \
1821      defined(SYNC_FILE_RANGE_WRITE) && !defined(__ANDROID_API__)) ||           \
1822     (defined(__ANDROID_API__) && __ANDROID_API__ >= 26)
1823 #define MDBX_USE_SYNCFILERANGE 1
1824 #else
1825 #define MDBX_USE_SYNCFILERANGE 0
1826 #endif
1827 #endif /* MDBX_USE_SYNCFILERANGE */
1828 
1829 //------------------------------------------------------------------------------
1830 
1831 #ifndef MDBX_CPU_WRITEBACK_INCOHERENT
1832 #if defined(__ia32__) || defined(__e2k__) || defined(__hppa) ||                \
1833     defined(__hppa__) || defined(DOXYGEN)
1834 #define MDBX_CPU_WRITEBACK_INCOHERENT 0
1835 #else
1836 #define MDBX_CPU_WRITEBACK_INCOHERENT 1
1837 #endif
1838 #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */
1839 
1840 #ifndef MDBX_MMAP_INCOHERENT_FILE_WRITE
1841 #ifdef __OpenBSD__
1842 #define MDBX_MMAP_INCOHERENT_FILE_WRITE 1
1843 #else
1844 #define MDBX_MMAP_INCOHERENT_FILE_WRITE 0
1845 #endif
1846 #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */
1847 
1848 #ifndef MDBX_MMAP_INCOHERENT_CPU_CACHE
1849 #if defined(__mips) || defined(__mips__) || defined(__mips64) ||               \
1850     defined(__mips64__) || defined(_M_MRX000) || defined(_MIPS_) ||            \
1851     defined(__MWERKS__) || defined(__sgi)
1852 /* MIPS has cache coherency issues. */
1853 #define MDBX_MMAP_INCOHERENT_CPU_CACHE 1
1854 #else
1855 /* LY: assume no relevant mmap/dcache issues. */
1856 #define MDBX_MMAP_INCOHERENT_CPU_CACHE 0
1857 #endif
1858 #endif /* MDBX_MMAP_INCOHERENT_CPU_CACHE */
1859 
1860 #ifndef MDBX_64BIT_ATOMIC
1861 #if MDBX_WORDBITS >= 64 || defined(DOXYGEN)
1862 #define MDBX_64BIT_ATOMIC 1
1863 #else
1864 #define MDBX_64BIT_ATOMIC 0
1865 #endif
1866 #define MDBX_64BIT_ATOMIC_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_64BIT_ATOMIC)
1867 #else
1868 #define MDBX_64BIT_ATOMIC_CONFIG MDBX_STRINGIFY(MDBX_64BIT_ATOMIC)
1869 #endif /* MDBX_64BIT_ATOMIC */
1870 
1871 #ifndef MDBX_64BIT_CAS
1872 #if defined(ATOMIC_LLONG_LOCK_FREE)
1873 #if ATOMIC_LLONG_LOCK_FREE > 1
1874 #define MDBX_64BIT_CAS 1
1875 #else
1876 #define MDBX_64BIT_CAS 0
1877 #endif
1878 #elif defined(__GCC_ATOMIC_LLONG_LOCK_FREE)
1879 #if __GCC_ATOMIC_LLONG_LOCK_FREE > 1
1880 #define MDBX_64BIT_CAS 1
1881 #else
1882 #define MDBX_64BIT_CAS 0
1883 #endif
1884 #elif defined(__CLANG_ATOMIC_LLONG_LOCK_FREE)
1885 #if __CLANG_ATOMIC_LLONG_LOCK_FREE > 1
1886 #define MDBX_64BIT_CAS 1
1887 #else
1888 #define MDBX_64BIT_CAS 0
1889 #endif
1890 #elif defined(_MSC_VER) || defined(__APPLE__) || defined(DOXYGEN)
1891 #define MDBX_64BIT_CAS 1
1892 #else
1893 #define MDBX_64BIT_CAS MDBX_64BIT_ATOMIC
1894 #endif
1895 #define MDBX_64BIT_CAS_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_64BIT_CAS)
1896 #else
1897 #define MDBX_64BIT_CAS_CONFIG MDBX_STRINGIFY(MDBX_64BIT_CAS)
1898 #endif /* MDBX_64BIT_CAS */
1899 
1900 #ifndef MDBX_UNALIGNED_OK
1901 #ifdef _MSC_VER
1902 #define MDBX_UNALIGNED_OK 1 /* avoid MSVC misoptimization */
1903 #elif __CLANG_PREREQ(5, 0) || __GNUC_PREREQ(5, 0)
1904 #define MDBX_UNALIGNED_OK 0 /* expecting optimization is well done */
1905 #elif (defined(__ia32__) || defined(__ARM_FEATURE_UNALIGNED)) &&               \
1906     !defined(__ALIGNED__)
1907 #define MDBX_UNALIGNED_OK 1
1908 #else
1909 #define MDBX_UNALIGNED_OK 0
1910 #endif
1911 #endif /* MDBX_UNALIGNED_OK */
1912 
1913 #ifndef MDBX_CACHELINE_SIZE
1914 #if defined(SYSTEM_CACHE_ALIGNMENT_SIZE)
1915 #define MDBX_CACHELINE_SIZE SYSTEM_CACHE_ALIGNMENT_SIZE
1916 #elif defined(__ia64__) || defined(__ia64) || defined(_M_IA64)
1917 #define MDBX_CACHELINE_SIZE 128
1918 #else
1919 #define MDBX_CACHELINE_SIZE 64
1920 #endif
1921 #endif /* MDBX_CACHELINE_SIZE */
1922 
1923 /** @} end of build options */
1924 /*******************************************************************************
1925  *******************************************************************************
1926  ******************************************************************************/
1927 
1928 #ifdef DOXYGEN
1929 /* !!! Actually this is a fake definitions     !!!
1930  * !!! for documentation generation by Doxygen !!! */
1931 
1932 /** Controls enabling of debugging features.
1933  *
1934  *  - `MDBX_DEBUG = 0` (by default) Disables any debugging features at all,
1935  *                     including logging and assertion controls.
1936  *                     Logging level and corresponding debug flags changing
1937  *                     by \ref mdbx_setup_debug() will not have effect.
1938  *  - `MDBX_DEBUG > 0` Enables code for the debugging features (logging,
1939  *                     assertions checking and internal audit).
1940  *                     Simultaneously sets the default logging level
1941  *                     to the `MDBX_DEBUG` value.
1942  *                     Also enables \ref MDBX_DBG_AUDIT if `MDBX_DEBUG >= 2`.
1943  *
1944  * \ingroup build_option */
1945 #define MDBX_DEBUG 0...7
1946 
1947 /** Disables using of GNU libc extensions. */
1948 #define MDBX_DISABLE_GNU_SOURCE 0 or 1
1949 
1950 #endif /* DOXYGEN */
1951 
1952 /* Undefine the NDEBUG if debugging is enforced by MDBX_DEBUG */
1953 #if MDBX_DEBUG
1954 #undef NDEBUG
1955 #endif
1956 
1957 /*----------------------------------------------------------------------------*/
1958 /* Atomics */
1959 
1960 enum MDBX_memory_order {
1961   mo_Relaxed,
1962   mo_AcquireRelease,
1963   mo_SequentialConsistency
1964 };
1965 
1966 typedef union {
1967   volatile uint32_t weak;
1968 #ifdef MDBX_HAVE_C11ATOMICS
1969   volatile _Atomic uint32_t c11a;
1970 #endif /* MDBX_HAVE_C11ATOMICS */
1971 } MDBX_atomic_uint32_t;
1972 
1973 typedef union {
1974   volatile uint64_t weak;
1975 #if defined(MDBX_HAVE_C11ATOMICS) && (MDBX_64BIT_CAS || MDBX_64BIT_ATOMIC)
1976   volatile _Atomic uint64_t c11a;
1977 #endif
1978 #if !defined(MDBX_HAVE_C11ATOMICS) || !MDBX_64BIT_CAS || !MDBX_64BIT_ATOMIC
1979   __anonymous_struct_extension__ struct {
1980 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
1981     MDBX_atomic_uint32_t low, high;
1982 #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
1983     MDBX_atomic_uint32_t high, low;
1984 #else
1985 #error "FIXME: Unsupported byte order"
1986 #endif /* __BYTE_ORDER__ */
1987   };
1988 #endif
1989 } MDBX_atomic_uint64_t;
1990 
1991 #ifdef MDBX_HAVE_C11ATOMICS
1992 
1993 /* Crutches for C11 atomic compiler's bugs */
1994 #if defined(__e2k__) && defined(__LCC__) && __LCC__ < /* FIXME */ 127
1995 #define MDBX_c11a_ro(type, ptr) (&(ptr)->weak)
1996 #define MDBX_c11a_rw(type, ptr) (&(ptr)->weak)
1997 #elif defined(__clang__) && __clang__ < 8
1998 #define MDBX_c11a_ro(type, ptr) ((volatile _Atomic(type) *)&(ptr)->c11a)
1999 #define MDBX_c11a_rw(type, ptr) (&(ptr)->c11a)
2000 #else
2001 #define MDBX_c11a_ro(type, ptr) (&(ptr)->c11a)
2002 #define MDBX_c11a_rw(type, ptr) (&(ptr)->c11a)
2003 #endif /* Crutches for C11 atomic compiler's bugs */
2004 
mo_c11_store(enum MDBX_memory_order fence)2005 static __always_inline memory_order mo_c11_store(enum MDBX_memory_order fence) {
2006   switch (fence) {
2007   default:
2008     assert(false);
2009     __unreachable();
2010   case mo_Relaxed:
2011     return memory_order_relaxed;
2012   case mo_AcquireRelease:
2013     return memory_order_release;
2014   case mo_SequentialConsistency:
2015     return memory_order_seq_cst;
2016   }
2017 }
2018 
mo_c11_load(enum MDBX_memory_order fence)2019 static __always_inline memory_order mo_c11_load(enum MDBX_memory_order fence) {
2020   switch (fence) {
2021   default:
2022     assert(false);
2023     __unreachable();
2024   case mo_Relaxed:
2025     return memory_order_relaxed;
2026   case mo_AcquireRelease:
2027     return memory_order_acquire;
2028   case mo_SequentialConsistency:
2029     return memory_order_seq_cst;
2030   }
2031 }
2032 #endif /* MDBX_HAVE_C11ATOMICS */
2033 
2034 #ifndef __cplusplus
2035 
2036 MDBX_MAYBE_UNUSED static __always_inline void
mdbx_memory_fence(enum MDBX_memory_order order,bool write)2037 mdbx_memory_fence(enum MDBX_memory_order order, bool write) {
2038 #ifdef MDBX_HAVE_C11ATOMICS
2039   atomic_thread_fence(write ? mo_c11_store(order) : mo_c11_load(order));
2040 #else  /* MDBX_HAVE_C11ATOMICS */
2041   mdbx_compiler_barrier();
2042   if (write &&
2043       order > (MDBX_CPU_WRITEBACK_INCOHERENT ? mo_Relaxed : mo_AcquireRelease))
2044     mdbx_memory_barrier();
2045 #endif /* MDBX_HAVE_C11ATOMICS */
2046 }
2047 
2048 MDBX_MAYBE_UNUSED static __always_inline uint32_t
atomic_store32(MDBX_atomic_uint32_t * p,const uint32_t value,enum MDBX_memory_order order)2049 atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value,
2050                enum MDBX_memory_order order) {
2051   STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4);
2052 #ifdef MDBX_HAVE_C11ATOMICS
2053   assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p)));
2054   atomic_store_explicit(MDBX_c11a_rw(uint32_t, p), value, mo_c11_store(order));
2055 #else  /* MDBX_HAVE_C11ATOMICS */
2056   if (order != mo_Relaxed)
2057     mdbx_compiler_barrier();
2058   p->weak = value;
2059   mdbx_memory_fence(order, true);
2060 #endif /* MDBX_HAVE_C11ATOMICS */
2061   return value;
2062 }
2063 
2064 MDBX_MAYBE_UNUSED static __always_inline uint32_t
atomic_load32(const MDBX_atomic_uint32_t * p,enum MDBX_memory_order order)2065 atomic_load32(const MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) {
2066   STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4);
2067 #ifdef MDBX_HAVE_C11ATOMICS
2068   assert(atomic_is_lock_free(MDBX_c11a_ro(uint32_t, p)));
2069   return atomic_load_explicit(MDBX_c11a_ro(uint32_t, p), mo_c11_load(order));
2070 #else  /* MDBX_HAVE_C11ATOMICS */
2071   mdbx_memory_fence(order, false);
2072   const uint32_t value = p->weak;
2073   if (order != mo_Relaxed)
2074     mdbx_compiler_barrier();
2075   return value;
2076 #endif /* MDBX_HAVE_C11ATOMICS */
2077 }
2078 
2079 #endif /* !__cplusplus */
2080 
2081 /*----------------------------------------------------------------------------*/
2082 /* Basic constants and types */
2083 
2084 /* A stamp that identifies a file as an MDBX file.
2085  * There's nothing special about this value other than that it is easily
2086  * recognizable, and it will reflect any byte order mismatches. */
2087 #define MDBX_MAGIC UINT64_C(/* 56-bit prime */ 0x59659DBDEF4C11)
2088 
2089 /* FROZEN: The version number for a database's datafile format. */
2090 #define MDBX_DATA_VERSION 3
2091 /* The version number for a database's lockfile format. */
2092 #define MDBX_LOCK_VERSION 4
2093 
2094 /* handle for the DB used to track free pages. */
2095 #define FREE_DBI 0
2096 /* handle for the default DB. */
2097 #define MAIN_DBI 1
2098 /* Number of DBs in metapage (free and main) - also hardcoded elsewhere */
2099 #define CORE_DBS 2
2100 
2101 /* Number of meta pages - also hardcoded elsewhere */
2102 #define NUM_METAS 3
2103 
2104 /* A page number in the database.
2105  *
2106  * MDBX uses 32 bit for page numbers. This limits database
2107  * size up to 2^44 bytes, in case of 4K pages. */
2108 typedef uint32_t pgno_t;
2109 typedef MDBX_atomic_uint32_t atomic_pgno_t;
2110 #define PRIaPGNO PRIu32
2111 #define MAX_PAGENO UINT32_C(0x7FFFffff)
2112 #define MIN_PAGENO NUM_METAS
2113 
2114 #define SAFE64_INVALID_THRESHOLD UINT64_C(0xffffFFFF00000000)
2115 
2116 /* A transaction ID. */
2117 typedef uint64_t txnid_t;
2118 typedef MDBX_atomic_uint64_t atomic_txnid_t;
2119 #define PRIaTXN PRIi64
2120 #define MIN_TXNID UINT64_C(1)
2121 #define MAX_TXNID (SAFE64_INVALID_THRESHOLD - 1)
2122 #define INITIAL_TXNID (MIN_TXNID + NUM_METAS - 1)
2123 #define INVALID_TXNID UINT64_MAX
2124 /* LY: for testing non-atomic 64-bit txnid on 32-bit arches.
2125  * #define xMDBX_TXNID_STEP (UINT32_MAX / 3) */
2126 #ifndef xMDBX_TXNID_STEP
2127 #if MDBX_64BIT_CAS
2128 #define xMDBX_TXNID_STEP 1u
2129 #else
2130 #define xMDBX_TXNID_STEP 2u
2131 #endif
2132 #endif /* xMDBX_TXNID_STEP */
2133 
2134 /* Used for offsets within a single page.
2135  * Since memory pages are typically 4 or 8KB in size, 12-13 bits,
2136  * this is plenty. */
2137 typedef uint16_t indx_t;
2138 
2139 #define MEGABYTE ((size_t)1 << 20)
2140 
2141 /*----------------------------------------------------------------------------*/
2142 /* Core structures for database and shared memory (i.e. format definition) */
2143 #pragma pack(push, 4)
2144 
2145 /* Information about a single database in the environment. */
2146 typedef struct MDBX_db {
2147   uint16_t md_flags;        /* see mdbx_dbi_open */
2148   uint16_t md_depth;        /* depth of this tree */
2149   uint32_t md_xsize;        /* key-size for MDBX_DUPFIXED (LEAF2 pages) */
2150   pgno_t md_root;           /* the root page of this tree */
2151   pgno_t md_branch_pages;   /* number of internal pages */
2152   pgno_t md_leaf_pages;     /* number of leaf pages */
2153   pgno_t md_overflow_pages; /* number of overflow pages */
2154   uint64_t md_seq;          /* table sequence counter */
2155   uint64_t md_entries;      /* number of data items */
2156   uint64_t md_mod_txnid;    /* txnid of last committed modification */
2157 } MDBX_db;
2158 
2159 /* database size-related parameters */
2160 typedef struct MDBX_geo {
2161   uint16_t grow_pv;   /* datafile growth step as a 16-bit packed (exponential
2162                            quantized) value */
2163   uint16_t shrink_pv; /* datafile shrink threshold as a 16-bit packed
2164                            (exponential quantized) value */
2165   pgno_t lower;       /* minimal size of datafile in pages */
2166   pgno_t upper;       /* maximal size of datafile in pages */
2167   pgno_t now;         /* current size of datafile in pages */
2168   pgno_t next;        /* first unused page in the datafile,
2169                          but actually the file may be shorter. */
2170 } MDBX_geo;
2171 
2172 /* Meta page content.
2173  * A meta page is the start point for accessing a database snapshot.
2174  * Pages 0-1 are meta pages. Transaction N writes meta page (N % 2). */
2175 typedef struct MDBX_meta {
2176   /* Stamp identifying this as an MDBX file.
2177    * It must be set to MDBX_MAGIC with MDBX_DATA_VERSION. */
2178   uint32_t mm_magic_and_version[2];
2179 
2180   /* txnid that committed this page, the first of a two-phase-update pair */
2181   uint32_t mm_txnid_a[2];
2182 
2183   uint16_t mm_extra_flags;  /* extra DB flags, zero (nothing) for now */
2184   uint8_t mm_validator_id;  /* ID of checksum and page validation method,
2185                              * zero (nothing) for now */
2186   uint8_t mm_extra_pagehdr; /* extra bytes in the page header,
2187                              * zero (nothing) for now */
2188 
2189   MDBX_geo mm_geo; /* database size-related parameters */
2190 
2191   MDBX_db mm_dbs[CORE_DBS]; /* first is free space, 2nd is main db */
2192                             /* The size of pages used in this DB */
2193 #define mm_psize mm_dbs[FREE_DBI].md_xsize
2194   MDBX_canary mm_canary;
2195 
2196 #define MDBX_DATASIGN_NONE 0u
2197 #define MDBX_DATASIGN_WEAK 1u
2198 #define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK)
2199 #define META_IS_STEADY(meta)                                                   \
2200   SIGN_IS_STEADY(unaligned_peek_u64(4, (meta)->mm_datasync_sign))
2201   uint32_t mm_datasync_sign[2];
2202 
2203   /* txnid that committed this page, the second of a two-phase-update pair */
2204   uint32_t mm_txnid_b[2];
2205 
2206   /* Number of non-meta pages which were put in GC after COW. May be 0 in case
2207    * DB was previously handled by libmdbx without corresponding feature.
2208    * This value in couple with mr_snapshot_pages_retired allows fast estimation
2209    * of "how much reader is restraining GC recycling". */
2210   uint32_t mm_pages_retired[2];
2211 
2212   /* The analogue /proc/sys/kernel/random/boot_id or similar to determine
2213    * whether the system was rebooted after the last use of the database files.
2214    * If there was no reboot, but there is no need to rollback to the last
2215    * steady sync point. Zeros mean that no relevant information is available
2216    * from the system. */
2217   bin128_t mm_bootid;
2218 
2219 } MDBX_meta;
2220 
2221 #pragma pack(1)
2222 
2223 /* Common header for all page types. The page type depends on mp_flags.
2224  *
2225  * P_BRANCH and P_LEAF pages have unsorted 'MDBX_node's at the end, with
2226  * sorted mp_ptrs[] entries referring to them. Exception: P_LEAF2 pages
2227  * omit mp_ptrs and pack sorted MDBX_DUPFIXED values after the page header.
2228  *
2229  * P_OVERFLOW records occupy one or more contiguous pages where only the
2230  * first has a page header. They hold the real data of F_BIGDATA nodes.
2231  *
2232  * P_SUBP sub-pages are small leaf "pages" with duplicate data.
2233  * A node with flag F_DUPDATA but not F_SUBDATA contains a sub-page.
2234  * (Duplicate data can also go in sub-databases, which use normal pages.)
2235  *
2236  * P_META pages contain MDBX_meta, the start point of an MDBX snapshot.
2237  *
2238  * Each non-metapage up to MDBX_meta.mm_last_pg is reachable exactly once
2239  * in the snapshot: Either used by a database or listed in a GC record. */
2240 typedef struct MDBX_page {
2241   union {
2242 #define IS_FROZEN(txn, p) ((p)->mp_txnid < (txn)->mt_txnid)
2243 #define IS_SPILLED(txn, p) ((p)->mp_txnid == (txn)->mt_txnid)
2244 #define IS_SHADOWED(txn, p) ((p)->mp_txnid > (txn)->mt_txnid)
2245 #define IS_VALID(txn, p) ((p)->mp_txnid <= (txn)->mt_front)
2246 #define IS_MODIFIABLE(txn, p) ((p)->mp_txnid == (txn)->mt_front)
2247     uint64_t mp_txnid;
2248     struct MDBX_page *mp_next; /* for in-memory list of freed pages */
2249   };
2250   uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */
2251 #define P_BRANCH 0x01      /* branch page */
2252 #define P_LEAF 0x02        /* leaf page */
2253 #define P_OVERFLOW 0x04    /* overflow page */
2254 #define P_META 0x08        /* meta page */
2255 #define P_BAD 0x10         /* explicit flag for invalid/bad page */
2256 #define P_LEAF2 0x20       /* for MDBX_DUPFIXED records */
2257 #define P_SUBP 0x40        /* for MDBX_DUPSORT sub-pages */
2258 #define P_SPILLED 0x2000   /* spilled in parent txn */
2259 #define P_LOOSE 0x4000     /* page was dirtied then freed, can be reused */
2260 #define P_FROZEN 0x8000    /* used for retire page with known status */
2261 #define P_ILL_BITS (~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_SPILLED))
2262   uint16_t mp_flags;
2263   union {
2264     uint32_t mp_pages; /* number of overflow pages */
2265     __anonymous_struct_extension__ struct {
2266       indx_t mp_lower; /* lower bound of free space */
2267       indx_t mp_upper; /* upper bound of free space */
2268     };
2269   };
2270   pgno_t mp_pgno; /* page number */
2271 
2272 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) ||              \
2273     (!defined(__cplusplus) && defined(_MSC_VER))
2274   indx_t mp_ptrs[] /* dynamic size */;
2275 #endif /* C99 */
2276 } MDBX_page;
2277 
2278 /* Size of the page header, excluding dynamic data at the end */
2279 #define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_ptrs))
2280 
2281 #pragma pack(pop)
2282 
2283 #if MDBX_ENABLE_PGOP_STAT
2284 /* Statistics of page operations overall of all (running, completed and aborted)
2285  * transactions */
2286 typedef struct {
2287   MDBX_atomic_uint64_t newly;   /* Quantity of a new pages added */
2288   MDBX_atomic_uint64_t cow;     /* Quantity of pages copied for update */
2289   MDBX_atomic_uint64_t clone;   /* Quantity of parent's dirty pages clones
2290                                    for nested transactions */
2291   MDBX_atomic_uint64_t split;   /* Page splits */
2292   MDBX_atomic_uint64_t merge;   /* Page merges */
2293   MDBX_atomic_uint64_t spill;   /* Quantity of spilled dirty pages */
2294   MDBX_atomic_uint64_t unspill; /* Quantity of unspilled/reloaded pages */
2295   MDBX_atomic_uint64_t
2296       wops; /* Number of explicit write operations (not a pages) to a disk */
2297 } MDBX_pgop_stat_t;
2298 #endif /* MDBX_ENABLE_PGOP_STAT */
2299 
2300 #if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES
2301 #define MDBX_CLOCK_SIGN UINT32_C(0xF10C)
2302 typedef void mdbx_ipclock_t;
2303 #elif MDBX_LOCKING == MDBX_LOCKING_SYSV
2304 
2305 #define MDBX_CLOCK_SIGN UINT32_C(0xF18D)
2306 typedef mdbx_pid_t mdbx_ipclock_t;
2307 #ifndef EOWNERDEAD
2308 #define EOWNERDEAD MDBX_RESULT_TRUE
2309 #endif
2310 
2311 #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 ||                                \
2312     MDBX_LOCKING == MDBX_LOCKING_POSIX2008
2313 #define MDBX_CLOCK_SIGN UINT32_C(0x8017)
2314 typedef pthread_mutex_t mdbx_ipclock_t;
2315 #elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988
2316 #define MDBX_CLOCK_SIGN UINT32_C(0xFC29)
2317 typedef sem_t mdbx_ipclock_t;
2318 #else
2319 #error "FIXME"
2320 #endif /* MDBX_LOCKING */
2321 
2322 #if MDBX_LOCKING > MDBX_LOCKING_SYSV && !defined(__cplusplus)
2323 MDBX_INTERNAL_FUNC int mdbx_ipclock_stub(mdbx_ipclock_t *ipc);
2324 MDBX_INTERNAL_FUNC int mdbx_ipclock_destroy(mdbx_ipclock_t *ipc);
2325 #endif /* MDBX_LOCKING */
2326 
2327 /* Reader Lock Table
2328  *
2329  * Readers don't acquire any locks for their data access. Instead, they
2330  * simply record their transaction ID in the reader table. The reader
2331  * mutex is needed just to find an empty slot in the reader table. The
2332  * slot's address is saved in thread-specific data so that subsequent
2333  * read transactions started by the same thread need no further locking to
2334  * proceed.
2335  *
2336  * If MDBX_NOTLS is set, the slot address is not saved in thread-specific data.
2337  * No reader table is used if the database is on a read-only filesystem.
2338  *
2339  * Since the database uses multi-version concurrency control, readers don't
2340  * actually need any locking. This table is used to keep track of which
2341  * readers are using data from which old transactions, so that we'll know
2342  * when a particular old transaction is no longer in use. Old transactions
2343  * that have discarded any data pages can then have those pages reclaimed
2344  * for use by a later write transaction.
2345  *
2346  * The lock table is constructed such that reader slots are aligned with the
2347  * processor's cache line size. Any slot is only ever used by one thread.
2348  * This alignment guarantees that there will be no contention or cache
2349  * thrashing as threads update their own slot info, and also eliminates
2350  * any need for locking when accessing a slot.
2351  *
2352  * A writer thread will scan every slot in the table to determine the oldest
2353  * outstanding reader transaction. Any freed pages older than this will be
2354  * reclaimed by the writer. The writer doesn't use any locks when scanning
2355  * this table. This means that there's no guarantee that the writer will
2356  * see the most up-to-date reader info, but that's not required for correct
2357  * operation - all we need is to know the upper bound on the oldest reader,
2358  * we don't care at all about the newest reader. So the only consequence of
2359  * reading stale information here is that old pages might hang around a
2360  * while longer before being reclaimed. That's actually good anyway, because
2361  * the longer we delay reclaiming old pages, the more likely it is that a
2362  * string of contiguous pages can be found after coalescing old pages from
2363  * many old transactions together. */
2364 
2365 /* The actual reader record, with cacheline padding. */
2366 typedef struct MDBX_reader {
2367   /* Current Transaction ID when this transaction began, or (txnid_t)-1.
2368    * Multiple readers that start at the same time will probably have the
2369    * same ID here. Again, it's not important to exclude them from
2370    * anything; all we need to know is which version of the DB they
2371    * started from so we can avoid overwriting any data used in that
2372    * particular version. */
2373   MDBX_atomic_uint64_t /* txnid_t */ mr_txnid;
2374 
2375   /* The information we store in a single slot of the reader table.
2376    * In addition to a transaction ID, we also record the process and
2377    * thread ID that owns a slot, so that we can detect stale information,
2378    * e.g. threads or processes that went away without cleaning up.
2379    *
2380    * NOTE: We currently don't check for stale records.
2381    * We simply re-init the table when we know that we're the only process
2382    * opening the lock file. */
2383 
2384   /* The thread ID of the thread owning this txn. */
2385   MDBX_atomic_uint64_t mr_tid;
2386 
2387   /* The process ID of the process owning this reader txn. */
2388   MDBX_atomic_uint32_t mr_pid;
2389 
2390   /* The number of pages used in the reader's MVCC snapshot,
2391    * i.e. the value of meta->mm_geo.next and txn->mt_next_pgno */
2392   atomic_pgno_t mr_snapshot_pages_used;
2393   /* Number of retired pages at the time this reader starts transaction. So,
2394    * at any time the difference mm_pages_retired - mr_snapshot_pages_retired
2395    * will give the number of pages which this reader restraining from reuse. */
2396   MDBX_atomic_uint64_t mr_snapshot_pages_retired;
2397 } MDBX_reader;
2398 
2399 /* The header for the reader table (a memory-mapped lock file). */
2400 typedef struct MDBX_lockinfo {
2401   /* Stamp identifying this as an MDBX file.
2402    * It must be set to MDBX_MAGIC with with MDBX_LOCK_VERSION. */
2403   uint64_t mti_magic_and_version;
2404 
2405   /* Format of this lock file. Must be set to MDBX_LOCK_FORMAT. */
2406   uint32_t mti_os_and_format;
2407 
2408   /* Flags which environment was opened. */
2409   MDBX_atomic_uint32_t mti_envmode;
2410 
2411   /* Threshold of un-synced-with-disk pages for auto-sync feature,
2412    * zero means no-threshold, i.e. auto-sync is disabled. */
2413   atomic_pgno_t mti_autosync_threshold;
2414 
2415   /* Low 32-bit of txnid with which meta-pages was synced,
2416    * i.e. for sync-polling in the MDBX_NOMETASYNC mode. */
2417   MDBX_atomic_uint32_t mti_meta_sync_txnid;
2418 
2419   /* Period for timed auto-sync feature, i.e. at the every steady checkpoint
2420    * the mti_unsynced_timeout sets to the current_time + mti_autosync_period.
2421    * The time value is represented in a suitable system-dependent form, for
2422    * example clock_gettime(CLOCK_BOOTTIME) or clock_gettime(CLOCK_MONOTONIC).
2423    * Zero means timed auto-sync is disabled. */
2424   MDBX_atomic_uint64_t mti_autosync_period;
2425 
2426   /* Marker to distinguish uniqueness of DB/CLK. */
2427   MDBX_atomic_uint64_t mti_bait_uniqueness;
2428 
2429   alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/
2430 
2431 #if MDBX_ENABLE_PGOP_STAT
2432       /* Statistics of costly ops of all (running, completed and aborted)
2433        * transactions */
2434       MDBX_pgop_stat_t mti_pgop_stat;
2435 #endif /* MDBX_ENABLE_PGOP_STAT*/
2436 
2437   alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/
2438 
2439   /* Write transaction lock. */
2440 #if MDBX_LOCKING > 0
2441       mdbx_ipclock_t mti_wlock;
2442 #endif /* MDBX_LOCKING > 0 */
2443 
2444   atomic_txnid_t mti_oldest_reader;
2445 
2446   /* Timestamp of the last steady sync. Value is represented in a suitable
2447    * system-dependent form, for example clock_gettime(CLOCK_BOOTTIME) or
2448    * clock_gettime(CLOCK_MONOTONIC). */
2449   MDBX_atomic_uint64_t mti_sync_timestamp;
2450 
2451   /* Number un-synced-with-disk pages for auto-sync feature. */
2452   atomic_pgno_t mti_unsynced_pages;
2453 
2454   /* Number of page which was discarded last time by madvise(MADV_FREE). */
2455   atomic_pgno_t mti_discarded_tail;
2456 
2457   /* Timestamp of the last readers check. */
2458   MDBX_atomic_uint64_t mti_reader_check_timestamp;
2459 
2460   /* Shared anchor for tracking readahead edge and enabled/disabled status. */
2461   pgno_t mti_readahead_anchor;
2462 
2463   alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/
2464 
2465   /* Readeaders registration lock. */
2466 #if MDBX_LOCKING > 0
2467       mdbx_ipclock_t mti_rlock;
2468 #endif /* MDBX_LOCKING > 0 */
2469 
2470   /* The number of slots that have been used in the reader table.
2471    * This always records the maximum count, it is not decremented
2472    * when readers release their slots. */
2473   MDBX_atomic_uint32_t mti_numreaders;
2474   MDBX_atomic_uint32_t mti_readers_refresh_flag;
2475 
2476 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) ||              \
2477     (!defined(__cplusplus) && defined(_MSC_VER))
2478   alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/
2479       MDBX_reader mti_readers[] /* dynamic size */;
2480 #endif /* C99 */
2481 } MDBX_lockinfo;
2482 
2483 /* Lockfile format signature: version, features and field layout */
2484 #define MDBX_LOCK_FORMAT                                                       \
2485   (MDBX_CLOCK_SIGN * 27733 + (unsigned)sizeof(MDBX_reader) * 13 +              \
2486    (unsigned)offsetof(MDBX_reader, mr_snapshot_pages_used) * 251 +             \
2487    (unsigned)offsetof(MDBX_lockinfo, mti_oldest_reader) * 83 +                 \
2488    (unsigned)offsetof(MDBX_lockinfo, mti_numreaders) * 37 +                    \
2489    (unsigned)offsetof(MDBX_lockinfo, mti_readers) * 29)
2490 
2491 #define MDBX_DATA_MAGIC                                                        \
2492   ((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + MDBX_DATA_VERSION)
2493 
2494 #define MDBX_DATA_MAGIC_LEGACY_COMPAT                                          \
2495   ((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + 2)
2496 
2497 #define MDBX_DATA_MAGIC_LEGACY_DEVEL ((MDBX_MAGIC << 8) + 255)
2498 
2499 #define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION)
2500 
2501 /* The maximum size of a database page.
2502  *
2503  * It is 64K, but value-PAGEHDRSZ must fit in MDBX_page.mp_upper.
2504  *
2505  * MDBX will use database pages < OS pages if needed.
2506  * That causes more I/O in write transactions: The OS must
2507  * know (read) the whole page before writing a partial page.
2508  *
2509  * Note that we don't currently support Huge pages. On Linux,
2510  * regular data files cannot use Huge pages, and in general
2511  * Huge pages aren't actually pageable. We rely on the OS
2512  * demand-pager to read our data and page it out when memory
2513  * pressure from other processes is high. So until OSs have
2514  * actual paging support for Huge pages, they're not viable. */
2515 #define MAX_PAGESIZE MDBX_MAX_PAGESIZE
2516 #define MIN_PAGESIZE MDBX_MIN_PAGESIZE
2517 
2518 #define MIN_MAPSIZE (MIN_PAGESIZE * MIN_PAGENO)
2519 #if defined(_WIN32) || defined(_WIN64)
2520 #define MAX_MAPSIZE32 UINT32_C(0x38000000)
2521 #else
2522 #define MAX_MAPSIZE32 UINT32_C(0x7f000000)
2523 #endif
2524 #define MAX_MAPSIZE64 (MAX_PAGENO * (uint64_t)MAX_PAGESIZE)
2525 
2526 #if MDBX_WORDBITS >= 64
2527 #define MAX_MAPSIZE MAX_MAPSIZE64
2528 #define MDBX_PGL_LIMIT ((size_t)MAX_PAGENO)
2529 #else
2530 #define MAX_MAPSIZE MAX_MAPSIZE32
2531 #define MDBX_PGL_LIMIT (MAX_MAPSIZE32 / MIN_PAGESIZE)
2532 #endif /* MDBX_WORDBITS */
2533 
2534 #define MDBX_READERS_LIMIT 32767
2535 #define MDBX_RADIXSORT_THRESHOLD 333
2536 
2537 /*----------------------------------------------------------------------------*/
2538 
2539 /* An PNL is an Page Number List, a sorted array of IDs.
2540  * The first element of the array is a counter for how many actual page-numbers
2541  * are in the list. By default PNLs are sorted in descending order, this allow
2542  * cut off a page with lowest pgno (at the tail) just truncating the list. The
2543  * sort order of PNLs is controlled by the MDBX_PNL_ASCENDING build option. */
2544 typedef pgno_t *MDBX_PNL;
2545 
2546 #if MDBX_PNL_ASCENDING
2547 #define MDBX_PNL_ORDERED(first, last) ((first) < (last))
2548 #define MDBX_PNL_DISORDERED(first, last) ((first) >= (last))
2549 #else
2550 #define MDBX_PNL_ORDERED(first, last) ((first) > (last))
2551 #define MDBX_PNL_DISORDERED(first, last) ((first) <= (last))
2552 #endif
2553 
2554 /* List of txnid, only for MDBX_txn.tw.lifo_reclaimed */
2555 typedef txnid_t *MDBX_TXL;
2556 
2557 /* An Dirty-Page list item is an pgno/pointer pair. */
2558 typedef struct MDBX_dp {
2559   MDBX_page *ptr;
2560   pgno_t pgno;
2561   union {
2562     unsigned extra;
2563     __anonymous_struct_extension__ struct {
2564       unsigned multi : 1;
2565       unsigned lru : 31;
2566     };
2567   };
2568 } MDBX_dp;
2569 
2570 /* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */
2571 typedef struct MDBX_dpl {
2572   unsigned sorted;
2573   unsigned length;
2574   unsigned detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */
2575 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) ||              \
2576     (!defined(__cplusplus) && defined(_MSC_VER))
2577   MDBX_dp items[] /* dynamic size with holes at zero and after the last */;
2578 #endif
2579 } MDBX_dpl;
2580 
2581 /* PNL sizes */
2582 #define MDBX_PNL_GRANULATE 1024
2583 #define MDBX_PNL_INITIAL                                                       \
2584   (MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
2585 
2586 #define MDBX_TXL_GRANULATE 32
2587 #define MDBX_TXL_INITIAL                                                       \
2588   (MDBX_TXL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t))
2589 #define MDBX_TXL_MAX                                                           \
2590   ((1u << 17) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t))
2591 
2592 #define MDBX_PNL_ALLOCLEN(pl) ((pl)[-1])
2593 #define MDBX_PNL_SIZE(pl) ((pl)[0])
2594 #define MDBX_PNL_FIRST(pl) ((pl)[1])
2595 #define MDBX_PNL_LAST(pl) ((pl)[MDBX_PNL_SIZE(pl)])
2596 #define MDBX_PNL_BEGIN(pl) (&(pl)[1])
2597 #define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_SIZE(pl) + 1])
2598 
2599 #if MDBX_PNL_ASCENDING
2600 #define MDBX_PNL_LEAST(pl) MDBX_PNL_FIRST(pl)
2601 #define MDBX_PNL_MOST(pl) MDBX_PNL_LAST(pl)
2602 #else
2603 #define MDBX_PNL_LEAST(pl) MDBX_PNL_LAST(pl)
2604 #define MDBX_PNL_MOST(pl) MDBX_PNL_FIRST(pl)
2605 #endif
2606 
2607 #define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_SIZE(pl) + 1) * sizeof(pgno_t))
2608 #define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_SIZE(pl) == 0)
2609 
2610 /*----------------------------------------------------------------------------*/
2611 /* Internal structures */
2612 
2613 /* Auxiliary DB info.
2614  * The information here is mostly static/read-only. There is
2615  * only a single copy of this record in the environment. */
2616 typedef struct MDBX_dbx {
2617   MDBX_val md_name;                /* name of the database */
2618   MDBX_cmp_func *md_cmp;           /* function for comparing keys */
2619   MDBX_cmp_func *md_dcmp;          /* function for comparing data items */
2620   size_t md_klen_min, md_klen_max; /* min/max key length for the database */
2621   size_t md_vlen_min,
2622       md_vlen_max; /* min/max value/data length for the database */
2623 } MDBX_dbx;
2624 
2625 /* A database transaction.
2626  * Every operation requires a transaction handle. */
2627 struct MDBX_txn {
2628 #define MDBX_MT_SIGNATURE UINT32_C(0x93D53A31)
2629   uint32_t mt_signature;
2630 
2631   /* Transaction Flags */
2632   /* mdbx_txn_begin() flags */
2633 #define MDBX_TXN_RO_BEGIN_FLAGS (MDBX_TXN_RDONLY | MDBX_TXN_RDONLY_PREPARE)
2634 #define MDBX_TXN_RW_BEGIN_FLAGS                                                \
2635   (MDBX_TXN_NOMETASYNC | MDBX_TXN_NOSYNC | MDBX_TXN_TRY)
2636   /* Additional flag for mdbx_sync_locked() */
2637 #define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000)
2638 
2639   /* internal txn flags */
2640 #define MDBX_TXN_FINISHED 0x01  /* txn is finished or never began */
2641 #define MDBX_TXN_ERROR 0x02     /* txn is unusable after an error */
2642 #define MDBX_TXN_DIRTY 0x04     /* must write, even if dirty list is empty */
2643 #define MDBX_TXN_SPILLS 0x08    /* txn or a parent has spilled pages */
2644 #define MDBX_TXN_HAS_CHILD 0x10 /* txn has an MDBX_txn.mt_child */
2645   /* most operations on the txn are currently illegal */
2646 #define MDBX_TXN_BLOCKED                                                       \
2647   (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_HAS_CHILD)
2648 
2649 #define TXN_FLAGS                                                              \
2650   (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS |     \
2651    MDBX_TXN_HAS_CHILD)
2652 
2653 #if (TXN_FLAGS & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS)) ||       \
2654     ((MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS | TXN_FLAGS) &         \
2655      MDBX_SHRINK_ALLOWED)
2656 #error "Oops, some flags overlapped or wrong"
2657 #endif
2658   uint32_t mt_flags;
2659 
2660   MDBX_txn *mt_parent; /* parent of a nested txn */
2661   /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */
2662   MDBX_txn *mt_child;
2663   MDBX_geo mt_geo;
2664   /* next unallocated page */
2665 #define mt_next_pgno mt_geo.next
2666   /* corresponding to the current size of datafile */
2667 #define mt_end_pgno mt_geo.now
2668 
2669   /* The ID of this transaction. IDs are integers incrementing from 1.
2670    * Only committed write transactions increment the ID. If a transaction
2671    * aborts, the ID may be re-used by the next writer. */
2672   txnid_t mt_txnid;
2673   txnid_t mt_front;
2674 
2675   MDBX_env *mt_env; /* the DB environment */
2676   /* Array of records for each DB known in the environment. */
2677   MDBX_dbx *mt_dbxs;
2678   /* Array of MDBX_db records for each known DB */
2679   MDBX_db *mt_dbs;
2680   /* Array of sequence numbers for each DB handle */
2681   unsigned *mt_dbiseqs;
2682 
2683   /* Transaction DBI Flags */
2684 #define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */
2685 #define DBI_STALE MDBX_DBI_STALE /* Named-DB record is older than txnID */
2686 #define DBI_FRESH MDBX_DBI_FRESH /* Named-DB handle opened in this txn */
2687 #define DBI_CREAT MDBX_DBI_CREAT /* Named-DB handle created in this txn */
2688 #define DBI_VALID 0x10           /* DB handle is valid, see also DB_VALID */
2689 #define DBI_USRVALID 0x20        /* As DB_VALID, but not set for FREE_DBI */
2690 #define DBI_AUDITED 0x40         /* Internal flag for accounting during audit */
2691   /* Array of flags for each DB */
2692   uint8_t *mt_dbistate;
2693   /* Number of DB records in use, or 0 when the txn is finished.
2694    * This number only ever increments until the txn finishes; we
2695    * don't decrement it when individual DB handles are closed. */
2696   MDBX_dbi mt_numdbs;
2697   size_t mt_owner; /* thread ID that owns this transaction */
2698   MDBX_canary mt_canary;
2699   void *mt_userctx; /* User-settable context */
2700 
2701   union {
2702     struct {
2703       /* For read txns: This thread/txn's reader table slot, or NULL. */
2704       MDBX_reader *reader;
2705     } to;
2706     struct {
2707       /* In write txns, array of cursors for each DB */
2708       MDBX_cursor **cursors;
2709       pgno_t *reclaimed_pglist; /* Reclaimed GC pages */
2710       txnid_t last_reclaimed;   /* ID of last used record */
2711 #if MDBX_ENABLE_REFUND
2712       pgno_t loose_refund_wl /* FIXME: describe */;
2713 #endif /* MDBX_ENABLE_REFUND */
2714       /* dirtylist room: Dirty array size - dirty pages visible to this txn.
2715        * Includes ancestor txns' dirty pages not hidden by other txns'
2716        * dirty/spilled pages. Thus commit(nested txn) has room to merge
2717        * dirtylist into mt_parent after freeing hidden mt_parent pages. */
2718       unsigned dirtyroom;
2719       /* a sequence to spilling dirty page with LRU policy */
2720       unsigned dirtylru;
2721       /* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */
2722       MDBX_dpl *dirtylist;
2723       /* The list of reclaimed txns from GC */
2724       MDBX_TXL lifo_reclaimed;
2725       /* The list of pages that became unused during this transaction. */
2726       MDBX_PNL retired_pages;
2727       /* The list of loose pages that became unused and may be reused
2728        * in this transaction, linked through `mp_next`. */
2729       MDBX_page *loose_pages;
2730       /* Number of loose pages (tw.loose_pages) */
2731       unsigned loose_count;
2732       /* The sorted list of dirty pages we temporarily wrote to disk
2733        * because the dirty list was full. page numbers in here are
2734        * shifted left by 1, deleted slots have the LSB set. */
2735       MDBX_PNL spill_pages;
2736       unsigned spill_least_removed;
2737     } tw;
2738   };
2739 };
2740 
2741 #if MDBX_WORDBITS >= 64
2742 #define CURSOR_STACK 32
2743 #else
2744 #define CURSOR_STACK 24
2745 #endif
2746 
2747 struct MDBX_xcursor;
2748 
2749 /* Cursors are used for all DB operations.
2750  * A cursor holds a path of (page pointer, key index) from the DB
2751  * root to a position in the DB, plus other state. MDBX_DUPSORT
2752  * cursors include an xcursor to the current data item. Write txns
2753  * track their cursors and keep them up to date when data moves.
2754  * Exception: An xcursor's pointer to a P_SUBP page can be stale.
2755  * (A node with F_DUPDATA but no F_SUBDATA contains a subpage). */
2756 struct MDBX_cursor {
2757 #define MDBX_MC_LIVE UINT32_C(0xFE05D5B1)
2758 #define MDBX_MC_READY4CLOSE UINT32_C(0x2817A047)
2759 #define MDBX_MC_WAIT4EOT UINT32_C(0x90E297A7)
2760   uint32_t mc_signature;
2761   /* The database handle this cursor operates on */
2762   MDBX_dbi mc_dbi;
2763   /* Next cursor on this DB in this txn */
2764   MDBX_cursor *mc_next;
2765   /* Backup of the original cursor if this cursor is a shadow */
2766   MDBX_cursor *mc_backup;
2767   /* Context used for databases with MDBX_DUPSORT, otherwise NULL */
2768   struct MDBX_xcursor *mc_xcursor;
2769   /* The transaction that owns this cursor */
2770   MDBX_txn *mc_txn;
2771   /* The database record for this cursor */
2772   MDBX_db *mc_db;
2773   /* The database auxiliary record for this cursor */
2774   MDBX_dbx *mc_dbx;
2775   /* The mt_dbistate for this database */
2776   uint8_t *mc_dbistate;
2777   unsigned mc_snum; /* number of pushed pages */
2778   unsigned mc_top;  /* index of top page, normally mc_snum-1 */
2779 
2780   /* Cursor state flags. */
2781 #define C_INITIALIZED 0x01 /* cursor has been initialized and is valid */
2782 #define C_EOF 0x02         /* No more data */
2783 #define C_SUB 0x04         /* Cursor is a sub-cursor */
2784 #define C_DEL 0x08         /* last op was a cursor_del */
2785 #define C_UNTRACK 0x10     /* Un-track cursor when closing */
2786 #define C_RECLAIMING 0x20  /* GC lookup is prohibited */
2787 #define C_GCFREEZE 0x40    /* reclaimed_pglist must not be updated */
2788 
2789   /* Cursor checking flags. */
2790 #define C_COPYING 0x100  /* skip key-value length check (copying simplify) */
2791 #define C_UPDATING 0x200 /* update/rebalance pending */
2792 #define C_RETIRING 0x400 /* refs to child pages may be invalid */
2793 #define C_SKIPORD 0x800  /* don't check keys ordering */
2794 
2795   unsigned mc_flags;              /* see mdbx_cursor */
2796   MDBX_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */
2797   indx_t mc_ki[CURSOR_STACK];     /* stack of page indices */
2798 };
2799 
2800 /* Context for sorted-dup records.
2801  * We could have gone to a fully recursive design, with arbitrarily
2802  * deep nesting of sub-databases. But for now we only handle these
2803  * levels - main DB, optional sub-DB, sorted-duplicate DB. */
2804 typedef struct MDBX_xcursor {
2805   /* A sub-cursor for traversing the Dup DB */
2806   MDBX_cursor mx_cursor;
2807   /* The database record for this Dup DB */
2808   MDBX_db mx_db;
2809   /* The auxiliary DB record for this Dup DB */
2810   MDBX_dbx mx_dbx;
2811 } MDBX_xcursor;
2812 
2813 typedef struct MDBX_cursor_couple {
2814   MDBX_cursor outer;
2815   void *mc_userctx; /* User-settable context */
2816   MDBX_xcursor inner;
2817 } MDBX_cursor_couple;
2818 
2819 /* The database environment. */
2820 struct MDBX_env {
2821   /* ----------------------------------------------------- mostly static part */
2822 #define MDBX_ME_SIGNATURE UINT32_C(0x9A899641)
2823   MDBX_atomic_uint32_t me_signature;
2824   /* Failed to update the meta page. Probably an I/O error. */
2825 #define MDBX_FATAL_ERROR UINT32_C(0x80000000)
2826   /* Some fields are initialized. */
2827 #define MDBX_ENV_ACTIVE UINT32_C(0x20000000)
2828   /* me_txkey is set */
2829 #define MDBX_ENV_TXKEY UINT32_C(0x10000000)
2830   /* Legacy MDBX_MAPASYNC (prior v0.9) */
2831 #define MDBX_DEPRECATED_MAPASYNC UINT32_C(0x100000)
2832 #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY)
2833   uint32_t me_flags;
2834   mdbx_mmap_t me_dxb_mmap; /* The main data file */
2835 #define me_map me_dxb_mmap.dxb
2836 #define me_lazy_fd me_dxb_mmap.fd
2837   mdbx_filehandle_t me_dsync_fd;
2838   mdbx_mmap_t me_lck_mmap; /* The lock file */
2839 #define me_lfd me_lck_mmap.fd
2840   struct MDBX_lockinfo *me_lck;
2841 
2842   unsigned me_psize;        /* DB page size, initialized from me_os_psize */
2843   unsigned me_leaf_nodemax; /* max size of a leaf-node */
2844   uint8_t me_psize2log;     /* log2 of DB page size */
2845   int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */
2846   uint16_t me_merge_threshold,
2847       me_merge_threshold_gc;  /* pages emptier than this are candidates for
2848                                  merging */
2849   unsigned me_os_psize;       /* OS page size, from mdbx_syspagesize() */
2850   unsigned me_maxreaders;     /* size of the reader table */
2851   MDBX_dbi me_maxdbs;         /* size of the DB table */
2852   uint32_t me_pid;            /* process ID of this env */
2853   mdbx_thread_key_t me_txkey; /* thread-key for readers */
2854   char *me_pathname;          /* path to the DB files */
2855   void *me_pbuf;              /* scratch area for DUPSORT put() */
2856   MDBX_txn *me_txn0;          /* preallocated write transaction */
2857 
2858   MDBX_dbx *me_dbxs;    /* array of static DB info */
2859   uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */
2860   unsigned *me_dbiseqs; /* array of dbi sequence numbers */
2861   unsigned
2862       me_maxgc_ov1page;    /* Number of pgno_t fit in a single overflow page */
2863   uint32_t me_live_reader; /* have liveness lock in reader table */
2864   void *me_userctx;        /* User-settable context */
2865   MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */
2866 
2867   struct {
2868     unsigned dp_reserve_limit;
2869     unsigned rp_augment_limit;
2870     unsigned dp_limit;
2871     unsigned dp_initial;
2872     uint8_t dp_loose_limit;
2873     uint8_t spill_max_denominator;
2874     uint8_t spill_min_denominator;
2875     uint8_t spill_parent4child_denominator;
2876     unsigned merge_threshold_16dot16_percent;
2877     union {
2878       unsigned all;
2879       /* tracks options with non-auto values but tuned by user */
2880       struct {
2881         unsigned dp_limit : 1;
2882       } non_auto;
2883     } flags;
2884   } me_options;
2885 
2886   /* struct me_dbgeo used for accepting db-geo params from user for the new
2887    * database creation, i.e. when mdbx_env_set_geometry() was called before
2888    * mdbx_env_open(). */
2889   struct {
2890     size_t lower;  /* minimal size of datafile */
2891     size_t upper;  /* maximal size of datafile */
2892     size_t now;    /* current size of datafile */
2893     size_t grow;   /* step to grow datafile */
2894     size_t shrink; /* threshold to shrink datafile */
2895   } me_dbgeo;
2896 
2897 #if MDBX_LOCKING == MDBX_LOCKING_SYSV
2898   union {
2899     key_t key;
2900     int semid;
2901   } me_sysv_ipc;
2902 #endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */
2903 
2904   MDBX_env *me_lcklist_next;
2905 
2906   /* --------------------------------------------------- mostly volatile part */
2907 
2908   MDBX_txn *me_txn; /* current write transaction */
2909   mdbx_fastmutex_t me_dbi_lock;
2910   MDBX_dbi me_numdbs; /* number of DBs opened */
2911 
2912   MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */
2913   unsigned me_dp_reserve_len;
2914   /* PNL of pages that became unused in a write txn */
2915   MDBX_PNL me_retired_pages;
2916 
2917 #if defined(_WIN32) || defined(_WIN64)
2918   MDBX_srwlock me_remap_guard;
2919   /* Workaround for LockFileEx and WriteFile multithread bug */
2920   CRITICAL_SECTION me_windowsbug_lock;
2921 #else
2922   mdbx_fastmutex_t me_remap_guard;
2923 #endif
2924 
2925   /* -------------------------------------------------------------- debugging */
2926 
2927 #if MDBX_DEBUG
2928   MDBX_assert_func *me_assert_func; /*  Callback for assertion failures */
2929 #endif
2930 #ifdef MDBX_USE_VALGRIND
2931   int me_valgrind_handle;
2932 #endif
2933 #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__)
2934   pgno_t me_poison_edge;
2935 #endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */
2936 
2937 #ifndef xMDBX_DEBUG_SPILLING
2938 #define xMDBX_DEBUG_SPILLING 0
2939 #endif
2940 #if xMDBX_DEBUG_SPILLING == 2
2941   unsigned debug_dirtied_est, debug_dirtied_act;
2942 #endif /* xMDBX_DEBUG_SPILLING */
2943 
2944   /* ------------------------------------------------- stub for lck-less mode */
2945   MDBX_atomic_uint64_t
2946       x_lckless_stub[(sizeof(MDBX_lockinfo) + MDBX_CACHELINE_SIZE - 1) /
2947                      sizeof(MDBX_atomic_uint64_t)];
2948 };
2949 
2950 #ifndef __cplusplus
2951 /*----------------------------------------------------------------------------*/
2952 /* Debug and Logging stuff */
2953 
2954 #define MDBX_RUNTIME_FLAGS_INIT                                                \
2955   ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT
2956 
2957 extern uint8_t mdbx_runtime_flags;
2958 extern uint8_t mdbx_loglevel;
2959 extern MDBX_debug_func *mdbx_debug_logger;
2960 
mdbx_jitter4testing(bool tiny)2961 MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny) {
2962 #if MDBX_DEBUG
2963   if (MDBX_DBG_JITTER & mdbx_runtime_flags)
2964     mdbx_osal_jitter(tiny);
2965 #else
2966   (void)tiny;
2967 #endif
2968 }
2969 
2970 MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5)
2971     mdbx_debug_log(int level, const char *function, int line, const char *fmt,
2972                    ...) MDBX_PRINTF_ARGS(4, 5);
2973 MDBX_INTERNAL_FUNC void mdbx_debug_log_va(int level, const char *function,
2974                                           int line, const char *fmt,
2975                                           va_list args);
2976 
2977 #define mdbx_log_enabled(msg) unlikely(msg <= mdbx_loglevel)
2978 
2979 #if MDBX_DEBUG
2980 
2981 #define mdbx_assert_enabled() unlikely(mdbx_runtime_flags &MDBX_DBG_ASSERT)
2982 
2983 #define mdbx_audit_enabled() unlikely(mdbx_runtime_flags &MDBX_DBG_AUDIT)
2984 
2985 #else /* MDBX_DEBUG */
2986 
2987 #define mdbx_audit_enabled() (0)
2988 
2989 #if !defined(NDEBUG) || MDBX_FORCE_ASSERTIONS
2990 #define mdbx_assert_enabled() (1)
2991 #else
2992 #define mdbx_assert_enabled() (0)
2993 #endif /* NDEBUG */
2994 
2995 #endif /* MDBX_DEBUG */
2996 
2997 #if !MDBX_DEBUG && defined(__ANDROID_API__)
2998 #define mdbx_assert_fail(env, msg, func, line)                                 \
2999   __android_log_assert(msg, "mdbx", "%s:%u", func, line)
3000 #else
3001 void mdbx_assert_fail(const MDBX_env *env, const char *msg, const char *func,
3002                       int line);
3003 #endif
3004 
3005 #define mdbx_debug_extra(fmt, ...)                                             \
3006   do {                                                                         \
3007     if (MDBX_DEBUG && mdbx_log_enabled(MDBX_LOG_EXTRA))                        \
3008       mdbx_debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__);    \
3009   } while (0)
3010 
3011 #define mdbx_debug_extra_print(fmt, ...)                                       \
3012   do {                                                                         \
3013     if (MDBX_DEBUG && mdbx_log_enabled(MDBX_LOG_EXTRA))                        \
3014       mdbx_debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__);               \
3015   } while (0)
3016 
3017 #define mdbx_trace(fmt, ...)                                                   \
3018   do {                                                                         \
3019     if (MDBX_DEBUG && mdbx_log_enabled(MDBX_LOG_TRACE))                        \
3020       mdbx_debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n",             \
3021                      __VA_ARGS__);                                             \
3022   } while (0)
3023 
3024 #define mdbx_debug(fmt, ...)                                                   \
3025   do {                                                                         \
3026     if (MDBX_DEBUG && mdbx_log_enabled(MDBX_LOG_DEBUG))                        \
3027       mdbx_debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n",             \
3028                      __VA_ARGS__);                                             \
3029   } while (0)
3030 
3031 #define mdbx_verbose(fmt, ...)                                                 \
3032   do {                                                                         \
3033     if (MDBX_DEBUG && mdbx_log_enabled(MDBX_LOG_VERBOSE))                      \
3034       mdbx_debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n",           \
3035                      __VA_ARGS__);                                             \
3036   } while (0)
3037 
3038 #define mdbx_notice(fmt, ...)                                                  \
3039   do {                                                                         \
3040     if (mdbx_log_enabled(MDBX_LOG_NOTICE))                                     \
3041       mdbx_debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n",            \
3042                      __VA_ARGS__);                                             \
3043   } while (0)
3044 
3045 #define mdbx_warning(fmt, ...)                                                 \
3046   do {                                                                         \
3047     if (mdbx_log_enabled(MDBX_LOG_WARN))                                       \
3048       mdbx_debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n",              \
3049                      __VA_ARGS__);                                             \
3050   } while (0)
3051 
3052 #define mdbx_error(fmt, ...)                                                   \
3053   do {                                                                         \
3054     if (mdbx_log_enabled(MDBX_LOG_ERROR))                                      \
3055       mdbx_debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n",             \
3056                      __VA_ARGS__);                                             \
3057   } while (0)
3058 
3059 #define mdbx_fatal(fmt, ...)                                                   \
3060   mdbx_debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__);
3061 
3062 #define mdbx_ensure_msg(env, expr, msg)                                        \
3063   do {                                                                         \
3064     if (unlikely(!(expr)))                                                     \
3065       mdbx_assert_fail(env, msg, __func__, __LINE__);                          \
3066   } while (0)
3067 
3068 #define mdbx_ensure(env, expr) mdbx_ensure_msg(env, expr, #expr)
3069 
3070 /* assert(3) variant in environment context */
3071 #define mdbx_assert(env, expr)                                                 \
3072   do {                                                                         \
3073     if (mdbx_assert_enabled())                                                 \
3074       mdbx_ensure(env, expr);                                                  \
3075   } while (0)
3076 
3077 /* assert(3) variant in cursor context */
3078 #define mdbx_cassert(mc, expr) mdbx_assert((mc)->mc_txn->mt_env, expr)
3079 
3080 /* assert(3) variant in transaction context */
3081 #define mdbx_tassert(txn, expr) mdbx_assert((txn)->mt_env, expr)
3082 
3083 #ifndef xMDBX_TOOLS /* Avoid using internal mdbx_assert() */
3084 #undef assert
3085 #define assert(expr) mdbx_assert(NULL, expr)
3086 #endif
3087 
3088 /*----------------------------------------------------------------------------*/
3089 /* Cache coherence and mmap invalidation */
3090 
3091 #if MDBX_CPU_WRITEBACK_INCOHERENT
3092 #define mdbx_flush_incoherent_cpu_writeback() mdbx_memory_barrier()
3093 #else
3094 #define mdbx_flush_incoherent_cpu_writeback() mdbx_compiler_barrier()
3095 #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */
3096 
3097 MDBX_MAYBE_UNUSED static __inline void
mdbx_flush_incoherent_mmap(void * addr,size_t nbytes,const intptr_t pagesize)3098 mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) {
3099 #if MDBX_MMAP_INCOHERENT_FILE_WRITE
3100   char *const begin = (char *)(-pagesize & (intptr_t)addr);
3101   char *const end =
3102       (char *)(-pagesize & (intptr_t)((char *)addr + nbytes + pagesize - 1));
3103   int err = msync(begin, end - begin, MS_SYNC | MS_INVALIDATE) ? errno : 0;
3104   mdbx_assert(nullptr, err == 0);
3105   (void)err;
3106 #else
3107   (void)pagesize;
3108 #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */
3109 
3110 #if MDBX_MMAP_INCOHERENT_CPU_CACHE
3111 #ifdef DCACHE
3112   /* MIPS has cache coherency issues.
3113    * Note: for any nbytes >= on-chip cache size, entire is flushed. */
3114   cacheflush(addr, nbytes, DCACHE);
3115 #else
3116 #error "Oops, cacheflush() not available"
3117 #endif /* DCACHE */
3118 #endif /* MDBX_MMAP_INCOHERENT_CPU_CACHE */
3119 
3120 #if !MDBX_MMAP_INCOHERENT_FILE_WRITE && !MDBX_MMAP_INCOHERENT_CPU_CACHE
3121   (void)addr;
3122   (void)nbytes;
3123 #endif
3124 }
3125 
3126 /*----------------------------------------------------------------------------*/
3127 /* Internal prototypes */
3128 
3129 MDBX_INTERNAL_FUNC int mdbx_cleanup_dead_readers(MDBX_env *env, int rlocked,
3130                                                  int *dead);
3131 MDBX_INTERNAL_FUNC int mdbx_rthc_alloc(mdbx_thread_key_t *key,
3132                                        MDBX_reader *begin, MDBX_reader *end);
3133 MDBX_INTERNAL_FUNC void mdbx_rthc_remove(const mdbx_thread_key_t key);
3134 
3135 MDBX_INTERNAL_FUNC void mdbx_rthc_global_init(void);
3136 MDBX_INTERNAL_FUNC void mdbx_rthc_global_dtor(void);
3137 MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr);
3138 
3139 #endif /* !__cplusplus */
3140 
3141 #define MDBX_IS_ERROR(rc)                                                      \
3142   ((rc) != MDBX_RESULT_TRUE && (rc) != MDBX_RESULT_FALSE)
3143 
3144 /* Internal error codes, not exposed outside libmdbx */
3145 #define MDBX_NO_ROOT (MDBX_LAST_ADDED_ERRCODE + 10)
3146 
3147 /* Debugging output value of a cursor DBI: Negative in a sub-cursor. */
3148 #define DDBI(mc)                                                               \
3149   (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi)
3150 
3151 /* Key size which fits in a DKBUF (debug key buffer). */
3152 #define DKBUF_MAX 511
3153 #define DKBUF char _kbuf[DKBUF_MAX * 4 + 2]
3154 #define DKEY(x) mdbx_dump_val(x, _kbuf, DKBUF_MAX * 2 + 1)
3155 #define DVAL(x) mdbx_dump_val(x, _kbuf + DKBUF_MAX * 2 + 1, DKBUF_MAX * 2 + 1)
3156 
3157 #if MDBX_DEBUG
3158 #define DKBUF_DEBUG DKBUF
3159 #define DKEY_DEBUG(x) DKEY(x)
3160 #define DVAL_DEBUG(x) DVAL(x)
3161 #else
3162 #define DKBUF_DEBUG ((void)(0))
3163 #define DKEY_DEBUG(x) ("-")
3164 #define DVAL_DEBUG(x) ("-")
3165 #endif
3166 
3167 /* An invalid page number.
3168  * Mainly used to denote an empty tree. */
3169 #define P_INVALID (~(pgno_t)0)
3170 
3171 /* Test if the flags f are set in a flag word w. */
3172 #define F_ISSET(w, f) (((w) & (f)) == (f))
3173 
3174 /* Round n up to an even number. */
3175 #define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */
3176 
3177 /* Default size of memory map.
3178  * This is certainly too small for any actual applications. Apps should
3179  * always set the size explicitly using mdbx_env_set_geometry(). */
3180 #define DEFAULT_MAPSIZE MEGABYTE
3181 
3182 /* Number of slots in the reader table.
3183  * This value was chosen somewhat arbitrarily. The 61 is a prime number,
3184  * and such readers plus a couple mutexes fit into single 4KB page.
3185  * Applications should set the table size using mdbx_env_set_maxreaders(). */
3186 #define DEFAULT_READERS 61
3187 
3188 /* Test if a page is a leaf page */
3189 #define IS_LEAF(p) (((p)->mp_flags & P_LEAF) != 0)
3190 /* Test if a page is a LEAF2 page */
3191 #define IS_LEAF2(p) unlikely(((p)->mp_flags & P_LEAF2) != 0)
3192 /* Test if a page is a branch page */
3193 #define IS_BRANCH(p) (((p)->mp_flags & P_BRANCH) != 0)
3194 /* Test if a page is an overflow page */
3195 #define IS_OVERFLOW(p) unlikely(((p)->mp_flags & P_OVERFLOW) != 0)
3196 /* Test if a page is a sub page */
3197 #define IS_SUBP(p) (((p)->mp_flags & P_SUBP) != 0)
3198 
3199 #define PAGETYPE(p) ((p)->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW))
3200 
3201 /* Header for a single key/data pair within a page.
3202  * Used in pages of type P_BRANCH and P_LEAF without P_LEAF2.
3203  * We guarantee 2-byte alignment for 'MDBX_node's.
3204  *
3205  * Leaf node flags describe node contents.  F_BIGDATA says the node's
3206  * data part is the page number of an overflow page with actual data.
3207  * F_DUPDATA and F_SUBDATA can be combined giving duplicate data in
3208  * a sub-page/sub-database, and named databases (just F_SUBDATA). */
3209 typedef struct MDBX_node {
3210 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
3211   union {
3212     uint32_t mn_dsize;
3213     uint32_t mn_pgno32;
3214   };
3215   uint8_t mn_flags; /* see mdbx_node flags */
3216   uint8_t mn_extra;
3217   uint16_t mn_ksize; /* key size */
3218 #else
3219   uint16_t mn_ksize; /* key size */
3220   uint8_t mn_extra;
3221   uint8_t mn_flags; /* see mdbx_node flags */
3222   union {
3223     uint32_t mn_pgno32;
3224     uint32_t mn_dsize;
3225   };
3226 #endif /* __BYTE_ORDER__ */
3227 
3228   /* mdbx_node Flags */
3229 #define F_BIGDATA 0x01 /* data put on overflow page */
3230 #define F_SUBDATA 0x02 /* data is a sub-database */
3231 #define F_DUPDATA 0x04 /* data has duplicates */
3232 
3233   /* valid flags for mdbx_node_add() */
3234 #define NODE_ADD_FLAGS (F_DUPDATA | F_SUBDATA | MDBX_RESERVE | MDBX_APPEND)
3235 
3236 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) ||              \
3237     (!defined(__cplusplus) && defined(_MSC_VER))
3238   uint8_t mn_data[] /* key and data are appended here */;
3239 #endif /* C99 */
3240 } MDBX_node;
3241 
3242 #define DB_PERSISTENT_FLAGS                                                    \
3243   (MDBX_REVERSEKEY | MDBX_DUPSORT | MDBX_INTEGERKEY | MDBX_DUPFIXED |          \
3244    MDBX_INTEGERDUP | MDBX_REVERSEDUP)
3245 
3246 /* mdbx_dbi_open() flags */
3247 #define DB_USABLE_FLAGS (DB_PERSISTENT_FLAGS | MDBX_CREATE | MDBX_DB_ACCEDE)
3248 
3249 #define DB_VALID 0x8000 /* DB handle is valid, for me_dbflags */
3250 #define DB_INTERNAL_FLAGS DB_VALID
3251 
3252 #if DB_INTERNAL_FLAGS & DB_USABLE_FLAGS
3253 #error "Oops, some flags overlapped or wrong"
3254 #endif
3255 #if DB_PERSISTENT_FLAGS & ~DB_USABLE_FLAGS
3256 #error "Oops, some flags overlapped or wrong"
3257 #endif
3258 
3259 /* max number of pages to commit in one writev() call */
3260 #define MDBX_COMMIT_PAGES 64
3261 #if defined(IOV_MAX) && IOV_MAX < MDBX_COMMIT_PAGES /* sysconf(_SC_IOV_MAX) */
3262 #undef MDBX_COMMIT_PAGES
3263 #define MDBX_COMMIT_PAGES IOV_MAX
3264 #endif
3265 
3266 /*
3267  *                /
3268  *                | -1, a < b
3269  * CMP2INT(a,b) = <  0, a == b
3270  *                |  1, a > b
3271  *                \
3272  */
3273 #ifndef __e2k__
3274 /* LY: fast enough on most systems */
3275 #define CMP2INT(a, b) (((b) > (a)) ? -1 : (a) > (b))
3276 #else
3277 /* LY: more parallelable on VLIW Elbrus */
3278 #define CMP2INT(a, b) (((a) > (b)) - ((b) > (a)))
3279 #endif
3280 
3281 /* Do not spill pages to disk if txn is getting full, may fail instead */
3282 #define MDBX_NOSPILL 0x8000
3283 
3284 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t
pgno_add(pgno_t base,pgno_t augend)3285 pgno_add(pgno_t base, pgno_t augend) {
3286   assert(base <= MAX_PAGENO);
3287   return (augend < MAX_PAGENO - base) ? base + augend : MAX_PAGENO;
3288 }
3289 
3290 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t
pgno_sub(pgno_t base,pgno_t subtrahend)3291 pgno_sub(pgno_t base, pgno_t subtrahend) {
3292   assert(base >= MIN_PAGENO);
3293   return (subtrahend < base - MIN_PAGENO) ? base - subtrahend : MIN_PAGENO;
3294 }
3295 
3296 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline bool
is_powerof2(size_t x)3297 is_powerof2(size_t x) {
3298   return (x & (x - 1)) == 0;
3299 }
3300 
3301 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t
floor_powerof2(size_t value,size_t granularity)3302 floor_powerof2(size_t value, size_t granularity) {
3303   assert(is_powerof2(granularity));
3304   return value & ~(granularity - 1);
3305 }
3306 
3307 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t
ceil_powerof2(size_t value,size_t granularity)3308 ceil_powerof2(size_t value, size_t granularity) {
3309   return floor_powerof2(value + granularity - 1, granularity);
3310 }
3311 
3312 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static unsigned
log2n_powerof2(size_t value)3313 log2n_powerof2(size_t value) {
3314   assert(value > 0 && value < INT32_MAX && is_powerof2(value));
3315   assert((value & -(int32_t)value) == value);
3316 #if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctzl)
3317   return __builtin_ctzl(value);
3318 #elif defined(_MSC_VER)
3319   unsigned long index;
3320   _BitScanForward(&index, (unsigned long)value);
3321   return index;
3322 #else
3323   static const uint8_t debruijn_ctz32[32] = {
3324       0,  1,  28, 2,  29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4,  8,
3325       31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6,  11, 5,  10, 9};
3326   return debruijn_ctz32[(uint32_t)(value * 0x077CB531u) >> 27];
3327 #endif
3328 }
3329 
3330 /* Only a subset of the mdbx_env flags can be changed
3331  * at runtime. Changing other flags requires closing the
3332  * environment and re-opening it with the new flags. */
3333 #define ENV_CHANGEABLE_FLAGS                                                   \
3334   (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_DEPRECATED_MAPASYNC |             \
3335    MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE)
3336 #define ENV_CHANGELESS_FLAGS                                                   \
3337   (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOTLS | MDBX_NORDAHEAD | \
3338    MDBX_LIFORECLAIM | MDBX_EXCLUSIVE)
3339 #define ENV_USABLE_FLAGS (ENV_CHANGEABLE_FLAGS | ENV_CHANGELESS_FLAGS)
3340 
3341 #if !defined(__cplusplus) || CONSTEXPR_ENUM_FLAGS_OPERATIONS
static_checks(void)3342 MDBX_MAYBE_UNUSED static void static_checks(void) {
3343   STATIC_ASSERT_MSG(INT16_MAX - CORE_DBS == MDBX_MAX_DBI,
3344                     "Oops, MDBX_MAX_DBI or CORE_DBS?");
3345   STATIC_ASSERT_MSG((unsigned)(MDBX_DB_ACCEDE | MDBX_CREATE) ==
3346                         ((DB_USABLE_FLAGS | DB_INTERNAL_FLAGS) &
3347                          (ENV_USABLE_FLAGS | ENV_INTERNAL_FLAGS)),
3348                     "Oops, some flags overlapped or wrong");
3349   STATIC_ASSERT_MSG((ENV_INTERNAL_FLAGS & ENV_USABLE_FLAGS) == 0,
3350                     "Oops, some flags overlapped or wrong");
3351 }
3352 #endif /* Disabled for MSVC 19.0 (VisualStudio 2015) */
3353 
3354 #ifdef __cplusplus
3355 }
3356 #endif
3357 
3358 #define MDBX_ASAN_POISON_MEMORY_REGION(addr, size)                             \
3359   do {                                                                         \
3360     mdbx_trace("POISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr),          \
3361                (size_t)(size), __LINE__);                                      \
3362     ASAN_POISON_MEMORY_REGION(addr, size);                                     \
3363   } while (0)
3364 
3365 #define MDBX_ASAN_UNPOISON_MEMORY_REGION(addr, size)                           \
3366   do {                                                                         \
3367     mdbx_trace("UNPOISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr),        \
3368                (size_t)(size), __LINE__);                                      \
3369     ASAN_UNPOISON_MEMORY_REGION(addr, size);                                   \
3370   } while (0)
3371 /*
3372  * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru>.
3373  * and other libmdbx authors: please see AUTHORS file.
3374  * All rights reserved.
3375  *
3376  * This code is derived from "LMDB engine" written by
3377  * Howard Chu (Symas Corporation), which itself derived from btree.c
3378  * written by Martin Hedenfalk.
3379  *
3380  * ---
3381  *
3382  * Portions Copyright 2011-2015 Howard Chu, Symas Corp. All rights reserved.
3383  *
3384  * Redistribution and use in source and binary forms, with or without
3385  * modification, are permitted only as authorized by the OpenLDAP
3386  * Public License.
3387  *
3388  * A copy of this license is available in the file LICENSE in the
3389  * top-level directory of the distribution or, alternatively, at
3390  * <http://www.OpenLDAP.org/license.html>.
3391  *
3392  * ---
3393  *
3394  * Portions Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se>
3395  *
3396  * Permission to use, copy, modify, and distribute this software for any
3397  * purpose with or without fee is hereby granted, provided that the above
3398  * copyright notice and this permission notice appear in all copies.
3399  *
3400  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
3401  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
3402  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
3403  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
3404  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
3405  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
3406  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
3407 
3408 
3409 /*------------------------------------------------------------------------------
3410  * Internal inline functions */
3411 
branchless_abs(int value)3412 MDBX_NOTHROW_CONST_FUNCTION static unsigned branchless_abs(int value) {
3413   assert(value > INT_MIN);
3414   const unsigned expanded_sign =
3415       (unsigned)(value >> (sizeof(value) * CHAR_BIT - 1));
3416   return ((unsigned)value + expanded_sign) ^ expanded_sign;
3417 }
3418 
3419 /* Pack/Unpack 16-bit values for Grow step & Shrink threshold */
me2v(unsigned m,unsigned e)3420 MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t me2v(unsigned m,
3421                                                         unsigned e) {
3422   assert(m < 2048 && e < 8);
3423   return (pgno_t)(32768 + ((m + 1) << (e + 8)));
3424 }
3425 
v2me(size_t v,unsigned e)3426 MDBX_NOTHROW_CONST_FUNCTION static __inline uint16_t v2me(size_t v,
3427                                                           unsigned e) {
3428   assert(v > (e ? me2v(2047, e - 1) : 32768));
3429   assert(v <= me2v(2047, e));
3430   size_t m = (v - 32768 + ((size_t)1 << (e + 8)) - 1) >> (e + 8);
3431   m -= m > 0;
3432   assert(m < 2048 && e < 8);
3433   // f e d c b a 9 8 7 6 5 4 3 2 1 0
3434   // 1 e e e m m m m m m m m m m m 1
3435   const uint16_t pv = (uint16_t)(0x8001 + (e << 12) + (m << 1));
3436   assert(pv != 65535);
3437   return pv;
3438 }
3439 
3440 /* Convert 16-bit packed (exponential quantized) value to number of pages */
pv2pages(uint16_t pv)3441 MDBX_NOTHROW_CONST_FUNCTION static pgno_t pv2pages(uint16_t pv) {
3442   if ((pv & 0x8001) != 0x8001)
3443     return pv;
3444   if (pv == 65535)
3445     return 65536;
3446   // f e d c b a 9 8 7 6 5 4 3 2 1 0
3447   // 1 e e e m m m m m m m m m m m 1
3448   return me2v((pv >> 1) & 2047, (pv >> 12) & 7);
3449 }
3450 
3451 /* Convert number of pages to 16-bit packed (exponential quantized) value */
pages2pv(size_t pages)3452 MDBX_NOTHROW_CONST_FUNCTION static uint16_t pages2pv(size_t pages) {
3453   if (pages < 32769 || (pages < 65536 && (pages & 1) == 0))
3454     return (uint16_t)pages;
3455   if (pages <= me2v(2047, 0))
3456     return v2me(pages, 0);
3457   if (pages <= me2v(2047, 1))
3458     return v2me(pages, 1);
3459   if (pages <= me2v(2047, 2))
3460     return v2me(pages, 2);
3461   if (pages <= me2v(2047, 3))
3462     return v2me(pages, 3);
3463   if (pages <= me2v(2047, 4))
3464     return v2me(pages, 4);
3465   if (pages <= me2v(2047, 5))
3466     return v2me(pages, 5);
3467   if (pages <= me2v(2047, 6))
3468     return v2me(pages, 6);
3469   return (pages < me2v(2046, 7)) ? v2me(pages, 7) : 65533;
3470 }
3471 
3472 /*------------------------------------------------------------------------------
3473  * Unaligned access */
3474 
3475 MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline unsigned
field_alignment(unsigned alignment_baseline,size_t field_offset)3476 field_alignment(unsigned alignment_baseline, size_t field_offset) {
3477   unsigned merge = alignment_baseline | (unsigned)field_offset;
3478   return merge & -(int)merge;
3479 }
3480 
3481 /* read-thunk for UB-sanitizer */
3482 MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint8_t
peek_u8(const uint8_t * const __restrict ptr)3483 peek_u8(const uint8_t *const __restrict ptr) {
3484   return *ptr;
3485 }
3486 
3487 /* write-thunk for UB-sanitizer */
poke_u8(uint8_t * const __restrict ptr,const uint8_t v)3488 static __always_inline void poke_u8(uint8_t *const __restrict ptr,
3489                                     const uint8_t v) {
3490   *ptr = v;
3491 }
3492 
3493 MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint16_t
unaligned_peek_u16(const unsigned expected_alignment,const void * const ptr)3494 unaligned_peek_u16(const unsigned expected_alignment, const void *const ptr) {
3495   assert((uintptr_t)ptr % expected_alignment == 0);
3496   if (MDBX_UNALIGNED_OK || (expected_alignment % sizeof(uint16_t)) == 0)
3497     return *(const uint16_t *)ptr;
3498   else {
3499     uint16_t v;
3500     memcpy(&v, ptr, sizeof(v));
3501     return v;
3502   }
3503 }
3504 
3505 static __always_inline void
unaligned_poke_u16(const unsigned expected_alignment,void * const __restrict ptr,const uint16_t v)3506 unaligned_poke_u16(const unsigned expected_alignment,
3507                    void *const __restrict ptr, const uint16_t v) {
3508   assert((uintptr_t)ptr % expected_alignment == 0);
3509   if (MDBX_UNALIGNED_OK || (expected_alignment % sizeof(v)) == 0)
3510     *(uint16_t *)ptr = v;
3511   else
3512     memcpy(ptr, &v, sizeof(v));
3513 }
3514 
unaligned_peek_u32(const unsigned expected_alignment,const void * const __restrict ptr)3515 MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint32_t unaligned_peek_u32(
3516     const unsigned expected_alignment, const void *const __restrict ptr) {
3517   assert((uintptr_t)ptr % expected_alignment == 0);
3518   if (MDBX_UNALIGNED_OK || (expected_alignment % sizeof(uint32_t)) == 0)
3519     return *(const uint32_t *)ptr;
3520   else if ((expected_alignment % sizeof(uint16_t)) == 0) {
3521     const uint16_t lo =
3522         ((const uint16_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__];
3523     const uint16_t hi =
3524         ((const uint16_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__];
3525     return lo | (uint32_t)hi << 16;
3526   } else {
3527     uint32_t v;
3528     memcpy(&v, ptr, sizeof(v));
3529     return v;
3530   }
3531 }
3532 
3533 static __always_inline void
unaligned_poke_u32(const unsigned expected_alignment,void * const __restrict ptr,const uint32_t v)3534 unaligned_poke_u32(const unsigned expected_alignment,
3535                    void *const __restrict ptr, const uint32_t v) {
3536   assert((uintptr_t)ptr % expected_alignment == 0);
3537   if (MDBX_UNALIGNED_OK || (expected_alignment % sizeof(v)) == 0)
3538     *(uint32_t *)ptr = v;
3539   else if ((expected_alignment % sizeof(uint16_t)) == 0) {
3540     ((uint16_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__] = (uint16_t)v;
3541     ((uint16_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__] =
3542         (uint16_t)(v >> 16);
3543   } else
3544     memcpy(ptr, &v, sizeof(v));
3545 }
3546 
unaligned_peek_u64(const unsigned expected_alignment,const void * const __restrict ptr)3547 MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint64_t unaligned_peek_u64(
3548     const unsigned expected_alignment, const void *const __restrict ptr) {
3549   assert((uintptr_t)ptr % expected_alignment == 0);
3550   if (MDBX_UNALIGNED_OK || (expected_alignment % sizeof(uint64_t)) == 0)
3551     return *(const uint64_t *)ptr;
3552   else if ((expected_alignment % sizeof(uint32_t)) == 0) {
3553     const uint32_t lo =
3554         ((const uint32_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__];
3555     const uint32_t hi =
3556         ((const uint32_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__];
3557     return lo | (uint64_t)hi << 32;
3558   } else {
3559     uint64_t v;
3560     memcpy(&v, ptr, sizeof(v));
3561     return v;
3562   }
3563 }
3564 
3565 static __always_inline void
unaligned_poke_u64(const unsigned expected_alignment,void * const __restrict ptr,const uint64_t v)3566 unaligned_poke_u64(const unsigned expected_alignment,
3567                    void *const __restrict ptr, const uint64_t v) {
3568   assert((uintptr_t)ptr % expected_alignment == 0);
3569   if (MDBX_UNALIGNED_OK || (expected_alignment % sizeof(v)) == 0)
3570     *(uint64_t *)ptr = v;
3571   else if ((expected_alignment % sizeof(uint32_t)) == 0) {
3572     ((uint32_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__] = (uint32_t)v;
3573     ((uint32_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__] =
3574         (uint32_t)(v >> 32);
3575   } else
3576     memcpy(ptr, &v, sizeof(v));
3577 }
3578 
3579 #define UNALIGNED_PEEK_8(ptr, struct, field)                                   \
3580   peek_u8((const uint8_t *)(ptr) + offsetof(struct, field))
3581 #define UNALIGNED_POKE_8(ptr, struct, field, value)                            \
3582   poke_u8((uint8_t *)(ptr) + offsetof(struct, field), value)
3583 
3584 #define UNALIGNED_PEEK_16(ptr, struct, field)                                  \
3585   unaligned_peek_u16(1, (const char *)(ptr) + offsetof(struct, field))
3586 #define UNALIGNED_POKE_16(ptr, struct, field, value)                           \
3587   unaligned_poke_u16(1, (char *)(ptr) + offsetof(struct, field), value)
3588 
3589 #define UNALIGNED_PEEK_32(ptr, struct, field)                                  \
3590   unaligned_peek_u32(1, (const char *)(ptr) + offsetof(struct, field))
3591 #define UNALIGNED_POKE_32(ptr, struct, field, value)                           \
3592   unaligned_poke_u32(1, (char *)(ptr) + offsetof(struct, field), value)
3593 
3594 #define UNALIGNED_PEEK_64(ptr, struct, field)                                  \
3595   unaligned_peek_u64(1, (const char *)(ptr) + offsetof(struct, field))
3596 #define UNALIGNED_POKE_64(ptr, struct, field, value)                           \
3597   unaligned_poke_u64(1, (char *)(ptr) + offsetof(struct, field), value)
3598 
3599 /* Get the page number pointed to by a branch node */
3600 MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t
node_pgno(const MDBX_node * const __restrict node)3601 node_pgno(const MDBX_node *const __restrict node) {
3602   pgno_t pgno = UNALIGNED_PEEK_32(node, MDBX_node, mn_pgno32);
3603   if (sizeof(pgno) > 4)
3604     pgno |= ((uint64_t)UNALIGNED_PEEK_8(node, MDBX_node, mn_extra)) << 32;
3605   return pgno;
3606 }
3607 
3608 /* Set the page number in a branch node */
node_set_pgno(MDBX_node * const __restrict node,pgno_t pgno)3609 static __always_inline void node_set_pgno(MDBX_node *const __restrict node,
3610                                           pgno_t pgno) {
3611   assert(pgno >= MIN_PAGENO && pgno <= MAX_PAGENO);
3612 
3613   UNALIGNED_POKE_32(node, MDBX_node, mn_pgno32, (uint32_t)pgno);
3614   if (sizeof(pgno) > 4)
3615     UNALIGNED_POKE_8(node, MDBX_node, mn_extra,
3616                      (uint8_t)((uint64_t)pgno >> 32));
3617 }
3618 
3619 /* Get the size of the data in a leaf node */
3620 MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t
node_ds(const MDBX_node * const __restrict node)3621 node_ds(const MDBX_node *const __restrict node) {
3622   return UNALIGNED_PEEK_32(node, MDBX_node, mn_dsize);
3623 }
3624 
3625 /* Set the size of the data for a leaf node */
node_set_ds(MDBX_node * const __restrict node,size_t size)3626 static __always_inline void node_set_ds(MDBX_node *const __restrict node,
3627                                         size_t size) {
3628   assert(size < INT_MAX);
3629   UNALIGNED_POKE_32(node, MDBX_node, mn_dsize, (uint32_t)size);
3630 }
3631 
3632 /* The size of a key in a node */
3633 MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t
node_ks(const MDBX_node * const __restrict node)3634 node_ks(const MDBX_node *const __restrict node) {
3635   return UNALIGNED_PEEK_16(node, MDBX_node, mn_ksize);
3636 }
3637 
3638 /* Set the size of the key for a leaf node */
node_set_ks(MDBX_node * const __restrict node,size_t size)3639 static __always_inline void node_set_ks(MDBX_node *const __restrict node,
3640                                         size_t size) {
3641   assert(size < INT16_MAX);
3642   UNALIGNED_POKE_16(node, MDBX_node, mn_ksize, (uint16_t)size);
3643 }
3644 
3645 MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint8_t
node_flags(const MDBX_node * const __restrict node)3646 node_flags(const MDBX_node *const __restrict node) {
3647   return UNALIGNED_PEEK_8(node, MDBX_node, mn_flags);
3648 }
3649 
node_set_flags(MDBX_node * const __restrict node,uint8_t flags)3650 static __always_inline void node_set_flags(MDBX_node *const __restrict node,
3651                                            uint8_t flags) {
3652   UNALIGNED_POKE_8(node, MDBX_node, mn_flags, flags);
3653 }
3654 
3655 /* Size of the node header, excluding dynamic data at the end */
3656 #define NODESIZE offsetof(MDBX_node, mn_data)
3657 
3658 /* Address of the key for the node */
3659 MDBX_NOTHROW_PURE_FUNCTION static __always_inline void *
node_key(const MDBX_node * const __restrict node)3660 node_key(const MDBX_node *const __restrict node) {
3661   return (char *)node + NODESIZE;
3662 }
3663 
3664 /* Address of the data for a node */
3665 MDBX_NOTHROW_PURE_FUNCTION static __always_inline void *
node_data(const MDBX_node * const __restrict node)3666 node_data(const MDBX_node *const __restrict node) {
3667   return (char *)node_key(node) + node_ks(node);
3668 }
3669 
3670 /* Size of a node in a leaf page with a given key and data.
3671  * This is node header plus key plus data size. */
3672 MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t
node_size_len(const size_t key_len,const size_t value_len)3673 node_size_len(const size_t key_len, const size_t value_len) {
3674   return NODESIZE + EVEN(key_len + value_len);
3675 }
3676 MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t
node_size(const MDBX_val * key,const MDBX_val * value)3677 node_size(const MDBX_val *key, const MDBX_val *value) {
3678   return node_size_len(key ? key->iov_len : 0, value ? value->iov_len : 0);
3679 }
3680 
3681 MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t
peek_pgno(const void * const __restrict ptr)3682 peek_pgno(const void *const __restrict ptr) {
3683   if (sizeof(pgno_t) == sizeof(uint32_t))
3684     return (pgno_t)unaligned_peek_u32(1, ptr);
3685   else if (sizeof(pgno_t) == sizeof(uint64_t))
3686     return (pgno_t)unaligned_peek_u64(1, ptr);
3687   else {
3688     pgno_t pgno;
3689     memcpy(&pgno, ptr, sizeof(pgno));
3690     return pgno;
3691   }
3692 }
3693 
poke_pgno(void * const __restrict ptr,const pgno_t pgno)3694 static __always_inline void poke_pgno(void *const __restrict ptr,
3695                                       const pgno_t pgno) {
3696   if (sizeof(pgno) == sizeof(uint32_t))
3697     unaligned_poke_u32(1, ptr, pgno);
3698   else if (sizeof(pgno) == sizeof(uint64_t))
3699     unaligned_poke_u64(1, ptr, pgno);
3700   else
3701     memcpy(ptr, &pgno, sizeof(pgno));
3702 }
3703 
3704 MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t
node_largedata_pgno(const MDBX_node * const __restrict node)3705 node_largedata_pgno(const MDBX_node *const __restrict node) {
3706   assert(node_flags(node) & F_BIGDATA);
3707   return peek_pgno(node_data(node));
3708 }
3709 
3710 /*------------------------------------------------------------------------------
3711  * Nodes, Keys & Values length limitation factors:
3712  *
3713  * BRANCH_NODE_MAX
3714  *   Branch-page must contain at least two nodes, within each a key and a child
3715  *   page number. But page can't be splitted if it contains less that 4 keys,
3716  *   i.e. a page should not overflow before adding the fourth key. Therefore,
3717  *   at least 3 branch-node should fit in the single branch-page. Further, the
3718  *   first node of a branch-page doesn't contain a key, i.e. the first node
3719  *   is always require space just for itself. Thus:
3720  *       PAGEROOM = pagesize - page_hdr_len;
3721  *       BRANCH_NODE_MAX = even_floor(
3722  *         (PAGEROOM - sizeof(indx_t) - NODESIZE) / (3 - 1) - sizeof(indx_t));
3723  *       KEYLEN_MAX = BRANCH_NODE_MAX - node_hdr_len;
3724  *
3725  * LEAF_NODE_MAX
3726  *   Leaf-node must fit into single leaf-page, where a value could be placed on
3727  *   a large/overflow page. However, may require to insert a nearly page-sized
3728  *   node between two large nodes are already fill-up a page. In this case the
3729  *   page must be splitted to two if some pair of nodes fits on one page, or
3730  *   otherwise the page should be splitted to the THREE with a single node
3731  *   per each of ones. Such 1-into-3 page splitting is costly and complex since
3732  *   requires TWO insertion into the parent page, that could lead to split it
3733  *   and so on up to the root. Therefore double-splitting is avoided here and
3734  *   the maximum node size is half of a leaf page space:
3735  *       LEAF_NODE_MAX = even_floor(PAGEROOM / 2 - sizeof(indx_t));
3736  *       DATALEN_NO_OVERFLOW = LEAF_NODE_MAX - KEYLEN_MAX;
3737  *
3738  *  - SubDatabase-node must fit into one leaf-page:
3739  *       SUBDB_NAME_MAX = LEAF_NODE_MAX - node_hdr_len - sizeof(MDBX_db);
3740  *
3741  *  - Dupsort values itself are a keys in a dupsort-subdb and couldn't be longer
3742  *    than the KEYLEN_MAX. But dupsort node must not great than LEAF_NODE_MAX,
3743  *    since dupsort value couldn't be placed on a large/overflow page:
3744  *       DUPSORT_DATALEN_MAX = min(KEYLEN_MAX,
3745  *                                 max(DATALEN_NO_OVERFLOW, sizeof(MDBX_db));
3746  */
3747 
3748 #define PAGEROOM(pagesize) ((pagesize)-PAGEHDRSZ)
3749 #define EVEN_FLOOR(n) ((n) & ~(size_t)1)
3750 #define BRANCH_NODE_MAX(pagesize)                                              \
3751   (EVEN_FLOOR((PAGEROOM(pagesize) - sizeof(indx_t) - NODESIZE) / (3 - 1) -     \
3752               sizeof(indx_t)))
3753 #define LEAF_NODE_MAX(pagesize)                                                \
3754   (EVEN_FLOOR(PAGEROOM(pagesize) / 2) - sizeof(indx_t))
3755 #define MAX_GC1OVPAGE(pagesize) (PAGEROOM(pagesize) / sizeof(pgno_t) - 1)
3756 
keysize_max(size_t pagesize,MDBX_db_flags_t flags)3757 static __inline unsigned keysize_max(size_t pagesize, MDBX_db_flags_t flags) {
3758   assert(pagesize >= MIN_PAGESIZE && pagesize <= MAX_PAGESIZE &&
3759          is_powerof2(pagesize));
3760   STATIC_ASSERT(BRANCH_NODE_MAX(MIN_PAGESIZE) - NODESIZE >= 8);
3761   if (flags & MDBX_INTEGERKEY)
3762     return 8 /* sizeof(uint64_t) */;
3763 
3764   const intptr_t max_branch_key = BRANCH_NODE_MAX(pagesize) - NODESIZE;
3765   STATIC_ASSERT(LEAF_NODE_MAX(MIN_PAGESIZE) - NODESIZE -
3766                     /* sizeof(uint64) as a key */ 8 >
3767                 sizeof(MDBX_db));
3768   if (flags &
3769       (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP | MDBX_INTEGERDUP)) {
3770     const intptr_t max_dupsort_leaf_key =
3771         LEAF_NODE_MAX(pagesize) - NODESIZE - sizeof(MDBX_db);
3772     return (max_branch_key < max_dupsort_leaf_key)
3773                ? (unsigned)max_branch_key
3774                : (unsigned)max_dupsort_leaf_key;
3775   }
3776   return (unsigned)max_branch_key;
3777 }
3778 
valsize_max(size_t pagesize,MDBX_db_flags_t flags)3779 static __inline size_t valsize_max(size_t pagesize, MDBX_db_flags_t flags) {
3780   assert(pagesize >= MIN_PAGESIZE && pagesize <= MAX_PAGESIZE &&
3781          is_powerof2(pagesize));
3782 
3783   if (flags & MDBX_INTEGERDUP)
3784     return 8 /* sizeof(uint64_t) */;
3785 
3786   if (flags & (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP))
3787     return keysize_max(pagesize, 0);
3788 
3789   const unsigned page_ln2 = log2n_powerof2(pagesize);
3790   const size_t hard = 0x7FF00000ul;
3791   const size_t hard_pages = hard >> page_ln2;
3792   STATIC_ASSERT(MDBX_PGL_LIMIT <= MAX_PAGENO);
3793   const size_t pages_limit = MDBX_PGL_LIMIT / 4;
3794   const size_t limit =
3795       (hard_pages < pages_limit) ? hard : (pages_limit << page_ln2);
3796   return (limit < MAX_MAPSIZE / 2) ? limit : MAX_MAPSIZE / 2;
3797 }
3798 
mdbx_env_get_maxkeysize(const MDBX_env * env)3799 __cold int mdbx_env_get_maxkeysize(const MDBX_env *env) {
3800   return mdbx_env_get_maxkeysize_ex(env, MDBX_DUPSORT);
3801 }
3802 
mdbx_env_get_maxkeysize_ex(const MDBX_env * env,MDBX_db_flags_t flags)3803 __cold int mdbx_env_get_maxkeysize_ex(const MDBX_env *env,
3804                                       MDBX_db_flags_t flags) {
3805   if (unlikely(!env || env->me_signature.weak != MDBX_ME_SIGNATURE))
3806     return -1;
3807 
3808   return (int)mdbx_limits_keysize_max((intptr_t)env->me_psize, flags);
3809 }
3810 
mdbx_default_pagesize(void)3811 size_t mdbx_default_pagesize(void) {
3812   size_t pagesize = mdbx_syspagesize();
3813   mdbx_ensure(nullptr, is_powerof2(pagesize));
3814   pagesize = (pagesize >= MIN_PAGESIZE) ? pagesize : MIN_PAGESIZE;
3815   pagesize = (pagesize <= MAX_PAGESIZE) ? pagesize : MAX_PAGESIZE;
3816   return pagesize;
3817 }
3818 
mdbx_limits_keysize_max(intptr_t pagesize,MDBX_db_flags_t flags)3819 __cold intptr_t mdbx_limits_keysize_max(intptr_t pagesize,
3820                                         MDBX_db_flags_t flags) {
3821   if (pagesize < 1)
3822     pagesize = (intptr_t)mdbx_default_pagesize();
3823   if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE ||
3824                pagesize > (intptr_t)MAX_PAGESIZE ||
3825                !is_powerof2((size_t)pagesize)))
3826     return -1;
3827 
3828   return keysize_max(pagesize, flags);
3829 }
3830 
mdbx_env_get_maxvalsize_ex(const MDBX_env * env,MDBX_db_flags_t flags)3831 __cold int mdbx_env_get_maxvalsize_ex(const MDBX_env *env,
3832                                       MDBX_db_flags_t flags) {
3833   if (unlikely(!env || env->me_signature.weak != MDBX_ME_SIGNATURE))
3834     return -1;
3835 
3836   return (int)mdbx_limits_valsize_max((intptr_t)env->me_psize, flags);
3837 }
3838 
mdbx_limits_valsize_max(intptr_t pagesize,MDBX_db_flags_t flags)3839 __cold intptr_t mdbx_limits_valsize_max(intptr_t pagesize,
3840                                         MDBX_db_flags_t flags) {
3841   if (pagesize < 1)
3842     pagesize = (intptr_t)mdbx_default_pagesize();
3843   if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE ||
3844                pagesize > (intptr_t)MAX_PAGESIZE ||
3845                !is_powerof2((size_t)pagesize)))
3846     return -1;
3847 
3848   return valsize_max(pagesize, flags);
3849 }
3850 
3851 /* Calculate the size of a leaf node.
3852  *
3853  * The size depends on the environment's page size; if a data item
3854  * is too large it will be put onto an overflow page and the node
3855  * size will only include the key and not the data. Sizes are always
3856  * rounded up to an even number of bytes, to guarantee 2-byte alignment
3857  * of the MDBX_node headers. */
3858 MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t
leaf_size(const MDBX_env * env,const MDBX_val * key,const MDBX_val * data)3859 leaf_size(const MDBX_env *env, const MDBX_val *key, const MDBX_val *data) {
3860   size_t node_bytes = node_size(key, data);
3861   if (node_bytes > env->me_leaf_nodemax) {
3862     /* put on overflow page */
3863     node_bytes = node_size_len(key->iov_len, 0) + sizeof(pgno_t);
3864   }
3865 
3866   return node_bytes + sizeof(indx_t);
3867 }
3868 
3869 /* Calculate the size of a branch node.
3870  *
3871  * The size should depend on the environment's page size but since
3872  * we currently don't support spilling large keys onto overflow
3873  * pages, it's simply the size of the MDBX_node header plus the
3874  * size of the key. Sizes are always rounded up to an even number
3875  * of bytes, to guarantee 2-byte alignment of the MDBX_node headers.
3876  *
3877  * [in] env The environment handle.
3878  * [in] key The key for the node.
3879  *
3880  * Returns The number of bytes needed to store the node. */
3881 MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t
branch_size(const MDBX_env * env,const MDBX_val * key)3882 branch_size(const MDBX_env *env, const MDBX_val *key) {
3883   /* Size of a node in a branch page with a given key.
3884    * This is just the node header plus the key, there is no data. */
3885   size_t node_bytes = node_size(key, nullptr);
3886   if (unlikely(node_bytes > env->me_leaf_nodemax)) {
3887     /* put on overflow page */
3888     /* not implemented */
3889     mdbx_assert_fail(env, "INDXSIZE(key) <= env->me_nodemax", __func__,
3890                      __LINE__);
3891     node_bytes = node_size(key, nullptr) + sizeof(pgno_t);
3892   }
3893 
3894   return node_bytes + sizeof(indx_t);
3895 }
3896 
3897 MDBX_NOTHROW_CONST_FUNCTION static __always_inline uint16_t
flags_db2sub(uint16_t db_flags)3898 flags_db2sub(uint16_t db_flags) {
3899   uint16_t sub_flags = db_flags & MDBX_DUPFIXED;
3900 
3901   /* MDBX_INTEGERDUP => MDBX_INTEGERKEY */
3902 #define SHIFT_INTEGERDUP_TO_INTEGERKEY 2
3903   STATIC_ASSERT((MDBX_INTEGERDUP >> SHIFT_INTEGERDUP_TO_INTEGERKEY) ==
3904                 MDBX_INTEGERKEY);
3905   sub_flags |= (db_flags & MDBX_INTEGERDUP) >> SHIFT_INTEGERDUP_TO_INTEGERKEY;
3906 
3907   /* MDBX_REVERSEDUP => MDBX_REVERSEKEY */
3908 #define SHIFT_REVERSEDUP_TO_REVERSEKEY 5
3909   STATIC_ASSERT((MDBX_REVERSEDUP >> SHIFT_REVERSEDUP_TO_REVERSEKEY) ==
3910                 MDBX_REVERSEKEY);
3911   sub_flags |= (db_flags & MDBX_REVERSEDUP) >> SHIFT_REVERSEDUP_TO_REVERSEKEY;
3912 
3913   return sub_flags;
3914 }
3915 
3916 /*----------------------------------------------------------------------------*/
3917 
3918 MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t
pgno2bytes(const MDBX_env * env,pgno_t pgno)3919 pgno2bytes(const MDBX_env *env, pgno_t pgno) {
3920   mdbx_assert(env, (1u << env->me_psize2log) == env->me_psize);
3921   return ((size_t)pgno) << env->me_psize2log;
3922 }
3923 
3924 MDBX_NOTHROW_PURE_FUNCTION static __always_inline MDBX_page *
pgno2page(const MDBX_env * env,pgno_t pgno)3925 pgno2page(const MDBX_env *env, pgno_t pgno) {
3926   return (MDBX_page *)(env->me_map + pgno2bytes(env, pgno));
3927 }
3928 
3929 MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t
bytes2pgno(const MDBX_env * env,size_t bytes)3930 bytes2pgno(const MDBX_env *env, size_t bytes) {
3931   mdbx_assert(env, (env->me_psize >> env->me_psize2log) == 1);
3932   return (pgno_t)(bytes >> env->me_psize2log);
3933 }
3934 
3935 MDBX_NOTHROW_PURE_FUNCTION static size_t
pgno_align2os_bytes(const MDBX_env * env,pgno_t pgno)3936 pgno_align2os_bytes(const MDBX_env *env, pgno_t pgno) {
3937   return ceil_powerof2(pgno2bytes(env, pgno), env->me_os_psize);
3938 }
3939 
pgno_align2os_pgno(const MDBX_env * env,pgno_t pgno)3940 MDBX_NOTHROW_PURE_FUNCTION static pgno_t pgno_align2os_pgno(const MDBX_env *env,
3941                                                             pgno_t pgno) {
3942   return bytes2pgno(env, pgno_align2os_bytes(env, pgno));
3943 }
3944 
3945 MDBX_NOTHROW_PURE_FUNCTION static size_t
bytes_align2os_bytes(const MDBX_env * env,size_t bytes)3946 bytes_align2os_bytes(const MDBX_env *env, size_t bytes) {
3947   return ceil_powerof2(ceil_powerof2(bytes, env->me_psize), env->me_os_psize);
3948 }
3949 
3950 /* Address of first usable data byte in a page, after the header */
3951 MDBX_NOTHROW_PURE_FUNCTION static __always_inline void *
page_data(const MDBX_page * mp)3952 page_data(const MDBX_page *mp) {
3953   return (char *)mp + PAGEHDRSZ;
3954 }
3955 
3956 MDBX_NOTHROW_PURE_FUNCTION static __always_inline const MDBX_page *
data_page(const void * data)3957 data_page(const void *data) {
3958   return container_of(data, MDBX_page, mp_ptrs);
3959 }
3960 
3961 MDBX_NOTHROW_PURE_FUNCTION static __always_inline MDBX_meta *
page_meta(MDBX_page * mp)3962 page_meta(MDBX_page *mp) {
3963   return (MDBX_meta *)page_data(mp);
3964 }
3965 
3966 /* Number of nodes on a page */
3967 MDBX_NOTHROW_PURE_FUNCTION static __always_inline unsigned
page_numkeys(const MDBX_page * mp)3968 page_numkeys(const MDBX_page *mp) {
3969   return mp->mp_lower >> 1;
3970 }
3971 
3972 /* The amount of space remaining in the page */
3973 MDBX_NOTHROW_PURE_FUNCTION static __always_inline unsigned
page_room(const MDBX_page * mp)3974 page_room(const MDBX_page *mp) {
3975   return mp->mp_upper - mp->mp_lower;
3976 }
3977 
3978 MDBX_NOTHROW_PURE_FUNCTION static __always_inline unsigned
page_space(const MDBX_env * env)3979 page_space(const MDBX_env *env) {
3980   STATIC_ASSERT(PAGEHDRSZ % 2 == 0);
3981   return env->me_psize - PAGEHDRSZ;
3982 }
3983 
3984 MDBX_NOTHROW_PURE_FUNCTION static __always_inline unsigned
page_used(const MDBX_env * env,const MDBX_page * mp)3985 page_used(const MDBX_env *env, const MDBX_page *mp) {
3986   return page_space(env) - page_room(mp);
3987 }
3988 
3989 /* The percentage of space used in the page, in a percents. */
3990 MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static __inline double
page_fill(const MDBX_env * env,const MDBX_page * mp)3991 page_fill(const MDBX_env *env, const MDBX_page *mp) {
3992   return page_used(env, mp) * 100.0 / page_space(env);
3993 }
3994 
3995 /* The number of overflow pages needed to store the given size. */
3996 MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t
number_of_ovpages(const MDBX_env * env,size_t bytes)3997 number_of_ovpages(const MDBX_env *env, size_t bytes) {
3998   return bytes2pgno(env, PAGEHDRSZ - 1 + bytes) + 1;
3999 }
4000 
4001 __cold static int MDBX_PRINTF_ARGS(2, 3)
bad_page(const MDBX_page * mp,const char * fmt,...)4002     bad_page(const MDBX_page *mp, const char *fmt, ...) {
4003   if (mdbx_log_enabled(MDBX_LOG_ERROR)) {
4004     static const MDBX_page *prev;
4005     if (prev != mp) {
4006       prev = mp;
4007       const char *type;
4008       switch (mp->mp_flags & (P_BRANCH | P_LEAF | P_OVERFLOW | P_META |
4009                               P_LEAF2 | P_BAD | P_SUBP)) {
4010       case P_BRANCH:
4011         type = "branch";
4012         break;
4013       case P_LEAF:
4014         type = "leaf";
4015         break;
4016       case P_LEAF | P_SUBP:
4017         type = "subleaf";
4018         break;
4019       case P_LEAF | P_LEAF2:
4020         type = "dupfixed-leaf";
4021         break;
4022       case P_LEAF | P_LEAF2 | P_SUBP:
4023         type = "dupfixed-subleaf";
4024         break;
4025       case P_OVERFLOW:
4026         type = "large";
4027         break;
4028       default:
4029         type = "broken";
4030       }
4031       mdbx_debug_log(MDBX_LOG_ERROR, "badpage", 0,
4032                      "corrupted %s-page #%u, mod-txnid %" PRIaTXN "\n", type,
4033                      mp->mp_pgno, mp->mp_txnid);
4034     }
4035 
4036     va_list args;
4037     va_start(args, fmt);
4038     mdbx_debug_log_va(MDBX_LOG_ERROR, "badpage", 0, fmt, args);
4039     va_end(args);
4040   }
4041   return MDBX_CORRUPTED;
4042 }
4043 
4044 /* Address of node i in page p */
4045 MDBX_NOTHROW_PURE_FUNCTION static __always_inline MDBX_node *
page_node(const MDBX_page * mp,unsigned i)4046 page_node(const MDBX_page *mp, unsigned i) {
4047   assert((mp->mp_flags & (P_LEAF2 | P_OVERFLOW | P_META)) == 0);
4048   assert(page_numkeys(mp) > (unsigned)(i));
4049   assert(mp->mp_ptrs[i] % 2 == 0);
4050   return (MDBX_node *)((char *)mp + mp->mp_ptrs[i] + PAGEHDRSZ);
4051 }
4052 
4053 /* The address of a key in a LEAF2 page.
4054  * LEAF2 pages are used for MDBX_DUPFIXED sorted-duplicate sub-DBs.
4055  * There are no node headers, keys are stored contiguously. */
4056 MDBX_NOTHROW_PURE_FUNCTION static __always_inline void *
page_leaf2key(const MDBX_page * mp,unsigned i,size_t keysize)4057 page_leaf2key(const MDBX_page *mp, unsigned i, size_t keysize) {
4058   assert((mp->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_META)) ==
4059          (P_LEAF | P_LEAF2));
4060   assert(mp->mp_leaf2_ksize == keysize);
4061   (void)keysize;
4062   return (char *)mp + PAGEHDRSZ + (i * mp->mp_leaf2_ksize);
4063 }
4064 
4065 /* Set the node's key into keyptr. */
get_key(const MDBX_node * node,MDBX_val * keyptr)4066 static __always_inline void get_key(const MDBX_node *node, MDBX_val *keyptr) {
4067   keyptr->iov_len = node_ks(node);
4068   keyptr->iov_base = node_key(node);
4069 }
4070 
4071 /* Set the node's key into keyptr, if requested. */
4072 static __always_inline void
get_key_optional(const MDBX_node * node,MDBX_val * keyptr)4073 get_key_optional(const MDBX_node *node, MDBX_val *keyptr /* __may_null */) {
4074   if (keyptr)
4075     get_key(node, keyptr);
4076 }
4077 
4078 /*------------------------------------------------------------------------------
4079  * Workaround for mmaped-lookahead-cross-page-boundary bug
4080  * in an obsolete versions of Elbrus's libc and kernels. */
4081 #if defined(__e2k__) && defined(MDBX_E2K_MLHCPB_WORKAROUND) &&                 \
4082     MDBX_E2K_MLHCPB_WORKAROUND
mdbx_e2k_memcmp_bug_workaround(const void * s1,const void * s2,size_t n)4083 int __hot mdbx_e2k_memcmp_bug_workaround(const void *s1, const void *s2,
4084                                          size_t n) {
4085   if (unlikely(n > 42
4086                /* LY: align followed access if reasonable possible */
4087                && (((uintptr_t)s1) & 7) != 0 &&
4088                (((uintptr_t)s1) & 7) == (((uintptr_t)s2) & 7))) {
4089     if (((uintptr_t)s1) & 1) {
4090       const int diff = *(uint8_t *)s1 - *(uint8_t *)s2;
4091       if (diff)
4092         return diff;
4093       s1 = (char *)s1 + 1;
4094       s2 = (char *)s2 + 1;
4095       n -= 1;
4096     }
4097 
4098     if (((uintptr_t)s1) & 2) {
4099       const uint16_t a = *(uint16_t *)s1;
4100       const uint16_t b = *(uint16_t *)s2;
4101       if (likely(a != b))
4102         return (__builtin_bswap16(a) > __builtin_bswap16(b)) ? 1 : -1;
4103       s1 = (char *)s1 + 2;
4104       s2 = (char *)s2 + 2;
4105       n -= 2;
4106     }
4107 
4108     if (((uintptr_t)s1) & 4) {
4109       const uint32_t a = *(uint32_t *)s1;
4110       const uint32_t b = *(uint32_t *)s2;
4111       if (likely(a != b))
4112         return (__builtin_bswap32(a) > __builtin_bswap32(b)) ? 1 : -1;
4113       s1 = (char *)s1 + 4;
4114       s2 = (char *)s2 + 4;
4115       n -= 4;
4116     }
4117   }
4118 
4119   while (n >= 8) {
4120     const uint64_t a = *(uint64_t *)s1;
4121     const uint64_t b = *(uint64_t *)s2;
4122     if (likely(a != b))
4123       return (__builtin_bswap64(a) > __builtin_bswap64(b)) ? 1 : -1;
4124     s1 = (char *)s1 + 8;
4125     s2 = (char *)s2 + 8;
4126     n -= 8;
4127   }
4128 
4129   if (n & 4) {
4130     const uint32_t a = *(uint32_t *)s1;
4131     const uint32_t b = *(uint32_t *)s2;
4132     if (likely(a != b))
4133       return (__builtin_bswap32(a) > __builtin_bswap32(b)) ? 1 : -1;
4134     s1 = (char *)s1 + 4;
4135     s2 = (char *)s2 + 4;
4136   }
4137 
4138   if (n & 2) {
4139     const uint16_t a = *(uint16_t *)s1;
4140     const uint16_t b = *(uint16_t *)s2;
4141     if (likely(a != b))
4142       return (__builtin_bswap16(a) > __builtin_bswap16(b)) ? 1 : -1;
4143     s1 = (char *)s1 + 2;
4144     s2 = (char *)s2 + 2;
4145   }
4146 
4147   return (n & 1) ? *(uint8_t *)s1 - *(uint8_t *)s2 : 0;
4148 }
4149 
mdbx_e2k_strcmp_bug_workaround(const char * s1,const char * s2)4150 int __hot mdbx_e2k_strcmp_bug_workaround(const char *s1, const char *s2) {
4151   while (true) {
4152     int diff = *(uint8_t *)s1 - *(uint8_t *)s2;
4153     if (likely(diff != 0) || *s1 == '\0')
4154       return diff;
4155     s1 += 1;
4156     s2 += 1;
4157   }
4158 }
4159 
mdbx_e2k_strncmp_bug_workaround(const char * s1,const char * s2,size_t n)4160 int __hot mdbx_e2k_strncmp_bug_workaround(const char *s1, const char *s2,
4161                                           size_t n) {
4162   while (n > 0) {
4163     int diff = *(uint8_t *)s1 - *(uint8_t *)s2;
4164     if (likely(diff != 0) || *s1 == '\0')
4165       return diff;
4166     s1 += 1;
4167     s2 += 1;
4168     n -= 1;
4169   }
4170   return 0;
4171 }
4172 
mdbx_e2k_strlen_bug_workaround(const char * s)4173 size_t __hot mdbx_e2k_strlen_bug_workaround(const char *s) {
4174   size_t n = 0;
4175   while (*s) {
4176     s += 1;
4177     n += 1;
4178   }
4179   return n;
4180 }
4181 
mdbx_e2k_strnlen_bug_workaround(const char * s,size_t maxlen)4182 size_t __hot mdbx_e2k_strnlen_bug_workaround(const char *s, size_t maxlen) {
4183   size_t n = 0;
4184   while (maxlen > n && *s) {
4185     s += 1;
4186     n += 1;
4187   }
4188   return n;
4189 }
4190 #endif /* MDBX_E2K_MLHCPB_WORKAROUND */
4191 
4192 /*------------------------------------------------------------------------------
4193  * safe read/write volatile 64-bit fields on 32-bit architectures. */
4194 
4195 MDBX_MAYBE_UNUSED static __always_inline uint64_t
atomic_store64(MDBX_atomic_uint64_t * p,const uint64_t value,enum MDBX_memory_order order)4196 atomic_store64(MDBX_atomic_uint64_t *p, const uint64_t value,
4197                enum MDBX_memory_order order) {
4198   STATIC_ASSERT(sizeof(MDBX_atomic_uint64_t) == 8);
4199 #if MDBX_64BIT_ATOMIC
4200 #ifdef MDBX_HAVE_C11ATOMICS
4201   assert(atomic_is_lock_free(MDBX_c11a_rw(uint64_t, p)));
4202   atomic_store_explicit(MDBX_c11a_rw(uint64_t, p), value, mo_c11_store(order));
4203 #else  /* MDBX_HAVE_C11ATOMICS */
4204   if (order != mo_Relaxed)
4205     mdbx_compiler_barrier();
4206   p->weak = value;
4207   mdbx_memory_fence(order, true);
4208 #endif /* MDBX_HAVE_C11ATOMICS */
4209 #else  /* !MDBX_64BIT_ATOMIC */
4210   mdbx_compiler_barrier();
4211   atomic_store32(&p->low, (uint32_t)value, mo_Relaxed);
4212   mdbx_jitter4testing(true);
4213   atomic_store32(&p->high, (uint32_t)(value >> 32), order);
4214   mdbx_jitter4testing(true);
4215 #endif /* !MDBX_64BIT_ATOMIC */
4216   return value;
4217 }
4218 
4219 MDBX_MAYBE_UNUSED static
4220 #if MDBX_64BIT_ATOMIC
4221     __always_inline
4222 #endif /* MDBX_64BIT_ATOMIC */
4223         uint64_t
atomic_load64(const MDBX_atomic_uint64_t * p,enum MDBX_memory_order order)4224         atomic_load64(const MDBX_atomic_uint64_t *p,
4225                       enum MDBX_memory_order order) {
4226   STATIC_ASSERT(sizeof(MDBX_atomic_uint64_t) == 8);
4227 #if MDBX_64BIT_ATOMIC
4228 #ifdef MDBX_HAVE_C11ATOMICS
4229   assert(atomic_is_lock_free(MDBX_c11a_ro(uint64_t, p)));
4230   return atomic_load_explicit(MDBX_c11a_ro(uint64_t, p), mo_c11_load(order));
4231 #else  /* MDBX_HAVE_C11ATOMICS */
4232   mdbx_memory_fence(order, false);
4233   const uint64_t value = p->weak;
4234   if (order != mo_Relaxed)
4235     mdbx_compiler_barrier();
4236   return value;
4237 #endif /* MDBX_HAVE_C11ATOMICS */
4238 #else  /* !MDBX_64BIT_ATOMIC */
4239   mdbx_compiler_barrier();
4240   uint64_t value = (uint64_t)atomic_load32(&p->high, order) << 32;
4241   mdbx_jitter4testing(true);
4242   value |= atomic_load32(&p->low, (order == mo_Relaxed) ? mo_Relaxed
4243                                                         : mo_AcquireRelease);
4244   mdbx_jitter4testing(true);
4245   for (;;) {
4246     mdbx_compiler_barrier();
4247     uint64_t again = (uint64_t)atomic_load32(&p->high, order) << 32;
4248     mdbx_jitter4testing(true);
4249     again |= atomic_load32(&p->low, (order == mo_Relaxed) ? mo_Relaxed
4250                                                           : mo_AcquireRelease);
4251     mdbx_jitter4testing(true);
4252     if (likely(value == again))
4253       return value;
4254     value = again;
4255   }
4256 #endif /* !MDBX_64BIT_ATOMIC */
4257 }
4258 
atomic_yield(void)4259 static __always_inline void atomic_yield(void) {
4260 #if defined(_WIN32) || defined(_WIN64)
4261   YieldProcessor();
4262 #elif defined(__ia32__) || defined(__e2k__)
4263   __builtin_ia32_pause();
4264 #elif defined(__ia64__)
4265 #if defined(__HP_cc__) || defined(__HP_aCC__)
4266   _Asm_hint(_HINT_PAUSE);
4267 #else
4268   __asm__ __volatile__("hint @pause");
4269 #endif
4270 #elif defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH > 6) ||       \
4271     defined(__ARM_ARCH_6K__)
4272 #ifdef __CC_ARM
4273   __yield();
4274 #else
4275   __asm__ __volatile__("yield");
4276 #endif
4277 #elif (defined(__mips64) || defined(__mips64__)) && defined(__mips_isa_rev) && \
4278     __mips_isa_rev >= 2
4279   __asm__ __volatile__("pause");
4280 #elif defined(__mips) || defined(__mips__) || defined(__mips64) ||             \
4281     defined(__mips64__) || defined(_M_MRX000) || defined(_MIPS_) ||            \
4282     defined(__MWERKS__) || defined(__sgi)
4283   __asm__ __volatile__(".word 0x00000140");
4284 #elif defined(__linux__) || defined(__gnu_linux__) || defined(_UNIX03_SOURCE)
4285   sched_yield();
4286 #elif (defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 1)) || defined(_OPEN_THREADS)
4287   pthread_yield();
4288 #endif
4289 }
4290 
4291 #if MDBX_64BIT_CAS
atomic_cas64(MDBX_atomic_uint64_t * p,uint64_t c,uint64_t v)4292 static __always_inline bool atomic_cas64(MDBX_atomic_uint64_t *p, uint64_t c,
4293                                          uint64_t v) {
4294 #ifdef MDBX_HAVE_C11ATOMICS
4295   STATIC_ASSERT(sizeof(long long) >= sizeof(uint64_t));
4296 #ifdef ATOMIC_LLONG_LOCK_FREE
4297   STATIC_ASSERT(ATOMIC_LLONG_LOCK_FREE > 0);
4298 #if ATOMIC_LLONG_LOCK_FREE < 2
4299   assert(atomic_is_lock_free(MDBX_c11a_rw(uint64_t, p)));
4300 #endif /* ATOMIC_LLONG_LOCK_FREE < 2 */
4301 #else  /* defined(ATOMIC_LLONG_LOCK_FREE) */
4302   assert(atomic_is_lock_free(MDBX_c11a_rw(uint64_t, p)));
4303 #endif
4304   return atomic_compare_exchange_strong(MDBX_c11a_rw(uint64_t, p), &c, v);
4305 #elif defined(__GNUC__) || defined(__clang__)
4306   return __sync_bool_compare_and_swap(&p->weak, c, v);
4307 #elif defined(_MSC_VER)
4308   return c == (uint64_t)_InterlockedCompareExchange64(
4309                   (volatile __int64 *)&p->weak, v, c);
4310 #elif defined(__APPLE__)
4311   return OSAtomicCompareAndSwap64Barrier(c, v, &p->weak);
4312 #else
4313 #error FIXME: Unsupported compiler
4314 #endif
4315 }
4316 #endif /* MDBX_64BIT_CAS */
4317 
atomic_cas32(MDBX_atomic_uint32_t * p,uint32_t c,uint32_t v)4318 static __always_inline bool atomic_cas32(MDBX_atomic_uint32_t *p, uint32_t c,
4319                                          uint32_t v) {
4320 #ifdef MDBX_HAVE_C11ATOMICS
4321   STATIC_ASSERT(sizeof(int) >= sizeof(uint32_t));
4322 #ifdef ATOMIC_INT_LOCK_FREE
4323   STATIC_ASSERT(ATOMIC_INT_LOCK_FREE > 0);
4324 #if ATOMIC_INT_LOCK_FREE < 2
4325   assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p)));
4326 #endif
4327 #else
4328   assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p)));
4329 #endif
4330   return atomic_compare_exchange_strong(MDBX_c11a_rw(uint32_t, p), &c, v);
4331 #elif defined(__GNUC__) || defined(__clang__)
4332   return __sync_bool_compare_and_swap(&p->weak, c, v);
4333 #elif defined(_MSC_VER)
4334   STATIC_ASSERT(sizeof(volatile long) == sizeof(volatile uint32_t));
4335   return c ==
4336          (uint32_t)_InterlockedCompareExchange((volatile long *)&p->weak, v, c);
4337 #elif defined(__APPLE__)
4338   return OSAtomicCompareAndSwap32Barrier(c, v, &p->weak);
4339 #else
4340 #error FIXME: Unsupported compiler
4341 #endif
4342 }
4343 
atomic_add32(MDBX_atomic_uint32_t * p,uint32_t v)4344 static __always_inline uint32_t atomic_add32(MDBX_atomic_uint32_t *p,
4345                                              uint32_t v) {
4346 #ifdef MDBX_HAVE_C11ATOMICS
4347   STATIC_ASSERT(sizeof(int) >= sizeof(uint32_t));
4348 #ifdef ATOMIC_INT_LOCK_FREE
4349   STATIC_ASSERT(ATOMIC_INT_LOCK_FREE > 0);
4350 #if ATOMIC_INT_LOCK_FREE < 2
4351   assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p)));
4352 #endif
4353 #else
4354   assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p)));
4355 #endif
4356   return atomic_fetch_add(MDBX_c11a_rw(uint32_t, p), v);
4357 #elif defined(__GNUC__) || defined(__clang__)
4358   return __sync_fetch_and_add(&p->weak, v);
4359 #elif defined(_MSC_VER)
4360   STATIC_ASSERT(sizeof(volatile long) == sizeof(volatile uint32_t));
4361   return (uint32_t)_InterlockedExchangeAdd((volatile long *)&p->weak, v);
4362 #elif defined(__APPLE__)
4363   return OSAtomicAdd32Barrier(v, &p->weak);
4364 #else
4365 #error FIXME: Unsupported compiler
4366 #endif
4367 }
4368 
4369 #define atomic_sub32(p, v) atomic_add32(p, 0 - (v))
4370 
safe64_txnid_next(uint64_t txnid)4371 static __always_inline uint64_t safe64_txnid_next(uint64_t txnid) {
4372   txnid += xMDBX_TXNID_STEP;
4373 #if !MDBX_64BIT_CAS
4374   /* avoid overflow of low-part in safe64_reset() */
4375   txnid += (UINT32_MAX == (uint32_t)txnid);
4376 #endif
4377   return txnid;
4378 }
4379 
safe64_reset(MDBX_atomic_uint64_t * p,bool single_writer)4380 static __always_inline void safe64_reset(MDBX_atomic_uint64_t *p,
4381                                          bool single_writer) {
4382 #if !MDBX_64BIT_CAS
4383   if (!single_writer) {
4384     STATIC_ASSERT(xMDBX_TXNID_STEP > 1);
4385     /* it is safe to increment low-part to avoid ABA, since xMDBX_TXNID_STEP > 1
4386      * and overflow was preserved in safe64_txnid_next() */
4387     atomic_add32(&p->low, 1) /* avoid ABA in safe64_reset_compare() */;
4388     atomic_store32(
4389         &p->high, UINT32_MAX,
4390         mo_Relaxed) /* atomically make >= SAFE64_INVALID_THRESHOLD */;
4391     atomic_add32(&p->low, 1) /* avoid ABA in safe64_reset_compare() */;
4392   } else
4393 #endif /* !MDBX_64BIT_CAS */
4394 #if MDBX_64BIT_ATOMIC
4395     /* atomically make value >= SAFE64_INVALID_THRESHOLD by 64-bit operation */
4396     atomic_store64(p, UINT64_MAX,
4397                    single_writer ? mo_AcquireRelease
4398                                  : mo_SequentialConsistency);
4399 #else
4400   /* atomically make value >= SAFE64_INVALID_THRESHOLD by 32-bit operation */
4401   atomic_store32(&p->high, UINT32_MAX,
4402                  single_writer ? mo_AcquireRelease : mo_SequentialConsistency);
4403 #endif /* MDBX_64BIT_ATOMIC */
4404   assert(p->weak >= SAFE64_INVALID_THRESHOLD);
4405   mdbx_jitter4testing(true);
4406 }
4407 
safe64_reset_compare(MDBX_atomic_uint64_t * p,txnid_t compare)4408 static __always_inline bool safe64_reset_compare(MDBX_atomic_uint64_t *p,
4409                                                  txnid_t compare) {
4410   /* LY: This function is used to reset `mr_txnid` from hsr-handler in case
4411    *     the asynchronously cancellation of read transaction. Therefore,
4412    *     there may be a collision between the cleanup performed here and
4413    *     asynchronous termination and restarting of the read transaction
4414    *     in another proces/thread. In general we MUST NOT reset the `mr_txnid`
4415    *     if a new transaction was started (i.e. if `mr_txnid` was changed). */
4416 #if MDBX_64BIT_CAS
4417   bool rc = atomic_cas64(p, compare, UINT64_MAX);
4418 #else
4419   /* LY: There is no gold ratio here since shared mutex is too costly,
4420    *     in such way we must acquire/release it for every update of mr_txnid,
4421    *     i.e. twice for each read transaction). */
4422   bool rc = false;
4423   if (likely(atomic_load32(&p->low, mo_AcquireRelease) == (uint32_t)compare &&
4424              atomic_cas32(&p->high, (uint32_t)(compare >> 32), UINT32_MAX))) {
4425     if (unlikely(atomic_load32(&p->low, mo_AcquireRelease) !=
4426                  (uint32_t)compare))
4427       atomic_cas32(&p->high, UINT32_MAX, (uint32_t)(compare >> 32));
4428     else
4429       rc = true;
4430   }
4431 #endif /* MDBX_64BIT_CAS */
4432   mdbx_jitter4testing(true);
4433   return rc;
4434 }
4435 
safe64_write(MDBX_atomic_uint64_t * p,const uint64_t v)4436 static __always_inline void safe64_write(MDBX_atomic_uint64_t *p,
4437                                          const uint64_t v) {
4438   assert(p->weak >= SAFE64_INVALID_THRESHOLD);
4439 #if MDBX_64BIT_ATOMIC
4440   atomic_store64(p, v, mo_AcquireRelease);
4441 #else  /* MDBX_64BIT_ATOMIC */
4442   mdbx_compiler_barrier();
4443   /* update low-part but still value >= SAFE64_INVALID_THRESHOLD */
4444   atomic_store32(&p->low, (uint32_t)v, mo_Relaxed);
4445   assert(p->weak >= SAFE64_INVALID_THRESHOLD);
4446   mdbx_jitter4testing(true);
4447   /* update high-part from SAFE64_INVALID_THRESHOLD to actual value */
4448   atomic_store32(&p->high, (uint32_t)(v >> 32), mo_AcquireRelease);
4449 #endif /* MDBX_64BIT_ATOMIC */
4450   assert(p->weak == v);
4451   mdbx_jitter4testing(true);
4452 }
4453 
safe64_read(const MDBX_atomic_uint64_t * p)4454 static __always_inline uint64_t safe64_read(const MDBX_atomic_uint64_t *p) {
4455   mdbx_jitter4testing(true);
4456   uint64_t v = atomic_load64(p, mo_AcquireRelease);
4457   mdbx_jitter4testing(true);
4458   return v;
4459 }
4460 
4461 #if 0 /* unused for now */
4462 MDBX_MAYBE_UNUSED static __always_inline bool safe64_is_valid(uint64_t v) {
4463 #if MDBX_WORDBITS >= 64
4464   return v < SAFE64_INVALID_THRESHOLD;
4465 #else
4466   return (v >> 32) != UINT32_MAX;
4467 #endif /* MDBX_WORDBITS */
4468 }
4469 
4470 MDBX_MAYBE_UNUSED static __always_inline bool
4471  safe64_is_valid_ptr(const MDBX_atomic_uint64_t *p) {
4472 #if MDBX_64BIT_ATOMIC
4473   return atomic_load64(p, mo_AcquireRelease) < SAFE64_INVALID_THRESHOLD;
4474 #else
4475   return atomic_load32(&p->high, mo_AcquireRelease) != UINT32_MAX;
4476 #endif /* MDBX_64BIT_ATOMIC */
4477 }
4478 #endif /* unused for now */
4479 
4480 /* non-atomic write with safety for reading a half-updated value */
safe64_update(MDBX_atomic_uint64_t * p,const uint64_t v)4481 static __always_inline void safe64_update(MDBX_atomic_uint64_t *p,
4482                                           const uint64_t v) {
4483 #if MDBX_64BIT_ATOMIC
4484   atomic_store64(p, v, mo_Relaxed);
4485 #else
4486   safe64_reset(p, true);
4487   safe64_write(p, v);
4488 #endif /* MDBX_64BIT_ATOMIC */
4489 }
4490 
4491 /* non-atomic increment with safety for reading a half-updated value */
4492 MDBX_MAYBE_UNUSED static
4493 #if MDBX_64BIT_ATOMIC
4494     __always_inline
4495 #endif /* MDBX_64BIT_ATOMIC */
4496     void
safe64_inc(MDBX_atomic_uint64_t * p,const uint64_t v)4497     safe64_inc(MDBX_atomic_uint64_t *p, const uint64_t v) {
4498   assert(v > 0);
4499   safe64_update(p, atomic_load64(p, mo_Relaxed) + v);
4500 }
4501 
4502 /*----------------------------------------------------------------------------*/
4503 /* rthc (tls keys and destructors) */
4504 
4505 typedef struct rthc_entry_t {
4506   MDBX_reader *begin;
4507   MDBX_reader *end;
4508   mdbx_thread_key_t thr_tls_key;
4509   bool key_valid;
4510 } rthc_entry_t;
4511 
4512 #if MDBX_DEBUG
4513 #define RTHC_INITIAL_LIMIT 1
4514 #else
4515 #define RTHC_INITIAL_LIMIT 16
4516 #endif
4517 
4518 static bin128_t bootid;
4519 
4520 #if defined(_WIN32) || defined(_WIN64)
4521 static CRITICAL_SECTION rthc_critical_section;
4522 static CRITICAL_SECTION lcklist_critical_section;
4523 #else
4524 int __cxa_thread_atexit_impl(void (*dtor)(void *), void *obj, void *dso_symbol)
4525     __attribute__((__weak__));
4526 #ifdef __APPLE__ /* FIXME: Thread-Local Storage destructors & DSO-unloading */
__cxa_thread_atexit_impl(void (* dtor)(void *),void * obj,void * dso_symbol)4527 int __cxa_thread_atexit_impl(void (*dtor)(void *), void *obj,
4528                              void *dso_symbol) {
4529   (void)dtor;
4530   (void)obj;
4531   (void)dso_symbol;
4532   return -1;
4533 }
4534 #endif           /* __APPLE__ */
4535 
4536 static pthread_mutex_t lcklist_mutex = PTHREAD_MUTEX_INITIALIZER;
4537 static pthread_mutex_t rthc_mutex = PTHREAD_MUTEX_INITIALIZER;
4538 static pthread_cond_t rthc_cond = PTHREAD_COND_INITIALIZER;
4539 static mdbx_thread_key_t rthc_key;
4540 static MDBX_atomic_uint32_t rthc_pending;
4541 
workaround_glibc_bug21031(void)4542 __cold static void workaround_glibc_bug21031(void) {
4543   /* Workaround for https://sourceware.org/bugzilla/show_bug.cgi?id=21031
4544    *
4545    * Due race between pthread_key_delete() and __nptl_deallocate_tsd()
4546    * The destructor(s) of thread-local-storage object(s) may be running
4547    * in another thread(s) and be blocked or not finished yet.
4548    * In such case we get a SEGFAULT after unload this library DSO.
4549    *
4550    * So just by yielding a few timeslices we give a chance
4551    * to such destructor(s) for completion and avoids segfault. */
4552   sched_yield();
4553   sched_yield();
4554   sched_yield();
4555 }
4556 #endif
4557 
4558 static unsigned rthc_count, rthc_limit;
4559 static rthc_entry_t *rthc_table;
4560 static rthc_entry_t rthc_table_static[RTHC_INITIAL_LIMIT];
4561 
rthc_lock(void)4562 static __inline void rthc_lock(void) {
4563 #if defined(_WIN32) || defined(_WIN64)
4564   EnterCriticalSection(&rthc_critical_section);
4565 #else
4566   mdbx_ensure(nullptr, pthread_mutex_lock(&rthc_mutex) == 0);
4567 #endif
4568 }
4569 
rthc_unlock(void)4570 static __inline void rthc_unlock(void) {
4571 #if defined(_WIN32) || defined(_WIN64)
4572   LeaveCriticalSection(&rthc_critical_section);
4573 #else
4574   mdbx_ensure(nullptr, pthread_mutex_unlock(&rthc_mutex) == 0);
4575 #endif
4576 }
4577 
thread_key_create(mdbx_thread_key_t * key)4578 static __inline int thread_key_create(mdbx_thread_key_t *key) {
4579   int rc;
4580 #if defined(_WIN32) || defined(_WIN64)
4581   *key = TlsAlloc();
4582   rc = (*key != TLS_OUT_OF_INDEXES) ? MDBX_SUCCESS : GetLastError();
4583 #else
4584   rc = pthread_key_create(key, nullptr);
4585 #endif
4586   mdbx_trace("&key = %p, value %" PRIuPTR ", rc %d",
4587              __Wpedantic_format_voidptr(key), (uintptr_t)*key, rc);
4588   return rc;
4589 }
4590 
thread_key_delete(mdbx_thread_key_t key)4591 static __inline void thread_key_delete(mdbx_thread_key_t key) {
4592   mdbx_trace("key = %" PRIuPTR, (uintptr_t)key);
4593 #if defined(_WIN32) || defined(_WIN64)
4594   mdbx_ensure(nullptr, TlsFree(key));
4595 #else
4596   mdbx_ensure(nullptr, pthread_key_delete(key) == 0);
4597   workaround_glibc_bug21031();
4598 #endif
4599 }
4600 
thread_rthc_get(mdbx_thread_key_t key)4601 static __inline void *thread_rthc_get(mdbx_thread_key_t key) {
4602 #if defined(_WIN32) || defined(_WIN64)
4603   return TlsGetValue(key);
4604 #else
4605   return pthread_getspecific(key);
4606 #endif
4607 }
4608 
thread_rthc_set(mdbx_thread_key_t key,const void * value)4609 static void thread_rthc_set(mdbx_thread_key_t key, const void *value) {
4610 #if defined(_WIN32) || defined(_WIN64)
4611   mdbx_ensure(nullptr, TlsSetValue(key, (void *)value));
4612 #else
4613 #define MDBX_THREAD_RTHC_ZERO 0
4614 #define MDBX_THREAD_RTHC_REGISTERED 1
4615 #define MDBX_THREAD_RTHC_COUNTED 2
4616   static __thread char thread_registration_state;
4617   if (value && unlikely(thread_registration_state == MDBX_THREAD_RTHC_ZERO)) {
4618     thread_registration_state = MDBX_THREAD_RTHC_REGISTERED;
4619     mdbx_trace("thread registered 0x%" PRIxPTR, mdbx_thread_self());
4620     if (&__cxa_thread_atexit_impl == nullptr ||
4621         __cxa_thread_atexit_impl(mdbx_rthc_thread_dtor,
4622                                  &thread_registration_state,
4623                                  (void *)&mdbx_version /* dso_anchor */)) {
4624       mdbx_ensure(nullptr, pthread_setspecific(
4625                                rthc_key, &thread_registration_state) == 0);
4626       thread_registration_state = MDBX_THREAD_RTHC_COUNTED;
4627       const unsigned count_before = atomic_add32(&rthc_pending, 1);
4628       mdbx_ensure(nullptr, count_before < INT_MAX);
4629       mdbx_trace("fallback to pthreads' tsd, key %" PRIuPTR ", count %u",
4630                  (uintptr_t)rthc_key, count_before);
4631       (void)count_before;
4632     }
4633   }
4634   mdbx_ensure(nullptr, pthread_setspecific(key, value) == 0);
4635 #endif
4636 }
4637 
mdbx_rthc_global_init(void)4638 __cold void mdbx_rthc_global_init(void) {
4639   rthc_limit = RTHC_INITIAL_LIMIT;
4640   rthc_table = rthc_table_static;
4641 #if defined(_WIN32) || defined(_WIN64)
4642   InitializeCriticalSection(&rthc_critical_section);
4643   InitializeCriticalSection(&lcklist_critical_section);
4644 #else
4645   mdbx_ensure(nullptr,
4646               pthread_key_create(&rthc_key, mdbx_rthc_thread_dtor) == 0);
4647   mdbx_trace("pid %d, &mdbx_rthc_key = %p, value 0x%x", mdbx_getpid(),
4648              __Wpedantic_format_voidptr(&rthc_key), (unsigned)rthc_key);
4649 #endif
4650   /* checking time conversion, this also avoids racing on 32-bit architectures
4651    * during writing calculated 64-bit ratio(s) into memory. */
4652   uint32_t proba = UINT32_MAX;
4653   while (true) {
4654     unsigned time_conversion_checkup =
4655         mdbx_osal_monotime_to_16dot16(mdbx_osal_16dot16_to_monotime(proba));
4656     unsigned one_more = (proba < UINT32_MAX) ? proba + 1 : proba;
4657     unsigned one_less = (proba > 0) ? proba - 1 : proba;
4658     mdbx_ensure(nullptr, time_conversion_checkup >= one_less &&
4659                              time_conversion_checkup <= one_more);
4660     if (proba == 0)
4661       break;
4662     proba >>= 1;
4663   }
4664 
4665   bootid = mdbx_osal_bootid();
4666 #if 0 /* debug */
4667   for (unsigned i = 0; i < 65536; ++i) {
4668     size_t pages = pv2pages(i);
4669     unsigned x = pages2pv(pages);
4670     size_t xp = pv2pages(x);
4671     if (!(x == i || (x % 2 == 0 && x < 65536)) || pages != xp)
4672       printf("%u => %zu => %u => %zu\n", i, pages, x, xp);
4673     assert(pages == xp);
4674   }
4675   fflush(stdout);
4676 #endif
4677 }
4678 
4679 /* dtor called for thread, i.e. for all mdbx's environment objects */
mdbx_rthc_thread_dtor(void * ptr)4680 __cold void mdbx_rthc_thread_dtor(void *ptr) {
4681   rthc_lock();
4682   mdbx_trace(">> pid %d, thread 0x%" PRIxPTR ", rthc %p", mdbx_getpid(),
4683              mdbx_thread_self(), ptr);
4684 
4685   const uint32_t self_pid = mdbx_getpid();
4686   for (unsigned i = 0; i < rthc_count; ++i) {
4687     if (!rthc_table[i].key_valid)
4688       continue;
4689     const mdbx_thread_key_t key = rthc_table[i].thr_tls_key;
4690     MDBX_reader *const rthc = thread_rthc_get(key);
4691     if (rthc < rthc_table[i].begin || rthc >= rthc_table[i].end)
4692       continue;
4693 #if !defined(_WIN32) && !defined(_WIN64)
4694     if (pthread_setspecific(key, nullptr) != 0) {
4695       mdbx_trace("== thread 0x%" PRIxPTR
4696                  ", rthc %p: ignore race with tsd-key deletion",
4697                  mdbx_thread_self(), ptr);
4698       continue /* ignore race with tsd-key deletion by mdbx_env_close() */;
4699     }
4700 #endif
4701 
4702     mdbx_trace("== thread 0x%" PRIxPTR
4703                ", rthc %p, [%i], %p ... %p (%+i), rtch-pid %i, "
4704                "current-pid %i",
4705                mdbx_thread_self(), __Wpedantic_format_voidptr(rthc), i,
4706                __Wpedantic_format_voidptr(rthc_table[i].begin),
4707                __Wpedantic_format_voidptr(rthc_table[i].end),
4708                (int)(rthc - rthc_table[i].begin), rthc->mr_pid.weak, self_pid);
4709     if (atomic_load32(&rthc->mr_pid, mo_Relaxed) == self_pid) {
4710       mdbx_trace("==== thread 0x%" PRIxPTR ", rthc %p, cleanup",
4711                  mdbx_thread_self(), __Wpedantic_format_voidptr(rthc));
4712       atomic_store32(&rthc->mr_pid, 0, mo_AcquireRelease);
4713     }
4714   }
4715 
4716 #if defined(_WIN32) || defined(_WIN64)
4717   mdbx_trace("<< thread 0x%" PRIxPTR ", rthc %p", mdbx_thread_self(), ptr);
4718   rthc_unlock();
4719 #else
4720   const char self_registration = *(volatile char *)ptr;
4721   *(volatile char *)ptr = MDBX_THREAD_RTHC_ZERO;
4722   mdbx_trace("== thread 0x%" PRIxPTR ", rthc %p, pid %d, self-status %d",
4723              mdbx_thread_self(), ptr, mdbx_getpid(), self_registration);
4724   if (self_registration == MDBX_THREAD_RTHC_COUNTED)
4725     mdbx_ensure(nullptr, atomic_sub32(&rthc_pending, 1) > 0);
4726 
4727   if (atomic_load32(&rthc_pending, mo_AcquireRelease) == 0) {
4728     mdbx_trace("== thread 0x%" PRIxPTR ", rthc %p, pid %d, wake",
4729                mdbx_thread_self(), ptr, mdbx_getpid());
4730     mdbx_ensure(nullptr, pthread_cond_broadcast(&rthc_cond) == 0);
4731   }
4732 
4733   mdbx_trace("<< thread 0x%" PRIxPTR ", rthc %p", mdbx_thread_self(), ptr);
4734   /* Allow tail call optimization, i.e. gcc should generate the jmp instruction
4735    * instead of a call for pthread_mutex_unlock() and therefore CPU could not
4736    * return to current DSO's code section, which may be unloaded immediately
4737    * after the mutex got released. */
4738   pthread_mutex_unlock(&rthc_mutex);
4739 #endif
4740 }
4741 
mdbx_rthc_global_dtor(void)4742 __cold void mdbx_rthc_global_dtor(void) {
4743   mdbx_trace(">> pid %d", mdbx_getpid());
4744 
4745   rthc_lock();
4746 #if !defined(_WIN32) && !defined(_WIN64)
4747   char *rthc = pthread_getspecific(rthc_key);
4748   mdbx_trace(
4749       "== thread 0x%" PRIxPTR ", rthc %p, pid %d, self-status %d, left %d",
4750       mdbx_thread_self(), __Wpedantic_format_voidptr(rthc), mdbx_getpid(),
4751       rthc ? *rthc : -1, atomic_load32(&rthc_pending, mo_Relaxed));
4752   if (rthc) {
4753     const char self_registration = *rthc;
4754     *rthc = MDBX_THREAD_RTHC_ZERO;
4755     if (self_registration == MDBX_THREAD_RTHC_COUNTED)
4756       mdbx_ensure(nullptr, atomic_sub32(&rthc_pending, 1) > 0);
4757   }
4758 
4759   struct timespec abstime;
4760   mdbx_ensure(nullptr, clock_gettime(CLOCK_REALTIME, &abstime) == 0);
4761   abstime.tv_nsec += 1000000000l / 10;
4762   if (abstime.tv_nsec >= 1000000000l) {
4763     abstime.tv_nsec -= 1000000000l;
4764     abstime.tv_sec += 1;
4765   }
4766 #if MDBX_DEBUG > 0
4767   abstime.tv_sec += 600;
4768 #endif
4769 
4770   for (unsigned left;
4771        (left = atomic_load32(&rthc_pending, mo_AcquireRelease)) > 0;) {
4772     mdbx_trace("pid %d, pending %u, wait for...", mdbx_getpid(), left);
4773     const int rc = pthread_cond_timedwait(&rthc_cond, &rthc_mutex, &abstime);
4774     if (rc && rc != EINTR)
4775       break;
4776   }
4777   thread_key_delete(rthc_key);
4778 #endif
4779 
4780   const uint32_t self_pid = mdbx_getpid();
4781   for (unsigned i = 0; i < rthc_count; ++i) {
4782     if (!rthc_table[i].key_valid)
4783       continue;
4784     const mdbx_thread_key_t key = rthc_table[i].thr_tls_key;
4785     thread_key_delete(key);
4786     for (MDBX_reader *rthc = rthc_table[i].begin; rthc < rthc_table[i].end;
4787          ++rthc) {
4788       mdbx_trace(
4789           "== [%i] = key %" PRIuPTR ", %p ... %p, rthc %p (%+i), "
4790           "rthc-pid %i, current-pid %i",
4791           i, (uintptr_t)key, __Wpedantic_format_voidptr(rthc_table[i].begin),
4792           __Wpedantic_format_voidptr(rthc_table[i].end),
4793           __Wpedantic_format_voidptr(rthc), (int)(rthc - rthc_table[i].begin),
4794           rthc->mr_pid.weak, self_pid);
4795       if (atomic_load32(&rthc->mr_pid, mo_Relaxed) == self_pid) {
4796         atomic_store32(&rthc->mr_pid, 0, mo_AcquireRelease);
4797         mdbx_trace("== cleanup %p", __Wpedantic_format_voidptr(rthc));
4798       }
4799     }
4800   }
4801 
4802   rthc_limit = rthc_count = 0;
4803   if (rthc_table != rthc_table_static)
4804     mdbx_free(rthc_table);
4805   rthc_table = nullptr;
4806   rthc_unlock();
4807 
4808 #if defined(_WIN32) || defined(_WIN64)
4809   DeleteCriticalSection(&lcklist_critical_section);
4810   DeleteCriticalSection(&rthc_critical_section);
4811 #else
4812   /* LY: yielding a few timeslices to give a more chance
4813    * to racing destructor(s) for completion. */
4814   workaround_glibc_bug21031();
4815 #endif
4816 
4817   mdbx_trace("<< pid %d\n", mdbx_getpid());
4818 }
4819 
mdbx_rthc_alloc(mdbx_thread_key_t * key,MDBX_reader * begin,MDBX_reader * end)4820 __cold int mdbx_rthc_alloc(mdbx_thread_key_t *key, MDBX_reader *begin,
4821                            MDBX_reader *end) {
4822   int rc;
4823   if (key) {
4824 #ifndef NDEBUG
4825     *key = (mdbx_thread_key_t)0xBADBADBAD;
4826 #endif /* NDEBUG */
4827     rc = thread_key_create(key);
4828     if (rc != MDBX_SUCCESS)
4829       return rc;
4830   }
4831 
4832   rthc_lock();
4833   const mdbx_thread_key_t new_key = key ? *key : 0;
4834   mdbx_trace(">> key %" PRIuPTR ", rthc_count %u, rthc_limit %u",
4835              (uintptr_t)new_key, rthc_count, rthc_limit);
4836   if (rthc_count == rthc_limit) {
4837     rthc_entry_t *new_table =
4838         mdbx_realloc((rthc_table == rthc_table_static) ? nullptr : rthc_table,
4839                      sizeof(rthc_entry_t) * rthc_limit * 2);
4840     if (new_table == nullptr) {
4841       rc = MDBX_ENOMEM;
4842       goto bailout;
4843     }
4844     if (rthc_table == rthc_table_static)
4845       memcpy(new_table, rthc_table_static, sizeof(rthc_table_static));
4846     rthc_table = new_table;
4847     rthc_limit *= 2;
4848   }
4849   mdbx_trace("== [%i] = key %" PRIuPTR ", %p ... %p", rthc_count,
4850              (uintptr_t)new_key, __Wpedantic_format_voidptr(begin),
4851              __Wpedantic_format_voidptr(end));
4852   rthc_table[rthc_count].key_valid = key ? true : false;
4853   rthc_table[rthc_count].thr_tls_key = key ? new_key : 0;
4854   rthc_table[rthc_count].begin = begin;
4855   rthc_table[rthc_count].end = end;
4856   ++rthc_count;
4857   mdbx_trace("<< key %" PRIuPTR ", rthc_count %u, rthc_limit %u",
4858              (uintptr_t)new_key, rthc_count, rthc_limit);
4859   rthc_unlock();
4860   return MDBX_SUCCESS;
4861 
4862 bailout:
4863   if (key)
4864     thread_key_delete(*key);
4865   rthc_unlock();
4866   return rc;
4867 }
4868 
mdbx_rthc_remove(const mdbx_thread_key_t key)4869 __cold void mdbx_rthc_remove(const mdbx_thread_key_t key) {
4870   thread_key_delete(key);
4871   rthc_lock();
4872   mdbx_trace(">> key %zu, rthc_count %u, rthc_limit %u", (size_t)key,
4873              rthc_count, rthc_limit);
4874 
4875   for (unsigned i = 0; i < rthc_count; ++i) {
4876     if (rthc_table[i].key_valid && key == rthc_table[i].thr_tls_key) {
4877       const uint32_t self_pid = mdbx_getpid();
4878       mdbx_trace("== [%i], %p ...%p, current-pid %d", i,
4879                  __Wpedantic_format_voidptr(rthc_table[i].begin),
4880                  __Wpedantic_format_voidptr(rthc_table[i].end), self_pid);
4881 
4882       for (MDBX_reader *rthc = rthc_table[i].begin; rthc < rthc_table[i].end;
4883            ++rthc) {
4884         if (atomic_load32(&rthc->mr_pid, mo_Relaxed) == self_pid) {
4885           atomic_store32(&rthc->mr_pid, 0, mo_AcquireRelease);
4886           mdbx_trace("== cleanup %p", __Wpedantic_format_voidptr(rthc));
4887         }
4888       }
4889       if (--rthc_count > 0)
4890         rthc_table[i] = rthc_table[rthc_count];
4891       else if (rthc_table != rthc_table_static) {
4892         mdbx_free(rthc_table);
4893         rthc_table = rthc_table_static;
4894         rthc_limit = RTHC_INITIAL_LIMIT;
4895       }
4896       break;
4897     }
4898   }
4899 
4900   mdbx_trace("<< key %zu, rthc_count %u, rthc_limit %u", (size_t)key,
4901              rthc_count, rthc_limit);
4902   rthc_unlock();
4903 }
4904 
4905 //------------------------------------------------------------------------------
4906 
4907 #define RTHC_ENVLIST_END ((MDBX_env *)((uintptr_t)50459))
4908 static MDBX_env *inprocess_lcklist_head = RTHC_ENVLIST_END;
4909 
lcklist_lock(void)4910 static __inline void lcklist_lock(void) {
4911 #if defined(_WIN32) || defined(_WIN64)
4912   EnterCriticalSection(&lcklist_critical_section);
4913 #else
4914   mdbx_ensure(nullptr, pthread_mutex_lock(&lcklist_mutex) == 0);
4915 #endif
4916 }
4917 
lcklist_unlock(void)4918 static __inline void lcklist_unlock(void) {
4919 #if defined(_WIN32) || defined(_WIN64)
4920   LeaveCriticalSection(&lcklist_critical_section);
4921 #else
4922   mdbx_ensure(nullptr, pthread_mutex_unlock(&lcklist_mutex) == 0);
4923 #endif
4924 }
4925 
rrxmrrxmsx_0(uint64_t v)4926 MDBX_NOTHROW_CONST_FUNCTION static uint64_t rrxmrrxmsx_0(uint64_t v) {
4927   /* Pelle Evensen's mixer, https://bit.ly/2HOfynt */
4928   v ^= (v << 39 | v >> 25) ^ (v << 14 | v >> 50);
4929   v *= UINT64_C(0xA24BAED4963EE407);
4930   v ^= (v << 40 | v >> 24) ^ (v << 15 | v >> 49);
4931   v *= UINT64_C(0x9FB21C651E98DF25);
4932   return v ^ v >> 28;
4933 }
4934 
uniq_peek(const mdbx_mmap_t * pending,mdbx_mmap_t * scan)4935 static int uniq_peek(const mdbx_mmap_t *pending, mdbx_mmap_t *scan) {
4936   int rc;
4937   uint64_t bait;
4938   MDBX_lockinfo *const pending_lck = pending->lck;
4939   MDBX_lockinfo *const scan_lck = scan->lck;
4940   if (pending_lck) {
4941     bait = atomic_load64(&pending_lck->mti_bait_uniqueness, mo_AcquireRelease);
4942     rc = MDBX_SUCCESS;
4943   } else {
4944     bait = 0 /* hush MSVC warning */;
4945     rc = mdbx_msync(scan, 0, sizeof(MDBX_lockinfo), MDBX_SYNC_DATA);
4946     if (rc == MDBX_SUCCESS)
4947       rc = mdbx_pread(pending->fd, &bait, sizeof(scan_lck->mti_bait_uniqueness),
4948                       offsetof(MDBX_lockinfo, mti_bait_uniqueness));
4949   }
4950   if (likely(rc == MDBX_SUCCESS) &&
4951       bait == atomic_load64(&scan_lck->mti_bait_uniqueness, mo_AcquireRelease))
4952     rc = MDBX_RESULT_TRUE;
4953 
4954   mdbx_trace("uniq-peek: %s, bait 0x%016" PRIx64 ",%s rc %d",
4955              pending_lck ? "mem" : "file", bait,
4956              (rc == MDBX_RESULT_TRUE) ? " found," : (rc ? " FAILED," : ""), rc);
4957   return rc;
4958 }
4959 
uniq_poke(const mdbx_mmap_t * pending,mdbx_mmap_t * scan,uint64_t * abra)4960 static int uniq_poke(const mdbx_mmap_t *pending, mdbx_mmap_t *scan,
4961                      uint64_t *abra) {
4962   if (*abra == 0) {
4963     const uintptr_t tid = mdbx_thread_self();
4964     uintptr_t uit = 0;
4965     memcpy(&uit, &tid, (sizeof(tid) < sizeof(uit)) ? sizeof(tid) : sizeof(uit));
4966     *abra =
4967         rrxmrrxmsx_0(mdbx_osal_monotime() + UINT64_C(5873865991930747) * uit);
4968   }
4969   const uint64_t cadabra =
4970       rrxmrrxmsx_0(*abra + UINT64_C(7680760450171793) * (unsigned)mdbx_getpid())
4971           << 24 |
4972       *abra >> 40;
4973   MDBX_lockinfo *const scan_lck = scan->lck;
4974   atomic_store64(&scan_lck->mti_bait_uniqueness, cadabra,
4975                  mo_SequentialConsistency);
4976   *abra = *abra * UINT64_C(6364136223846793005) + 1;
4977   return uniq_peek(pending, scan);
4978 }
4979 
uniq_check(const mdbx_mmap_t * pending,MDBX_env ** found)4980 __cold static int uniq_check(const mdbx_mmap_t *pending, MDBX_env **found) {
4981   *found = nullptr;
4982   uint64_t salt = 0;
4983   for (MDBX_env *scan = inprocess_lcklist_head; scan != RTHC_ENVLIST_END;
4984        scan = scan->me_lcklist_next) {
4985     MDBX_lockinfo *const scan_lck = scan->me_lck_mmap.lck;
4986     int err = atomic_load64(&scan_lck->mti_bait_uniqueness, mo_AcquireRelease)
4987                   ? uniq_peek(pending, &scan->me_lck_mmap)
4988                   : uniq_poke(pending, &scan->me_lck_mmap, &salt);
4989     if (err == MDBX_ENODATA) {
4990       uint64_t length;
4991       if (likely(mdbx_filesize(pending->fd, &length) == MDBX_SUCCESS &&
4992                  length == 0)) {
4993         /* LY: skip checking since LCK-file is empty, i.e. just created. */
4994         mdbx_debug("uniq-probe: %s", "unique (new/empty lck)");
4995         return MDBX_RESULT_TRUE;
4996       }
4997     }
4998     if (err == MDBX_RESULT_TRUE)
4999       err = uniq_poke(pending, &scan->me_lck_mmap, &salt);
5000     if (err == MDBX_RESULT_TRUE) {
5001       (void)mdbx_msync(&scan->me_lck_mmap, 0, sizeof(MDBX_lockinfo),
5002                        MDBX_SYNC_NONE);
5003       err = uniq_poke(pending, &scan->me_lck_mmap, &salt);
5004     }
5005     if (err == MDBX_RESULT_TRUE) {
5006       err = uniq_poke(pending, &scan->me_lck_mmap, &salt);
5007       *found = scan;
5008       mdbx_debug("uniq-probe: found %p", __Wpedantic_format_voidptr(*found));
5009       return MDBX_RESULT_FALSE;
5010     }
5011     if (unlikely(err != MDBX_SUCCESS)) {
5012       mdbx_debug("uniq-probe: failed rc %d", err);
5013       return err;
5014     }
5015   }
5016 
5017   mdbx_debug("uniq-probe: %s", "unique");
5018   return MDBX_RESULT_TRUE;
5019 }
5020 
lcklist_detach_locked(MDBX_env * env)5021 static int lcklist_detach_locked(MDBX_env *env) {
5022   MDBX_env *inprocess_neighbor = nullptr;
5023   int rc = MDBX_SUCCESS;
5024   if (env->me_lcklist_next != nullptr) {
5025     mdbx_ensure(env, env->me_lcklist_next != nullptr);
5026     mdbx_ensure(env, inprocess_lcklist_head != RTHC_ENVLIST_END);
5027     for (MDBX_env **ptr = &inprocess_lcklist_head; *ptr != RTHC_ENVLIST_END;
5028          ptr = &(*ptr)->me_lcklist_next) {
5029       if (*ptr == env) {
5030         *ptr = env->me_lcklist_next;
5031         env->me_lcklist_next = nullptr;
5032         break;
5033       }
5034     }
5035     mdbx_ensure(env, env->me_lcklist_next == nullptr);
5036   }
5037 
5038   rc = likely(mdbx_getpid() == env->me_pid)
5039            ? uniq_check(&env->me_lck_mmap, &inprocess_neighbor)
5040            : MDBX_PANIC;
5041   if (!inprocess_neighbor && env->me_live_reader)
5042     (void)mdbx_rpid_clear(env);
5043   if (!MDBX_IS_ERROR(rc))
5044     rc = mdbx_lck_destroy(env, inprocess_neighbor);
5045   return rc;
5046 }
5047 
5048 /*------------------------------------------------------------------------------
5049  * LY: State of the art quicksort-based sorting, with internal stack
5050  * and network-sort for small chunks.
5051  * Thanks to John M. Gamble for the http://pages.ripco.net/~jgamble/nw.html */
5052 
5053 #define SORT_CMP_SWAP(TYPE, CMP, a, b)                                         \
5054   do {                                                                         \
5055     const TYPE swap_tmp = (a);                                                 \
5056     const bool swap_cmp = CMP(swap_tmp, b);                                    \
5057     (a) = swap_cmp ? swap_tmp : b;                                             \
5058     (b) = swap_cmp ? b : swap_tmp;                                             \
5059   } while (0)
5060 
5061 //  3 comparators, 3 parallel operations
5062 //  o-----^--^--o
5063 //        |  |
5064 //  o--^--|--v--o
5065 //     |  |
5066 //  o--v--v-----o
5067 //
5068 //  [[1,2]]
5069 //  [[0,2]]
5070 //  [[0,1]]
5071 #define SORT_NETWORK_3(TYPE, CMP, begin)                                       \
5072   do {                                                                         \
5073     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]);                              \
5074     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]);                              \
5075     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]);                              \
5076   } while (0)
5077 
5078 //  5 comparators, 3 parallel operations
5079 //  o--^--^--------o
5080 //     |  |
5081 //  o--v--|--^--^--o
5082 //        |  |  |
5083 //  o--^--v--|--v--o
5084 //     |     |
5085 //  o--v-----v-----o
5086 //
5087 //  [[0,1],[2,3]]
5088 //  [[0,2],[1,3]]
5089 //  [[1,2]]
5090 #define SORT_NETWORK_4(TYPE, CMP, begin)                                       \
5091   do {                                                                         \
5092     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]);                              \
5093     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]);                              \
5094     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]);                              \
5095     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]);                              \
5096     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]);                              \
5097   } while (0)
5098 
5099 //  9 comparators, 5 parallel operations
5100 //  o--^--^-----^-----------o
5101 //     |  |     |
5102 //  o--|--|--^--v-----^--^--o
5103 //     |  |  |        |  |
5104 //  o--|--v--|--^--^--|--v--o
5105 //     |     |  |  |  |
5106 //  o--|-----v--|--v--|--^--o
5107 //     |        |     |  |
5108 //  o--v--------v-----v--v--o
5109 //
5110 //  [[0,4],[1,3]]
5111 //  [[0,2]]
5112 //  [[2,4],[0,1]]
5113 //  [[2,3],[1,4]]
5114 //  [[1,2],[3,4]]
5115 #define SORT_NETWORK_5(TYPE, CMP, begin)                                       \
5116   do {                                                                         \
5117     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]);                              \
5118     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]);                              \
5119     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]);                              \
5120     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]);                              \
5121     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]);                              \
5122     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]);                              \
5123     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]);                              \
5124     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]);                              \
5125     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]);                              \
5126   } while (0)
5127 
5128 //  12 comparators, 6 parallel operations
5129 //  o-----^--^--^-----------------o
5130 //        |  |  |
5131 //  o--^--|--v--|--^--------^-----o
5132 //     |  |     |  |        |
5133 //  o--v--v-----|--|--^--^--|--^--o
5134 //              |  |  |  |  |  |
5135 //  o-----^--^--v--|--|--|--v--v--o
5136 //        |  |     |  |  |
5137 //  o--^--|--v-----v--|--v--------o
5138 //     |  |           |
5139 //  o--v--v-----------v-----------o
5140 //
5141 //  [[1,2],[4,5]]
5142 //  [[0,2],[3,5]]
5143 //  [[0,1],[3,4],[2,5]]
5144 //  [[0,3],[1,4]]
5145 //  [[2,4],[1,3]]
5146 //  [[2,3]]
5147 #define SORT_NETWORK_6(TYPE, CMP, begin)                                       \
5148   do {                                                                         \
5149     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]);                              \
5150     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]);                              \
5151     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]);                              \
5152     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]);                              \
5153     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]);                              \
5154     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]);                              \
5155     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[5]);                              \
5156     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[3]);                              \
5157     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]);                              \
5158     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]);                              \
5159     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]);                              \
5160     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]);                              \
5161   } while (0)
5162 
5163 //  16 comparators, 6 parallel operations
5164 //  o--^--------^-----^-----------------o
5165 //     |        |     |
5166 //  o--|--^-----|--^--v--------^--^-----o
5167 //     |  |     |  |           |  |
5168 //  o--|--|--^--v--|--^-----^--|--v-----o
5169 //     |  |  |     |  |     |  |
5170 //  o--|--|--|-----v--|--^--v--|--^--^--o
5171 //     |  |  |        |  |     |  |  |
5172 //  o--v--|--|--^-----v--|--^--v--|--v--o
5173 //        |  |  |        |  |     |
5174 //  o-----v--|--|--------v--v-----|--^--o
5175 //           |  |                 |  |
5176 //  o--------v--v-----------------v--v--o
5177 //
5178 //  [[0,4],[1,5],[2,6]]
5179 //  [[0,2],[1,3],[4,6]]
5180 //  [[2,4],[3,5],[0,1]]
5181 //  [[2,3],[4,5]]
5182 //  [[1,4],[3,6]]
5183 //  [[1,2],[3,4],[5,6]]
5184 #define SORT_NETWORK_7(TYPE, CMP, begin)                                       \
5185   do {                                                                         \
5186     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]);                              \
5187     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]);                              \
5188     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]);                              \
5189     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]);                              \
5190     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]);                              \
5191     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]);                              \
5192     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]);                              \
5193     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]);                              \
5194     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]);                              \
5195     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]);                              \
5196     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]);                              \
5197     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]);                              \
5198     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[6]);                              \
5199     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]);                              \
5200     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]);                              \
5201     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]);                              \
5202   } while (0)
5203 
5204 //  19 comparators, 6 parallel operations
5205 //  o--^--------^-----^-----------------o
5206 //     |        |     |
5207 //  o--|--^-----|--^--v--------^--^-----o
5208 //     |  |     |  |           |  |
5209 //  o--|--|--^--v--|--^-----^--|--v-----o
5210 //     |  |  |     |  |     |  |
5211 //  o--|--|--|--^--v--|--^--v--|--^--^--o
5212 //     |  |  |  |     |  |     |  |  |
5213 //  o--v--|--|--|--^--v--|--^--v--|--v--o
5214 //        |  |  |  |     |  |     |
5215 //  o-----v--|--|--|--^--v--v-----|--^--o
5216 //           |  |  |  |           |  |
5217 //  o--------v--|--v--|--^--------v--v--o
5218 //              |     |  |
5219 //  o-----------v-----v--v--------------o
5220 //
5221 //  [[0,4],[1,5],[2,6],[3,7]]
5222 //  [[0,2],[1,3],[4,6],[5,7]]
5223 //  [[2,4],[3,5],[0,1],[6,7]]
5224 //  [[2,3],[4,5]]
5225 //  [[1,4],[3,6]]
5226 //  [[1,2],[3,4],[5,6]]
5227 #define SORT_NETWORK_8(TYPE, CMP, begin)                                       \
5228   do {                                                                         \
5229     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]);                              \
5230     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]);                              \
5231     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]);                              \
5232     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]);                              \
5233     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]);                              \
5234     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]);                              \
5235     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]);                              \
5236     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]);                              \
5237     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]);                              \
5238     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]);                              \
5239     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]);                              \
5240     SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]);                              \
5241     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]);                              \
5242     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]);                              \
5243     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]);                              \
5244     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[6]);                              \
5245     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]);                              \
5246     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]);                              \
5247     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]);                              \
5248   } while (0)
5249 
5250 //  25 comparators, 9 parallel operations
5251 //  o--^-----^--^-----^-----------------------------------o
5252 //     |     |  |     |
5253 //  o--v--^--v--|-----|--^-----^-----------^--------------o
5254 //        |     |     |  |     |           |
5255 //  o-----v-----|-----|--|-----|--^-----^--|--^-----^--^--o
5256 //              |     |  |     |  |     |  |  |     |  |
5257 //  o--^-----^--v--^--v--|-----|--|-----|--v--|-----|--v--o
5258 //     |     |     |     |     |  |     |     |     |
5259 //  o--v--^--v-----|-----v--^--v--|-----|-----|--^--v-----o
5260 //        |        |        |     |     |     |  |
5261 //  o-----v--------|--------|-----v--^--v--^--|--|--^-----o
5262 //                 |        |        |     |  |  |  |
5263 //  o--^-----^-----v--------|--------|-----|--v--v--v-----o
5264 //     |     |              |        |     |
5265 //  o--v--^--v--------------v--------|-----v--------------o
5266 //        |                          |
5267 //  o-----v--------------------------v--------------------o
5268 //
5269 //  [[0,1],[3,4],[6,7]]
5270 //  [[1,2],[4,5],[7,8]]
5271 //  [[0,1],[3,4],[6,7],[2,5]]
5272 //  [[0,3],[1,4],[5,8]]
5273 //  [[3,6],[4,7],[2,5]]
5274 //  [[0,3],[1,4],[5,7],[2,6]]
5275 //  [[1,3],[4,6]]
5276 //  [[2,4],[5,6]]
5277 //  [[2,3]]
5278 #define SORT_NETWORK_9(TYPE, CMP, begin)                                       \
5279   do {                                                                         \
5280     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]);                              \
5281     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]);                              \
5282     SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]);                              \
5283     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]);                              \
5284     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]);                              \
5285     SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]);                              \
5286     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]);                              \
5287     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]);                              \
5288     SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]);                              \
5289     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[5]);                              \
5290     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[3]);                              \
5291     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]);                              \
5292     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[8]);                              \
5293     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[6]);                              \
5294     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[7]);                              \
5295     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[5]);                              \
5296     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[3]);                              \
5297     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]);                              \
5298     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]);                              \
5299     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]);                              \
5300     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]);                              \
5301     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]);                              \
5302     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]);                              \
5303     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]);                              \
5304     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]);                              \
5305   } while (0)
5306 
5307 //  29 comparators, 9 parallel operations
5308 //  o--------------^-----^--^--^-----------------------o
5309 //                 |     |  |  |
5310 //  o-----------^--|--^--|--|--v--^--------^-----------o
5311 //              |  |  |  |  |     |        |
5312 //  o--------^--|--|--|--|--v--^--v-----^--|--^--------o
5313 //           |  |  |  |  |     |        |  |  |
5314 //  o-----^--|--|--|--|--v--^--|-----^--|--v--v--^-----o
5315 //        |  |  |  |  |     |  |     |  |        |
5316 //  o--^--|--|--|--|--v-----|--v--^--|--|--^-----v--^--o
5317 //     |  |  |  |  |        |     |  |  |  |        |
5318 //  o--|--|--|--|--v--^-----|--^--|--v--v--|-----^--v--o
5319 //     |  |  |  |     |     |  |  |        |     |
5320 //  o--|--|--|--v--^--|-----v--|--v--^-----|--^--v-----o
5321 //     |  |  |     |  |        |     |     |  |
5322 //  o--|--|--v-----|--|--^-----v--^--|-----v--v--------o
5323 //     |  |        |  |  |        |  |
5324 //  o--|--v--------|--v--|--^-----v--v-----------------o
5325 //     |           |     |  |
5326 //  o--v-----------v-----v--v--------------------------o
5327 //
5328 //  [[4,9],[3,8],[2,7],[1,6],[0,5]]
5329 //  [[1,4],[6,9],[0,3],[5,8]]
5330 //  [[0,2],[3,6],[7,9]]
5331 //  [[0,1],[2,4],[5,7],[8,9]]
5332 //  [[1,2],[4,6],[7,8],[3,5]]
5333 //  [[2,5],[6,8],[1,3],[4,7]]
5334 //  [[2,3],[6,7]]
5335 //  [[3,4],[5,6]]
5336 //  [[4,5]]
5337 #define SORT_NETWORK_10(TYPE, CMP, begin)                                      \
5338   do {                                                                         \
5339     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[9]);                              \
5340     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[8]);                              \
5341     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[7]);                              \
5342     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[6]);                              \
5343     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[5]);                              \
5344     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]);                              \
5345     SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[9]);                              \
5346     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[3]);                              \
5347     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[8]);                              \
5348     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]);                              \
5349     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[6]);                              \
5350     SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[9]);                              \
5351     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]);                              \
5352     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]);                              \
5353     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]);                              \
5354     SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]);                              \
5355     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]);                              \
5356     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]);                              \
5357     SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]);                              \
5358     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]);                              \
5359     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[5]);                              \
5360     SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]);                              \
5361     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]);                              \
5362     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[7]);                              \
5363     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]);                              \
5364     SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]);                              \
5365     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]);                              \
5366     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]);                              \
5367     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]);                              \
5368   } while (0)
5369 
5370 //  35 comparators, 9 parallel operations
5371 //  o--^-----^-----------------^--------^--------------------o
5372 //     |     |                 |        |
5373 //  o--v--^--|--^--^--------^--|--------|--^-----------------o
5374 //        |  |  |  |        |  |        |  |
5375 //  o--^--|--v--v--|-----^--|--|--------|--|-----^--^--------o
5376 //     |  |        |     |  |  |        |  |     |  |
5377 //  o--v--v--------|-----|--|--|--^-----|--|--^--v--|--^--^--o
5378 //                 |     |  |  |  |     |  |  |     |  |  |
5379 //  o--^-----^-----|-----|--|--v--|--^--v--v--|-----v--|--v--o
5380 //     |     |     |     |  |     |  |        |        |
5381 //  o--v--^--|--^--v--^--|--v-----|--|--------|--------v--^--o
5382 //        |  |  |     |  |        |  |        |           |
5383 //  o--^--|--v--v--^--|--v--^-----|--|--------|--------^--v--o
5384 //     |  |        |  |     |     |  |        |        |
5385 //  o--v--v--------|--|-----|-----v--|--^-----|-----^--|--^--o
5386 //                 |  |     |        |  |     |     |  |  |
5387 //  o--^--^--------|--|-----|--------v--|-----v--^--|--v--v--o
5388 //     |  |        |  |     |           |        |  |
5389 //  o--v--|--^-----|--v-----|-----------|--------v--v--------o
5390 //        |  |     |        |           |
5391 //  o-----v--v-----v--------v-----------v--------------------o
5392 //
5393 //  [[0,1],[2,3],[4,5],[6,7],[8,9]]
5394 //  [[1,3],[5,7],[0,2],[4,6],[8,10]]
5395 //  [[1,2],[5,6],[9,10],[0,4],[3,7]]
5396 //  [[1,5],[6,10],[4,8]]
5397 //  [[5,9],[2,6],[0,4],[3,8]]
5398 //  [[1,5],[6,10],[2,3],[8,9]]
5399 //  [[1,4],[7,10],[3,5],[6,8]]
5400 //  [[2,4],[7,9],[5,6]]
5401 //  [[3,4],[7,8]]
5402 #define SORT_NETWORK_11(TYPE, CMP, begin)                                      \
5403   do {                                                                         \
5404     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]);                              \
5405     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]);                              \
5406     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]);                              \
5407     SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]);                              \
5408     SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]);                              \
5409     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]);                              \
5410     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]);                              \
5411     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]);                              \
5412     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]);                              \
5413     SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[10]);                             \
5414     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]);                              \
5415     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]);                              \
5416     SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]);                             \
5417     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]);                              \
5418     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]);                              \
5419     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]);                              \
5420     SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[10]);                             \
5421     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[8]);                              \
5422     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[9]);                              \
5423     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]);                              \
5424     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]);                              \
5425     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[8]);                              \
5426     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]);                              \
5427     SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[10]);                             \
5428     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]);                              \
5429     SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]);                              \
5430     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]);                              \
5431     SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[10]);                             \
5432     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]);                              \
5433     SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]);                              \
5434     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]);                              \
5435     SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[9]);                              \
5436     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]);                              \
5437     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]);                              \
5438     SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]);                              \
5439   } while (0)
5440 
5441 //  39 comparators, parallel operations
5442 //  o--^-----^-----------------^--------^--------------------o
5443 //     |     |                 |        |
5444 //  o--v--^--|--^--^--------^--|--------|--^-----------------o
5445 //        |  |  |  |        |  |        |  |
5446 //  o--^--|--v--v--|-----^--|--|--------|--|-----^--^--------o
5447 //     |  |        |     |  |  |        |  |     |  |
5448 //  o--v--v--------|-----|--|--|--^-----|--|--^--v--|--^--^--o
5449 //                 |     |  |  |  |     |  |  |     |  |  |
5450 //  o--^-----^-----|-----|--|--v--|--^--v--v--|-----v--|--v--o
5451 //     |     |     |     |  |     |  |        |        |
5452 //  o--v--^--|--^--v--^--|--v-----|--|--------|--------v--^--o
5453 //        |  |  |     |  |        |  |        |           |
5454 //  o--^--|--v--v--^--|--v--^-----|--|--------|--------^--v--o
5455 //     |  |        |  |     |     |  |        |        |
5456 //  o--v--v--------|--|-----|--^--v--|--^--^--|-----^--|--^--o
5457 //                 |  |     |  |     |  |  |  |     |  |  |
5458 //  o--^-----^-----|--|-----|--|-----v--|--|--v--^--|--v--v--o
5459 //     |     |     |  |     |  |        |  |     |  |
5460 //  o--v--^--|--^--|--v-----|--|--------|--|-----v--v--------o
5461 //        |  |  |  |        |  |        |  |
5462 //  o--^--|--v--v--v--------v--|--------|--v-----------------o
5463 //     |  |                    |        |
5464 //  o--v--v--------------------v--------v--------------------o
5465 //
5466 //  [[0,1],[2,3],[4,5],[6,7],[8,9],[10,11]]
5467 //  [[1,3],[5,7],[9,11],[0,2],[4,6],[8,10]]
5468 //  [[1,2],[5,6],[9,10],[0,4],[7,11]]
5469 //  [[1,5],[6,10],[3,7],[4,8]]
5470 //  [[5,9],[2,6],[0,4],[7,11],[3,8]]
5471 //  [[1,5],[6,10],[2,3],[8,9]]
5472 //  [[1,4],[7,10],[3,5],[6,8]]
5473 //  [[2,4],[7,9],[5,6]]
5474 //  [[3,4],[7,8]]
5475 #define SORT_NETWORK_12(TYPE, CMP, begin)                                      \
5476   do {                                                                         \
5477     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]);                              \
5478     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]);                              \
5479     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]);                              \
5480     SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]);                              \
5481     SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]);                              \
5482     SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[11]);                            \
5483     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]);                              \
5484     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]);                              \
5485     SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[11]);                             \
5486     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]);                              \
5487     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]);                              \
5488     SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[10]);                             \
5489     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]);                              \
5490     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]);                              \
5491     SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]);                             \
5492     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]);                              \
5493     SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[11]);                             \
5494     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]);                              \
5495     SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[10]);                             \
5496     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]);                              \
5497     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[8]);                              \
5498     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[9]);                              \
5499     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]);                              \
5500     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]);                              \
5501     SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[11]);                             \
5502     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[8]);                              \
5503     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]);                              \
5504     SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[10]);                             \
5505     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]);                              \
5506     SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]);                              \
5507     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]);                              \
5508     SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[10]);                             \
5509     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]);                              \
5510     SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]);                              \
5511     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]);                              \
5512     SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[9]);                              \
5513     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]);                              \
5514     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]);                              \
5515     SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]);                              \
5516   } while (0)
5517 
5518 //  45 comparators, 10 parallel operations
5519 //  o--------^--^-----^-----------------------------^-----------------o
5520 //           |  |     |                             |
5521 //  o--^-----|--v-----|-----^--------------^-----^--|-----^-----------o
5522 //     |     |        |     |              |     |  |     |
5523 //  o--|-----|--^--^--v-----|--------------|--^--|--|--^--v--^--------o
5524 //     |     |  |  |        |              |  |  |  |  |     |
5525 //  o--|--^--|--|--v-----^--|--------^-----|--|--v--|--|--^--v-----^--o
5526 //     |  |  |  |        |  |        |     |  |     |  |  |        |
5527 //  o--|--v--|--|--^-----|--v-----^--v-----|--|--^--|--|--|--^--^--v--o
5528 //     |     |  |  |     |        |        |  |  |  |  |  |  |  |
5529 //  o--|--^--|--|--|--^--|--------|-----^--|--|--|--v--v--v--|--v--^--o
5530 //     |  |  |  |  |  |  |        |     |  |  |  |           |     |
5531 //  o--|--|--|--v--v--|--|--^-----|--^--v--|--v--|--^--------v--^--v--o
5532 //     |  |  |        |  |  |     |  |     |     |  |           |
5533 //  o--v--|--|-----^--|--v--|--^--|--|-----v-----v--|--^--------v-----o
5534 //        |  |     |  |     |  |  |  |              |  |
5535 //  o-----v--|--^--|--|-----|--v--|--|--^-----^-----v--v--^-----------o
5536 //           |  |  |  |     |     |  |  |     |           |
5537 //  o--^-----|--|--|--v-----|-----v--|--v--^--|--^--------v-----------o
5538 //     |     |  |  |        |        |     |  |  |
5539 //  o--|-----|--|--|--^-----|--------v--^--|--v--v--------------------o
5540 //     |     |  |  |  |     |           |  |
5541 //  o--v-----|--v--|--v-----|--^--------v--v--------------------------o
5542 //           |     |        |  |
5543 //  o--------v-----v--------v--v--------------------------------------o
5544 //
5545 //  [[1,7],[9,11],[3,4],[5,8],[0,12],[2,6]]
5546 //  [[0,1],[2,3],[4,6],[8,11],[7,12],[5,9]]
5547 //  [[0,2],[3,7],[10,11],[1,4],[6,12]]
5548 //  [[7,8],[11,12],[4,9],[6,10]]
5549 //  [[3,4],[5,6],[8,9],[10,11],[1,7]]
5550 //  [[2,6],[9,11],[1,3],[4,7],[8,10],[0,5]]
5551 //  [[2,5],[6,8],[9,10]]
5552 //  [[1,2],[3,5],[7,8],[4,6]]
5553 //  [[2,3],[4,5],[6,7],[8,9]]
5554 //  [[3,4],[5,6]]
5555 #define SORT_NETWORK_13(TYPE, CMP, begin)                                      \
5556   do {                                                                         \
5557     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[7]);                              \
5558     SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[11]);                             \
5559     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]);                              \
5560     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[8]);                              \
5561     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[12]);                             \
5562     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]);                              \
5563     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]);                              \
5564     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]);                              \
5565     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]);                              \
5566     SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[11]);                             \
5567     SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[12]);                             \
5568     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[9]);                              \
5569     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]);                              \
5570     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]);                              \
5571     SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[11]);                            \
5572     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]);                              \
5573     SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[12]);                             \
5574     SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]);                              \
5575     SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[12]);                            \
5576     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[9]);                              \
5577     SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[10]);                             \
5578     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]);                              \
5579     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]);                              \
5580     SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]);                              \
5581     SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[11]);                            \
5582     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[7]);                              \
5583     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]);                              \
5584     SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[11]);                             \
5585     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]);                              \
5586     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[7]);                              \
5587     SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[10]);                             \
5588     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[5]);                              \
5589     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[5]);                              \
5590     SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]);                              \
5591     SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]);                             \
5592     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]);                              \
5593     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]);                              \
5594     SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]);                              \
5595     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]);                              \
5596     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]);                              \
5597     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]);                              \
5598     SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]);                              \
5599     SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]);                              \
5600     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]);                              \
5601     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]);                              \
5602   } while (0)
5603 
5604 /* *INDENT-OFF* */
5605 /* clang-format off */
5606 
5607 //  51 comparators, 10 parallel operations
5608 //  o--^--^-----^-----------^-----------------------------------------------------------o
5609 //     |  |     |           |
5610 //  o--v--|--^--|--^--------|--^-----^-----------------------^--------------------------o
5611 //        |  |  |  |        |  |     |                       |
5612 //  o--^--v--|--|--|--^-----|--|--^--v-----------------------|--^--^--------------------o
5613 //     |     |  |  |  |     |  |  |                          |  |  |
5614 //  o--v-----v--|--|--|--^--|--|--|--^--------------^--------|--|--|--^--^--^-----------o
5615 //              |  |  |  |  |  |  |  |              |        |  |  |  |  |  |
5616 //  o--^--^-----v--|--|--|--|--|--|--|--^-----------|-----^--v--|--v--|--|--v-----------o
5617 //     |  |        |  |  |  |  |  |  |  |           |     |     |     |  |
5618 //  o--v--|--^-----v--|--|--|--|--|--|--|--^--^-----|-----|-----|--^--|--v-----^--------o
5619 //        |  |        |  |  |  |  |  |  |  |  |     |     |     |  |  |        |
5620 //  o--^--v--|--------v--|--|--|--|--|--|--|--|--^--|-----|-----|--v--|-----^--v-----^--o
5621 //     |     |           |  |  |  |  |  |  |  |  |  |     |     |     |     |        |
5622 //  o--v-----v-----------v--|--|--|--|--|--|--|--|--|--^--|--^--|-----|--^--|--^--^--v--o
5623 //                          |  |  |  |  |  |  |  |  |  |  |  |  |     |  |  |  |  |
5624 //  o--^--^-----^-----------v--|--|--|--|--|--|--|--|--|--v--|--v-----v--|--v--|--v--^--o
5625 //     |  |     |              |  |  |  |  |  |  |  |  |     |           |     |     |
5626 //  o--v--|--^--|--^-----------v--|--|--|--|--|--v--|--|-----|--^--------|-----v--^--v--o
5627 //        |  |  |  |              |  |  |  |  |     |  |     |  |        |        |
5628 //  o--^--v--|--|--|--------------v--|--|--|--v-----|--|-----|--v--------|--^-----v-----o
5629 //     |     |  |  |                 |  |  |        |  |     |           |  |
5630 //  o--v-----v--|--|-----------------v--|--|--------|--v-----|--^--------|--|--^--------o
5631 //              |  |                    |  |        |        |  |        |  |  |
5632 //  o--^--------v--|--------------------v--|--------v--------|--|--------v--v--v--------o
5633 //     |           |                       |                 |  |
5634 //  o--v-----------v-----------------------v-----------------v--v-----------------------o
5635 //
5636 //  [[0,1],[2,3],[4,5],[6,7],[8,9],[10,11],[12,13]]
5637 //  [[0,2],[4,6],[8,10],[1,3],[5,7],[9,11]]
5638 //  [[0,4],[8,12],[1,5],[9,13],[2,6],[3,7]]
5639 //  [[0,8],[1,9],[2,10],[3,11],[4,12],[5,13]]
5640 //  [[5,10],[6,9],[3,12],[7,11],[1,2],[4,8]]
5641 //  [[1,4],[7,13],[2,8],[5,6],[9,10]]
5642 //  [[2,4],[11,13],[3,8],[7,12]]
5643 //  [[6,8],[10,12],[3,5],[7,9]]
5644 //  [[3,4],[5,6],[7,8],[9,10],[11,12]]
5645 //  [[6,7],[8,9]]
5646 
5647 /* *INDENT-ON* */
5648 /* clang-format on */
5649 
5650 #define SORT_NETWORK_14(TYPE, CMP, begin)                                      \
5651   do {                                                                         \
5652     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]);                              \
5653     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]);                              \
5654     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]);                              \
5655     SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]);                              \
5656     SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]);                              \
5657     SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[11]);                            \
5658     SORT_CMP_SWAP(TYPE, CMP, begin[12], begin[13]);                            \
5659     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]);                              \
5660     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]);                              \
5661     SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[10]);                             \
5662     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]);                              \
5663     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]);                              \
5664     SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[11]);                             \
5665     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]);                              \
5666     SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[12]);                             \
5667     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]);                              \
5668     SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[13]);                             \
5669     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]);                              \
5670     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]);                              \
5671     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[8]);                              \
5672     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[9]);                              \
5673     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[10]);                             \
5674     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[11]);                             \
5675     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[12]);                             \
5676     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[13]);                             \
5677     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[10]);                             \
5678     SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[9]);                              \
5679     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[12]);                             \
5680     SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[11]);                             \
5681     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]);                              \
5682     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[8]);                              \
5683     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]);                              \
5684     SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[13]);                             \
5685     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[8]);                              \
5686     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]);                              \
5687     SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]);                             \
5688     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]);                              \
5689     SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[13]);                            \
5690     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[8]);                              \
5691     SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[12]);                             \
5692     SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]);                              \
5693     SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[12]);                            \
5694     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]);                              \
5695     SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[9]);                              \
5696     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]);                              \
5697     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]);                              \
5698     SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]);                              \
5699     SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]);                             \
5700     SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[12]);                            \
5701     SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]);                              \
5702     SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]);                              \
5703   } while (0)
5704 
5705 /* *INDENT-OFF* */
5706 /* clang-format off */
5707 
5708 //  56 comparators, 10 parallel operations
5709 //  o--^--^-----^-----------^--------------------------------------------------------------o
5710 //     |  |     |           |
5711 //  o--v--|--^--|--^--------|--^-----^--------------------------^--------------------------o
5712 //        |  |  |  |        |  |     |                          |
5713 //  o--^--v--|--|--|--^-----|--|--^--v--------------------------|--^--^--------------------o
5714 //     |     |  |  |  |     |  |  |                             |  |  |
5715 //  o--v-----v--|--|--|--^--|--|--|--^-----------------^--------|--|--|--^--^--^-----------o
5716 //              |  |  |  |  |  |  |  |                 |        |  |  |  |  |  |
5717 //  o--^--^-----v--|--|--|--|--|--|--|--^--------------|-----^--v--|--v--|--|--v-----------o
5718 //     |  |        |  |  |  |  |  |  |  |              |     |     |     |  |
5719 //  o--v--|--^-----v--|--|--|--|--|--|--|--^-----^-----|-----|-----|--^--|--v-----^--------o
5720 //        |  |        |  |  |  |  |  |  |  |     |     |     |     |  |  |        |
5721 //  o--^--v--|--------v--|--|--|--|--|--|--|--^--|--^--|-----|-----|--v--|-----^--v-----^--o
5722 //     |     |           |  |  |  |  |  |  |  |  |  |  |     |     |     |     |        |
5723 //  o--v-----v-----------v--|--|--|--|--|--|--|--|--|--|--^--|--^--|-----|--^--|--^--^--v--o
5724 //                          |  |  |  |  |  |  |  |  |  |  |  |  |  |     |  |  |  |  |
5725 //  o--^--^-----^-----------v--|--|--|--|--|--|--|--|--|--|--v--|--v-----v--|--v--|--v--^--o
5726 //     |  |     |              |  |  |  |  |  |  |  |  |  |     |           |     |     |
5727 //  o--v--|--^--|--^-----------v--|--|--|--|--|--|--v--|--|-----|--^--------|-----v--^--v--o
5728 //        |  |  |  |              |  |  |  |  |  |     |  |     |  |        |        |
5729 //  o--^--v--|--|--|--^-----------v--|--|--|--|--v-----|--|-----|--v--------|--^-----v-----o
5730 //     |     |  |  |  |              |  |  |  |        |  |     |           |  |
5731 //  o--v-----v--|--|--|--------------v--|--|--|--------|--v-----|--^--^-----|--|--^--------o
5732 //              |  |  |                 |  |  |        |        |  |  |     |  |  |
5733 //  o--^--^-----v--|--|-----------------v--|--|--------v--------|--|--|-----v--v--v--------o
5734 //     |  |        |  |                    |  |                 |  |  |
5735 //  o--v--|--------v--|--------------------v--|--^--------------v--|--v--------------------o
5736 //        |           |                       |  |                 |
5737 //  o-----v-----------v-----------------------v--v-----------------v-----------------------o
5738 //
5739 //  [[0,1],[2,3],[4,5],[6,7],[8,9],[10,11],[12,13]]
5740 //  [[0,2],[4,6],[8,10],[12,14],[1,3],[5,7],[9,11]]
5741 //  [[0,4],[8,12],[1,5],[9,13],[2,6],[10,14],[3,7]]
5742 //  [[0,8],[1,9],[2,10],[3,11],[4,12],[5,13],[6,14]]
5743 //  [[5,10],[6,9],[3,12],[13,14],[7,11],[1,2],[4,8]]
5744 //  [[1,4],[7,13],[2,8],[11,14],[5,6],[9,10]]
5745 //  [[2,4],[11,13],[3,8],[7,12]]
5746 //  [[6,8],[10,12],[3,5],[7,9]]
5747 //  [[3,4],[5,6],[7,8],[9,10],[11,12]]
5748 //  [[6,7],[8,9]]
5749 
5750 /* *INDENT-ON* */
5751 /* clang-format on */
5752 
5753 #define SORT_NETWORK_15(TYPE, CMP, begin)                                      \
5754   do {                                                                         \
5755     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]);                              \
5756     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]);                              \
5757     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]);                              \
5758     SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]);                              \
5759     SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]);                              \
5760     SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[11]);                            \
5761     SORT_CMP_SWAP(TYPE, CMP, begin[12], begin[13]);                            \
5762     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]);                              \
5763     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]);                              \
5764     SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[10]);                             \
5765     SORT_CMP_SWAP(TYPE, CMP, begin[12], begin[14]);                            \
5766     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]);                              \
5767     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]);                              \
5768     SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[11]);                             \
5769     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]);                              \
5770     SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[12]);                             \
5771     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]);                              \
5772     SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[13]);                             \
5773     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]);                              \
5774     SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[14]);                            \
5775     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]);                              \
5776     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[8]);                              \
5777     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[9]);                              \
5778     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[10]);                             \
5779     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[11]);                             \
5780     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[12]);                             \
5781     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[13]);                             \
5782     SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[14]);                             \
5783     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[10]);                             \
5784     SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[9]);                              \
5785     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[12]);                             \
5786     SORT_CMP_SWAP(TYPE, CMP, begin[13], begin[14]);                            \
5787     SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[11]);                             \
5788     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]);                              \
5789     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[8]);                              \
5790     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]);                              \
5791     SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[13]);                             \
5792     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[8]);                              \
5793     SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[14]);                            \
5794     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]);                              \
5795     SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]);                             \
5796     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]);                              \
5797     SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[13]);                            \
5798     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[8]);                              \
5799     SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[12]);                             \
5800     SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]);                              \
5801     SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[12]);                            \
5802     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]);                              \
5803     SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[9]);                              \
5804     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]);                              \
5805     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]);                              \
5806     SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]);                              \
5807     SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]);                             \
5808     SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[12]);                            \
5809     SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]);                              \
5810     SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]);                              \
5811   } while (0)
5812 
5813 /* *INDENT-OFF* */
5814 /* clang-format off */
5815 
5816 //  60 comparators, 10 parallel operations
5817 //  o--^--^-----^-----------^-----------------------------------------------------------------o
5818 //     |  |     |           |
5819 //  o--v--|--^--|--^--------|--^-----^-----------------------------^--------------------------o
5820 //        |  |  |  |        |  |     |                             |
5821 //  o--^--v--|--|--|--^-----|--|--^--v-----------------------------|--^--^--------------------o
5822 //     |     |  |  |  |     |  |  |                                |  |  |
5823 //  o--v-----v--|--|--|--^--|--|--|--^--------------------^--------|--|--|--^--^--^-----------o
5824 //              |  |  |  |  |  |  |  |                    |        |  |  |  |  |  |
5825 //  o--^--^-----v--|--|--|--|--|--|--|--^-----------------|-----^--v--|--v--|--|--v-----------o
5826 //     |  |        |  |  |  |  |  |  |  |                 |     |     |     |  |
5827 //  o--v--|--^-----v--|--|--|--|--|--|--|--^--------^-----|-----|-----|--^--|--v-----^--------o
5828 //        |  |        |  |  |  |  |  |  |  |        |     |     |     |  |  |        |
5829 //  o--^--v--|--------v--|--|--|--|--|--|--|--^-----|--^--|-----|-----|--v--|-----^--v-----^--o
5830 //     |     |           |  |  |  |  |  |  |  |     |  |  |     |     |     |     |        |
5831 //  o--v-----v-----------v--|--|--|--|--|--|--|--^--|--|--|--^--|--^--|-----|--^--|--^--^--v--o
5832 //                          |  |  |  |  |  |  |  |  |  |  |  |  |  |  |     |  |  |  |  |
5833 //  o--^--^-----^-----------v--|--|--|--|--|--|--|--|--|--|--|--v--|--v-----v--|--v--|--v--^--o
5834 //     |  |     |              |  |  |  |  |  |  |  |  |  |  |     |           |     |     |
5835 //  o--v--|--^--|--^-----------v--|--|--|--|--|--|--|--v--|--|-----|--^--------|-----v--^--v--o
5836 //        |  |  |  |              |  |  |  |  |  |  |     |  |     |  |        |        |
5837 //  o--^--v--|--|--|--^-----------v--|--|--|--|--|--v-----|--|-----|--v--------|--^-----v-----o
5838 //     |     |  |  |  |              |  |  |  |  |        |  |     |           |  |
5839 //  o--v-----v--|--|--|--^-----------v--|--|--|--|--------|--v-----|--^--^-----|--|--^--------o
5840 //              |  |  |  |              |  |  |  |        |        |  |  |     |  |  |
5841 //  o--^--^-----v--|--|--|--------------v--|--|--|--------v--------|--|--|-----v--v--v--------o
5842 //     |  |        |  |  |                 |  |  |                 |  |  |
5843 //  o--v--|--^-----v--|--|-----------------v--|--|--^--------------v--|--v--------------------o
5844 //        |  |        |  |                    |  |  |                 |
5845 //  o--^--v--|--------v--|--------------------v--|--v-----------------v-----------------------o
5846 //     |     |           |                       |
5847 //  o--v-----v-----------v-----------------------v--------------------------------------------o
5848 //
5849 //  [[0,1],[2,3],[4,5],[6,7],[8,9],[10,11],[12,13],[14,15]]
5850 //  [[0,2],[4,6],[8,10],[12,14],[1,3],[5,7],[9,11],[13,15]]
5851 //  [[0,4],[8,12],[1,5],[9,13],[2,6],[10,14],[3,7],[11,15]]
5852 //  [[0,8],[1,9],[2,10],[3,11],[4,12],[5,13],[6,14],[7,15]]
5853 //  [[5,10],[6,9],[3,12],[13,14],[7,11],[1,2],[4,8]]
5854 //  [[1,4],[7,13],[2,8],[11,14],[5,6],[9,10]]
5855 //  [[2,4],[11,13],[3,8],[7,12]]
5856 //  [[6,8],[10,12],[3,5],[7,9]]
5857 //  [[3,4],[5,6],[7,8],[9,10],[11,12]]
5858 //  [[6,7],[8,9]]
5859 
5860 /* *INDENT-ON* */
5861 /* clang-format on */
5862 
5863 #define SORT_NETWORK_16(TYPE, CMP, begin)                                      \
5864   do {                                                                         \
5865     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]);                              \
5866     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]);                              \
5867     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]);                              \
5868     SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]);                              \
5869     SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]);                              \
5870     SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[11]);                            \
5871     SORT_CMP_SWAP(TYPE, CMP, begin[12], begin[13]);                            \
5872     SORT_CMP_SWAP(TYPE, CMP, begin[14], begin[15]);                            \
5873     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]);                              \
5874     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]);                              \
5875     SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[10]);                             \
5876     SORT_CMP_SWAP(TYPE, CMP, begin[12], begin[14]);                            \
5877     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]);                              \
5878     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]);                              \
5879     SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[11]);                             \
5880     SORT_CMP_SWAP(TYPE, CMP, begin[13], begin[15]);                            \
5881     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]);                              \
5882     SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[12]);                             \
5883     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]);                              \
5884     SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[13]);                             \
5885     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]);                              \
5886     SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[14]);                            \
5887     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]);                              \
5888     SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[15]);                            \
5889     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[8]);                              \
5890     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[9]);                              \
5891     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[10]);                             \
5892     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[11]);                             \
5893     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[12]);                             \
5894     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[13]);                             \
5895     SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[14]);                             \
5896     SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[15]);                             \
5897     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[10]);                             \
5898     SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[9]);                              \
5899     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[12]);                             \
5900     SORT_CMP_SWAP(TYPE, CMP, begin[13], begin[14]);                            \
5901     SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[11]);                             \
5902     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]);                              \
5903     SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[8]);                              \
5904     SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]);                              \
5905     SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[13]);                             \
5906     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[8]);                              \
5907     SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[14]);                            \
5908     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]);                              \
5909     SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]);                             \
5910     SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]);                              \
5911     SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[13]);                            \
5912     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[8]);                              \
5913     SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[12]);                             \
5914     SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]);                              \
5915     SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[12]);                            \
5916     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]);                              \
5917     SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[9]);                              \
5918     SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]);                              \
5919     SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]);                              \
5920     SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]);                              \
5921     SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]);                             \
5922     SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[12]);                            \
5923     SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]);                              \
5924     SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]);                              \
5925   } while (0)
5926 
5927 #define SORT_INNER(TYPE, CMP, begin, end, len)                                 \
5928   switch (len) {                                                               \
5929   default:                                                                     \
5930     __unreachable();                                                           \
5931   case 0:                                                                      \
5932   case 1:                                                                      \
5933     break;                                                                     \
5934   case 2:                                                                      \
5935     SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]);                              \
5936     break;                                                                     \
5937   case 3:                                                                      \
5938     SORT_NETWORK_3(TYPE, CMP, begin);                                          \
5939     break;                                                                     \
5940   case 4:                                                                      \
5941     SORT_NETWORK_4(TYPE, CMP, begin);                                          \
5942     break;                                                                     \
5943   case 5:                                                                      \
5944     SORT_NETWORK_5(TYPE, CMP, begin);                                          \
5945     break;                                                                     \
5946   case 6:                                                                      \
5947     SORT_NETWORK_6(TYPE, CMP, begin);                                          \
5948     break;                                                                     \
5949   case 7:                                                                      \
5950     SORT_NETWORK_7(TYPE, CMP, begin);                                          \
5951     break;                                                                     \
5952   case 8:                                                                      \
5953     SORT_NETWORK_8(TYPE, CMP, begin);                                          \
5954     break;                                                                     \
5955   case 9:                                                                      \
5956     SORT_NETWORK_9(TYPE, CMP, begin);                                          \
5957     break;                                                                     \
5958   case 10:                                                                     \
5959     SORT_NETWORK_10(TYPE, CMP, begin);                                         \
5960     break;                                                                     \
5961   case 11:                                                                     \
5962     SORT_NETWORK_11(TYPE, CMP, begin);                                         \
5963     break;                                                                     \
5964   case 12:                                                                     \
5965     SORT_NETWORK_12(TYPE, CMP, begin);                                         \
5966     break;                                                                     \
5967   case 13:                                                                     \
5968     SORT_NETWORK_13(TYPE, CMP, begin);                                         \
5969     break;                                                                     \
5970   case 14:                                                                     \
5971     SORT_NETWORK_14(TYPE, CMP, begin);                                         \
5972     break;                                                                     \
5973   case 15:                                                                     \
5974     SORT_NETWORK_15(TYPE, CMP, begin);                                         \
5975     break;                                                                     \
5976   case 16:                                                                     \
5977     SORT_NETWORK_16(TYPE, CMP, begin);                                         \
5978     break;                                                                     \
5979   }
5980 
5981 #define SORT_SWAP(TYPE, a, b)                                                  \
5982   do {                                                                         \
5983     const TYPE swap_tmp = (a);                                                 \
5984     (a) = (b);                                                                 \
5985     (b) = swap_tmp;                                                            \
5986   } while (0)
5987 
5988 #define SORT_PUSH(low, high)                                                   \
5989   do {                                                                         \
5990     top->lo = (low);                                                           \
5991     top->hi = (high);                                                          \
5992     ++top;                                                                     \
5993   } while (0)
5994 
5995 #define SORT_POP(low, high)                                                    \
5996   do {                                                                         \
5997     --top;                                                                     \
5998     low = top->lo;                                                             \
5999     high = top->hi;                                                            \
6000   } while (0)
6001 
6002 #define SORT_IMPL(NAME, EXPECT_LOW_CARDINALITY_OR_PRESORTED, TYPE, CMP)        \
6003                                                                                \
6004   static __inline bool NAME##_is_sorted(const TYPE *first, const TYPE *last) { \
6005     while (++first <= last)                                                    \
6006       if (CMP(first[0], first[-1]))                                            \
6007         return false;                                                          \
6008     return true;                                                               \
6009   }                                                                            \
6010                                                                                \
6011   typedef struct {                                                             \
6012     TYPE *lo, *hi;                                                             \
6013   } NAME##_stack;                                                              \
6014                                                                                \
6015   static __hot void NAME(TYPE *const begin, TYPE *const end) {                 \
6016     NAME##_stack stack[sizeof(unsigned) * CHAR_BIT], *top = stack;             \
6017                                                                                \
6018     TYPE *hi = end - 1;                                                        \
6019     TYPE *lo = begin;                                                          \
6020     while (true) {                                                             \
6021       const ptrdiff_t len = hi - lo;                                           \
6022       if (len < 16) {                                                          \
6023         SORT_INNER(TYPE, CMP, lo, hi + 1, len + 1);                            \
6024         if (unlikely(top == stack))                                            \
6025           break;                                                               \
6026         SORT_POP(lo, hi);                                                      \
6027         continue;                                                              \
6028       }                                                                        \
6029                                                                                \
6030       TYPE *mid = lo + (len >> 1);                                             \
6031       SORT_CMP_SWAP(TYPE, CMP, *lo, *mid);                                     \
6032       SORT_CMP_SWAP(TYPE, CMP, *mid, *hi);                                     \
6033       SORT_CMP_SWAP(TYPE, CMP, *lo, *mid);                                     \
6034                                                                                \
6035       TYPE *right = hi - 1;                                                    \
6036       TYPE *left = lo + 1;                                                     \
6037       while (1) {                                                              \
6038         while (CMP(*left, *mid))                                               \
6039           ++left;                                                              \
6040         while (CMP(*mid, *right))                                              \
6041           --right;                                                             \
6042         if (unlikely(left > right)) {                                          \
6043           if (EXPECT_LOW_CARDINALITY_OR_PRESORTED) {                           \
6044             if (NAME##_is_sorted(lo, right))                                   \
6045               lo = right + 1;                                                  \
6046             if (NAME##_is_sorted(left, hi))                                    \
6047               hi = left;                                                       \
6048           }                                                                    \
6049           break;                                                               \
6050         }                                                                      \
6051         SORT_SWAP(TYPE, *left, *right);                                        \
6052         mid = (mid == left) ? right : (mid == right) ? left : mid;             \
6053         ++left;                                                                \
6054         --right;                                                               \
6055       }                                                                        \
6056                                                                                \
6057       if (right - lo > hi - left) {                                            \
6058         SORT_PUSH(lo, right);                                                  \
6059         lo = left;                                                             \
6060       } else {                                                                 \
6061         SORT_PUSH(left, hi);                                                   \
6062         hi = right;                                                            \
6063       }                                                                        \
6064     }                                                                          \
6065                                                                                \
6066     if (mdbx_audit_enabled()) {                                                \
6067       for (TYPE *scan = begin + 1; scan < end; ++scan)                         \
6068         assert(CMP(scan[-1], scan[0]));                                        \
6069     }                                                                          \
6070   }
6071 
6072 /*------------------------------------------------------------------------------
6073  * LY: radix sort for large chunks */
6074 
6075 #define RADIXSORT_IMPL(NAME, TYPE, EXTRACT_KEY, BUFFER_PREALLOCATED, END_GAP)  \
6076                                                                                \
6077   __hot static bool NAME##_radixsort(TYPE *const begin,                        \
6078                                      const unsigned length) {                  \
6079     TYPE *tmp;                                                                 \
6080     if (BUFFER_PREALLOCATED) {                                                 \
6081       tmp = begin + length + END_GAP;                                          \
6082       /* memset(tmp, 0xDeadBeef, sizeof(TYPE) * length); */                    \
6083     } else {                                                                   \
6084       tmp = mdbx_malloc(sizeof(TYPE) * length);                                \
6085       if (unlikely(!tmp))                                                      \
6086         return false;                                                          \
6087     }                                                                          \
6088                                                                                \
6089     unsigned key_shift = 0, key_diff_mask;                                     \
6090     do {                                                                       \
6091       struct {                                                                 \
6092         unsigned a[256], b[256];                                               \
6093       } counters;                                                              \
6094       memset(&counters, 0, sizeof(counters));                                  \
6095                                                                                \
6096       key_diff_mask = 0;                                                       \
6097       unsigned prev_key = EXTRACT_KEY(begin) >> key_shift;                     \
6098       TYPE *r = begin, *end = begin + length;                                  \
6099       do {                                                                     \
6100         const unsigned key = EXTRACT_KEY(r) >> key_shift;                      \
6101         counters.a[key & 255]++;                                               \
6102         counters.b[(key >> 8) & 255]++;                                        \
6103         key_diff_mask |= prev_key ^ key;                                       \
6104         prev_key = key;                                                        \
6105       } while (++r != end);                                                    \
6106                                                                                \
6107       unsigned ta = 0, tb = 0;                                                 \
6108       for (unsigned i = 0; i < 256; ++i) {                                     \
6109         const unsigned ia = counters.a[i];                                     \
6110         counters.a[i] = ta;                                                    \
6111         ta += ia;                                                              \
6112         const unsigned ib = counters.b[i];                                     \
6113         counters.b[i] = tb;                                                    \
6114         tb += ib;                                                              \
6115       }                                                                        \
6116                                                                                \
6117       r = begin;                                                               \
6118       do {                                                                     \
6119         const unsigned key = EXTRACT_KEY(r) >> key_shift;                      \
6120         tmp[counters.a[key & 255]++] = *r;                                     \
6121       } while (++r != end);                                                    \
6122                                                                                \
6123       if (unlikely(key_diff_mask < 256)) {                                     \
6124         memcpy(begin, tmp, (char *)end - (char *)begin);                       \
6125         break;                                                                 \
6126       }                                                                        \
6127       end = (r = tmp) + length;                                                \
6128       do {                                                                     \
6129         const unsigned key = EXTRACT_KEY(r) >> key_shift;                      \
6130         begin[counters.b[(key >> 8) & 255]++] = *r;                            \
6131       } while (++r != end);                                                    \
6132                                                                                \
6133       key_shift += 16;                                                         \
6134     } while (key_diff_mask >> 16);                                             \
6135                                                                                \
6136     if (!(BUFFER_PREALLOCATED))                                                \
6137       mdbx_free(tmp);                                                          \
6138     return true;                                                               \
6139   }
6140 
6141 /*------------------------------------------------------------------------------
6142  * LY: Binary search */
6143 
6144 #define SEARCH_IMPL(NAME, TYPE_LIST, TYPE_ARG, CMP)                            \
6145   static __always_inline const TYPE_LIST *NAME(                                \
6146       const TYPE_LIST *first, unsigned length, const TYPE_ARG item) {          \
6147     const TYPE_LIST *const begin = first, *const end = begin + length;         \
6148                                                                                \
6149     while (length > 3) {                                                       \
6150       const unsigned whole = length;                                           \
6151       length >>= 1;                                                            \
6152       const TYPE_LIST *const middle = first + length;                          \
6153       const unsigned left = whole - length - 1;                                \
6154       const bool cmp = CMP(*middle, item);                                     \
6155       length = cmp ? left : length;                                            \
6156       first = cmp ? middle + 1 : first;                                        \
6157     }                                                                          \
6158                                                                                \
6159     switch (length) {                                                          \
6160     case 3:                                                                    \
6161       if (!CMP(*first, item))                                                  \
6162         break;                                                                 \
6163       ++first;                                                                 \
6164       __fallthrough /* fall through */;                                        \
6165     case 2:                                                                    \
6166       if (!CMP(*first, item))                                                  \
6167         break;                                                                 \
6168       ++first;                                                                 \
6169       __fallthrough /* fall through */;                                        \
6170     case 1:                                                                    \
6171       if (!CMP(*first, item))                                                  \
6172         break;                                                                 \
6173       ++first;                                                                 \
6174       __fallthrough /* fall through */;                                        \
6175     case 0:                                                                    \
6176       break;                                                                   \
6177     default:                                                                   \
6178       __unreachable();                                                         \
6179     }                                                                          \
6180                                                                                \
6181     if (mdbx_audit_enabled()) {                                                \
6182       for (const TYPE_LIST *scan = begin; scan < first; ++scan)                \
6183         assert(CMP(*scan, item));                                              \
6184       for (const TYPE_LIST *scan = first; scan < end; ++scan)                  \
6185         assert(!CMP(*scan, item));                                             \
6186       (void)begin, (void)end;                                                  \
6187     }                                                                          \
6188                                                                                \
6189     return first;                                                              \
6190   }
6191 
6192 /*----------------------------------------------------------------------------*/
6193 
pnl2bytes(size_t size)6194 static __always_inline size_t pnl2bytes(size_t size) {
6195   assert(size > 0 && size <= MDBX_PGL_LIMIT);
6196 #if MDBX_PNL_PREALLOC_FOR_RADIXSORT
6197   size += size;
6198 #endif /* MDBX_PNL_PREALLOC_FOR_RADIXSORT */
6199   STATIC_ASSERT(MDBX_ASSUME_MALLOC_OVERHEAD +
6200                     (MDBX_PGL_LIMIT * (MDBX_PNL_PREALLOC_FOR_RADIXSORT + 1) +
6201                      MDBX_PNL_GRANULATE + 2) *
6202                         sizeof(pgno_t) <
6203                 SIZE_MAX / 4 * 3);
6204   size_t bytes =
6205       ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(pgno_t) * (size + 2),
6206                     MDBX_PNL_GRANULATE * sizeof(pgno_t)) -
6207       MDBX_ASSUME_MALLOC_OVERHEAD;
6208   return bytes;
6209 }
6210 
bytes2pnl(const size_t bytes)6211 static __always_inline pgno_t bytes2pnl(const size_t bytes) {
6212   size_t size = bytes / sizeof(pgno_t);
6213   assert(size > 2 && size <= MDBX_PGL_LIMIT);
6214   size -= 2;
6215 #if MDBX_PNL_PREALLOC_FOR_RADIXSORT
6216   size >>= 1;
6217 #endif /* MDBX_PNL_PREALLOC_FOR_RADIXSORT */
6218   return (pgno_t)size;
6219 }
6220 
mdbx_pnl_alloc(size_t size)6221 static MDBX_PNL mdbx_pnl_alloc(size_t size) {
6222   size_t bytes = pnl2bytes(size);
6223   MDBX_PNL pl = mdbx_malloc(bytes);
6224   if (likely(pl)) {
6225 #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size)
6226     bytes = malloc_usable_size(pl);
6227 #endif /* malloc_usable_size */
6228     pl[0] = bytes2pnl(bytes);
6229     assert(pl[0] >= size);
6230     pl[1] = 0;
6231     pl += 1;
6232   }
6233   return pl;
6234 }
6235 
mdbx_pnl_free(MDBX_PNL pl)6236 static void mdbx_pnl_free(MDBX_PNL pl) {
6237   if (likely(pl))
6238     mdbx_free(pl - 1);
6239 }
6240 
6241 /* Shrink the PNL to the default size if it has grown larger */
mdbx_pnl_shrink(MDBX_PNL * ppl)6242 static void mdbx_pnl_shrink(MDBX_PNL *ppl) {
6243   assert(bytes2pnl(pnl2bytes(MDBX_PNL_INITIAL)) >= MDBX_PNL_INITIAL &&
6244          bytes2pnl(pnl2bytes(MDBX_PNL_INITIAL)) < MDBX_PNL_INITIAL * 3 / 2);
6245   assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PGL_LIMIT &&
6246          MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_SIZE(*ppl));
6247   MDBX_PNL_SIZE(*ppl) = 0;
6248   if (unlikely(MDBX_PNL_ALLOCLEN(*ppl) >
6249                MDBX_PNL_INITIAL * 2 - MDBX_CACHELINE_SIZE / sizeof(pgno_t))) {
6250     size_t bytes = pnl2bytes(MDBX_PNL_INITIAL);
6251     MDBX_PNL pl = mdbx_realloc(*ppl - 1, bytes);
6252     if (likely(pl)) {
6253 #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size)
6254       bytes = malloc_usable_size(pl);
6255 #endif /* malloc_usable_size */
6256       *pl = bytes2pnl(bytes);
6257       *ppl = pl + 1;
6258     }
6259   }
6260 }
6261 
6262 /* Grow the PNL to the size growed to at least given size */
mdbx_pnl_reserve(MDBX_PNL * ppl,const size_t wanna)6263 static int mdbx_pnl_reserve(MDBX_PNL *ppl, const size_t wanna) {
6264   const size_t allocated = MDBX_PNL_ALLOCLEN(*ppl);
6265   assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PGL_LIMIT &&
6266          MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_SIZE(*ppl));
6267   if (likely(allocated >= wanna))
6268     return MDBX_SUCCESS;
6269 
6270   if (unlikely(wanna > /* paranoia */ MDBX_PGL_LIMIT)) {
6271     mdbx_error("PNL too long (%zu > %zu)", wanna, (size_t)MDBX_PGL_LIMIT);
6272     return MDBX_TXN_FULL;
6273   }
6274 
6275   const size_t size = (wanna + wanna - allocated < MDBX_PGL_LIMIT)
6276                           ? wanna + wanna - allocated
6277                           : MDBX_PGL_LIMIT;
6278   size_t bytes = pnl2bytes(size);
6279   MDBX_PNL pl = mdbx_realloc(*ppl - 1, bytes);
6280   if (likely(pl)) {
6281 #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size)
6282     bytes = malloc_usable_size(pl);
6283 #endif /* malloc_usable_size */
6284     *pl = bytes2pnl(bytes);
6285     assert(*pl >= wanna);
6286     *ppl = pl + 1;
6287     return MDBX_SUCCESS;
6288   }
6289   return MDBX_ENOMEM;
6290 }
6291 
6292 /* Make room for num additional elements in an PNL */
mdbx_pnl_need(MDBX_PNL * ppl,size_t num)6293 static __always_inline int __must_check_result mdbx_pnl_need(MDBX_PNL *ppl,
6294                                                              size_t num) {
6295   assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PGL_LIMIT &&
6296          MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_SIZE(*ppl));
6297   assert(num <= MDBX_PGL_LIMIT);
6298   const size_t wanna = MDBX_PNL_SIZE(*ppl) + num;
6299   return likely(MDBX_PNL_ALLOCLEN(*ppl) >= wanna)
6300              ? MDBX_SUCCESS
6301              : mdbx_pnl_reserve(ppl, wanna);
6302 }
6303 
mdbx_pnl_xappend(MDBX_PNL pl,pgno_t pgno)6304 static __always_inline void mdbx_pnl_xappend(MDBX_PNL pl, pgno_t pgno) {
6305   assert(MDBX_PNL_SIZE(pl) < MDBX_PNL_ALLOCLEN(pl));
6306   if (mdbx_audit_enabled()) {
6307     for (unsigned i = MDBX_PNL_SIZE(pl); i > 0; --i)
6308       assert(pgno != pl[i]);
6309   }
6310   MDBX_PNL_SIZE(pl) += 1;
6311   MDBX_PNL_LAST(pl) = pgno;
6312 }
6313 
6314 /* Append an pgno range onto an unsorted PNL */
6315 __always_inline static int __must_check_result
mdbx_pnl_append_range(bool spilled,MDBX_PNL * ppl,pgno_t pgno,unsigned n)6316 mdbx_pnl_append_range(bool spilled, MDBX_PNL *ppl, pgno_t pgno, unsigned n) {
6317   assert(n > 0);
6318   int rc = mdbx_pnl_need(ppl, n);
6319   if (unlikely(rc != MDBX_SUCCESS))
6320     return rc;
6321 
6322   const MDBX_PNL pnl = *ppl;
6323 #if MDBX_PNL_ASCENDING
6324   unsigned w = MDBX_PNL_SIZE(pnl);
6325   do {
6326     pnl[++w] = pgno;
6327     pgno += spilled ? 2 : 1;
6328   } while (--n);
6329   MDBX_PNL_SIZE(pnl) = w;
6330 #else
6331   unsigned w = MDBX_PNL_SIZE(pnl) + n;
6332   MDBX_PNL_SIZE(pnl) = w;
6333   do {
6334     pnl[w--] = pgno;
6335     pgno += spilled ? 2 : 1;
6336   } while (--n);
6337 #endif
6338 
6339   return MDBX_SUCCESS;
6340 }
6341 
6342 /* Append an pgno range into the sorted PNL */
mdbx_pnl_insert_range(MDBX_PNL * ppl,pgno_t pgno,unsigned n)6343 static __hot int __must_check_result mdbx_pnl_insert_range(MDBX_PNL *ppl,
6344                                                            pgno_t pgno,
6345                                                            unsigned n) {
6346   assert(n > 0);
6347   int rc = mdbx_pnl_need(ppl, n);
6348   if (unlikely(rc != MDBX_SUCCESS))
6349     return rc;
6350 
6351   const MDBX_PNL pnl = *ppl;
6352   unsigned r = MDBX_PNL_SIZE(pnl), w = r + n;
6353   MDBX_PNL_SIZE(pnl) = w;
6354   while (r && MDBX_PNL_DISORDERED(pnl[r], pgno))
6355     pnl[w--] = pnl[r--];
6356 
6357   for (pgno_t fill = MDBX_PNL_ASCENDING ? pgno + n : pgno; w > r; --w)
6358     pnl[w] = MDBX_PNL_ASCENDING ? --fill : fill++;
6359 
6360   return MDBX_SUCCESS;
6361 }
6362 
mdbx_pnl_check(const MDBX_PNL pl,const pgno_t limit)6363 static bool mdbx_pnl_check(const MDBX_PNL pl, const pgno_t limit) {
6364   if (likely(MDBX_PNL_SIZE(pl))) {
6365     assert(MDBX_PNL_LEAST(pl) >= MIN_PAGENO);
6366     assert(MDBX_PNL_MOST(pl) < limit);
6367     assert(MDBX_PNL_SIZE(pl) <= MDBX_PGL_LIMIT);
6368     if (unlikely(MDBX_PNL_SIZE(pl) > MDBX_PGL_LIMIT))
6369       return false;
6370     if (unlikely(MDBX_PNL_LEAST(pl) < MIN_PAGENO))
6371       return false;
6372     if (unlikely(MDBX_PNL_MOST(pl) >= limit))
6373       return false;
6374     if (mdbx_audit_enabled()) {
6375       for (const pgno_t *scan = &MDBX_PNL_LAST(pl); --scan > pl;) {
6376         assert(MDBX_PNL_ORDERED(scan[0], scan[1]));
6377         if (unlikely(!MDBX_PNL_ORDERED(scan[0], scan[1])))
6378           return false;
6379       }
6380     }
6381   }
6382   return true;
6383 }
6384 
mdbx_pnl_check4assert(const MDBX_PNL pl,const pgno_t limit)6385 static __always_inline bool mdbx_pnl_check4assert(const MDBX_PNL pl,
6386                                                   const pgno_t limit) {
6387   if (unlikely(pl == nullptr))
6388     return true;
6389   assert(MDBX_PNL_ALLOCLEN(pl) >= MDBX_PNL_SIZE(pl));
6390   if (unlikely(MDBX_PNL_ALLOCLEN(pl) < MDBX_PNL_SIZE(pl)))
6391     return false;
6392   return mdbx_pnl_check(pl, limit);
6393 }
6394 
6395 /* Merge an PNL onto an PNL. The destination PNL must be big enough */
mdbx_pnl_xmerge(MDBX_PNL dst,const MDBX_PNL src)6396 static void __hot mdbx_pnl_xmerge(MDBX_PNL dst, const MDBX_PNL src) {
6397   assert(mdbx_pnl_check4assert(dst, MAX_PAGENO + 1));
6398   assert(mdbx_pnl_check(src, MAX_PAGENO + 1));
6399   const size_t total = MDBX_PNL_SIZE(dst) + MDBX_PNL_SIZE(src);
6400   assert(MDBX_PNL_ALLOCLEN(dst) >= total);
6401   pgno_t *w = dst + total;
6402   pgno_t *d = dst + MDBX_PNL_SIZE(dst);
6403   const pgno_t *s = src + MDBX_PNL_SIZE(src);
6404   dst[0] = /* detent for scan below */ (MDBX_PNL_ASCENDING ? 0 : ~(pgno_t)0);
6405   while (s > src) {
6406     while (MDBX_PNL_ORDERED(*s, *d))
6407       *w-- = *d--;
6408     *w-- = *s--;
6409   }
6410   MDBX_PNL_SIZE(dst) = (pgno_t)total;
6411   assert(mdbx_pnl_check4assert(dst, MAX_PAGENO + 1));
6412 }
6413 
mdbx_spill_remove(MDBX_txn * txn,unsigned idx,unsigned npages)6414 static void mdbx_spill_remove(MDBX_txn *txn, unsigned idx, unsigned npages) {
6415   mdbx_tassert(txn, idx > 0 && idx <= MDBX_PNL_SIZE(txn->tw.spill_pages) &&
6416                         txn->tw.spill_least_removed > 0);
6417   txn->tw.spill_least_removed =
6418       (idx < txn->tw.spill_least_removed) ? idx : txn->tw.spill_least_removed;
6419   txn->tw.spill_pages[idx] |= 1;
6420   MDBX_PNL_SIZE(txn->tw.spill_pages) -=
6421       (idx == MDBX_PNL_SIZE(txn->tw.spill_pages));
6422 
6423   while (unlikely(npages > 1)) {
6424     const pgno_t pgno = (txn->tw.spill_pages[idx] >> 1) + 1;
6425     if (MDBX_PNL_ASCENDING) {
6426       if (++idx > MDBX_PNL_SIZE(txn->tw.spill_pages) ||
6427           (txn->tw.spill_pages[idx] >> 1) != pgno)
6428         return;
6429     } else {
6430       if (--idx < 1 || (txn->tw.spill_pages[idx] >> 1) != pgno)
6431         return;
6432       txn->tw.spill_least_removed = (idx < txn->tw.spill_least_removed)
6433                                         ? idx
6434                                         : txn->tw.spill_least_removed;
6435     }
6436     txn->tw.spill_pages[idx] |= 1;
6437     MDBX_PNL_SIZE(txn->tw.spill_pages) -=
6438         (idx == MDBX_PNL_SIZE(txn->tw.spill_pages));
6439     --npages;
6440   }
6441 }
6442 
mdbx_spill_purge(MDBX_txn * txn)6443 static MDBX_PNL mdbx_spill_purge(MDBX_txn *txn) {
6444   mdbx_tassert(txn, txn->tw.spill_least_removed > 0);
6445   const MDBX_PNL sl = txn->tw.spill_pages;
6446   if (txn->tw.spill_least_removed != INT_MAX) {
6447     unsigned len = MDBX_PNL_SIZE(sl), r, w;
6448     for (w = r = txn->tw.spill_least_removed; r <= len; ++r) {
6449       sl[w] = sl[r];
6450       w += 1 - (sl[r] & 1);
6451     }
6452     for (size_t i = 1; i < w; ++i)
6453       mdbx_tassert(txn, (sl[i] & 1) == 0);
6454     MDBX_PNL_SIZE(sl) = w - 1;
6455     txn->tw.spill_least_removed = INT_MAX;
6456   } else {
6457     for (size_t i = 1; i <= MDBX_PNL_SIZE(sl); ++i)
6458       mdbx_tassert(txn, (sl[i] & 1) == 0);
6459   }
6460   return sl;
6461 }
6462 
6463 #if MDBX_PNL_ASCENDING
6464 #define MDBX_PNL_EXTRACT_KEY(ptr) (*(ptr))
6465 #else
6466 #define MDBX_PNL_EXTRACT_KEY(ptr) (P_INVALID - *(ptr))
6467 #endif
6468 RADIXSORT_IMPL(pgno, pgno_t, MDBX_PNL_EXTRACT_KEY,
6469                MDBX_PNL_PREALLOC_FOR_RADIXSORT, 0)
6470 
SORT_IMPL(pgno_sort,false,pgno_t,MDBX_PNL_ORDERED)6471 SORT_IMPL(pgno_sort, false, pgno_t, MDBX_PNL_ORDERED)
6472 static __hot void mdbx_pnl_sort(MDBX_PNL pnl) {
6473   if (likely(MDBX_PNL_SIZE(pnl) < MDBX_RADIXSORT_THRESHOLD) ||
6474       unlikely(!pgno_radixsort(&MDBX_PNL_FIRST(pnl), MDBX_PNL_SIZE(pnl))))
6475     pgno_sort(MDBX_PNL_BEGIN(pnl), MDBX_PNL_END(pnl));
6476   assert(mdbx_pnl_check(pnl, MAX_PAGENO + 1));
6477 }
6478 
6479 /* Search for an pgno in an PNL.
6480  * Returns The index of the first item greater than or equal to pgno. */
SEARCH_IMPL(pgno_bsearch,pgno_t,pgno_t,MDBX_PNL_ORDERED)6481 SEARCH_IMPL(pgno_bsearch, pgno_t, pgno_t, MDBX_PNL_ORDERED)
6482 
6483 static __hot unsigned mdbx_pnl_search(const MDBX_PNL pnl, pgno_t pgno) {
6484   assert(mdbx_pnl_check4assert(pnl, MAX_PAGENO + 1));
6485   const pgno_t *begin = MDBX_PNL_BEGIN(pnl);
6486   const pgno_t *it = pgno_bsearch(begin, MDBX_PNL_SIZE(pnl), pgno);
6487   const pgno_t *end = begin + MDBX_PNL_SIZE(pnl);
6488   assert(it >= begin && it <= end);
6489   if (it != begin)
6490     assert(MDBX_PNL_ORDERED(it[-1], pgno));
6491   if (it != end)
6492     assert(!MDBX_PNL_ORDERED(it[0], pgno));
6493   return (unsigned)(it - begin + 1);
6494 }
6495 
mdbx_pnl_exist(const MDBX_PNL pnl,pgno_t pgno)6496 static __inline unsigned mdbx_pnl_exist(const MDBX_PNL pnl, pgno_t pgno) {
6497   unsigned n = mdbx_pnl_search(pnl, pgno);
6498   return (n <= MDBX_PNL_SIZE(pnl) && pnl[n] == pgno) ? n : 0;
6499 }
6500 
mdbx_pnl_intersect(const MDBX_PNL pnl,pgno_t pgno,unsigned npages)6501 static __inline unsigned mdbx_pnl_intersect(const MDBX_PNL pnl, pgno_t pgno,
6502                                             unsigned npages) {
6503   const unsigned len = MDBX_PNL_SIZE(pnl);
6504   if (mdbx_log_enabled(MDBX_LOG_EXTRA)) {
6505     mdbx_debug_extra("PNL len %u [", len);
6506     for (unsigned i = 1; i <= len; ++i)
6507       mdbx_debug_extra_print(" %" PRIaPGNO, pnl[i]);
6508     mdbx_debug_extra_print("%s\n", "]");
6509   }
6510   const pgno_t range_last = pgno + npages - 1;
6511 #if MDBX_PNL_ASCENDING
6512   const unsigned n = mdbx_pnl_search(pnl, pgno);
6513   assert(n && (n == MDBX_PNL_SIZE(pnl) + 1 || pgno <= pnl[n]));
6514   const bool rc = n <= MDBX_PNL_SIZE(pnl) && pnl[n] <= range_last;
6515 #else
6516   const unsigned n = mdbx_pnl_search(pnl, range_last);
6517   assert(n && (n == MDBX_PNL_SIZE(pnl) + 1 || range_last >= pnl[n]));
6518   const bool rc = n <= MDBX_PNL_SIZE(pnl) && pnl[n] >= pgno;
6519 #endif
6520   if (mdbx_assert_enabled()) {
6521     bool check = false;
6522     for (unsigned i = 0; i < npages; ++i)
6523       check |= mdbx_pnl_exist(pnl, pgno + i) != 0;
6524     assert(check == rc);
6525   }
6526   return rc;
6527 }
6528 
6529 /*----------------------------------------------------------------------------*/
6530 
txl2bytes(const size_t size)6531 static __always_inline size_t txl2bytes(const size_t size) {
6532   assert(size > 0 && size <= MDBX_TXL_MAX * 2);
6533   size_t bytes =
6534       ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(txnid_t) * (size + 2),
6535                     MDBX_TXL_GRANULATE * sizeof(txnid_t)) -
6536       MDBX_ASSUME_MALLOC_OVERHEAD;
6537   return bytes;
6538 }
6539 
bytes2txl(const size_t bytes)6540 static __always_inline size_t bytes2txl(const size_t bytes) {
6541   size_t size = bytes / sizeof(txnid_t);
6542   assert(size > 2 && size <= MDBX_TXL_MAX * 2);
6543   return size - 2;
6544 }
6545 
mdbx_txl_alloc(void)6546 static MDBX_TXL mdbx_txl_alloc(void) {
6547   size_t bytes = txl2bytes(MDBX_TXL_INITIAL);
6548   MDBX_TXL tl = mdbx_malloc(bytes);
6549   if (likely(tl)) {
6550 #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size)
6551     bytes = malloc_usable_size(tl);
6552 #endif /* malloc_usable_size */
6553     tl[0] = bytes2txl(bytes);
6554     assert(tl[0] >= MDBX_TXL_INITIAL);
6555     tl[1] = 0;
6556     tl += 1;
6557   }
6558   return tl;
6559 }
6560 
mdbx_txl_free(MDBX_TXL tl)6561 static void mdbx_txl_free(MDBX_TXL tl) {
6562   if (likely(tl))
6563     mdbx_free(tl - 1);
6564 }
6565 
mdbx_txl_reserve(MDBX_TXL * ptl,const size_t wanna)6566 static int mdbx_txl_reserve(MDBX_TXL *ptl, const size_t wanna) {
6567   const size_t allocated = (size_t)MDBX_PNL_ALLOCLEN(*ptl);
6568   assert(MDBX_PNL_SIZE(*ptl) <= MDBX_TXL_MAX &&
6569          MDBX_PNL_ALLOCLEN(*ptl) >= MDBX_PNL_SIZE(*ptl));
6570   if (likely(allocated >= wanna))
6571     return MDBX_SUCCESS;
6572 
6573   if (unlikely(wanna > /* paranoia */ MDBX_TXL_MAX)) {
6574     mdbx_error("TXL too long (%zu > %zu)", wanna, (size_t)MDBX_TXL_MAX);
6575     return MDBX_TXN_FULL;
6576   }
6577 
6578   const size_t size = (wanna + wanna - allocated < MDBX_TXL_MAX)
6579                           ? wanna + wanna - allocated
6580                           : MDBX_TXL_MAX;
6581   size_t bytes = txl2bytes(size);
6582   MDBX_TXL tl = mdbx_realloc(*ptl - 1, bytes);
6583   if (likely(tl)) {
6584 #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size)
6585     bytes = malloc_usable_size(tl);
6586 #endif /* malloc_usable_size */
6587     *tl = bytes2txl(bytes);
6588     assert(*tl >= wanna);
6589     *ptl = tl + 1;
6590     return MDBX_SUCCESS;
6591   }
6592   return MDBX_ENOMEM;
6593 }
6594 
mdbx_txl_need(MDBX_TXL * ptl,size_t num)6595 static __always_inline int __must_check_result mdbx_txl_need(MDBX_TXL *ptl,
6596                                                              size_t num) {
6597   assert(MDBX_PNL_SIZE(*ptl) <= MDBX_TXL_MAX &&
6598          MDBX_PNL_ALLOCLEN(*ptl) >= MDBX_PNL_SIZE(*ptl));
6599   assert(num <= MDBX_PGL_LIMIT);
6600   const size_t wanna = (size_t)MDBX_PNL_SIZE(*ptl) + num;
6601   return likely(MDBX_PNL_ALLOCLEN(*ptl) >= wanna)
6602              ? MDBX_SUCCESS
6603              : mdbx_txl_reserve(ptl, wanna);
6604 }
6605 
mdbx_txl_xappend(MDBX_TXL tl,txnid_t id)6606 static __always_inline void mdbx_txl_xappend(MDBX_TXL tl, txnid_t id) {
6607   assert(MDBX_PNL_SIZE(tl) < MDBX_PNL_ALLOCLEN(tl));
6608   MDBX_PNL_SIZE(tl) += 1;
6609   MDBX_PNL_LAST(tl) = id;
6610 }
6611 
6612 #define TXNID_SORT_CMP(first, last) ((first) > (last))
SORT_IMPL(txnid_sort,false,txnid_t,TXNID_SORT_CMP)6613 SORT_IMPL(txnid_sort, false, txnid_t, TXNID_SORT_CMP)
6614 static void mdbx_txl_sort(MDBX_TXL tl) {
6615   txnid_sort(MDBX_PNL_BEGIN(tl), MDBX_PNL_END(tl));
6616 }
6617 
mdbx_txl_append(MDBX_TXL * ptl,txnid_t id)6618 static int __must_check_result mdbx_txl_append(MDBX_TXL *ptl, txnid_t id) {
6619   if (unlikely(MDBX_PNL_SIZE(*ptl) == MDBX_PNL_ALLOCLEN(*ptl))) {
6620     int rc = mdbx_txl_need(ptl, MDBX_TXL_GRANULATE);
6621     if (unlikely(rc != MDBX_SUCCESS))
6622       return rc;
6623   }
6624   mdbx_txl_xappend(*ptl, id);
6625   return MDBX_SUCCESS;
6626 }
6627 
6628 /*----------------------------------------------------------------------------*/
6629 
6630 #define MDBX_DPL_UNSORTED_BACKLOG 16
6631 #define MDBX_DPL_GAP_FOR_MERGESORT MDBX_DPL_UNSORTED_BACKLOG
6632 #define MDBX_DPL_GAP_FOR_EDGING 2
6633 #define MDBX_DPL_RESERVE_GAP                                                   \
6634   (MDBX_DPL_GAP_FOR_MERGESORT + MDBX_DPL_GAP_FOR_EDGING)
6635 
dpl2bytes(ptrdiff_t size)6636 static __always_inline size_t dpl2bytes(ptrdiff_t size) {
6637   assert(size > CURSOR_STACK && (size_t)size <= MDBX_PGL_LIMIT);
6638 #if MDBX_DPL_PREALLOC_FOR_RADIXSORT
6639   size += size;
6640 #endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */
6641   STATIC_ASSERT(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(MDBX_dpl) +
6642                     (MDBX_PGL_LIMIT * (MDBX_DPL_PREALLOC_FOR_RADIXSORT + 1) +
6643                      MDBX_DPL_RESERVE_GAP) *
6644                         sizeof(MDBX_dp) +
6645                     MDBX_PNL_GRANULATE * sizeof(void *) * 2 <
6646                 SIZE_MAX / 4 * 3);
6647   size_t bytes =
6648       ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(MDBX_dpl) +
6649                         ((size_t)size + MDBX_DPL_RESERVE_GAP) * sizeof(MDBX_dp),
6650                     MDBX_PNL_GRANULATE * sizeof(void *) * 2) -
6651       MDBX_ASSUME_MALLOC_OVERHEAD;
6652   return bytes;
6653 }
6654 
bytes2dpl(const ptrdiff_t bytes)6655 static __always_inline unsigned bytes2dpl(const ptrdiff_t bytes) {
6656   size_t size = (bytes - sizeof(MDBX_dpl)) / sizeof(MDBX_dp);
6657   assert(size > CURSOR_STACK + MDBX_DPL_RESERVE_GAP &&
6658          size <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE);
6659   size -= MDBX_DPL_RESERVE_GAP;
6660 #if MDBX_DPL_PREALLOC_FOR_RADIXSORT
6661   size >>= 1;
6662 #endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */
6663   return (unsigned)size;
6664 }
6665 
dpl_setlen(MDBX_dpl * dl,unsigned len)6666 static __always_inline unsigned dpl_setlen(MDBX_dpl *dl, unsigned len) {
6667   static const MDBX_page dpl_stub_pageE = {
6668       {0}, 0, P_BAD, {0}, /* pgno */ ~(pgno_t)0};
6669   assert(dpl_stub_pageE.mp_flags == P_BAD &&
6670          dpl_stub_pageE.mp_pgno == P_INVALID);
6671   dl->length = len;
6672   dl->items[len + 1].ptr = (MDBX_page *)&dpl_stub_pageE;
6673   dl->items[len + 1].pgno = P_INVALID;
6674   dl->items[len + 1].extra = 0;
6675   return len;
6676 }
6677 
dpl_clear(MDBX_dpl * dl)6678 static __always_inline void dpl_clear(MDBX_dpl *dl) {
6679   static const MDBX_page dpl_stub_pageB = {{0}, 0, P_BAD, {0}, /* pgno */ 0};
6680   assert(dpl_stub_pageB.mp_flags == P_BAD && dpl_stub_pageB.mp_pgno == 0);
6681   dl->sorted = dpl_setlen(dl, 0);
6682   dl->items[0].ptr = (MDBX_page *)&dpl_stub_pageB;
6683   dl->items[0].pgno = 0;
6684   dl->items[0].extra = 0;
6685   assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
6686 }
6687 
mdbx_dpl_free(MDBX_txn * txn)6688 static void mdbx_dpl_free(MDBX_txn *txn) {
6689   if (likely(txn->tw.dirtylist)) {
6690     mdbx_free(txn->tw.dirtylist);
6691     txn->tw.dirtylist = NULL;
6692   }
6693 }
6694 
mdbx_dpl_reserve(MDBX_txn * txn,size_t size)6695 static MDBX_dpl *mdbx_dpl_reserve(MDBX_txn *txn, size_t size) {
6696   size_t bytes = dpl2bytes((size < MDBX_PGL_LIMIT) ? size : MDBX_PGL_LIMIT);
6697   MDBX_dpl *const dl = mdbx_realloc(txn->tw.dirtylist, bytes);
6698   if (likely(dl)) {
6699 #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size)
6700     bytes = malloc_usable_size(dl);
6701 #endif /* malloc_usable_size */
6702     dl->detent = bytes2dpl(bytes);
6703     mdbx_tassert(txn, txn->tw.dirtylist == NULL || dl->length <= dl->detent);
6704     txn->tw.dirtylist = dl;
6705   }
6706   return dl;
6707 }
6708 
mdbx_dpl_alloc(MDBX_txn * txn)6709 static int mdbx_dpl_alloc(MDBX_txn *txn) {
6710   mdbx_tassert(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0);
6711   const int wanna = (txn->mt_env->me_options.dp_initial < txn->mt_geo.upper)
6712                         ? txn->mt_env->me_options.dp_initial
6713                         : txn->mt_geo.upper;
6714   if (txn->tw.dirtylist) {
6715     dpl_clear(txn->tw.dirtylist);
6716     const int realloc_threshold = 64;
6717     if (likely(
6718             !((int)(txn->tw.dirtylist->detent - wanna) > realloc_threshold ||
6719               (int)(txn->tw.dirtylist->detent - wanna) < -realloc_threshold)))
6720       return MDBX_SUCCESS;
6721   }
6722   if (unlikely(!mdbx_dpl_reserve(txn, wanna)))
6723     return MDBX_ENOMEM;
6724   dpl_clear(txn->tw.dirtylist);
6725   return MDBX_SUCCESS;
6726 }
6727 
6728 #define MDBX_DPL_EXTRACT_KEY(ptr) ((ptr)->pgno)
6729 RADIXSORT_IMPL(dpl, MDBX_dp, MDBX_DPL_EXTRACT_KEY,
6730                MDBX_DPL_PREALLOC_FOR_RADIXSORT, 1)
6731 
6732 #define DP_SORT_CMP(first, last) ((first).pgno < (last).pgno)
SORT_IMPL(dp_sort,false,MDBX_dp,DP_SORT_CMP)6733 SORT_IMPL(dp_sort, false, MDBX_dp, DP_SORT_CMP)
6734 
6735 __hot __noinline static MDBX_dpl *mdbx_dpl_sort_slowpath(const MDBX_txn *txn) {
6736   MDBX_dpl *dl = txn->tw.dirtylist;
6737   assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
6738   const unsigned unsorted = dl->length - dl->sorted;
6739   if (likely(unsorted < MDBX_RADIXSORT_THRESHOLD) ||
6740       unlikely(!dpl_radixsort(dl->items + 1, dl->length))) {
6741     if (dl->sorted > unsorted / 4 + 4 &&
6742         (MDBX_DPL_PREALLOC_FOR_RADIXSORT ||
6743          dl->length + unsorted < dl->detent + MDBX_DPL_GAP_FOR_MERGESORT)) {
6744       MDBX_dp *const sorted_begin = dl->items + 1;
6745       MDBX_dp *const sorted_end = sorted_begin + dl->sorted;
6746       MDBX_dp *const end =
6747           dl->items + (MDBX_DPL_PREALLOC_FOR_RADIXSORT
6748                            ? dl->length + dl->length + 1
6749                            : dl->detent + MDBX_DPL_RESERVE_GAP);
6750       MDBX_dp *const tmp = end - unsorted;
6751       assert(dl->items + dl->length + 1 < tmp);
6752       /* copy unsorted to the end of allocated space and sort it */
6753       memcpy(tmp, sorted_end, unsorted * sizeof(MDBX_dp));
6754       dp_sort(tmp, tmp + unsorted);
6755       /* merge two parts from end to begin */
6756       MDBX_dp *w = dl->items + dl->length;
6757       MDBX_dp *l = dl->items + dl->sorted;
6758       MDBX_dp *r = end - 1;
6759       do {
6760         const bool cmp = l->pgno > r->pgno;
6761         *w = cmp ? *l : *r;
6762         l -= cmp;
6763         r += cmp - 1;
6764       } while (likely(--w > l));
6765       assert(r == tmp - 1);
6766       assert(dl->items[0].pgno == 0 &&
6767              dl->items[dl->length + 1].pgno == P_INVALID);
6768       if (mdbx_assert_enabled())
6769         for (unsigned i = 0; i <= dl->length; ++i)
6770           assert(dl->items[i].pgno < dl->items[i + 1].pgno);
6771     } else {
6772       dp_sort(dl->items + 1, dl->items + dl->length + 1);
6773       assert(dl->items[0].pgno == 0 &&
6774              dl->items[dl->length + 1].pgno == P_INVALID);
6775     }
6776   } else {
6777     assert(dl->items[0].pgno == 0 &&
6778            dl->items[dl->length + 1].pgno == P_INVALID);
6779   }
6780   dl->sorted = dl->length;
6781   return dl;
6782 }
6783 
mdbx_dpl_sort(const MDBX_txn * txn)6784 static __always_inline MDBX_dpl *mdbx_dpl_sort(const MDBX_txn *txn) {
6785   MDBX_dpl *dl = txn->tw.dirtylist;
6786   assert(dl->length <= MDBX_PGL_LIMIT);
6787   assert(dl->sorted <= dl->length);
6788   assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
6789   return likely(dl->sorted == dl->length) ? dl : mdbx_dpl_sort_slowpath(txn);
6790 }
6791 
6792 /* Returns the index of the first dirty-page whose pgno
6793  * member is greater than or equal to id. */
6794 #define DP_SEARCH_CMP(dp, id) ((dp).pgno < (id))
SEARCH_IMPL(dp_bsearch,MDBX_dp,pgno_t,DP_SEARCH_CMP)6795 SEARCH_IMPL(dp_bsearch, MDBX_dp, pgno_t, DP_SEARCH_CMP)
6796 
6797 static unsigned __hot mdbx_dpl_search(const MDBX_txn *txn, pgno_t pgno) {
6798   MDBX_dpl *dl = txn->tw.dirtylist;
6799   assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
6800   if (mdbx_audit_enabled()) {
6801     for (const MDBX_dp *ptr = dl->items + dl->sorted; --ptr > dl->items;) {
6802       assert(ptr[0].pgno < ptr[1].pgno);
6803       assert(ptr[0].pgno >= NUM_METAS);
6804     }
6805   }
6806 
6807   switch (dl->length - dl->sorted) {
6808   default:
6809     /* sort a whole */
6810     mdbx_dpl_sort_slowpath(txn);
6811     break;
6812   case 0:
6813     /* whole sorted cases */
6814     break;
6815 
6816 #define LINEAR_SEARCH_CASE(N)                                                  \
6817   case N:                                                                      \
6818     if (dl->items[dl->length - N + 1].pgno == pgno)                            \
6819       return dl->length - N + 1;                                               \
6820     __fallthrough
6821 
6822     /* try linear search until the threshold */
6823     LINEAR_SEARCH_CASE(16); /* fall through */
6824     LINEAR_SEARCH_CASE(15); /* fall through */
6825     LINEAR_SEARCH_CASE(14); /* fall through */
6826     LINEAR_SEARCH_CASE(13); /* fall through */
6827     LINEAR_SEARCH_CASE(12); /* fall through */
6828     LINEAR_SEARCH_CASE(11); /* fall through */
6829     LINEAR_SEARCH_CASE(10); /* fall through */
6830     LINEAR_SEARCH_CASE(9);  /* fall through */
6831     LINEAR_SEARCH_CASE(8);  /* fall through */
6832     LINEAR_SEARCH_CASE(7);  /* fall through */
6833     LINEAR_SEARCH_CASE(6);  /* fall through */
6834     LINEAR_SEARCH_CASE(5);  /* fall through */
6835     LINEAR_SEARCH_CASE(4);  /* fall through */
6836     LINEAR_SEARCH_CASE(3);  /* fall through */
6837     LINEAR_SEARCH_CASE(2);  /* fall through */
6838   case 1:
6839     if (dl->items[dl->length].pgno == pgno)
6840       return dl->length;
6841     /* continue bsearch on the sorted part */
6842     break;
6843   }
6844   return (unsigned)(dp_bsearch(dl->items + 1, dl->sorted, pgno) - dl->items);
6845 }
6846 
6847 MDBX_NOTHROW_PURE_FUNCTION static __inline unsigned
dpl_npages(const MDBX_dpl * dl,unsigned i)6848 dpl_npages(const MDBX_dpl *dl, unsigned i) {
6849   assert(0 <= (int)i && i <= dl->length);
6850   unsigned n = likely(!dl->items[i].multi) ? 1 : dl->items[i].ptr->mp_pages;
6851   assert(n == (IS_OVERFLOW(dl->items[i].ptr) ? dl->items[i].ptr->mp_pages : 1));
6852   return n;
6853 }
6854 
6855 MDBX_NOTHROW_PURE_FUNCTION static __inline unsigned
dpl_endpgno(const MDBX_dpl * dl,unsigned i)6856 dpl_endpgno(const MDBX_dpl *dl, unsigned i) {
6857   return dpl_npages(dl, i) + dl->items[i].pgno;
6858 }
6859 
mdbx_dpl_intersect(const MDBX_txn * txn,pgno_t pgno,unsigned npages)6860 static __inline bool mdbx_dpl_intersect(const MDBX_txn *txn, pgno_t pgno,
6861                                         unsigned npages) {
6862   MDBX_dpl *dl = txn->tw.dirtylist;
6863   assert(dl->sorted == dl->length);
6864   assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
6865   unsigned const n = mdbx_dpl_search(txn, pgno);
6866   assert(n >= 1 && n <= dl->length + 1);
6867   assert(pgno <= dl->items[n].pgno);
6868   assert(pgno > dl->items[n - 1].pgno);
6869   const bool rc =
6870       /* intersection with founded */ pgno + npages > dl->items[n].pgno ||
6871       /* intersection with prev */ dpl_endpgno(dl, n - 1) > pgno;
6872   if (mdbx_assert_enabled()) {
6873     bool check = false;
6874     for (unsigned i = 1; i <= dl->length; ++i) {
6875       const MDBX_page *const dp = dl->items[i].ptr;
6876       if (!(dp->mp_pgno /* begin */ >= /* end */ pgno + npages ||
6877             dpl_endpgno(dl, i) /* end */ <= /* begin */ pgno))
6878         check |= true;
6879     }
6880     assert(check == rc);
6881   }
6882   return rc;
6883 }
6884 
mdbx_dpl_exist(MDBX_txn * txn,pgno_t pgno)6885 static __always_inline unsigned mdbx_dpl_exist(MDBX_txn *txn, pgno_t pgno) {
6886   MDBX_dpl *dl = txn->tw.dirtylist;
6887   unsigned i = mdbx_dpl_search(txn, pgno);
6888   assert((int)i > 0);
6889   return (dl->items[i].pgno == pgno) ? i : 0;
6890 }
6891 
debug_dpl_find(const MDBX_txn * txn,const pgno_t pgno)6892 MDBX_MAYBE_UNUSED static const MDBX_page *debug_dpl_find(const MDBX_txn *txn,
6893                                                          const pgno_t pgno) {
6894   const MDBX_dpl *dl = txn->tw.dirtylist;
6895   assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
6896   for (unsigned i = dl->length; i > dl->sorted; --i)
6897     if (dl->items[i].pgno == pgno)
6898       return dl->items[i].ptr;
6899 
6900   if (dl->sorted) {
6901     const unsigned i =
6902         (unsigned)(dp_bsearch(dl->items + 1, dl->sorted, pgno) - dl->items);
6903     if (dl->items[i].pgno == pgno)
6904       return dl->items[i].ptr;
6905   }
6906   return nullptr;
6907 }
6908 
mdbx_dpl_remove(const MDBX_txn * txn,unsigned i)6909 static void mdbx_dpl_remove(const MDBX_txn *txn, unsigned i) {
6910   MDBX_dpl *dl = txn->tw.dirtylist;
6911   assert((int)i > 0 && i <= dl->length);
6912   assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
6913   dl->sorted -= dl->sorted >= i;
6914   dl->length -= 1;
6915   memmove(dl->items + i, dl->items + i + 1,
6916           (dl->length - i + 2) * sizeof(dl->items[0]));
6917   assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
6918 }
6919 
6920 static __always_inline int __must_check_result
mdbx_dpl_append(MDBX_txn * txn,pgno_t pgno,MDBX_page * page,unsigned npages)6921 mdbx_dpl_append(MDBX_txn *txn, pgno_t pgno, MDBX_page *page, unsigned npages) {
6922   MDBX_dpl *dl = txn->tw.dirtylist;
6923   assert(dl->length <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE);
6924   assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
6925   if (mdbx_audit_enabled()) {
6926     for (unsigned i = dl->length; i > 0; --i) {
6927       assert(dl->items[i].pgno != pgno);
6928       if (unlikely(dl->items[i].pgno == pgno)) {
6929         mdbx_error("Page %u already exist in the DPL at %u", pgno, i);
6930         return MDBX_PROBLEM;
6931       }
6932     }
6933   }
6934 
6935   const unsigned length = dl->length + 1;
6936   const unsigned sorted =
6937       (dl->sorted == dl->length && dl->items[dl->length].pgno < pgno)
6938           ? length
6939           : dl->sorted;
6940 
6941   if (unlikely(dl->length == dl->detent)) {
6942     if (unlikely(dl->detent >= MDBX_PGL_LIMIT)) {
6943       mdbx_error("DPL is full (MDBX_PGL_LIMIT %zu)", MDBX_PGL_LIMIT);
6944       return MDBX_TXN_FULL;
6945     }
6946     const size_t size = (dl->detent < MDBX_PNL_INITIAL * 42)
6947                             ? dl->detent + dl->detent
6948                             : dl->detent + dl->detent / 2;
6949     dl = mdbx_dpl_reserve(txn, size);
6950     if (unlikely(!dl))
6951       return MDBX_ENOMEM;
6952     mdbx_tassert(txn, dl->length < dl->detent);
6953   }
6954 
6955   /* copy the stub beyond the end */
6956   dl->items[length + 1] = dl->items[length];
6957   /* append page */
6958   dl->items[length].ptr = page;
6959   dl->items[length].pgno = pgno;
6960   dl->items[length].multi = npages > 1;
6961   dl->items[length].lru = txn->tw.dirtylru++;
6962   dl->length = length;
6963   dl->sorted = sorted;
6964   assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
6965   return MDBX_SUCCESS;
6966 }
6967 
mdbx_dpl_age(const MDBX_txn * txn,unsigned i)6968 static __inline uint32_t mdbx_dpl_age(const MDBX_txn *txn, unsigned i) {
6969   const MDBX_dpl *dl = txn->tw.dirtylist;
6970   assert((int)i > 0 && i <= dl->length);
6971   /* overflow could be here */
6972   return (txn->tw.dirtylru - dl->items[i].lru) & UINT32_C(0x7fffFFFF);
6973 }
6974 
6975 /*----------------------------------------------------------------------------*/
6976 
6977 uint8_t mdbx_runtime_flags = MDBX_RUNTIME_FLAGS_INIT;
6978 uint8_t mdbx_loglevel = MDBX_LOG_FATAL;
6979 MDBX_debug_func *mdbx_debug_logger;
6980 
6981 static __must_check_result __inline int mdbx_page_retire(MDBX_cursor *mc,
6982                                                          MDBX_page *mp);
6983 
6984 static int __must_check_result mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp,
6985                                                unsigned npages);
6986 struct page_result {
6987   MDBX_page *page;
6988   int err;
6989 };
6990 
6991 static struct page_result mdbx_page_alloc(MDBX_cursor *mc, const unsigned num,
6992                                           int flags);
6993 static txnid_t mdbx_kick_longlived_readers(MDBX_env *env,
6994                                            const txnid_t laggard);
6995 
6996 static struct page_result mdbx_page_new(MDBX_cursor *mc, const unsigned flags,
6997                                         const unsigned npages);
6998 static int mdbx_page_touch(MDBX_cursor *mc);
6999 static int mdbx_cursor_touch(MDBX_cursor *mc);
7000 static int mdbx_touch_dbi(MDBX_cursor *mc);
7001 
7002 #define MDBX_END_NAMES                                                         \
7003   {                                                                            \
7004     "committed", "empty-commit", "abort", "reset", "reset-tmp", "fail-begin",  \
7005         "fail-beginchild"                                                      \
7006   }
7007 enum {
7008   /* mdbx_txn_end operation number, for logging */
7009   MDBX_END_COMMITTED,
7010   MDBX_END_PURE_COMMIT,
7011   MDBX_END_ABORT,
7012   MDBX_END_RESET,
7013   MDBX_END_RESET_TMP,
7014   MDBX_END_FAIL_BEGIN,
7015   MDBX_END_FAIL_BEGINCHILD
7016 };
7017 #define MDBX_END_OPMASK 0x0F  /* mask for mdbx_txn_end() operation number */
7018 #define MDBX_END_UPDATE 0x10  /* update env state (DBIs) */
7019 #define MDBX_END_FREE 0x20    /* free txn unless it is MDBX_env.me_txn0 */
7020 #define MDBX_END_EOTDONE 0x40 /* txn's cursors already closed */
7021 #define MDBX_END_SLOT 0x80    /* release any reader slot if MDBX_NOTLS */
7022 static int mdbx_txn_end(MDBX_txn *txn, const unsigned mode);
7023 
7024 __hot static struct page_result __must_check_result
7025 mdbx_page_get_ex(MDBX_cursor *const mc, const pgno_t pgno, txnid_t front);
mdbx_page_get(MDBX_cursor * mc,pgno_t pgno,MDBX_page ** mp,txnid_t front)7026 static __inline int __must_check_result mdbx_page_get(MDBX_cursor *mc,
7027                                                       pgno_t pgno,
7028                                                       MDBX_page **mp,
7029                                                       txnid_t front) {
7030 
7031   struct page_result ret = mdbx_page_get_ex(mc, pgno, front);
7032   *mp = ret.page;
7033   return ret.err;
7034 }
7035 
7036 static int __must_check_result mdbx_page_search_root(MDBX_cursor *mc,
7037                                                      const MDBX_val *key,
7038                                                      int flags);
7039 
7040 #define MDBX_PS_MODIFY 1
7041 #define MDBX_PS_ROOTONLY 2
7042 #define MDBX_PS_FIRST 4
7043 #define MDBX_PS_LAST 8
7044 static int __must_check_result mdbx_page_search(MDBX_cursor *mc,
7045                                                 const MDBX_val *key, int flags);
7046 static int __must_check_result mdbx_page_merge(MDBX_cursor *csrc,
7047                                                MDBX_cursor *cdst);
7048 
7049 #define MDBX_SPLIT_REPLACE MDBX_APPENDDUP /* newkey is not new */
7050 static int __must_check_result mdbx_page_split(MDBX_cursor *mc,
7051                                                const MDBX_val *const newkey,
7052                                                MDBX_val *const newdata,
7053                                                pgno_t newpgno, unsigned nflags);
7054 
7055 static int __must_check_result mdbx_validate_meta_copy(MDBX_env *env,
7056                                                        const MDBX_meta *meta,
7057                                                        MDBX_meta *dest);
7058 static int __must_check_result mdbx_override_meta(MDBX_env *env,
7059                                                   unsigned target,
7060                                                   txnid_t txnid,
7061                                                   const MDBX_meta *shape);
7062 static int __must_check_result mdbx_read_header(MDBX_env *env, MDBX_meta *meta,
7063                                                 const int lck_exclusive,
7064                                                 const mdbx_mode_t mode_bits);
7065 static int __must_check_result mdbx_sync_locked(MDBX_env *env, unsigned flags,
7066                                                 MDBX_meta *const pending);
7067 static int mdbx_env_close0(MDBX_env *env);
7068 
7069 struct node_result {
7070   MDBX_node *node;
7071   bool exact;
7072 };
7073 
7074 static struct node_result mdbx_node_search(MDBX_cursor *mc,
7075                                            const MDBX_val *key);
7076 
7077 static int __must_check_result mdbx_node_add_branch(MDBX_cursor *mc,
7078                                                     unsigned indx,
7079                                                     const MDBX_val *key,
7080                                                     pgno_t pgno);
7081 static int __must_check_result mdbx_node_add_leaf(MDBX_cursor *mc,
7082                                                   unsigned indx,
7083                                                   const MDBX_val *key,
7084                                                   MDBX_val *data,
7085                                                   unsigned flags);
7086 static int __must_check_result mdbx_node_add_leaf2(MDBX_cursor *mc,
7087                                                    unsigned indx,
7088                                                    const MDBX_val *key);
7089 
7090 static void mdbx_node_del(MDBX_cursor *mc, size_t ksize);
7091 static void mdbx_node_shrink(MDBX_page *mp, unsigned indx);
7092 static int __must_check_result mdbx_node_move(MDBX_cursor *csrc,
7093                                               MDBX_cursor *cdst, bool fromleft);
7094 static int __must_check_result mdbx_node_read(MDBX_cursor *mc, MDBX_node *leaf,
7095                                               MDBX_val *data,
7096                                               const txnid_t front);
7097 static int __must_check_result mdbx_rebalance(MDBX_cursor *mc);
7098 static int __must_check_result mdbx_update_key(MDBX_cursor *mc,
7099                                                const MDBX_val *key);
7100 
7101 static void mdbx_cursor_pop(MDBX_cursor *mc);
7102 static int __must_check_result mdbx_cursor_push(MDBX_cursor *mc, MDBX_page *mp);
7103 
7104 static int __must_check_result mdbx_audit_ex(MDBX_txn *txn,
7105                                              unsigned retired_stored,
7106                                              bool dont_filter_gc);
7107 
7108 static int __must_check_result mdbx_page_check(MDBX_cursor *const mc,
7109                                                const MDBX_page *const mp,
7110                                                unsigned options);
7111 static int __must_check_result mdbx_cursor_check(MDBX_cursor *mc,
7112                                                  unsigned options);
7113 static int __must_check_result mdbx_cursor_del0(MDBX_cursor *mc);
7114 static int __must_check_result mdbx_del0(MDBX_txn *txn, MDBX_dbi dbi,
7115                                          const MDBX_val *key,
7116                                          const MDBX_val *data, unsigned flags);
7117 #define SIBLING_LEFT 0
7118 #define SIBLING_RIGHT 2
7119 static int __must_check_result mdbx_cursor_sibling(MDBX_cursor *mc, int dir);
7120 static int __must_check_result mdbx_cursor_next(MDBX_cursor *mc, MDBX_val *key,
7121                                                 MDBX_val *data,
7122                                                 MDBX_cursor_op op);
7123 static int __must_check_result mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key,
7124                                                 MDBX_val *data,
7125                                                 MDBX_cursor_op op);
7126 struct cursor_set_result {
7127   int err;
7128   bool exact;
7129 };
7130 
7131 static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key,
7132                                                 MDBX_val *data,
7133                                                 MDBX_cursor_op op);
7134 static int __must_check_result mdbx_cursor_first(MDBX_cursor *mc, MDBX_val *key,
7135                                                  MDBX_val *data);
7136 static int __must_check_result mdbx_cursor_last(MDBX_cursor *mc, MDBX_val *key,
7137                                                 MDBX_val *data);
7138 
7139 static int __must_check_result mdbx_cursor_init(MDBX_cursor *mc, MDBX_txn *txn,
7140                                                 MDBX_dbi dbi);
7141 static int __must_check_result mdbx_xcursor_init0(MDBX_cursor *mc);
7142 static int __must_check_result mdbx_xcursor_init1(MDBX_cursor *mc,
7143                                                   MDBX_node *node,
7144                                                   const MDBX_page *mp);
7145 static int __must_check_result mdbx_xcursor_init2(MDBX_cursor *mc,
7146                                                   MDBX_xcursor *src_mx,
7147                                                   bool new_dupdata);
7148 static void cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst);
7149 
7150 static int __must_check_result mdbx_drop_tree(MDBX_cursor *mc,
7151                                               const bool may_have_subDBs);
7152 static int __must_check_result mdbx_fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi);
7153 static int __must_check_result mdbx_setup_dbx(MDBX_dbx *const dbx,
7154                                               const MDBX_db *const db,
7155                                               const unsigned pagesize);
7156 
7157 static MDBX_cmp_func cmp_lexical, cmp_reverse, cmp_int_align4, cmp_int_align2,
7158     cmp_int_unaligned, cmp_lenfast;
7159 
7160 static __inline MDBX_cmp_func *get_default_keycmp(unsigned flags);
7161 static __inline MDBX_cmp_func *get_default_datacmp(unsigned flags);
7162 
mdbx_liberr2str(int errnum)7163 __cold const char *mdbx_liberr2str(int errnum) {
7164   /* Table of descriptions for MDBX errors */
7165   static const char *const tbl[] = {
7166       "MDBX_KEYEXIST: Key/data pair already exists",
7167       "MDBX_NOTFOUND: No matching key/data pair found",
7168       "MDBX_PAGE_NOTFOUND: Requested page not found",
7169       "MDBX_CORRUPTED: Database is corrupted",
7170       "MDBX_PANIC: Environment had fatal error",
7171       "MDBX_VERSION_MISMATCH: DB version mismatch libmdbx",
7172       "MDBX_INVALID: File is not an MDBX file",
7173       "MDBX_MAP_FULL: Environment mapsize limit reached",
7174       "MDBX_DBS_FULL: Too many DBI-handles (maxdbs reached)",
7175       "MDBX_READERS_FULL: Too many readers (maxreaders reached)",
7176       NULL /* MDBX_TLS_FULL (-30789): unused in MDBX */,
7177       "MDBX_TXN_FULL: Transaction has too many dirty pages,"
7178       " i.e transaction is too big",
7179       "MDBX_CURSOR_FULL: Cursor stack limit reachedn - this usually indicates"
7180       " corruption, i.e branch-pages loop",
7181       "MDBX_PAGE_FULL: Internal error - Page has no more space",
7182       "MDBX_UNABLE_EXTEND_MAPSIZE: Database engine was unable to extend"
7183       " mapping, e.g. since address space is unavailable or busy,"
7184       " or Operation system not supported such operations",
7185       "MDBX_INCOMPATIBLE: Environment or database is not compatible"
7186       " with the requested operation or the specified flags",
7187       "MDBX_BAD_RSLOT: Invalid reuse of reader locktable slot,"
7188       " e.g. read-transaction already run for current thread",
7189       "MDBX_BAD_TXN: Transaction is not valid for requested operation,"
7190       " e.g. had errored and be must aborted, has a child, or is invalid",
7191       "MDBX_BAD_VALSIZE: Invalid size or alignment of key or data"
7192       " for target database, either invalid subDB name",
7193       "MDBX_BAD_DBI: The specified DBI-handle is invalid"
7194       " or changed by another thread/transaction",
7195       "MDBX_PROBLEM: Unexpected internal error, transaction should be aborted",
7196       "MDBX_BUSY: Another write transaction is running,"
7197       " or environment is already used while opening with MDBX_EXCLUSIVE flag",
7198   };
7199 
7200   if (errnum >= MDBX_KEYEXIST && errnum <= MDBX_BUSY) {
7201     int i = errnum - MDBX_KEYEXIST;
7202     return tbl[i];
7203   }
7204 
7205   switch (errnum) {
7206   case MDBX_SUCCESS:
7207     return "MDBX_SUCCESS: Successful";
7208   case MDBX_EMULTIVAL:
7209     return "MDBX_EMULTIVAL: The specified key has"
7210            " more than one associated value";
7211   case MDBX_EBADSIGN:
7212     return "MDBX_EBADSIGN: Wrong signature of a runtime object(s),"
7213            " e.g. memory corruption or double-free";
7214   case MDBX_WANNA_RECOVERY:
7215     return "MDBX_WANNA_RECOVERY: Database should be recovered,"
7216            " but this could NOT be done automatically for now"
7217            " since it opened in read-only mode";
7218   case MDBX_EKEYMISMATCH:
7219     return "MDBX_EKEYMISMATCH: The given key value is mismatched to the"
7220            " current cursor position";
7221   case MDBX_TOO_LARGE:
7222     return "MDBX_TOO_LARGE: Database is too large for current system,"
7223            " e.g. could NOT be mapped into RAM";
7224   case MDBX_THREAD_MISMATCH:
7225     return "MDBX_THREAD_MISMATCH: A thread has attempted to use a not"
7226            " owned object, e.g. a transaction that started by another thread";
7227   case MDBX_TXN_OVERLAPPING:
7228     return "MDBX_TXN_OVERLAPPING: Overlapping read and write transactions for"
7229            " the current thread";
7230   default:
7231     return NULL;
7232   }
7233 }
7234 
mdbx_strerror_r(int errnum,char * buf,size_t buflen)7235 __cold const char *mdbx_strerror_r(int errnum, char *buf, size_t buflen) {
7236   const char *msg = mdbx_liberr2str(errnum);
7237   if (!msg && buflen > 0 && buflen < INT_MAX) {
7238 #if defined(_WIN32) || defined(_WIN64)
7239     const DWORD size = FormatMessageA(
7240         FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL,
7241         errnum, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), buf, (DWORD)buflen,
7242         NULL);
7243     return size ? buf : "FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM) failed";
7244 #elif defined(_GNU_SOURCE) && defined(__GLIBC__)
7245     /* GNU-specific */
7246     if (errnum > 0)
7247       msg = strerror_r(errnum, buf, buflen);
7248 #elif (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600)
7249     /* XSI-compliant */
7250     if (errnum > 0 && strerror_r(errnum, buf, buflen) == 0)
7251       msg = buf;
7252 #else
7253     if (errnum > 0) {
7254       msg = strerror(errnum);
7255       if (msg) {
7256         strncpy(buf, msg, buflen);
7257         msg = buf;
7258       }
7259     }
7260 #endif
7261     if (!msg) {
7262       (void)snprintf(buf, buflen, "error %d", errnum);
7263       msg = buf;
7264     }
7265     buf[buflen - 1] = '\0';
7266   }
7267   return msg;
7268 }
7269 
mdbx_strerror(int errnum)7270 __cold const char *mdbx_strerror(int errnum) {
7271 #if defined(_WIN32) || defined(_WIN64)
7272   static char buf[1024];
7273   return mdbx_strerror_r(errnum, buf, sizeof(buf));
7274 #else
7275   const char *msg = mdbx_liberr2str(errnum);
7276   if (!msg) {
7277     if (errnum > 0)
7278       msg = strerror(errnum);
7279     if (!msg) {
7280       static char buf[32];
7281       (void)snprintf(buf, sizeof(buf) - 1, "error %d", errnum);
7282       msg = buf;
7283     }
7284   }
7285   return msg;
7286 #endif
7287 }
7288 
7289 #if defined(_WIN32) || defined(_WIN64) /* Bit of madness for Windows */
mdbx_strerror_r_ANSI2OEM(int errnum,char * buf,size_t buflen)7290 const char *mdbx_strerror_r_ANSI2OEM(int errnum, char *buf, size_t buflen) {
7291   const char *msg = mdbx_liberr2str(errnum);
7292   if (!msg && buflen > 0 && buflen < INT_MAX) {
7293     const DWORD size = FormatMessageA(
7294         FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL,
7295         errnum, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), buf, (DWORD)buflen,
7296         NULL);
7297     if (!size)
7298       msg = "FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM) failed";
7299     else if (!CharToOemBuffA(buf, buf, size))
7300       msg = "CharToOemBuffA() failed";
7301     else
7302       msg = buf;
7303   }
7304   return msg;
7305 }
7306 
mdbx_strerror_ANSI2OEM(int errnum)7307 const char *mdbx_strerror_ANSI2OEM(int errnum) {
7308   static char buf[1024];
7309   return mdbx_strerror_r_ANSI2OEM(errnum, buf, sizeof(buf));
7310 }
7311 #endif /* Bit of madness for Windows */
7312 
mdbx_debug_log_va(int level,const char * function,int line,const char * fmt,va_list args)7313 __cold void mdbx_debug_log_va(int level, const char *function, int line,
7314                               const char *fmt, va_list args) {
7315   if (mdbx_debug_logger)
7316     mdbx_debug_logger(level, function, line, fmt, args);
7317   else {
7318 #if defined(_WIN32) || defined(_WIN64)
7319     if (IsDebuggerPresent()) {
7320       int prefix_len = 0;
7321       char *prefix = nullptr;
7322       if (function && line > 0)
7323         prefix_len = mdbx_asprintf(&prefix, "%s:%d ", function, line);
7324       else if (function)
7325         prefix_len = mdbx_asprintf(&prefix, "%s: ", function);
7326       else if (line > 0)
7327         prefix_len = mdbx_asprintf(&prefix, "%d: ", line);
7328       if (prefix_len > 0 && prefix) {
7329         OutputDebugStringA(prefix);
7330         mdbx_free(prefix);
7331       }
7332       char *msg = nullptr;
7333       int msg_len = mdbx_vasprintf(&msg, fmt, args);
7334       if (msg_len > 0 && msg) {
7335         OutputDebugStringA(msg);
7336         mdbx_free(msg);
7337       }
7338     }
7339 #else
7340     if (function && line > 0)
7341       fprintf(stderr, "%s:%d ", function, line);
7342     else if (function)
7343       fprintf(stderr, "%s: ", function);
7344     else if (line > 0)
7345       fprintf(stderr, "%d: ", line);
7346     vfprintf(stderr, fmt, args);
7347     fflush(stderr);
7348 #endif
7349   }
7350 }
7351 
mdbx_debug_log(int level,const char * function,int line,const char * fmt,...)7352 __cold void mdbx_debug_log(int level, const char *function, int line,
7353                            const char *fmt, ...) {
7354   va_list args;
7355   va_start(args, fmt);
7356   mdbx_debug_log_va(level, function, line, fmt, args);
7357   va_end(args);
7358 }
7359 
7360 /* Dump a key in ascii or hexadecimal. */
mdbx_dump_val(const MDBX_val * key,char * const buf,const size_t bufsize)7361 const char *mdbx_dump_val(const MDBX_val *key, char *const buf,
7362                           const size_t bufsize) {
7363   if (!key)
7364     return "<null>";
7365   if (!key->iov_len)
7366     return "<empty>";
7367   if (!buf || bufsize < 4)
7368     return nullptr;
7369 
7370   bool is_ascii = true;
7371   const uint8_t *const data = key->iov_base;
7372   for (unsigned i = 0; i < key->iov_len; i++)
7373     if (data[i] < ' ' || data[i] > '~') {
7374       is_ascii = false;
7375       break;
7376     }
7377 
7378   if (is_ascii) {
7379     int len =
7380         snprintf(buf, bufsize, "%.*s",
7381                  (key->iov_len > INT_MAX) ? INT_MAX : (int)key->iov_len, data);
7382     assert(len > 0 && (unsigned)len < bufsize);
7383     (void)len;
7384   } else {
7385     char *const detent = buf + bufsize - 2;
7386     char *ptr = buf;
7387     *ptr++ = '<';
7388     for (unsigned i = 0; i < key->iov_len; i++) {
7389       const ptrdiff_t left = detent - ptr;
7390       assert(left > 0);
7391       int len = snprintf(ptr, left, "%02x", data[i]);
7392       if (len < 0 || len >= left)
7393         break;
7394       ptr += len;
7395     }
7396     if (ptr < detent) {
7397       ptr[0] = '>';
7398       ptr[1] = '\0';
7399     }
7400   }
7401   return buf;
7402 }
7403 
7404 /*------------------------------------------------------------------------------
7405  LY: debug stuff */
7406 
mdbx_leafnode_type(MDBX_node * n)7407 static const char *mdbx_leafnode_type(MDBX_node *n) {
7408   static const char *const tp[2][2] = {{"", ": DB"},
7409                                        {": sub-page", ": sub-DB"}};
7410   return F_ISSET(node_flags(n), F_BIGDATA)
7411              ? ": overflow page"
7412              : tp[F_ISSET(node_flags(n), F_DUPDATA)]
7413                  [F_ISSET(node_flags(n), F_SUBDATA)];
7414 }
7415 
7416 /* Display all the keys in the page. */
mdbx_page_list(MDBX_page * mp)7417 MDBX_MAYBE_UNUSED static void mdbx_page_list(MDBX_page *mp) {
7418   pgno_t pgno = mp->mp_pgno;
7419   const char *type;
7420   MDBX_node *node;
7421   unsigned i, nkeys, nsize, total = 0;
7422   MDBX_val key;
7423   DKBUF;
7424 
7425   switch (mp->mp_flags &
7426           (P_BRANCH | P_LEAF | P_LEAF2 | P_META | P_OVERFLOW | P_SUBP)) {
7427   case P_BRANCH:
7428     type = "Branch page";
7429     break;
7430   case P_LEAF:
7431     type = "Leaf page";
7432     break;
7433   case P_LEAF | P_SUBP:
7434     type = "Leaf sub-page";
7435     break;
7436   case P_LEAF | P_LEAF2:
7437     type = "Leaf2 page";
7438     break;
7439   case P_LEAF | P_LEAF2 | P_SUBP:
7440     type = "Leaf2 sub-page";
7441     break;
7442   case P_OVERFLOW:
7443     mdbx_verbose("Overflow page %" PRIaPGNO " pages %u\n", pgno, mp->mp_pages);
7444     return;
7445   case P_META:
7446     mdbx_verbose("Meta-page %" PRIaPGNO " txnid %" PRIu64 "\n", pgno,
7447                  unaligned_peek_u64(4, page_meta(mp)->mm_txnid_a));
7448     return;
7449   default:
7450     mdbx_verbose("Bad page %" PRIaPGNO " flags 0x%X\n", pgno, mp->mp_flags);
7451     return;
7452   }
7453 
7454   nkeys = page_numkeys(mp);
7455   mdbx_verbose("%s %" PRIaPGNO " numkeys %u\n", type, pgno, nkeys);
7456 
7457   for (i = 0; i < nkeys; i++) {
7458     if (IS_LEAF2(mp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */
7459       key.iov_len = nsize = mp->mp_leaf2_ksize;
7460       key.iov_base = page_leaf2key(mp, i, nsize);
7461       total += nsize;
7462       mdbx_verbose("key %u: nsize %u, %s\n", i, nsize, DKEY(&key));
7463       continue;
7464     }
7465     node = page_node(mp, i);
7466     key.iov_len = node_ks(node);
7467     key.iov_base = node->mn_data;
7468     nsize = (unsigned)(NODESIZE + key.iov_len);
7469     if (IS_BRANCH(mp)) {
7470       mdbx_verbose("key %u: page %" PRIaPGNO ", %s\n", i, node_pgno(node),
7471                    DKEY(&key));
7472       total += nsize;
7473     } else {
7474       if (F_ISSET(node_flags(node), F_BIGDATA))
7475         nsize += sizeof(pgno_t);
7476       else
7477         nsize += (unsigned)node_ds(node);
7478       total += nsize;
7479       nsize += sizeof(indx_t);
7480       mdbx_verbose("key %u: nsize %u, %s%s\n", i, nsize, DKEY(&key),
7481                    mdbx_leafnode_type(node));
7482     }
7483     total = EVEN(total);
7484   }
7485   mdbx_verbose("Total: header %u + contents %u + unused %u\n",
7486                IS_LEAF2(mp) ? PAGEHDRSZ : PAGEHDRSZ + mp->mp_lower, total,
7487                page_room(mp));
7488 }
7489 
7490 /*----------------------------------------------------------------------------*/
7491 
7492 /* Check if there is an initialized xcursor, so XCURSOR_REFRESH() is proper */
7493 #define XCURSOR_INITED(mc)                                                     \
7494   ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))
7495 
7496 /* Update sub-page pointer, if any, in mc->mc_xcursor.
7497  * Needed when the node which contains the sub-page may have moved.
7498  * Called with mp = mc->mc_pg[mc->mc_top], ki = mc->mc_ki[mc->mc_top]. */
7499 #define XCURSOR_REFRESH(mc, mp, ki)                                            \
7500   do {                                                                         \
7501     MDBX_page *xr_pg = (mp);                                                   \
7502     MDBX_node *xr_node = page_node(xr_pg, ki);                                 \
7503     if ((node_flags(xr_node) & (F_DUPDATA | F_SUBDATA)) == F_DUPDATA)          \
7504       (mc)->mc_xcursor->mx_cursor.mc_pg[0] = node_data(xr_node);               \
7505   } while (0)
7506 
cursor_is_tracked(const MDBX_cursor * mc)7507 MDBX_MAYBE_UNUSED static bool cursor_is_tracked(const MDBX_cursor *mc) {
7508   for (MDBX_cursor *scan = mc->mc_txn->tw.cursors[mc->mc_dbi]; scan;
7509        scan = scan->mc_next)
7510     if (mc == ((mc->mc_flags & C_SUB) ? &scan->mc_xcursor->mx_cursor : scan))
7511       return true;
7512   return false;
7513 }
7514 
7515 /* Perform act while tracking temporary cursor mn */
7516 #define WITH_CURSOR_TRACKING(mn, act)                                          \
7517   do {                                                                         \
7518     mdbx_cassert(&(mn),                                                        \
7519                  mn.mc_txn->tw.cursors != NULL /* must be not rdonly txt */);  \
7520     mdbx_cassert(&(mn), !cursor_is_tracked(&(mn)));                            \
7521     MDBX_cursor mc_dummy;                                                      \
7522     MDBX_cursor **tracking_head = &(mn).mc_txn->tw.cursors[mn.mc_dbi];         \
7523     MDBX_cursor *tracked = &(mn);                                              \
7524     if ((mn).mc_flags & C_SUB) {                                               \
7525       mc_dummy.mc_flags = C_INITIALIZED;                                       \
7526       mc_dummy.mc_top = 0;                                                     \
7527       mc_dummy.mc_snum = 0;                                                    \
7528       mc_dummy.mc_xcursor = (MDBX_xcursor *)&(mn);                             \
7529       tracked = &mc_dummy;                                                     \
7530     }                                                                          \
7531     tracked->mc_next = *tracking_head;                                         \
7532     *tracking_head = tracked;                                                  \
7533     { act; }                                                                   \
7534     *tracking_head = tracked->mc_next;                                         \
7535   } while (0)
7536 
mdbx_cmp(const MDBX_txn * txn,MDBX_dbi dbi,const MDBX_val * a,const MDBX_val * b)7537 int mdbx_cmp(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a,
7538              const MDBX_val *b) {
7539   mdbx_assert(NULL, txn->mt_signature == MDBX_MT_SIGNATURE);
7540   return txn->mt_dbxs[dbi].md_cmp(a, b);
7541 }
7542 
mdbx_dcmp(const MDBX_txn * txn,MDBX_dbi dbi,const MDBX_val * a,const MDBX_val * b)7543 int mdbx_dcmp(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a,
7544               const MDBX_val *b) {
7545   mdbx_assert(NULL, txn->mt_signature == MDBX_MT_SIGNATURE);
7546   return txn->mt_dbxs[dbi].md_dcmp(a, b);
7547 }
7548 
7549 /* Allocate memory for a page.
7550  * Re-use old malloc'ed pages first for singletons, otherwise just malloc.
7551  * Set MDBX_TXN_ERROR on failure. */
mdbx_page_malloc(MDBX_txn * txn,unsigned num)7552 static MDBX_page *mdbx_page_malloc(MDBX_txn *txn, unsigned num) {
7553   MDBX_env *env = txn->mt_env;
7554   MDBX_page *np = env->me_dp_reserve;
7555   size_t size = env->me_psize;
7556   if (likely(num == 1 && np)) {
7557     mdbx_assert(env, env->me_dp_reserve_len > 0);
7558     MDBX_ASAN_UNPOISON_MEMORY_REGION(np, size);
7559     VALGRIND_MEMPOOL_ALLOC(env, np, size);
7560     VALGRIND_MAKE_MEM_DEFINED(&np->mp_next, sizeof(np->mp_next));
7561     env->me_dp_reserve = np->mp_next;
7562     env->me_dp_reserve_len -= 1;
7563   } else {
7564     size = pgno2bytes(env, num);
7565     np = mdbx_malloc(size);
7566     if (unlikely(!np)) {
7567       txn->mt_flags |= MDBX_TXN_ERROR;
7568       return np;
7569     }
7570     VALGRIND_MEMPOOL_ALLOC(env, np, size);
7571   }
7572 
7573   if ((env->me_flags & MDBX_NOMEMINIT) == 0) {
7574     /* For a single page alloc, we init everything after the page header.
7575      * For multi-page, we init the final page; if the caller needed that
7576      * many pages they will be filling in at least up to the last page. */
7577     size_t skip = PAGEHDRSZ;
7578     if (num > 1)
7579       skip += pgno2bytes(env, num - 1);
7580     memset((char *)np + skip, 0, size - skip);
7581   }
7582 #if MDBX_DEBUG
7583   np->mp_pgno = 0;
7584 #endif
7585   VALGRIND_MAKE_MEM_UNDEFINED(np, size);
7586   np->mp_flags = 0;
7587   np->mp_pages = num;
7588   return np;
7589 }
7590 
7591 /* Free a shadow dirty page */
mdbx_dpage_free(MDBX_env * env,MDBX_page * dp,unsigned npages)7592 static void mdbx_dpage_free(MDBX_env *env, MDBX_page *dp, unsigned npages) {
7593   VALGRIND_MAKE_MEM_UNDEFINED(dp, pgno2bytes(env, npages));
7594   MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, pgno2bytes(env, npages));
7595   if (MDBX_DEBUG != 0 || unlikely(env->me_flags & MDBX_PAGEPERTURB))
7596     memset(dp, -1, pgno2bytes(env, npages));
7597   if (npages == 1 &&
7598       env->me_dp_reserve_len < env->me_options.dp_reserve_limit) {
7599     MDBX_ASAN_POISON_MEMORY_REGION((char *)dp + sizeof(dp->mp_next),
7600                                    pgno2bytes(env, npages) -
7601                                        sizeof(dp->mp_next));
7602     dp->mp_next = env->me_dp_reserve;
7603     VALGRIND_MEMPOOL_FREE(env, dp);
7604     env->me_dp_reserve = dp;
7605     env->me_dp_reserve_len += 1;
7606   } else {
7607     /* large pages just get freed directly */
7608     VALGRIND_MEMPOOL_FREE(env, dp);
7609     mdbx_free(dp);
7610   }
7611 }
7612 
7613 /* Return all dirty pages to dpage list */
mdbx_dlist_free(MDBX_txn * txn)7614 static void mdbx_dlist_free(MDBX_txn *txn) {
7615   MDBX_env *env = txn->mt_env;
7616   MDBX_dpl *const dl = txn->tw.dirtylist;
7617 
7618   for (unsigned i = 1; i <= dl->length; i++) {
7619     MDBX_page *dp = dl->items[i].ptr;
7620     mdbx_dpage_free(env, dp, dpl_npages(dl, i));
7621   }
7622 
7623   dpl_clear(dl);
7624 }
7625 
mdbx_outer_db(MDBX_cursor * mc)7626 static __always_inline MDBX_db *mdbx_outer_db(MDBX_cursor *mc) {
7627   mdbx_cassert(mc, (mc->mc_flags & C_SUB) != 0);
7628   MDBX_xcursor *mx = container_of(mc->mc_db, MDBX_xcursor, mx_db);
7629   MDBX_cursor_couple *couple = container_of(mx, MDBX_cursor_couple, inner);
7630   mdbx_cassert(mc, mc->mc_db == &couple->outer.mc_xcursor->mx_db);
7631   mdbx_cassert(mc, mc->mc_dbx == &couple->outer.mc_xcursor->mx_dbx);
7632   return couple->outer.mc_db;
7633 }
7634 
mdbx_dirtylist_check(MDBX_txn * txn)7635 MDBX_MAYBE_UNUSED __cold static bool mdbx_dirtylist_check(MDBX_txn *txn) {
7636   const MDBX_dpl *const dl = txn->tw.dirtylist;
7637   assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
7638   mdbx_tassert(txn, txn->tw.dirtyroom + dl->length ==
7639                         (txn->mt_parent ? txn->mt_parent->tw.dirtyroom
7640                                         : txn->mt_env->me_options.dp_limit));
7641 
7642   if (!mdbx_audit_enabled())
7643     return true;
7644 
7645   unsigned loose = 0;
7646   for (unsigned i = dl->length; i > 0; --i) {
7647     const MDBX_page *const dp = dl->items[i].ptr;
7648     if (!dp)
7649       continue;
7650 
7651     mdbx_tassert(txn, dp->mp_pgno == dl->items[i].pgno);
7652     if (unlikely(dp->mp_pgno != dl->items[i].pgno))
7653       return false;
7654 
7655     const uint32_t age = mdbx_dpl_age(txn, i);
7656     mdbx_tassert(txn, age < UINT32_MAX / 3);
7657     if (unlikely(age > UINT32_MAX / 3))
7658       return false;
7659 
7660     mdbx_tassert(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp));
7661     if (dp->mp_flags == P_LOOSE) {
7662       loose += 1;
7663     } else if (unlikely(!IS_MODIFIABLE(txn, dp)))
7664       return false;
7665 
7666     const unsigned num = dpl_npages(dl, i);
7667     mdbx_tassert(txn, txn->mt_next_pgno >= dp->mp_pgno + num);
7668     if (unlikely(txn->mt_next_pgno < dp->mp_pgno + num))
7669       return false;
7670 
7671     if (i < dl->sorted) {
7672       mdbx_tassert(txn, dl->items[i + 1].pgno >= dp->mp_pgno + num);
7673       if (unlikely(dl->items[i + 1].pgno < dp->mp_pgno + num))
7674         return false;
7675     }
7676 
7677     const unsigned rpa = mdbx_pnl_search(txn->tw.reclaimed_pglist, dp->mp_pgno);
7678     mdbx_tassert(txn, rpa > MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) ||
7679                           txn->tw.reclaimed_pglist[rpa] != dp->mp_pgno);
7680     if (rpa <= MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) &&
7681         unlikely(txn->tw.reclaimed_pglist[rpa] == dp->mp_pgno))
7682       return false;
7683     if (num > 1) {
7684       const unsigned rpb =
7685           mdbx_pnl_search(txn->tw.reclaimed_pglist, dp->mp_pgno + num - 1);
7686       mdbx_tassert(txn, rpa == rpb);
7687       if (unlikely(rpa != rpb))
7688         return false;
7689     }
7690   }
7691 
7692   mdbx_tassert(txn, loose == txn->tw.loose_count);
7693   if (unlikely(loose != txn->tw.loose_count))
7694     return false;
7695 
7696   for (unsigned i = 1; i <= MDBX_PNL_SIZE(txn->tw.retired_pages); ++i) {
7697     const MDBX_page *const dp = debug_dpl_find(txn, txn->tw.retired_pages[i]);
7698     mdbx_tassert(txn, !dp);
7699     if (unlikely(dp))
7700       return false;
7701   }
7702 
7703   return true;
7704 }
7705 
7706 #if MDBX_ENABLE_REFUND
mdbx_refund_reclaimed(MDBX_txn * txn)7707 static void mdbx_refund_reclaimed(MDBX_txn *txn) {
7708   /* Scanning in descend order */
7709   pgno_t next_pgno = txn->mt_next_pgno;
7710   const MDBX_PNL pnl = txn->tw.reclaimed_pglist;
7711   mdbx_tassert(txn, MDBX_PNL_SIZE(pnl) && MDBX_PNL_MOST(pnl) == next_pgno - 1);
7712 #if MDBX_PNL_ASCENDING
7713   unsigned i = MDBX_PNL_SIZE(pnl);
7714   mdbx_tassert(txn, pnl[i] == next_pgno - 1);
7715   while (--next_pgno, --i > 0 && pnl[i] == next_pgno - 1)
7716     ;
7717   MDBX_PNL_SIZE(pnl) = i;
7718 #else
7719   unsigned i = 1;
7720   mdbx_tassert(txn, pnl[i] == next_pgno - 1);
7721   unsigned len = MDBX_PNL_SIZE(pnl);
7722   while (--next_pgno, ++i <= len && pnl[i] == next_pgno - 1)
7723     ;
7724   MDBX_PNL_SIZE(pnl) = len -= i - 1;
7725   for (unsigned move = 0; move < len; ++move)
7726     pnl[1 + move] = pnl[i + move];
7727 #endif
7728   mdbx_verbose("refunded %" PRIaPGNO " pages: %" PRIaPGNO " -> %" PRIaPGNO,
7729                txn->mt_next_pgno - next_pgno, txn->mt_next_pgno, next_pgno);
7730   txn->mt_next_pgno = next_pgno;
7731   mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
7732                                           txn->mt_next_pgno - 1));
7733 }
7734 
mdbx_refund_loose(MDBX_txn * txn)7735 static void mdbx_refund_loose(MDBX_txn *txn) {
7736   mdbx_tassert(txn, txn->tw.loose_pages != nullptr);
7737   mdbx_tassert(txn, txn->tw.loose_count > 0);
7738 
7739   MDBX_dpl *const dl = txn->tw.dirtylist;
7740   mdbx_tassert(txn, dl->length >= txn->tw.loose_count);
7741 
7742   pgno_t onstack[MDBX_CACHELINE_SIZE * 8 / sizeof(pgno_t)];
7743   MDBX_PNL suitable = onstack;
7744 
7745   if (dl->length - dl->sorted > txn->tw.loose_count) {
7746     /* Dirty list is useless since unsorted. */
7747     if (bytes2pnl(sizeof(onstack)) < txn->tw.loose_count) {
7748       suitable = mdbx_pnl_alloc(txn->tw.loose_count);
7749       if (unlikely(!suitable))
7750         return /* this is not a reason for transaction fail */;
7751     }
7752 
7753     /* Collect loose-pages which may be refunded. */
7754     mdbx_tassert(txn, txn->mt_next_pgno >= MIN_PAGENO + txn->tw.loose_count);
7755     pgno_t most = MIN_PAGENO;
7756     unsigned w = 0;
7757     for (const MDBX_page *lp = txn->tw.loose_pages; lp; lp = lp->mp_next) {
7758       mdbx_tassert(txn, lp->mp_flags == P_LOOSE);
7759       mdbx_tassert(txn, txn->mt_next_pgno > lp->mp_pgno);
7760       if (likely(txn->mt_next_pgno - txn->tw.loose_count <= lp->mp_pgno)) {
7761         mdbx_tassert(txn,
7762                      w < ((suitable == onstack) ? bytes2pnl(sizeof(onstack))
7763                                                 : MDBX_PNL_ALLOCLEN(suitable)));
7764         suitable[++w] = lp->mp_pgno;
7765         most = (lp->mp_pgno > most) ? lp->mp_pgno : most;
7766       }
7767     }
7768 
7769     if (most + 1 == txn->mt_next_pgno) {
7770       /* Sort suitable list and refund pages at the tail. */
7771       MDBX_PNL_SIZE(suitable) = w;
7772       mdbx_pnl_sort(suitable);
7773 
7774       /* Scanning in descend order */
7775       const int step = MDBX_PNL_ASCENDING ? -1 : 1;
7776       const int begin = MDBX_PNL_ASCENDING ? MDBX_PNL_SIZE(suitable) : 1;
7777       const int end = MDBX_PNL_ASCENDING ? 0 : MDBX_PNL_SIZE(suitable) + 1;
7778       mdbx_tassert(txn, suitable[begin] >= suitable[end - step]);
7779       mdbx_tassert(txn, most == suitable[begin]);
7780 
7781       for (int i = begin + step; i != end; i += step) {
7782         if (suitable[i] != most - 1)
7783           break;
7784         most -= 1;
7785       }
7786       const unsigned refunded = txn->mt_next_pgno - most;
7787       mdbx_debug("refund-suitable %u pages %" PRIaPGNO " -> %" PRIaPGNO,
7788                  refunded, most, txn->mt_next_pgno);
7789       txn->tw.loose_count -= refunded;
7790       txn->tw.dirtyroom += refunded;
7791       assert(txn->tw.dirtyroom <= txn->mt_env->me_options.dp_limit);
7792       txn->mt_next_pgno = most;
7793 
7794       /* Filter-out dirty list */
7795       unsigned r = 0;
7796       w = 0;
7797       if (dl->sorted) {
7798         do {
7799           if (dl->items[++r].pgno < most) {
7800             if (++w != r)
7801               dl->items[w] = dl->items[r];
7802           }
7803         } while (r < dl->sorted);
7804         dl->sorted = w;
7805       }
7806       while (r < dl->length) {
7807         if (dl->items[++r].pgno < most) {
7808           if (++w != r)
7809             dl->items[w] = dl->items[r];
7810         }
7811       }
7812       dpl_setlen(dl, w);
7813       mdbx_tassert(txn,
7814                    txn->tw.dirtyroom + txn->tw.dirtylist->length ==
7815                        (txn->mt_parent ? txn->mt_parent->tw.dirtyroom
7816                                        : txn->mt_env->me_options.dp_limit));
7817 
7818       goto unlink_loose;
7819     }
7820   } else {
7821     /* Dirtylist is mostly sorted, just refund loose pages at the end. */
7822     mdbx_dpl_sort(txn);
7823     mdbx_tassert(txn, dl->length < 2 ||
7824                           dl->items[1].pgno < dl->items[dl->length].pgno);
7825     mdbx_tassert(txn, dl->sorted == dl->length);
7826 
7827     /* Scan dirtylist tail-forward and cutoff suitable pages. */
7828     unsigned n;
7829     for (n = dl->length; dl->items[n].pgno == txn->mt_next_pgno - 1 &&
7830                          dl->items[n].ptr->mp_flags == P_LOOSE;
7831          --n) {
7832       mdbx_tassert(txn, n > 0);
7833       MDBX_page *dp = dl->items[n].ptr;
7834       mdbx_debug("refund-sorted page %" PRIaPGNO, dp->mp_pgno);
7835       mdbx_tassert(txn, dp->mp_pgno == dl->items[n].pgno);
7836       txn->mt_next_pgno -= 1;
7837     }
7838     dpl_setlen(dl, n);
7839 
7840     if (dl->sorted != dl->length) {
7841       const unsigned refunded = dl->sorted - dl->length;
7842       dl->sorted = dl->length;
7843       txn->tw.loose_count -= refunded;
7844       txn->tw.dirtyroom += refunded;
7845       mdbx_tassert(txn,
7846                    txn->tw.dirtyroom + txn->tw.dirtylist->length ==
7847                        (txn->mt_parent ? txn->mt_parent->tw.dirtyroom
7848                                        : txn->mt_env->me_options.dp_limit));
7849 
7850       /* Filter-out loose chain & dispose refunded pages. */
7851     unlink_loose:
7852       for (MDBX_page **link = &txn->tw.loose_pages; *link;) {
7853         MDBX_page *dp = *link;
7854         mdbx_tassert(txn, dp->mp_flags == P_LOOSE);
7855         if (txn->mt_next_pgno > dp->mp_pgno) {
7856           link = &dp->mp_next;
7857         } else {
7858           *link = dp->mp_next;
7859           if ((txn->mt_flags & MDBX_WRITEMAP) == 0)
7860             mdbx_dpage_free(txn->mt_env, dp, 1);
7861         }
7862       }
7863     }
7864   }
7865 
7866   mdbx_tassert(txn, mdbx_dirtylist_check(txn));
7867   if (suitable != onstack)
7868     mdbx_pnl_free(suitable);
7869   txn->tw.loose_refund_wl = txn->mt_next_pgno;
7870 }
7871 
mdbx_refund(MDBX_txn * txn)7872 static bool mdbx_refund(MDBX_txn *txn) {
7873   const pgno_t before = txn->mt_next_pgno;
7874 
7875   if (txn->tw.loose_pages && txn->tw.loose_refund_wl > txn->mt_next_pgno)
7876     mdbx_refund_loose(txn);
7877 
7878   while (true) {
7879     if (MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) == 0 ||
7880         MDBX_PNL_MOST(txn->tw.reclaimed_pglist) != txn->mt_next_pgno - 1)
7881       break;
7882 
7883     mdbx_refund_reclaimed(txn);
7884     if (!txn->tw.loose_pages || txn->tw.loose_refund_wl <= txn->mt_next_pgno)
7885       break;
7886 
7887     const pgno_t memo = txn->mt_next_pgno;
7888     mdbx_refund_loose(txn);
7889     if (memo == txn->mt_next_pgno)
7890       break;
7891   }
7892 
7893   if (before == txn->mt_next_pgno)
7894     return false;
7895 
7896   if (txn->tw.spill_pages)
7897     /* Squash deleted pagenums if we refunded any */
7898     mdbx_spill_purge(txn);
7899 
7900   return true;
7901 }
7902 #else  /* MDBX_ENABLE_REFUND */
mdbx_refund(MDBX_txn * txn)7903 static __inline bool mdbx_refund(MDBX_txn *txn) {
7904   (void)txn;
7905   /* No online auto-compactification. */
7906   return false;
7907 }
7908 #endif /* MDBX_ENABLE_REFUND */
7909 
mdbx_kill_page(MDBX_txn * txn,MDBX_page * mp,pgno_t pgno,unsigned npages)7910 __cold static void mdbx_kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno,
7911                                   unsigned npages) {
7912   MDBX_env *const env = txn->mt_env;
7913   mdbx_debug("kill %u page(s) %" PRIaPGNO, npages, pgno);
7914   mdbx_assert(env, pgno >= NUM_METAS && npages);
7915   if (!IS_FROZEN(txn, mp)) {
7916     const size_t bytes = pgno2bytes(env, npages);
7917     memset(mp, -1, bytes);
7918     mp->mp_pgno = pgno;
7919     if ((env->me_flags & MDBX_WRITEMAP) == 0)
7920       mdbx_pwrite(env->me_lazy_fd, mp, bytes, pgno2bytes(env, pgno));
7921   } else {
7922     struct iovec iov[MDBX_COMMIT_PAGES];
7923     iov[0].iov_len = env->me_psize;
7924     iov[0].iov_base = (char *)env->me_pbuf + env->me_psize;
7925     size_t iov_off = pgno2bytes(env, pgno);
7926     unsigned n = 1;
7927     while (--npages) {
7928       iov[n] = iov[0];
7929       if (++n == MDBX_COMMIT_PAGES) {
7930         mdbx_pwritev(env->me_lazy_fd, iov, MDBX_COMMIT_PAGES, iov_off,
7931                      pgno2bytes(env, MDBX_COMMIT_PAGES));
7932         iov_off += pgno2bytes(env, MDBX_COMMIT_PAGES);
7933         n = 0;
7934       }
7935     }
7936     mdbx_pwritev(env->me_lazy_fd, iov, n, iov_off, pgno2bytes(env, n));
7937   }
7938 }
7939 
7940 /* Remove page from dirty list */
mdbx_page_wash(MDBX_txn * txn,const unsigned di,MDBX_page * const mp,const unsigned npages)7941 static __inline void mdbx_page_wash(MDBX_txn *txn, const unsigned di,
7942                                     MDBX_page *const mp,
7943                                     const unsigned npages) {
7944   mdbx_tassert(txn, di && di <= txn->tw.dirtylist->length &&
7945                         txn->tw.dirtylist->items[di].ptr == mp);
7946   mdbx_dpl_remove(txn, di);
7947   txn->tw.dirtyroom++;
7948   mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
7949                         (txn->mt_parent ? txn->mt_parent->tw.dirtyroom
7950                                         : txn->mt_env->me_options.dp_limit));
7951   mp->mp_txnid = INVALID_TXNID;
7952   mp->mp_flags = 0xFFFF;
7953   VALGRIND_MAKE_MEM_UNDEFINED(mp, PAGEHDRSZ);
7954   if (txn->mt_flags & MDBX_WRITEMAP) {
7955     VALGRIND_MAKE_MEM_NOACCESS(page_data(mp),
7956                                pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ);
7957     MDBX_ASAN_POISON_MEMORY_REGION(page_data(mp),
7958                                    pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ);
7959   } else
7960     mdbx_dpage_free(txn->mt_env, mp, npages);
7961 }
7962 
pp_txnid4chk(const MDBX_page * mp,const MDBX_txn * txn)7963 static __inline txnid_t pp_txnid4chk(const MDBX_page *mp, const MDBX_txn *txn) {
7964   (void)txn;
7965 #if MDBX_DISABLE_PAGECHECKS
7966   (void)mp;
7967   return 0;
7968 #else
7969   return /* maybe zero in legacy DB */ mp->mp_txnid;
7970 #endif /* !MDBX_DISABLE_PAGECHECKS */
7971 }
7972 
7973 /* Retire, loosen or free a single page.
7974  *
7975  * For dirty pages, saves single pages to a list for future reuse in this same
7976  * txn. It has been pulled from the GC and already resides on the dirty list,
7977  * but has been deleted. Use these pages first before pulling again from the GC.
7978  *
7979  * If the page wasn't dirtied in this txn, just add it
7980  * to this txn's free list. */
mdbx_page_retire_ex(MDBX_cursor * mc,const pgno_t pgno,MDBX_page * mp,int pagetype)7981 static int mdbx_page_retire_ex(MDBX_cursor *mc, const pgno_t pgno,
7982                                MDBX_page *mp /* maybe null */,
7983                                int pagetype /* maybe unknown/zero */) {
7984   int rc;
7985   MDBX_txn *const txn = mc->mc_txn;
7986   mdbx_tassert(txn, !mp || (mp->mp_pgno == pgno && PAGETYPE(mp) == pagetype));
7987 
7988   /* During deleting entire subtrees, it is reasonable and possible to avoid
7989    * reading leaf pages, i.e. significantly reduce hard page-faults & IOPs:
7990    *  - mp is null, i.e. the page has not yet been read;
7991    *  - pagetype is known and the P_LEAF bit is set;
7992    *  - we can determine the page status via scanning the lists
7993    *    of dirty and spilled pages.
7994    *
7995    *  On the other hand, this could be suboptimal for WRITEMAP mode, since
7996    *  requires support the list of dirty pages and avoid explicit spilling.
7997    *  So for flexibility and avoid extra internal dependencies we just
7998    *  fallback to reading if dirty list was not allocated yet. */
7999   unsigned di = 0, si = 0, npages = 1;
8000   bool is_frozen = false, is_spilled = false, is_shadowed = false;
8001   if (unlikely(!mp)) {
8002     if (mdbx_assert_enabled() && pagetype) {
8003       MDBX_page *check;
8004       rc = mdbx_page_get(mc, pgno, &check, txn->mt_front);
8005       if (unlikely(rc != MDBX_SUCCESS))
8006         return rc;
8007       mdbx_tassert(txn, (PAGETYPE(check) & ~P_LEAF2) == (pagetype & ~P_FROZEN));
8008       mdbx_tassert(txn, !(pagetype & P_FROZEN) || IS_FROZEN(txn, check));
8009     }
8010     if (pagetype & P_FROZEN) {
8011       is_frozen = true;
8012       if (mdbx_assert_enabled()) {
8013         for (MDBX_txn *scan = txn; scan; scan = scan->mt_parent) {
8014           mdbx_tassert(txn,
8015                        !scan->tw.spill_pages ||
8016                            !mdbx_pnl_exist(scan->tw.spill_pages, pgno << 1));
8017           mdbx_tassert(txn, !scan->tw.dirtylist || !debug_dpl_find(scan, pgno));
8018         }
8019       }
8020       goto status_done;
8021     } else if (pagetype && txn->tw.dirtylist) {
8022       if ((di = mdbx_dpl_exist(txn, pgno)) != 0) {
8023         mp = txn->tw.dirtylist->items[di].ptr;
8024         mdbx_tassert(txn, IS_MODIFIABLE(txn, mp));
8025         goto status_done;
8026       }
8027       if (txn->tw.spill_pages &&
8028           (si = mdbx_pnl_exist(txn->tw.spill_pages, pgno << 1)) != 0) {
8029         is_spilled = true;
8030         goto status_done;
8031       }
8032       for (MDBX_txn *parent = txn->mt_parent; parent;
8033            parent = parent->mt_parent) {
8034         if (mdbx_dpl_exist(parent, pgno)) {
8035           is_shadowed = true;
8036           goto status_done;
8037         }
8038         if (parent->tw.spill_pages &&
8039             mdbx_pnl_exist(parent->tw.spill_pages, pgno << 1)) {
8040           is_spilled = true;
8041           goto status_done;
8042         }
8043       }
8044       is_frozen = true;
8045       goto status_done;
8046     }
8047 
8048     rc = mdbx_page_get(mc, pgno, &mp, txn->mt_front);
8049     if (unlikely(rc != MDBX_SUCCESS))
8050       return rc;
8051     mdbx_tassert(txn, !pagetype || PAGETYPE(mp) == pagetype);
8052     pagetype = PAGETYPE(mp);
8053   }
8054 
8055   is_frozen = IS_FROZEN(txn, mp);
8056   if (!is_frozen) {
8057     const bool is_dirty = IS_MODIFIABLE(txn, mp);
8058     is_spilled = IS_SPILLED(txn, mp) && !(txn->mt_flags & MDBX_WRITEMAP);
8059     is_shadowed = IS_SHADOWED(txn, mp);
8060     if (is_dirty) {
8061       mdbx_tassert(txn, !is_spilled);
8062       mdbx_tassert(txn, !txn->tw.spill_pages ||
8063                             !mdbx_pnl_exist(txn->tw.spill_pages, pgno << 1));
8064       mdbx_tassert(txn, debug_dpl_find(txn, pgno) == mp || txn->mt_parent ||
8065                             (txn->mt_flags & MDBX_WRITEMAP));
8066     } else {
8067       mdbx_tassert(txn, !debug_dpl_find(txn, pgno));
8068     }
8069 
8070     di = is_dirty ? mdbx_dpl_exist(txn, pgno) : 0;
8071     si = (is_spilled && txn->tw.spill_pages)
8072              ? mdbx_pnl_exist(txn->tw.spill_pages, pgno << 1)
8073              : 0;
8074     mdbx_tassert(txn, !is_dirty || di || (txn->mt_flags & MDBX_WRITEMAP));
8075   } else {
8076     mdbx_tassert(txn, !IS_MODIFIABLE(txn, mp));
8077     mdbx_tassert(txn, !IS_SPILLED(txn, mp));
8078     mdbx_tassert(txn, !IS_SHADOWED(txn, mp));
8079   }
8080 
8081 status_done:
8082   if (likely((pagetype & P_OVERFLOW) == 0)) {
8083     STATIC_ASSERT(P_BRANCH == 1);
8084     const bool is_branch = pagetype & P_BRANCH;
8085     if (unlikely(mc->mc_flags & C_SUB)) {
8086       MDBX_db *outer = mdbx_outer_db(mc);
8087       mdbx_cassert(mc, !is_branch || outer->md_branch_pages > 0);
8088       outer->md_branch_pages -= is_branch;
8089       mdbx_cassert(mc, is_branch || outer->md_leaf_pages > 0);
8090       outer->md_leaf_pages -= 1 - is_branch;
8091     }
8092     mdbx_cassert(mc, !is_branch || mc->mc_db->md_branch_pages > 0);
8093     mc->mc_db->md_branch_pages -= is_branch;
8094     mdbx_cassert(mc, (pagetype & P_LEAF) == 0 || mc->mc_db->md_leaf_pages > 0);
8095     mc->mc_db->md_leaf_pages -= (pagetype & P_LEAF) != 0;
8096   } else {
8097     npages = mp->mp_pages;
8098     mdbx_cassert(mc, mc->mc_db->md_overflow_pages >= npages);
8099     mc->mc_db->md_overflow_pages -= npages;
8100   }
8101 
8102   if (is_frozen) {
8103   retire:
8104     mdbx_debug("retire %u page %" PRIaPGNO, npages, pgno);
8105     rc = mdbx_pnl_append_range(false, &txn->tw.retired_pages, pgno, npages);
8106     mdbx_tassert(txn, mdbx_dirtylist_check(txn));
8107     return rc;
8108   }
8109 
8110   /* Возврат страниц в нераспределенный "хвост" БД.
8111    * Содержимое страниц не уничтожается, а для вложенных транзакций граница
8112    * нераспределенного "хвоста" БД сдвигается только при их коммите. */
8113   if (MDBX_ENABLE_REFUND && unlikely(pgno + npages == txn->mt_next_pgno)) {
8114     const char *kind = nullptr;
8115     if (di) {
8116       /* Страница испачкана в этой транзакции, но до этого могла быть
8117        * аллоцирована, испачкана и пролита в одной из родительских транзакций.
8118        * Её МОЖНО вытолкнуть в нераспределенный хвост. */
8119       kind = "dirty";
8120       /* Remove from dirty list */
8121       mdbx_page_wash(txn, di, mp, npages);
8122     } else if (si) {
8123       /* Страница пролита в этой транзакции, т.е. она аллоцирована
8124        * и запачкана в этой или одной из родительских транзакций.
8125        * Её МОЖНО вытолкнуть в нераспределенный хвост. */
8126       kind = "spilled";
8127       mdbx_spill_remove(txn, si, npages);
8128     } else if ((txn->mt_flags & MDBX_WRITEMAP)) {
8129       kind = "writemap";
8130       mdbx_tassert(txn, mp && IS_MODIFIABLE(txn, mp));
8131     } else {
8132       /* Страница аллоцирована, запачкана и возможно пролита в одной
8133        * из родительских транзакций.
8134        * Её МОЖНО вытолкнуть в нераспределенный хвост. */
8135       kind = "parent's";
8136       if (mdbx_assert_enabled() && mp) {
8137         kind = nullptr;
8138         for (MDBX_txn *parent = txn->mt_parent; parent;
8139              parent = parent->mt_parent) {
8140           if (parent->tw.spill_pages &&
8141               mdbx_pnl_exist(parent->tw.spill_pages, pgno << 1)) {
8142             kind = "parent-spilled";
8143             mdbx_tassert(txn, is_spilled);
8144             break;
8145           }
8146           if (mp == debug_dpl_find(parent, pgno)) {
8147             kind = "parent-dirty";
8148             mdbx_tassert(txn, !is_spilled);
8149             break;
8150           }
8151         }
8152         mdbx_tassert(txn, kind != nullptr);
8153       }
8154       mdbx_tassert(txn,
8155                    is_spilled || is_shadowed || (mp && IS_SHADOWED(txn, mp)));
8156     }
8157     mdbx_debug("refunded %u %s page %" PRIaPGNO, npages, kind, pgno);
8158     txn->mt_next_pgno = pgno;
8159     mdbx_refund(txn);
8160     return MDBX_SUCCESS;
8161   }
8162 
8163   if (di) {
8164     /* Dirty page from this transaction */
8165     /* If suitable we can reuse it through loose list */
8166     if (likely(npages == 1 &&
8167                txn->tw.loose_count < txn->mt_env->me_options.dp_loose_limit &&
8168                (!MDBX_ENABLE_REFUND ||
8169                 /* skip pages near to the end in favor of compactification */
8170                 txn->mt_next_pgno >
8171                     pgno + txn->mt_env->me_options.dp_loose_limit ||
8172                 txn->mt_next_pgno <= txn->mt_env->me_options.dp_loose_limit))) {
8173       mdbx_debug("loosen dirty page %" PRIaPGNO, pgno);
8174       mp->mp_flags = P_LOOSE;
8175       mp->mp_next = txn->tw.loose_pages;
8176       txn->tw.loose_pages = mp;
8177       txn->tw.loose_count++;
8178 #if MDBX_ENABLE_REFUND
8179       txn->tw.loose_refund_wl = (pgno + 2 > txn->tw.loose_refund_wl)
8180                                     ? pgno + 2
8181                                     : txn->tw.loose_refund_wl;
8182 #endif /* MDBX_ENABLE_REFUND */
8183       if (MDBX_DEBUG != 0 || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB))
8184         memset(page_data(mp), -1, txn->mt_env->me_psize - PAGEHDRSZ);
8185       VALGRIND_MAKE_MEM_NOACCESS(page_data(mp),
8186                                  txn->mt_env->me_psize - PAGEHDRSZ);
8187       MDBX_ASAN_POISON_MEMORY_REGION(page_data(mp),
8188                                      txn->mt_env->me_psize - PAGEHDRSZ);
8189       return MDBX_SUCCESS;
8190     }
8191 
8192 #if !MDBX_DEBUG && !defined(MDBX_USE_VALGRIND) && !defined(__SANITIZE_ADDRESS__)
8193     if (unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB))
8194 #endif
8195     {
8196       /* Страница могла быть изменена в одной из родительских транзакций,
8197        * в том числе, позже выгружена и затем снова загружена и изменена.
8198        * В обоих случаях её нельзя затирать на диске и помечать недоступной
8199        * в asan и/или valgrind */
8200       for (MDBX_txn *parent = txn->mt_parent;
8201            parent && (parent->mt_flags & MDBX_TXN_SPILLS);
8202            parent = parent->mt_parent) {
8203         if (parent->tw.spill_pages &&
8204             mdbx_pnl_intersect(parent->tw.spill_pages, pgno << 1, npages << 1))
8205           goto skip_invalidate;
8206         if (mdbx_dpl_intersect(parent, pgno, npages))
8207           goto skip_invalidate;
8208       }
8209 
8210 #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__)
8211       if (MDBX_DEBUG != 0 || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB))
8212 #endif
8213         mdbx_kill_page(txn, mp, pgno, npages);
8214       if (!(txn->mt_flags & MDBX_WRITEMAP)) {
8215         VALGRIND_MAKE_MEM_NOACCESS(page_data(pgno2page(txn->mt_env, pgno)),
8216                                    pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ);
8217         MDBX_ASAN_POISON_MEMORY_REGION(page_data(pgno2page(txn->mt_env, pgno)),
8218                                        pgno2bytes(txn->mt_env, npages) -
8219                                            PAGEHDRSZ);
8220       }
8221     }
8222   skip_invalidate:
8223     /* Remove from dirty list */
8224     mdbx_page_wash(txn, di, mp, npages);
8225 
8226   reclaim:
8227     mdbx_debug("reclaim %u %s page %" PRIaPGNO, npages, "dirty", pgno);
8228     rc = mdbx_pnl_insert_range(&txn->tw.reclaimed_pglist, pgno, npages);
8229     mdbx_tassert(txn,
8230                  mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
8231                                        txn->mt_next_pgno - MDBX_ENABLE_REFUND));
8232     mdbx_tassert(txn, mdbx_dirtylist_check(txn));
8233     return rc;
8234   }
8235 
8236   if (si) {
8237     /* Page ws spilled in this txn */
8238     mdbx_spill_remove(txn, si, npages);
8239     /* Страница могла быть выделена и затем пролита в этой транзакции,
8240      * тогда её необходимо поместить в reclaimed-список.
8241      * Либо она могла быть выделена в одной из родительских транзакций и затем
8242      * пролита в этой транзакции, тогда её необходимо поместить в
8243      * retired-список для последующей фильтрации при коммите. */
8244     for (MDBX_txn *parent = txn->mt_parent; parent;
8245          parent = parent->mt_parent) {
8246       if (mdbx_dpl_exist(parent, pgno))
8247         goto retire;
8248     }
8249     /* Страница точно была выделена в этой транзакции
8250      * и теперь может быть использована повторно. */
8251     goto reclaim;
8252   }
8253 
8254   if (is_shadowed) {
8255     /* Dirty page MUST BE a clone from (one of) parent transaction(s). */
8256     if (mdbx_assert_enabled()) {
8257       const MDBX_page *parent_dp = nullptr;
8258       /* Check parent(s)'s dirty lists. */
8259       for (MDBX_txn *parent = txn->mt_parent; parent && !parent_dp;
8260            parent = parent->mt_parent) {
8261         mdbx_tassert(txn,
8262                      !parent->tw.spill_pages ||
8263                          !mdbx_pnl_exist(parent->tw.spill_pages, pgno << 1));
8264         parent_dp = debug_dpl_find(parent, pgno);
8265       }
8266       mdbx_tassert(txn, parent_dp && (!mp || parent_dp == mp));
8267     }
8268     /* Страница была выделена в родительской транзакции и теперь может быть
8269      * использована повторно, но только внутри этой транзакции, либо дочерних.
8270      */
8271     goto reclaim;
8272   }
8273 
8274   /* Страница может входить в доступный читателям MVCC-снимок, либо же она
8275    * могла быть выделена, а затем пролита в одной из родительских
8276    * транзакций. Поэтому пока помещаем её в retired-список, который будет
8277    * фильтроваться относительно dirty- и spilled-списков родительских
8278    * транзакций при коммите дочерних транзакций, либо же будет записан
8279    * в GC в неизменном виде. */
8280   goto retire;
8281 }
8282 
mdbx_page_retire(MDBX_cursor * mc,MDBX_page * mp)8283 static __inline int mdbx_page_retire(MDBX_cursor *mc, MDBX_page *mp) {
8284   return mdbx_page_retire_ex(mc, mp->mp_pgno, mp, PAGETYPE(mp));
8285 }
8286 
8287 struct mdbx_iov_ctx {
8288   unsigned iov_items;
8289   size_t iov_bytes;
8290   size_t iov_off;
8291   pgno_t flush_begin;
8292   pgno_t flush_end;
8293   struct iovec iov[MDBX_COMMIT_PAGES];
8294 };
8295 
mdbx_iov_init(MDBX_txn * const txn,struct mdbx_iov_ctx * ctx)8296 static __inline void mdbx_iov_init(MDBX_txn *const txn,
8297                                    struct mdbx_iov_ctx *ctx) {
8298   ctx->flush_begin = MAX_PAGENO;
8299   ctx->flush_end = MIN_PAGENO;
8300   ctx->iov_items = 0;
8301   ctx->iov_bytes = 0;
8302   ctx->iov_off = 0;
8303   (void)txn;
8304 }
8305 
mdbx_iov_done(MDBX_txn * const txn,struct mdbx_iov_ctx * ctx)8306 static __inline void mdbx_iov_done(MDBX_txn *const txn,
8307                                    struct mdbx_iov_ctx *ctx) {
8308   mdbx_tassert(txn, ctx->iov_items == 0);
8309 #if defined(__linux__) || defined(__gnu_linux__)
8310   MDBX_env *const env = txn->mt_env;
8311   if (!(txn->mt_flags & MDBX_WRITEMAP) &&
8312       mdbx_linux_kernel_version < 0x02060b00)
8313     /* Linux kernels older than version 2.6.11 ignore the addr and nbytes
8314      * arguments, making this function fairly expensive. Therefore, the
8315      * whole cache is always flushed. */
8316     mdbx_flush_incoherent_mmap(
8317         env->me_map + pgno2bytes(env, ctx->flush_begin),
8318         pgno2bytes(env, ctx->flush_end - ctx->flush_begin), env->me_os_psize);
8319 #endif /* Linux */
8320 }
8321 
mdbx_iov_write(MDBX_txn * const txn,struct mdbx_iov_ctx * ctx)8322 static int mdbx_iov_write(MDBX_txn *const txn, struct mdbx_iov_ctx *ctx) {
8323   mdbx_tassert(txn, !(txn->mt_flags & MDBX_WRITEMAP));
8324   mdbx_tassert(txn, ctx->iov_items > 0);
8325 
8326   MDBX_env *const env = txn->mt_env;
8327   int rc;
8328   if (likely(ctx->iov_items == 1)) {
8329     mdbx_assert(env, ctx->iov_bytes == (size_t)ctx->iov[0].iov_len);
8330     rc = mdbx_pwrite(env->me_lazy_fd, ctx->iov[0].iov_base, ctx->iov[0].iov_len,
8331                      ctx->iov_off);
8332   } else {
8333     rc = mdbx_pwritev(env->me_lazy_fd, ctx->iov, ctx->iov_items, ctx->iov_off,
8334                       ctx->iov_bytes);
8335   }
8336 
8337   if (unlikely(rc != MDBX_SUCCESS))
8338     mdbx_error("Write error: %s", mdbx_strerror(rc));
8339   else {
8340     VALGRIND_MAKE_MEM_DEFINED(txn->mt_env->me_map + ctx->iov_off,
8341                               ctx->iov_bytes);
8342     MDBX_ASAN_UNPOISON_MEMORY_REGION(txn->mt_env->me_map + ctx->iov_off,
8343                                      ctx->iov_bytes);
8344   }
8345 
8346   for (unsigned i = 0; i < ctx->iov_items; i++)
8347     mdbx_dpage_free(env, (MDBX_page *)ctx->iov[i].iov_base,
8348                     bytes2pgno(env, ctx->iov[i].iov_len));
8349 
8350 #if MDBX_ENABLE_PGOP_STAT
8351   txn->mt_env->me_lck->mti_pgop_stat.wops.weak += ctx->iov_items;
8352 #endif /* MDBX_ENABLE_PGOP_STAT */
8353   ctx->iov_items = 0;
8354   ctx->iov_bytes = 0;
8355   return rc;
8356 }
8357 
iov_page(MDBX_txn * txn,struct mdbx_iov_ctx * ctx,MDBX_page * dp,unsigned npages)8358 static int iov_page(MDBX_txn *txn, struct mdbx_iov_ctx *ctx, MDBX_page *dp,
8359                     unsigned npages) {
8360   MDBX_env *const env = txn->mt_env;
8361   mdbx_tassert(txn,
8362                dp->mp_pgno >= MIN_PAGENO && dp->mp_pgno < txn->mt_next_pgno);
8363   mdbx_tassert(txn, IS_MODIFIABLE(txn, dp));
8364   mdbx_tassert(txn,
8365                !(dp->mp_flags & ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW)));
8366 
8367   ctx->flush_begin =
8368       (ctx->flush_begin < dp->mp_pgno) ? ctx->flush_begin : dp->mp_pgno;
8369   ctx->flush_end = (ctx->flush_end > dp->mp_pgno + npages)
8370                        ? ctx->flush_end
8371                        : dp->mp_pgno + npages;
8372   env->me_lck->mti_unsynced_pages.weak += npages;
8373 
8374   if (IS_SHADOWED(txn, dp)) {
8375     mdbx_tassert(txn, !(txn->mt_flags & MDBX_WRITEMAP));
8376     dp->mp_txnid = txn->mt_txnid;
8377     mdbx_tassert(txn, IS_SPILLED(txn, dp));
8378     const size_t size = pgno2bytes(env, npages);
8379     if (ctx->iov_off + ctx->iov_bytes != pgno2bytes(env, dp->mp_pgno) ||
8380         ctx->iov_items == ARRAY_LENGTH(ctx->iov) ||
8381         ctx->iov_bytes + size > MAX_WRITE) {
8382       if (ctx->iov_items) {
8383         int err = mdbx_iov_write(txn, ctx);
8384         if (unlikely(err != MDBX_SUCCESS))
8385           return err;
8386 #if defined(__linux__) || defined(__gnu_linux__)
8387         if (mdbx_linux_kernel_version >= 0x02060b00)
8388         /* Linux kernels older than version 2.6.11 ignore the addr and nbytes
8389          * arguments, making this function fairly expensive. Therefore, the
8390          * whole cache is always flushed. */
8391 #endif /* Linux */
8392           mdbx_flush_incoherent_mmap(env->me_map + ctx->iov_off, ctx->iov_bytes,
8393                                      env->me_os_psize);
8394       }
8395       ctx->iov_off = pgno2bytes(env, dp->mp_pgno);
8396     }
8397     ctx->iov[ctx->iov_items].iov_base = (void *)dp;
8398     ctx->iov[ctx->iov_items].iov_len = size;
8399     ctx->iov_items += 1;
8400     ctx->iov_bytes += size;
8401   } else {
8402     mdbx_tassert(txn, txn->mt_flags & MDBX_WRITEMAP);
8403   }
8404   return MDBX_SUCCESS;
8405 }
8406 
spill_page(MDBX_txn * txn,struct mdbx_iov_ctx * ctx,MDBX_page * dp,unsigned npages)8407 static int spill_page(MDBX_txn *txn, struct mdbx_iov_ctx *ctx, MDBX_page *dp,
8408                       unsigned npages) {
8409   mdbx_tassert(txn, !(txn->mt_flags & MDBX_WRITEMAP));
8410   pgno_t pgno = dp->mp_pgno;
8411   int err = iov_page(txn, ctx, dp, npages);
8412   if (likely(err == MDBX_SUCCESS)) {
8413     err = mdbx_pnl_append_range(true, &txn->tw.spill_pages, pgno << 1, npages);
8414 #if MDBX_ENABLE_PGOP_STAT
8415     if (likely(err == MDBX_SUCCESS))
8416       txn->mt_env->me_lck->mti_pgop_stat.spill.weak += npages;
8417 #endif /* MDBX_ENABLE_PGOP_STAT */
8418   }
8419   return err;
8420 }
8421 
8422 /* Set unspillable LRU-label for dirty pages watched by txn.
8423  * Returns the number of pages marked as unspillable. */
mdbx_cursor_keep(MDBX_txn * txn,MDBX_cursor * mc)8424 static unsigned mdbx_cursor_keep(MDBX_txn *txn, MDBX_cursor *mc) {
8425   unsigned keep = 0;
8426   while (mc->mc_flags & C_INITIALIZED) {
8427     for (unsigned i = 0; i < mc->mc_snum; ++i) {
8428       const MDBX_page *mp = mc->mc_pg[i];
8429       if (IS_MODIFIABLE(txn, mp) && !IS_SUBP(mp)) {
8430         unsigned const n = mdbx_dpl_search(txn, mp->mp_pgno);
8431         if (txn->tw.dirtylist->items[n].pgno == mp->mp_pgno &&
8432             mdbx_dpl_age(txn, n)) {
8433           txn->tw.dirtylist->items[n].lru = txn->tw.dirtylru;
8434           ++keep;
8435         }
8436       }
8437     }
8438     if (!mc->mc_xcursor)
8439       break;
8440     mc = &mc->mc_xcursor->mx_cursor;
8441   }
8442   return keep;
8443 }
8444 
mdbx_txn_keep(MDBX_txn * txn,MDBX_cursor * m0)8445 static unsigned mdbx_txn_keep(MDBX_txn *txn, MDBX_cursor *m0) {
8446   unsigned keep = m0 ? mdbx_cursor_keep(txn, m0) : 0;
8447   for (unsigned i = FREE_DBI; i < txn->mt_numdbs; ++i)
8448     if (F_ISSET(txn->mt_dbistate[i], DBI_DIRTY | DBI_VALID) &&
8449         txn->mt_dbs[i].md_root != P_INVALID)
8450       for (MDBX_cursor *mc = txn->tw.cursors[i]; mc; mc = mc->mc_next)
8451         if (mc != m0)
8452           keep += mdbx_cursor_keep(txn, mc);
8453   return keep;
8454 }
8455 
8456 /* Returns the spilling priority (0..255) for a dirty page:
8457  *      0 = should be spilled;
8458  *    ...
8459  *  > 255 = must not be spilled. */
spill_prio(const MDBX_txn * txn,const unsigned i,const uint32_t reciprocal)8460 static unsigned spill_prio(const MDBX_txn *txn, const unsigned i,
8461                            const uint32_t reciprocal) {
8462   MDBX_dpl *const dl = txn->tw.dirtylist;
8463   const uint32_t age = mdbx_dpl_age(txn, i);
8464   const unsigned npages = dpl_npages(dl, i);
8465   const pgno_t pgno = dl->items[i].pgno;
8466   if (age == 0) {
8467     mdbx_debug("skip %s %u page %" PRIaPGNO, "keep", npages, pgno);
8468     return 256;
8469   }
8470 
8471   MDBX_page *const dp = dl->items[i].ptr;
8472   if (dp->mp_flags & (P_LOOSE | P_SPILLED)) {
8473     mdbx_debug("skip %s %u page %" PRIaPGNO,
8474                (dp->mp_flags & P_LOOSE)   ? "loose"
8475                : (dp->mp_flags & P_LOOSE) ? "loose"
8476                                           : "parent-spilled",
8477                npages, pgno);
8478     return 256;
8479   }
8480 
8481   /* Can't spill twice,
8482    * make sure it's not already in a parent's spill list(s). */
8483   MDBX_txn *parent = txn->mt_parent;
8484   if (parent && (parent->mt_flags & MDBX_TXN_SPILLS)) {
8485     do
8486       if (parent->tw.spill_pages &&
8487           mdbx_pnl_intersect(parent->tw.spill_pages, pgno << 1, npages << 1)) {
8488         mdbx_debug("skip-2 parent-spilled %u page %" PRIaPGNO, npages, pgno);
8489         dp->mp_flags |= P_SPILLED;
8490         return 256;
8491       }
8492     while ((parent = parent->mt_parent) != nullptr);
8493   }
8494 
8495   mdbx_tassert(txn, age * (uint64_t)reciprocal < UINT32_MAX);
8496   unsigned prio = age * reciprocal >> 24;
8497   mdbx_tassert(txn, prio < 256);
8498   if (likely(npages == 1))
8499     return prio = 256 - prio;
8500 
8501   /* make a large/overflow pages be likely to spill */
8502   uint32_t factor = npages | npages >> 1;
8503   factor |= factor >> 2;
8504   factor |= factor >> 4;
8505   factor |= factor >> 8;
8506   factor |= factor >> 16;
8507   factor = prio * log2n_powerof2(factor + 1) + /* golden ratio */ 157;
8508   factor = (factor < 256) ? 255 - factor : 0;
8509   mdbx_tassert(txn, factor < 256 && factor < (256 - prio));
8510   return prio = factor;
8511 }
8512 
8513 /* Spill pages from the dirty list back to disk.
8514  * This is intended to prevent running into MDBX_TXN_FULL situations,
8515  * but note that they may still occur in a few cases:
8516  *
8517  * 1) our estimate of the txn size could be too small. Currently this
8518  *  seems unlikely, except with a large number of MDBX_MULTIPLE items.
8519  *
8520  * 2) child txns may run out of space if their parents dirtied a
8521  *  lot of pages and never spilled them. TODO: we probably should do
8522  *  a preemptive spill during mdbx_txn_begin() of a child txn, if
8523  *  the parent's dirtyroom is below a given threshold.
8524  *
8525  * Otherwise, if not using nested txns, it is expected that apps will
8526  * not run into MDBX_TXN_FULL any more. The pages are flushed to disk
8527  * the same way as for a txn commit, e.g. their dirty status is cleared.
8528  * If the txn never references them again, they can be left alone.
8529  * If the txn only reads them, they can be used without any fuss.
8530  * If the txn writes them again, they can be dirtied immediately without
8531  * going thru all of the work of mdbx_page_touch(). Such references are
8532  * handled by mdbx_page_unspill().
8533  *
8534  * Also note, we never spill DB root pages, nor pages of active cursors,
8535  * because we'll need these back again soon anyway. And in nested txns,
8536  * we can't spill a page in a child txn if it was already spilled in a
8537  * parent txn. That would alter the parent txns' data even though
8538  * the child hasn't committed yet, and we'd have no way to undo it if
8539  * the child aborted. */
mdbx_txn_spill(MDBX_txn * const txn,MDBX_cursor * const m0,const unsigned need)8540 static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0,
8541                           const unsigned need) {
8542 #if xMDBX_DEBUG_SPILLING != 1
8543   /* production mode */
8544   if (likely(txn->tw.dirtyroom + txn->tw.loose_count >= need))
8545     return MDBX_SUCCESS;
8546   unsigned wanna_spill = need - txn->tw.dirtyroom;
8547 #else
8548   /* debug mode: spill at least one page if xMDBX_DEBUG_SPILLING == 1 */
8549   unsigned wanna_spill =
8550       (need > txn->tw.dirtyroom) ? need - txn->tw.dirtyroom : 1;
8551 #endif /* xMDBX_DEBUG_SPILLING */
8552 
8553   const unsigned dirty = txn->tw.dirtylist->length;
8554   const unsigned spill_min =
8555       txn->mt_env->me_options.spill_min_denominator
8556           ? dirty / txn->mt_env->me_options.spill_min_denominator
8557           : 0;
8558   const unsigned spill_max =
8559       dirty - (txn->mt_env->me_options.spill_max_denominator
8560                    ? dirty / txn->mt_env->me_options.spill_max_denominator
8561                    : 0);
8562   wanna_spill = (wanna_spill > spill_min) ? wanna_spill : spill_min;
8563   wanna_spill = (wanna_spill < spill_max) ? wanna_spill : spill_max;
8564   if (!wanna_spill)
8565     return MDBX_SUCCESS;
8566 
8567   mdbx_notice("spilling %u dirty-entries (have %u dirty-room, need %u)",
8568               wanna_spill, txn->tw.dirtyroom, need);
8569   mdbx_tassert(txn, txn->tw.dirtylist->length >= wanna_spill);
8570 
8571   struct mdbx_iov_ctx ctx;
8572   mdbx_iov_init(txn, &ctx);
8573   int rc = MDBX_SUCCESS;
8574   if (txn->mt_flags & MDBX_WRITEMAP) {
8575     MDBX_dpl *const dl = txn->tw.dirtylist;
8576     const unsigned span = dl->length - txn->tw.loose_count;
8577     txn->tw.dirtyroom += span;
8578     unsigned r, w;
8579     for (w = 0, r = 1; r <= dl->length; ++r) {
8580       MDBX_page *dp = dl->items[r].ptr;
8581       if (dp->mp_flags & P_LOOSE)
8582         dl->items[++w] = dl->items[r];
8583       else if (!MDBX_FAKE_SPILL_WRITEMAP) {
8584         rc = iov_page(txn, &ctx, dp, dpl_npages(dl, r));
8585         mdbx_tassert(txn, rc == MDBX_SUCCESS);
8586       }
8587     }
8588 
8589     mdbx_tassert(txn, span == r - 1 - w && w == txn->tw.loose_count);
8590     dl->sorted = (dl->sorted == dl->length) ? w : 0;
8591     dpl_setlen(dl, w);
8592     mdbx_tassert(txn, mdbx_dirtylist_check(txn));
8593 
8594     if (!MDBX_FAKE_SPILL_WRITEMAP && ctx.flush_end > ctx.flush_begin) {
8595       MDBX_env *const env = txn->mt_env;
8596 #if MDBX_ENABLE_PGOP_STAT
8597       env->me_lck->mti_pgop_stat.wops.weak += 1;
8598 #endif /* MDBX_ENABLE_PGOP_STAT */
8599       rc = mdbx_msync(&env->me_dxb_mmap,
8600                       pgno_align2os_bytes(env, ctx.flush_begin),
8601                       pgno_align2os_bytes(env, ctx.flush_end - ctx.flush_begin),
8602                       MDBX_SYNC_NONE);
8603     }
8604     return rc;
8605   }
8606 
8607   mdbx_tassert(txn, !(txn->mt_flags & MDBX_WRITEMAP));
8608   if (!txn->tw.spill_pages) {
8609     txn->tw.spill_least_removed = INT_MAX;
8610     txn->tw.spill_pages = mdbx_pnl_alloc(wanna_spill);
8611     if (unlikely(!txn->tw.spill_pages)) {
8612       rc = MDBX_ENOMEM;
8613     bailout:
8614       txn->mt_flags |= MDBX_TXN_ERROR;
8615       return rc;
8616     }
8617   } else {
8618     /* purge deleted slots */
8619     mdbx_spill_purge(txn);
8620     rc = mdbx_pnl_reserve(&txn->tw.spill_pages, wanna_spill);
8621     (void)rc /* ignore since the resulting list may be shorter
8622      and mdbx_pnl_append() will increase pnl on demand */
8623         ;
8624   }
8625 
8626   /* Сортируем чтобы запись на диск была полее последовательна */
8627   MDBX_dpl *const dl = mdbx_dpl_sort(txn);
8628 
8629   /* Preserve pages which may soon be dirtied again */
8630   const unsigned unspillable = mdbx_txn_keep(txn, m0);
8631   if (unspillable + txn->tw.loose_count >= dl->length) {
8632 #if xMDBX_DEBUG_SPILLING == 1 /* avoid false failure in debug mode  */
8633     if (likely(txn->tw.dirtyroom + txn->tw.loose_count >= need))
8634       return MDBX_SUCCESS;
8635 #endif /* xMDBX_DEBUG_SPILLING */
8636     mdbx_error("all %u dirty pages are unspillable  since referenced "
8637                "by a cursor(s), use fewer cursors or increase "
8638                "MDBX_opt_txn_dp_limit",
8639                unspillable);
8640     goto done;
8641   }
8642 
8643   /* Подзадача: Вытолкнуть часть страниц на диск в соответствии с LRU,
8644    * но при этом учесть важные поправки:
8645    *  - лучше выталкивать старые large/overflow страницы, так будет освобождено
8646    *    больше памяти, а также так как они (в текущем понимании) гораздо реже
8647    *    повторно изменяются;
8648    *  - при прочих равных лучше выталкивать смежные страницы, так будет
8649    *    меньше I/O операций;
8650    *  - желательно потратить на это меньше времени чем std::partial_sort_copy;
8651    *
8652    * Решение:
8653    *  - Квантуем весь диапазон lru-меток до 256 значений и задействуем один
8654    *    проход 8-битного radix-sort. В результате получаем 256 уровней
8655    *    "свежести", в том числе значение lru-метки, старее которой страницы
8656    *    должны быть выгружены;
8657    *  - Двигаемся последовательно в сторону увеличения номеров страниц
8658    *    и выталкиваем страницы с lru-меткой старее отсекающего значения,
8659    *    пока не вытолкнем достаточно;
8660    *  - Встречая страницы смежные с выталкиваемыми для уменьшения кол-ва
8661    *    I/O операций выталкиваем и их, если они попадают в первую половину
8662    *    между выталкиваемыми и с самыми свежими lru-метками;
8663    *  - дополнительно при сортировке умышленно старим large/overflow страницы,
8664    *    тем самым повышая их шансы на выталкивание. */
8665 
8666   /* get min/max of LRU-labels */
8667   uint32_t age_max = 0;
8668   for (unsigned i = 1; i <= dl->length; ++i) {
8669     const uint32_t age = mdbx_dpl_age(txn, i);
8670     age_max = (age_max >= age) ? age_max : age;
8671   }
8672 
8673   mdbx_verbose("lru-head %u, age-max %u", txn->tw.dirtylru, age_max);
8674 
8675   /* half of 8-bit radix-sort */
8676   unsigned radix_counters[256], spillable = 0, spilled = 0;
8677   memset(&radix_counters, 0, sizeof(radix_counters));
8678   const uint32_t reciprocal = (UINT32_C(255) << 24) / (age_max + 1);
8679   for (unsigned i = 1; i <= dl->length; ++i) {
8680     unsigned prio = spill_prio(txn, i, reciprocal);
8681     if (prio < 256) {
8682       radix_counters[prio] += 1;
8683       spillable += 1;
8684     }
8685   }
8686 
8687   if (likely(spillable > 0)) {
8688     unsigned prio2spill = 0, prio2adjacent = 128, amount = radix_counters[0];
8689     for (unsigned i = 1; i < 256; i++) {
8690       if (amount < wanna_spill) {
8691         prio2spill = i;
8692         prio2adjacent = i + (257 - i) / 2;
8693         amount += radix_counters[i];
8694       } else if (amount + amount < spillable + wanna_spill
8695                  /* РАВНОЗНАЧНО: amount - wanna_spill < spillable - amount */) {
8696         prio2adjacent = i;
8697         amount += radix_counters[i];
8698       } else
8699         break;
8700     }
8701 
8702     mdbx_verbose("prio2spill %u, prio2adjacent %u, amount %u, spillable %u, "
8703                  "wanna_spill %u",
8704                  prio2spill, prio2adjacent, amount, spillable, wanna_spill);
8705     mdbx_tassert(txn, prio2spill < prio2adjacent && prio2adjacent <= 256);
8706 
8707     unsigned prev_prio = 256;
8708     unsigned r, w, prio;
8709     for (w = 0, r = 1; r <= dl->length && spilled < wanna_spill;
8710          prev_prio = prio, ++r) {
8711       prio = spill_prio(txn, r, reciprocal);
8712       MDBX_page *const dp = dl->items[r].ptr;
8713       if (prio < prio2adjacent) {
8714         const pgno_t pgno = dl->items[r].pgno;
8715         const unsigned npages = dpl_npages(dl, r);
8716         if (prio <= prio2spill) {
8717           if (prev_prio < prio2adjacent && prev_prio > prio2spill &&
8718               dpl_endpgno(dl, r - 1) == pgno) {
8719             mdbx_debug("co-spill %u prev-adjacent page %" PRIaPGNO
8720                        " (age %d, prio %u)",
8721                        dpl_npages(dl, w), dl->items[r - 1].pgno,
8722                        mdbx_dpl_age(txn, r - 1), prev_prio);
8723             --w;
8724             rc = spill_page(txn, &ctx, dl->items[r - 1].ptr,
8725                             dpl_npages(dl, r - 1));
8726             if (unlikely(rc != MDBX_SUCCESS))
8727               break;
8728             ++spilled;
8729           }
8730 
8731           mdbx_debug("spill %u page %" PRIaPGNO " (age %d, prio %u)", npages,
8732                      dp->mp_pgno, mdbx_dpl_age(txn, r), prio);
8733           rc = spill_page(txn, &ctx, dp, npages);
8734           if (unlikely(rc != MDBX_SUCCESS))
8735             break;
8736           ++spilled;
8737           continue;
8738         }
8739 
8740         if (prev_prio <= prio2spill && dpl_endpgno(dl, r - 1) == pgno) {
8741           mdbx_debug("co-spill %u next-adjacent page %" PRIaPGNO
8742                      " (age %d, prio %u)",
8743                      npages, dp->mp_pgno, mdbx_dpl_age(txn, r), prio);
8744           rc = spill_page(txn, &ctx, dp, npages);
8745           if (unlikely(rc != MDBX_SUCCESS))
8746             break;
8747           prio = prev_prio /* to continue co-spilling next adjacent pages */;
8748           ++spilled;
8749           continue;
8750         }
8751       }
8752       dl->items[++w] = dl->items[r];
8753     }
8754 
8755     mdbx_tassert(txn, spillable == 0 || spilled > 0);
8756 
8757     while (r <= dl->length)
8758       dl->items[++w] = dl->items[r++];
8759     mdbx_tassert(txn, r - 1 - w == spilled);
8760 
8761     dl->sorted = dpl_setlen(dl, w);
8762     txn->tw.dirtyroom += spilled;
8763     mdbx_tassert(txn, mdbx_dirtylist_check(txn));
8764 
8765     if (ctx.iov_items)
8766       rc = mdbx_iov_write(txn, &ctx);
8767 
8768     if (unlikely(rc != MDBX_SUCCESS))
8769       goto bailout;
8770 
8771     mdbx_pnl_sort(txn->tw.spill_pages);
8772     txn->mt_flags |= MDBX_TXN_SPILLS;
8773     mdbx_notice("spilled %u dirty-entries, now have %u dirty-room", spilled,
8774                 txn->tw.dirtyroom);
8775     mdbx_iov_done(txn, &ctx);
8776   } else {
8777     mdbx_tassert(txn, ctx.iov_items == 0 && rc == MDBX_SUCCESS);
8778     for (unsigned i = 1; i <= dl->length; ++i) {
8779       MDBX_page *dp = dl->items[i].ptr;
8780       mdbx_notice(
8781           "dirtylist[%u]: pgno %u, npages %u, flags 0x%04X, age %u, prio %u", i,
8782           dp->mp_pgno, dpl_npages(dl, i), dp->mp_flags, mdbx_dpl_age(txn, i),
8783           spill_prio(txn, i, reciprocal));
8784     }
8785   }
8786 
8787 #if xMDBX_DEBUG_SPILLING == 2
8788   if (txn->tw.loose_count + txn->tw.dirtyroom <= need / 2 + 1)
8789     mdbx_error("dirty-list length: before %u, after %u, parent %i, loose %u; "
8790                "needed %u, spillable %u; "
8791                "spilled %u dirty-entries, now have %u dirty-room",
8792                dl->length + spilled, dl->length,
8793                (txn->mt_parent && txn->mt_parent->tw.dirtylist)
8794                    ? (int)txn->mt_parent->tw.dirtylist->length
8795                    : -1,
8796                txn->tw.loose_count, need, spillable, spilled,
8797                txn->tw.dirtyroom);
8798   mdbx_ensure(txn->mt_env, txn->tw.loose_count + txn->tw.dirtyroom > need / 2);
8799 #endif /* xMDBX_DEBUG_SPILLING */
8800 
8801 done:
8802   return likely(txn->tw.dirtyroom + txn->tw.loose_count >
8803                 ((need > CURSOR_STACK) ? CURSOR_STACK : need))
8804              ? MDBX_SUCCESS
8805              : MDBX_TXN_FULL;
8806 }
8807 
mdbx_cursor_spill(MDBX_cursor * mc,const MDBX_val * key,const MDBX_val * data)8808 static int mdbx_cursor_spill(MDBX_cursor *mc, const MDBX_val *key,
8809                              const MDBX_val *data) {
8810   MDBX_txn *txn = mc->mc_txn;
8811   /* Estimate how much space this operation will take: */
8812   /* 1) Max b-tree height, reasonable enough with including dups' sub-tree */
8813   unsigned need = CURSOR_STACK + 3;
8814   /* 2) GC/FreeDB for any payload */
8815   if (mc->mc_dbi > FREE_DBI) {
8816     need += txn->mt_dbs[FREE_DBI].md_depth + 3;
8817     /* 3) Named DBs also dirty the main DB */
8818     if (mc->mc_dbi > MAIN_DBI)
8819       need += txn->mt_dbs[MAIN_DBI].md_depth + 3;
8820   }
8821 #if xMDBX_DEBUG_SPILLING != 2
8822   /* production mode */
8823   /* 4) Double the page chain estimation
8824    * for extensively splitting, rebalance and merging */
8825   need += need;
8826   /* 5) Factor the key+data which to be put in */
8827   need += bytes2pgno(txn->mt_env, node_size(key, data)) + 1;
8828 #else
8829   /* debug mode */
8830   (void)key;
8831   (void)data;
8832   mc->mc_txn->mt_env->debug_dirtied_est = ++need;
8833   mc->mc_txn->mt_env->debug_dirtied_act = 0;
8834 #endif /* xMDBX_DEBUG_SPILLING == 2 */
8835 
8836   return mdbx_txn_spill(txn, mc, need);
8837 }
8838 
8839 /*----------------------------------------------------------------------------*/
8840 
meta_bootid_match(const MDBX_meta * meta)8841 static __always_inline bool meta_bootid_match(const MDBX_meta *meta) {
8842   return memcmp(&meta->mm_bootid, &bootid, 16) == 0 &&
8843          (bootid.x | bootid.y) != 0;
8844 }
8845 
meta_weak_acceptable(const MDBX_env * env,const MDBX_meta * meta,const int lck_exclusive)8846 static bool meta_weak_acceptable(const MDBX_env *env, const MDBX_meta *meta,
8847                                  const int lck_exclusive) {
8848   return lck_exclusive
8849              ? /* exclusive lock */ meta_bootid_match(meta)
8850              : /* db already opened */ env->me_lck_mmap.lck &&
8851                    (env->me_lck_mmap.lck->mti_envmode.weak & MDBX_RDONLY) == 0;
8852 }
8853 
8854 #define METAPAGE(env, n) page_meta(pgno2page(env, n))
8855 #define METAPAGE_END(env) METAPAGE(env, NUM_METAS)
8856 
meta_txnid(const MDBX_env * env,const MDBX_meta * meta,const bool allow_volatile)8857 static __inline txnid_t meta_txnid(const MDBX_env *env, const MDBX_meta *meta,
8858                                    const bool allow_volatile) {
8859   mdbx_memory_fence(mo_AcquireRelease, false);
8860   txnid_t a = unaligned_peek_u64(4, &meta->mm_txnid_a);
8861   txnid_t b = unaligned_peek_u64(4, &meta->mm_txnid_b);
8862   if (allow_volatile)
8863     return (a == b) ? a : 0;
8864   mdbx_assert(env, a == b);
8865   return a;
8866 }
8867 
mdbx_meta_txnid_stable(const MDBX_env * env,const MDBX_meta * meta)8868 static __inline txnid_t mdbx_meta_txnid_stable(const MDBX_env *env,
8869                                                const MDBX_meta *meta) {
8870   return meta_txnid(env, meta, false);
8871 }
8872 
mdbx_meta_txnid_fluid(const MDBX_env * env,const MDBX_meta * meta)8873 static __inline txnid_t mdbx_meta_txnid_fluid(const MDBX_env *env,
8874                                               const MDBX_meta *meta) {
8875   return meta_txnid(env, meta, true);
8876 }
8877 
mdbx_meta_update_begin(const MDBX_env * env,MDBX_meta * meta,txnid_t txnid)8878 static __inline void mdbx_meta_update_begin(const MDBX_env *env,
8879                                             MDBX_meta *meta, txnid_t txnid) {
8880   mdbx_assert(env, meta >= METAPAGE(env, 0) && meta < METAPAGE_END(env));
8881   mdbx_assert(env, unaligned_peek_u64(4, meta->mm_txnid_a) < txnid &&
8882                        unaligned_peek_u64(4, meta->mm_txnid_b) < txnid);
8883   (void)env;
8884   unaligned_poke_u64(4, meta->mm_txnid_b, 0);
8885   mdbx_memory_fence(mo_AcquireRelease, true);
8886   unaligned_poke_u64(4, meta->mm_txnid_a, txnid);
8887 }
8888 
mdbx_meta_update_end(const MDBX_env * env,MDBX_meta * meta,txnid_t txnid)8889 static __inline void mdbx_meta_update_end(const MDBX_env *env, MDBX_meta *meta,
8890                                           txnid_t txnid) {
8891   mdbx_assert(env, meta >= METAPAGE(env, 0) && meta < METAPAGE_END(env));
8892   mdbx_assert(env, unaligned_peek_u64(4, meta->mm_txnid_a) == txnid);
8893   mdbx_assert(env, unaligned_peek_u64(4, meta->mm_txnid_b) < txnid);
8894   (void)env;
8895   mdbx_jitter4testing(true);
8896   memcpy(&meta->mm_bootid, &bootid, 16);
8897   unaligned_poke_u64(4, meta->mm_txnid_b, txnid);
8898   mdbx_memory_fence(mo_AcquireRelease, true);
8899 }
8900 
mdbx_meta_set_txnid(const MDBX_env * env,MDBX_meta * meta,txnid_t txnid)8901 static __inline void mdbx_meta_set_txnid(const MDBX_env *env, MDBX_meta *meta,
8902                                          txnid_t txnid) {
8903   mdbx_assert(env, !env->me_map || meta < METAPAGE(env, 0) ||
8904                        meta >= METAPAGE_END(env));
8905   (void)env;
8906   /* update inconsistent since this function used ONLY for filling meta-image
8907    * for writing, but not the actual meta-page */
8908   memcpy(&meta->mm_bootid, &bootid, 16);
8909   unaligned_poke_u64(4, meta->mm_txnid_a, txnid);
8910   unaligned_poke_u64(4, meta->mm_txnid_b, txnid);
8911 }
8912 
mdbx_meta_sign(const MDBX_meta * meta)8913 static __inline uint64_t mdbx_meta_sign(const MDBX_meta *meta) {
8914   uint64_t sign = MDBX_DATASIGN_NONE;
8915 #if 0 /* TODO */
8916   sign = hippeus_hash64(...);
8917 #else
8918   (void)meta;
8919 #endif
8920   /* LY: newer returns MDBX_DATASIGN_NONE or MDBX_DATASIGN_WEAK */
8921   return (sign > MDBX_DATASIGN_WEAK) ? sign : ~sign;
8922 }
8923 
8924 enum meta_choise_mode { prefer_last, prefer_steady };
8925 
mdbx_meta_ot(const enum meta_choise_mode mode,const MDBX_env * env,const MDBX_meta * a,const MDBX_meta * b)8926 static __inline bool mdbx_meta_ot(const enum meta_choise_mode mode,
8927                                   const MDBX_env *env, const MDBX_meta *a,
8928                                   const MDBX_meta *b) {
8929   mdbx_jitter4testing(true);
8930   txnid_t txnid_a = mdbx_meta_txnid_fluid(env, a);
8931   txnid_t txnid_b = mdbx_meta_txnid_fluid(env, b);
8932 
8933   mdbx_jitter4testing(true);
8934   switch (mode) {
8935   default:
8936     assert(false);
8937     __unreachable();
8938     /* fall through */
8939     __fallthrough;
8940   case prefer_steady:
8941     if (META_IS_STEADY(a) != META_IS_STEADY(b))
8942       return META_IS_STEADY(b);
8943     /* fall through */
8944     __fallthrough;
8945   case prefer_last:
8946     mdbx_jitter4testing(true);
8947     if (txnid_a == txnid_b)
8948       return META_IS_STEADY(b);
8949     return txnid_a < txnid_b;
8950   }
8951 }
8952 
mdbx_meta_eq(const MDBX_env * env,const MDBX_meta * a,const MDBX_meta * b)8953 static __inline bool mdbx_meta_eq(const MDBX_env *env, const MDBX_meta *a,
8954                                   const MDBX_meta *b) {
8955   mdbx_jitter4testing(true);
8956   const txnid_t txnid = mdbx_meta_txnid_fluid(env, a);
8957   if (!txnid || txnid != mdbx_meta_txnid_fluid(env, b))
8958     return false;
8959 
8960   mdbx_jitter4testing(true);
8961   if (META_IS_STEADY(a) != META_IS_STEADY(b))
8962     return false;
8963 
8964   mdbx_jitter4testing(true);
8965   return true;
8966 }
8967 
mdbx_meta_eq_mask(const MDBX_env * env)8968 static int mdbx_meta_eq_mask(const MDBX_env *env) {
8969   MDBX_meta *m0 = METAPAGE(env, 0);
8970   MDBX_meta *m1 = METAPAGE(env, 1);
8971   MDBX_meta *m2 = METAPAGE(env, 2);
8972 
8973   int rc = mdbx_meta_eq(env, m0, m1) ? 1 : 0;
8974   if (mdbx_meta_eq(env, m1, m2))
8975     rc += 2;
8976   if (mdbx_meta_eq(env, m2, m0))
8977     rc += 4;
8978   return rc;
8979 }
8980 
mdbx_meta_recent(const enum meta_choise_mode mode,const MDBX_env * env,MDBX_meta * a,MDBX_meta * b)8981 static __inline MDBX_meta *mdbx_meta_recent(const enum meta_choise_mode mode,
8982                                             const MDBX_env *env, MDBX_meta *a,
8983                                             MDBX_meta *b) {
8984   const bool a_older_that_b = mdbx_meta_ot(mode, env, a, b);
8985   mdbx_assert(env, !mdbx_meta_eq(env, a, b));
8986   return a_older_that_b ? b : a;
8987 }
8988 
mdbx_meta_ancient(const enum meta_choise_mode mode,const MDBX_env * env,MDBX_meta * a,MDBX_meta * b)8989 static __inline MDBX_meta *mdbx_meta_ancient(const enum meta_choise_mode mode,
8990                                              const MDBX_env *env, MDBX_meta *a,
8991                                              MDBX_meta *b) {
8992   const bool a_older_that_b = mdbx_meta_ot(mode, env, a, b);
8993   mdbx_assert(env, !mdbx_meta_eq(env, a, b));
8994   return a_older_that_b ? a : b;
8995 }
8996 
8997 static __inline MDBX_meta *
mdbx_meta_mostrecent(const enum meta_choise_mode mode,const MDBX_env * env)8998 mdbx_meta_mostrecent(const enum meta_choise_mode mode, const MDBX_env *env) {
8999   MDBX_meta *m0 = METAPAGE(env, 0);
9000   MDBX_meta *m1 = METAPAGE(env, 1);
9001   MDBX_meta *m2 = METAPAGE(env, 2);
9002 
9003   MDBX_meta *head = mdbx_meta_recent(mode, env, m0, m1);
9004   head = mdbx_meta_recent(mode, env, head, m2);
9005   return head;
9006 }
9007 
mdbx_meta_steady(const MDBX_env * env)9008 static MDBX_meta *mdbx_meta_steady(const MDBX_env *env) {
9009   return mdbx_meta_mostrecent(prefer_steady, env);
9010 }
9011 
mdbx_meta_head(const MDBX_env * env)9012 static MDBX_meta *mdbx_meta_head(const MDBX_env *env) {
9013   return mdbx_meta_mostrecent(prefer_last, env);
9014 }
9015 
mdbx_recent_committed_txnid(const MDBX_env * env)9016 static txnid_t mdbx_recent_committed_txnid(const MDBX_env *env) {
9017   while (true) {
9018     const MDBX_meta *head = mdbx_meta_head(env);
9019     const txnid_t recent = mdbx_meta_txnid_fluid(env, head);
9020     mdbx_compiler_barrier();
9021     if (likely(head == mdbx_meta_head(env) &&
9022                recent == mdbx_meta_txnid_fluid(env, head)))
9023       return recent;
9024   }
9025 }
9026 
mdbx_recent_steady_txnid(const MDBX_env * env)9027 static txnid_t mdbx_recent_steady_txnid(const MDBX_env *env) {
9028   while (true) {
9029     const MDBX_meta *head = mdbx_meta_steady(env);
9030     const txnid_t recent = mdbx_meta_txnid_fluid(env, head);
9031     mdbx_compiler_barrier();
9032     if (likely(head == mdbx_meta_steady(env) &&
9033                recent == mdbx_meta_txnid_fluid(env, head)))
9034       return recent;
9035   }
9036 }
9037 
mdbx_durable_str(const MDBX_meta * const meta)9038 static const char *mdbx_durable_str(const MDBX_meta *const meta) {
9039   if (META_IS_STEADY(meta))
9040     return (unaligned_peek_u64(4, meta->mm_datasync_sign) ==
9041             mdbx_meta_sign(meta))
9042                ? "Steady"
9043                : "Tainted";
9044   return "Weak";
9045 }
9046 
9047 /*----------------------------------------------------------------------------*/
9048 
9049 /* Find oldest txnid still referenced. */
mdbx_find_oldest(const MDBX_txn * txn)9050 static txnid_t mdbx_find_oldest(const MDBX_txn *txn) {
9051   mdbx_tassert(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0);
9052   MDBX_env *env = txn->mt_env;
9053   const txnid_t edge = mdbx_recent_steady_txnid(env);
9054   mdbx_tassert(txn, edge <= txn->mt_txnid);
9055 
9056   MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
9057   if (unlikely(lck == NULL /* exclusive mode */))
9058     return atomic_store64(&lck->mti_oldest_reader, edge, mo_Relaxed);
9059 
9060   const txnid_t last_oldest =
9061       atomic_load64(&lck->mti_oldest_reader, mo_AcquireRelease);
9062   mdbx_tassert(txn, edge >= last_oldest);
9063   if (likely(last_oldest == edge))
9064     return edge;
9065 
9066   const uint32_t nothing_changed = MDBX_STRING_TETRAD("None");
9067   const uint32_t snap_readers_refresh_flag =
9068       atomic_load32(&lck->mti_readers_refresh_flag, mo_AcquireRelease);
9069   mdbx_jitter4testing(false);
9070   if (snap_readers_refresh_flag == nothing_changed)
9071     return last_oldest;
9072 
9073   txnid_t oldest = edge;
9074   atomic_store32(&lck->mti_readers_refresh_flag, nothing_changed, mo_Relaxed);
9075   const unsigned snap_nreaders =
9076       atomic_load32(&lck->mti_numreaders, mo_AcquireRelease);
9077   for (unsigned i = 0; i < snap_nreaders; ++i) {
9078     if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) {
9079       /* mdbx_jitter4testing(true); */
9080       const txnid_t snap = safe64_read(&lck->mti_readers[i].mr_txnid);
9081       if (oldest > snap && last_oldest <= /* ignore pending updates */ snap) {
9082         oldest = snap;
9083         if (oldest == last_oldest)
9084           return oldest;
9085       }
9086     }
9087   }
9088 
9089   if (oldest != last_oldest) {
9090     mdbx_notice("update oldest %" PRIaTXN " -> %" PRIaTXN, last_oldest, oldest);
9091     mdbx_tassert(txn, oldest >= lck->mti_oldest_reader.weak);
9092     atomic_store64(&lck->mti_oldest_reader, oldest, mo_Relaxed);
9093   }
9094   return oldest;
9095 }
9096 
9097 /* Find largest mvcc-snapshot still referenced. */
mdbx_find_largest(MDBX_env * env,pgno_t largest)9098 __cold static pgno_t mdbx_find_largest(MDBX_env *env, pgno_t largest) {
9099   MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
9100   if (likely(lck != NULL /* exclusive mode */)) {
9101     const unsigned snap_nreaders =
9102         atomic_load32(&lck->mti_numreaders, mo_AcquireRelease);
9103     for (unsigned i = 0; i < snap_nreaders; ++i) {
9104     retry:
9105       if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) {
9106         /* mdbx_jitter4testing(true); */
9107         const pgno_t snap_pages = atomic_load32(
9108             &lck->mti_readers[i].mr_snapshot_pages_used, mo_Relaxed);
9109         const txnid_t snap_txnid = safe64_read(&lck->mti_readers[i].mr_txnid);
9110         if (unlikely(
9111                 snap_pages !=
9112                     atomic_load32(&lck->mti_readers[i].mr_snapshot_pages_used,
9113                                   mo_AcquireRelease) ||
9114                 snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid)))
9115           goto retry;
9116         if (largest < snap_pages &&
9117             atomic_load64(&lck->mti_oldest_reader, mo_AcquireRelease) <=
9118                 /* ignore pending updates */ snap_txnid &&
9119             snap_txnid <= env->me_txn0->mt_txnid)
9120           largest = snap_pages;
9121       }
9122     }
9123   }
9124 
9125   return largest;
9126 }
9127 
9128 /* Add a page to the txn's dirty list */
mdbx_page_dirty(MDBX_txn * txn,MDBX_page * mp,unsigned npages)9129 static int __must_check_result mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp,
9130                                                unsigned npages) {
9131 #if xMDBX_DEBUG_SPILLING == 2
9132   txn->mt_env->debug_dirtied_act += 1;
9133   mdbx_ensure(txn->mt_env,
9134               txn->mt_env->debug_dirtied_act < txn->mt_env->debug_dirtied_est);
9135   mdbx_ensure(txn->mt_env, txn->tw.dirtyroom + txn->tw.loose_count > 0);
9136 #endif /* xMDBX_DEBUG_SPILLING == 2 */
9137 
9138   int rc;
9139   mp->mp_txnid = txn->mt_front;
9140   if (unlikely(txn->tw.dirtyroom == 0)) {
9141     if (txn->tw.loose_count) {
9142       MDBX_page *loose = txn->tw.loose_pages;
9143       mdbx_debug("purge-and-reclaim loose page %" PRIaPGNO, loose->mp_pgno);
9144       rc = mdbx_pnl_insert_range(&txn->tw.reclaimed_pglist, loose->mp_pgno, 1);
9145       if (unlikely(rc != MDBX_SUCCESS))
9146         goto bailout;
9147       unsigned di = mdbx_dpl_search(txn, loose->mp_pgno);
9148       mdbx_tassert(txn, txn->tw.dirtylist->items[di].ptr == loose);
9149       mdbx_dpl_remove(txn, di);
9150       txn->tw.loose_pages = loose->mp_next;
9151       txn->tw.loose_count--;
9152       txn->tw.dirtyroom++;
9153       if (!(txn->mt_flags & MDBX_WRITEMAP))
9154         mdbx_dpage_free(txn->mt_env, loose, 1);
9155     } else {
9156       mdbx_error("Dirtyroom is depleted, DPL length %u",
9157                  txn->tw.dirtylist->length);
9158       if (!(txn->mt_flags & MDBX_WRITEMAP))
9159         mdbx_dpage_free(txn->mt_env, mp, npages);
9160       return MDBX_TXN_FULL;
9161     }
9162   }
9163 
9164   rc = mdbx_dpl_append(txn, mp->mp_pgno, mp, npages);
9165   if (unlikely(rc != MDBX_SUCCESS)) {
9166   bailout:
9167     txn->mt_flags |= MDBX_TXN_ERROR;
9168     return rc;
9169   }
9170   txn->tw.dirtyroom--;
9171   mdbx_tassert(txn, mdbx_dirtylist_check(txn));
9172   return MDBX_SUCCESS;
9173 }
9174 
9175 #if !(defined(_WIN32) || defined(_WIN64))
ignore_enosys(int err)9176 MDBX_MAYBE_UNUSED static __always_inline int ignore_enosys(int err) {
9177 #ifdef ENOSYS
9178   if (err == ENOSYS)
9179     return MDBX_RESULT_TRUE;
9180 #endif /* ENOSYS */
9181 #ifdef ENOIMPL
9182   if (err == ENOIMPL)
9183     return MDBX_RESULT_TRUE;
9184 #endif /* ENOIMPL */
9185 #ifdef ENOTSUP
9186   if (err == ENOTSUP)
9187     return MDBX_RESULT_TRUE;
9188 #endif /* ENOTSUP */
9189 #ifdef ENOSUPP
9190   if (err == ENOSUPP)
9191     return MDBX_RESULT_TRUE;
9192 #endif /* ENOSUPP */
9193 #ifdef EOPNOTSUPP
9194   if (err == EOPNOTSUPP)
9195     return MDBX_RESULT_TRUE;
9196 #endif /* EOPNOTSUPP */
9197   if (err == EAGAIN)
9198     return MDBX_RESULT_TRUE;
9199   return err;
9200 }
9201 #endif /* defined(_WIN32) || defined(_WIN64) */
9202 
9203 #if MDBX_ENABLE_MADVISE
9204 /* Turn on/off readahead. It's harmful when the DB is larger than RAM. */
mdbx_set_readahead(MDBX_env * env,const pgno_t edge,const bool enable,const bool force_whole)9205 __cold static int mdbx_set_readahead(MDBX_env *env, const pgno_t edge,
9206                                      const bool enable,
9207                                      const bool force_whole) {
9208   mdbx_assert(env, edge >= NUM_METAS && edge <= MAX_PAGENO);
9209   mdbx_assert(env, (enable & 1) == (enable != 0));
9210   const bool toggle = force_whole ||
9211                       ((enable ^ env->me_lck->mti_readahead_anchor) & 1) ||
9212                       !env->me_lck->mti_readahead_anchor;
9213   const pgno_t prev_edge = env->me_lck->mti_readahead_anchor >> 1;
9214   const size_t limit = env->me_dxb_mmap.limit;
9215   size_t offset =
9216       toggle ? 0
9217              : pgno_align2os_bytes(env, (prev_edge < edge) ? prev_edge : edge);
9218   offset = (offset < limit) ? offset : limit;
9219 
9220   size_t length =
9221       pgno_align2os_bytes(env, (prev_edge < edge) ? edge : prev_edge);
9222   length = (length < limit) ? length : limit;
9223   length -= offset;
9224 
9225   mdbx_assert(env, 0 <= (intptr_t)length);
9226   if (length == 0)
9227     return MDBX_SUCCESS;
9228 
9229   mdbx_notice("readahead %s %u..%u", enable ? "ON" : "OFF",
9230               bytes2pgno(env, offset), bytes2pgno(env, offset + length));
9231 
9232 #if defined(F_RDAHEAD)
9233   if (toggle && unlikely(fcntl(env->me_lazy_fd, F_RDAHEAD, enable) == -1))
9234     return errno;
9235 #endif /* F_RDAHEAD */
9236 
9237   int err;
9238   if (enable) {
9239 #if defined(MADV_NORMAL)
9240     err = madvise(env->me_map + offset, length, MADV_NORMAL)
9241               ? ignore_enosys(errno)
9242               : MDBX_SUCCESS;
9243     if (unlikely(MDBX_IS_ERROR(err)))
9244       return err;
9245 #elif defined(POSIX_MADV_NORMAL)
9246     err = ignore_enosys(
9247         posix_madvise(env->me_map + offset, length, POSIX_MADV_NORMAL));
9248     if (unlikely(MDBX_IS_ERROR(err)))
9249       return err;
9250 #elif defined(POSIX_FADV_NORMAL) && defined(POSIX_FADV_WILLNEED)
9251     err = ignore_enosys(
9252         posix_fadvise(env->me_lazy_fd, offset, length, POSIX_FADV_NORMAL));
9253     if (unlikely(MDBX_IS_ERROR(err)))
9254       return err;
9255 #elif defined(_WIN32) || defined(_WIN64)
9256     /* no madvise on Windows */
9257 #else
9258 #warning "FIXME"
9259 #endif
9260     if (toggle) {
9261       /* NOTE: Seems there is a bug in the Mach/Darwin/OSX kernel,
9262        * because MADV_WILLNEED with offset != 0 may cause SIGBUS
9263        * on following access to the hinted region.
9264        * 19.6.0 Darwin Kernel Version 19.6.0: Tue Jan 12 22:13:05 PST 2021;
9265        * root:xnu-6153.141.16~1/RELEASE_X86_64 x86_64 */
9266 #if defined(F_RDADVISE)
9267       struct radvisory hint;
9268       hint.ra_offset = offset;
9269       hint.ra_count = length;
9270       (void)/* Ignore ENOTTY for DB on the ram-disk and so on */ fcntl(
9271           env->me_lazy_fd, F_RDADVISE, &hint);
9272 #elif defined(MADV_WILLNEED)
9273       err = madvise(env->me_map + offset, length, MADV_WILLNEED)
9274                 ? ignore_enosys(errno)
9275                 : MDBX_SUCCESS;
9276       if (unlikely(MDBX_IS_ERROR(err)))
9277         return err;
9278 #elif defined(POSIX_MADV_WILLNEED)
9279       err = ignore_enosys(
9280           posix_madvise(env->me_map + offset, length, POSIX_MADV_WILLNEED));
9281       if (unlikely(MDBX_IS_ERROR(err)))
9282         return err;
9283 #elif defined(_WIN32) || defined(_WIN64)
9284       if (mdbx_PrefetchVirtualMemory) {
9285         WIN32_MEMORY_RANGE_ENTRY hint;
9286         hint.VirtualAddress = env->me_map + offset;
9287         hint.NumberOfBytes = length;
9288         (void)mdbx_PrefetchVirtualMemory(GetCurrentProcess(), 1, &hint, 0);
9289       }
9290 #elif defined(POSIX_FADV_WILLNEED)
9291       err = ignore_enosys(
9292           posix_fadvise(env->me_lazy_fd, offset, length, POSIX_FADV_WILLNEED));
9293       if (unlikely(MDBX_IS_ERROR(err)))
9294         return err;
9295 #else
9296 #warning "FIXME"
9297 #endif
9298     }
9299   } else {
9300 #if defined(MADV_RANDOM)
9301     err = madvise(env->me_map + offset, length, MADV_RANDOM)
9302               ? ignore_enosys(errno)
9303               : MDBX_SUCCESS;
9304     if (unlikely(MDBX_IS_ERROR(err)))
9305       return err;
9306 #elif defined(POSIX_MADV_RANDOM)
9307     err = ignore_enosys(
9308         posix_madvise(env->me_map + offset, length, POSIX_MADV_RANDOM));
9309     if (unlikely(MDBX_IS_ERROR(err)))
9310       return err;
9311 #elif defined(POSIX_FADV_RANDOM)
9312     err = ignore_enosys(
9313         posix_fadvise(env->me_lazy_fd, offset, length, POSIX_FADV_RANDOM));
9314     if (unlikely(MDBX_IS_ERROR(err)))
9315       return err;
9316 #elif defined(_WIN32) || defined(_WIN64)
9317     /* no madvise on Windows */
9318 #else
9319 #warning "FIXME"
9320 #endif /* MADV_RANDOM */
9321   }
9322 
9323   env->me_lck->mti_readahead_anchor = (enable & 1) + (edge << 1);
9324   err = MDBX_SUCCESS;
9325   return err;
9326 }
9327 #endif /* MDBX_ENABLE_MADVISE */
9328 
mdbx_mapresize(MDBX_env * env,const pgno_t used_pgno,const pgno_t size_pgno,const pgno_t limit_pgno,const bool implicit)9329 __cold static int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno,
9330                                  const pgno_t size_pgno,
9331                                  const pgno_t limit_pgno, const bool implicit) {
9332   const size_t limit_bytes = pgno_align2os_bytes(env, limit_pgno);
9333   const size_t size_bytes = pgno_align2os_bytes(env, size_pgno);
9334   const size_t prev_size = env->me_dxb_mmap.current;
9335   const size_t prev_limit = env->me_dxb_mmap.limit;
9336 #if MDBX_ENABLE_MADVISE || defined(MDBX_USE_VALGRIND)
9337   const void *const prev_addr = env->me_map;
9338 #endif /* MDBX_ENABLE_MADVISE || MDBX_USE_VALGRIND */
9339 
9340   mdbx_verbose("resize datafile/mapping: "
9341                "present %" PRIuPTR " -> %" PRIuPTR ", "
9342                "limit %" PRIuPTR " -> %" PRIuPTR,
9343                prev_size, size_bytes, prev_limit, limit_bytes);
9344 
9345   mdbx_assert(env, limit_bytes >= size_bytes);
9346   mdbx_assert(env, bytes2pgno(env, size_bytes) >= size_pgno);
9347   mdbx_assert(env, bytes2pgno(env, limit_bytes) >= limit_pgno);
9348 
9349   unsigned mresize_flags =
9350       env->me_flags & (MDBX_RDONLY | MDBX_WRITEMAP | MDBX_UTTERLY_NOSYNC);
9351 #if defined(_WIN32) || defined(_WIN64)
9352   /* Acquire guard in exclusive mode for:
9353    *   - to avoid collision between read and write txns around env->me_dbgeo;
9354    *   - to avoid attachment of new reading threads (see mdbx_rdt_lock); */
9355   mdbx_srwlock_AcquireExclusive(&env->me_remap_guard);
9356   mdbx_handle_array_t *suspended = NULL;
9357   mdbx_handle_array_t array_onstack;
9358   int rc = MDBX_SUCCESS;
9359   if (limit_bytes == env->me_dxb_mmap.limit &&
9360       size_bytes == env->me_dxb_mmap.current &&
9361       size_bytes == env->me_dxb_mmap.filesize)
9362     goto bailout;
9363 
9364   if ((env->me_flags & MDBX_NOTLS) == 0) {
9365     /* 1) Windows allows only extending a read-write section, but not a
9366      *    corresponding mapped view. Therefore in other cases we must suspend
9367      *    the local threads for safe remap.
9368      * 2) At least on Windows 10 1803 the entire mapped section is unavailable
9369      *    for short time during NtExtendSection() or VirtualAlloc() execution.
9370      * 3) Under Wine runtime environment on Linux a section extending is not
9371      *    supported.
9372      *
9373      * THEREFORE LOCAL THREADS SUSPENDING IS ALWAYS REQUIRED! */
9374     array_onstack.limit = ARRAY_LENGTH(array_onstack.handles);
9375     array_onstack.count = 0;
9376     suspended = &array_onstack;
9377     rc = mdbx_suspend_threads_before_remap(env, &suspended);
9378     if (rc != MDBX_SUCCESS) {
9379       mdbx_error("failed suspend-for-remap: errcode %d", rc);
9380       goto bailout;
9381     }
9382     mresize_flags |= implicit ? MDBX_MRESIZE_MAY_UNMAP
9383                               : MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE;
9384   }
9385 #else  /* Windows */
9386   /* Acquire guard to avoid collision between read and write txns
9387    * around env->me_dbgeo */
9388   int rc = mdbx_fastmutex_acquire(&env->me_remap_guard);
9389   if (unlikely(rc != MDBX_SUCCESS))
9390     return rc;
9391   if (limit_bytes == env->me_dxb_mmap.limit &&
9392       size_bytes == env->me_dxb_mmap.current)
9393     goto bailout;
9394 
9395   MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
9396   if (limit_bytes != env->me_dxb_mmap.limit && !(env->me_flags & MDBX_NOTLS) &&
9397       lck && !implicit) {
9398     int err = mdbx_rdt_lock(env) /* lock readers table until remap done */;
9399     if (unlikely(MDBX_IS_ERROR(err))) {
9400       rc = err;
9401       goto bailout;
9402     }
9403 
9404     /* looking for readers from this process */
9405     const unsigned snap_nreaders =
9406         atomic_load32(&lck->mti_numreaders, mo_AcquireRelease);
9407     mdbx_assert(env, !implicit);
9408     mresize_flags |= MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE;
9409     for (unsigned i = 0; i < snap_nreaders; ++i) {
9410       if (lck->mti_readers[i].mr_pid.weak == env->me_pid &&
9411           lck->mti_readers[i].mr_tid.weak != mdbx_thread_self()) {
9412         /* the base address of the mapping can't be changed since
9413          * the other reader thread from this process exists. */
9414         mdbx_rdt_unlock(env);
9415         mresize_flags &= ~(MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE);
9416         break;
9417       }
9418     }
9419   }
9420 #endif /* ! Windows */
9421 
9422   if ((env->me_flags & MDBX_WRITEMAP) && env->me_lck->mti_unsynced_pages.weak) {
9423 #if MDBX_ENABLE_PGOP_STAT
9424     env->me_lck->mti_pgop_stat.wops.weak += 1;
9425 #endif /* MDBX_ENABLE_PGOP_STAT */
9426     rc = mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, used_pgno),
9427                     MDBX_SYNC_NONE);
9428     if (unlikely(rc != MDBX_SUCCESS))
9429       goto bailout;
9430   }
9431 
9432 #if MDBX_ENABLE_MADVISE
9433   if (size_bytes < prev_size) {
9434     mdbx_notice("resize-MADV_%s %u..%u",
9435                 (env->me_flags & MDBX_WRITEMAP) ? "REMOVE" : "DONTNEED",
9436                 size_pgno, bytes2pgno(env, prev_size));
9437     rc = MDBX_RESULT_TRUE;
9438 #if defined(MADV_REMOVE)
9439     if (env->me_flags & MDBX_WRITEMAP)
9440       rc =
9441           madvise(env->me_map + size_bytes, prev_size - size_bytes, MADV_REMOVE)
9442               ? ignore_enosys(errno)
9443               : MDBX_SUCCESS;
9444 #endif /* MADV_REMOVE */
9445 #if defined(MADV_DONTNEED)
9446     if (rc == MDBX_RESULT_TRUE)
9447       rc = madvise(env->me_map + size_bytes, prev_size - size_bytes,
9448                    MADV_DONTNEED)
9449                ? ignore_enosys(errno)
9450                : MDBX_SUCCESS;
9451 #elif defined(POSIX_MADV_DONTNEED)
9452     if (rc == MDBX_RESULT_TRUE)
9453       rc = ignore_enosys(posix_madvise(env->me_map + size_bytes,
9454                                        prev_size - size_bytes,
9455                                        POSIX_MADV_DONTNEED));
9456 #elif defined(POSIX_FADV_DONTNEED)
9457     if (rc == MDBX_RESULT_TRUE)
9458       rc = ignore_enosys(posix_fadvise(env->me_lazy_fd, size_bytes,
9459                                        prev_size - size_bytes,
9460                                        POSIX_FADV_DONTNEED));
9461 #endif /* MADV_DONTNEED */
9462     if (unlikely(MDBX_IS_ERROR(rc)))
9463       goto bailout;
9464     if (env->me_lck->mti_discarded_tail.weak > size_pgno)
9465       env->me_lck->mti_discarded_tail.weak = size_pgno;
9466   }
9467 #endif /* MDBX_ENABLE_MADVISE */
9468 
9469   rc = mdbx_mresize(mresize_flags, &env->me_dxb_mmap, size_bytes, limit_bytes);
9470 
9471 #if MDBX_ENABLE_MADVISE
9472   if (rc == MDBX_SUCCESS) {
9473     env->me_lck->mti_discarded_tail.weak = size_pgno;
9474     const bool readahead =
9475         !(env->me_flags & MDBX_NORDAHEAD) &&
9476         mdbx_is_readahead_reasonable(size_bytes, -(intptr_t)prev_size);
9477     const bool force = limit_bytes != prev_limit ||
9478                        env->me_dxb_mmap.address != prev_addr
9479 #if defined(_WIN32) || defined(_WIN64)
9480                        || prev_size > size_bytes
9481 #endif /* Windows */
9482         ;
9483     rc = mdbx_set_readahead(env, size_pgno, readahead, force);
9484   }
9485 #endif /* MDBX_ENABLE_MADVISE */
9486 
9487 bailout:
9488   if (rc == MDBX_SUCCESS) {
9489     mdbx_assert(env, size_bytes == env->me_dxb_mmap.current);
9490     mdbx_assert(env, size_bytes <= env->me_dxb_mmap.filesize);
9491     mdbx_assert(env, limit_bytes == env->me_dxb_mmap.limit);
9492 #ifdef MDBX_USE_VALGRIND
9493     if (prev_limit != env->me_dxb_mmap.limit || prev_addr != env->me_map) {
9494       VALGRIND_DISCARD(env->me_valgrind_handle);
9495       env->me_valgrind_handle = 0;
9496       if (env->me_dxb_mmap.limit)
9497         env->me_valgrind_handle =
9498             VALGRIND_CREATE_BLOCK(env->me_map, env->me_dxb_mmap.limit, "mdbx");
9499     }
9500 #endif /* MDBX_USE_VALGRIND */
9501   } else {
9502     if (rc != MDBX_UNABLE_EXTEND_MAPSIZE && rc != MDBX_RESULT_TRUE) {
9503       mdbx_error("failed resize datafile/mapping: "
9504                  "present %" PRIuPTR " -> %" PRIuPTR ", "
9505                  "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d",
9506                  prev_size, size_bytes, prev_limit, limit_bytes, rc);
9507     } else {
9508       mdbx_warning("unable resize datafile/mapping: "
9509                    "present %" PRIuPTR " -> %" PRIuPTR ", "
9510                    "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d",
9511                    prev_size, size_bytes, prev_limit, limit_bytes, rc);
9512     }
9513     if (!env->me_dxb_mmap.address) {
9514       env->me_flags |= MDBX_FATAL_ERROR;
9515       if (env->me_txn)
9516         env->me_txn->mt_flags |= MDBX_TXN_ERROR;
9517       rc = MDBX_PANIC;
9518     }
9519   }
9520 
9521 #if defined(_WIN32) || defined(_WIN64)
9522   int err = MDBX_SUCCESS;
9523   mdbx_srwlock_ReleaseExclusive(&env->me_remap_guard);
9524   if (suspended) {
9525     err = mdbx_resume_threads_after_remap(suspended);
9526     if (suspended != &array_onstack)
9527       mdbx_free(suspended);
9528   }
9529 #else
9530   if (env->me_lck_mmap.lck &&
9531       (mresize_flags & (MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE)) != 0)
9532     mdbx_rdt_unlock(env);
9533   int err = mdbx_fastmutex_release(&env->me_remap_guard);
9534 #endif /* Windows */
9535   if (err != MDBX_SUCCESS) {
9536     mdbx_fatal("failed resume-after-remap: errcode %d", err);
9537     return MDBX_PANIC;
9538   }
9539   return rc;
9540 }
9541 
mdbx_mapresize_implicit(MDBX_env * env,const pgno_t used_pgno,const pgno_t size_pgno,const pgno_t limit_pgno)9542 __cold static int mdbx_mapresize_implicit(MDBX_env *env, const pgno_t used_pgno,
9543                                           const pgno_t size_pgno,
9544                                           const pgno_t limit_pgno) {
9545   const pgno_t mapped_pgno = bytes2pgno(env, env->me_dxb_mmap.limit);
9546   mdbx_assert(env, mapped_pgno >= used_pgno);
9547   return mdbx_mapresize(
9548       env, used_pgno, size_pgno,
9549       (size_pgno > mapped_pgno)
9550           ? limit_pgno
9551           : /* The actual mapsize may be less since the geo.upper may be changed
9552                by other process. So, avoids remapping until it necessary. */
9553           mapped_pgno,
9554       true);
9555 }
9556 
mdbx_meta_unsteady(MDBX_env * env,const txnid_t last_steady,MDBX_meta * const meta,mdbx_filehandle_t fd)9557 static int mdbx_meta_unsteady(MDBX_env *env, const txnid_t last_steady,
9558                               MDBX_meta *const meta, mdbx_filehandle_t fd) {
9559   const uint64_t wipe = MDBX_DATASIGN_NONE;
9560   if (unlikely(META_IS_STEADY(meta)) &&
9561       mdbx_meta_txnid_stable(env, meta) <= last_steady) {
9562     mdbx_warning("wipe txn #%" PRIaTXN ", meta %" PRIaPGNO, last_steady,
9563                  data_page(meta)->mp_pgno);
9564     if (env->me_flags & MDBX_WRITEMAP)
9565       unaligned_poke_u64(4, meta->mm_datasync_sign, wipe);
9566     else
9567       return mdbx_pwrite(fd, &wipe, sizeof(meta->mm_datasync_sign),
9568                          (uint8_t *)&meta->mm_datasync_sign - env->me_map);
9569   }
9570   return MDBX_SUCCESS;
9571 }
9572 
mdbx_wipe_steady(MDBX_env * env,const txnid_t last_steady)9573 __cold static int mdbx_wipe_steady(MDBX_env *env, const txnid_t last_steady) {
9574 #if MDBX_ENABLE_PGOP_STAT
9575   env->me_lck->mti_pgop_stat.wops.weak += 1;
9576 #endif /* MDBX_ENABLE_PGOP_STAT */
9577   const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE)
9578                                    ? env->me_dsync_fd
9579                                    : env->me_lazy_fd;
9580   int err = mdbx_meta_unsteady(env, last_steady, METAPAGE(env, 0), fd);
9581   if (unlikely(err != MDBX_SUCCESS))
9582     return err;
9583   err = mdbx_meta_unsteady(env, last_steady, METAPAGE(env, 1), fd);
9584   if (unlikely(err != MDBX_SUCCESS))
9585     return err;
9586   err = mdbx_meta_unsteady(env, last_steady, METAPAGE(env, 2), fd);
9587   if (unlikely(err != MDBX_SUCCESS))
9588     return err;
9589 
9590   if (env->me_flags & MDBX_WRITEMAP) {
9591     mdbx_flush_incoherent_cpu_writeback();
9592     err = mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS),
9593                      MDBX_SYNC_DATA);
9594     if (unlikely(err != MDBX_SUCCESS))
9595       return err;
9596   } else {
9597     if (fd == env->me_lazy_fd) {
9598 #if MDBX_USE_SYNCFILERANGE
9599       static bool syncfilerange_unavailable;
9600       if (!syncfilerange_unavailable &&
9601           sync_file_range(env->me_lazy_fd, 0, pgno2bytes(env, NUM_METAS),
9602                           SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER)) {
9603         err = errno;
9604         if (ignore_enosys(err) == MDBX_RESULT_TRUE)
9605           syncfilerange_unavailable = true;
9606       }
9607       if (syncfilerange_unavailable)
9608 #endif /* MDBX_USE_SYNCFILERANGE */
9609         err = mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA);
9610       if (unlikely(err != MDBX_SUCCESS))
9611         return err;
9612     }
9613     mdbx_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS),
9614                                env->me_os_psize);
9615   }
9616 
9617   /* force oldest refresh */
9618   atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, mo_Relaxed);
9619   return MDBX_SUCCESS;
9620 }
9621 
9622 /* Allocate page numbers and memory for writing.  Maintain mt_last_reclaimed,
9623  * mt_reclaimed_pglist and mt_next_pgno.  Set MDBX_TXN_ERROR on failure.
9624  *
9625  * If there are free pages available from older transactions, they
9626  * are re-used first. Otherwise allocate a new page at mt_next_pgno.
9627  * Do not modify the GC, just merge GC records into mt_reclaimed_pglist
9628  * and move mt_last_reclaimed to say which records were consumed.  Only this
9629  * function can create mt_reclaimed_pglist and move
9630  * mt_last_reclaimed/mt_next_pgno.
9631  *
9632  * [in] mc    cursor A cursor handle identifying the transaction and
9633  *            database for which we are allocating.
9634  * [in] num   the number of pages to allocate.
9635  *
9636  * Returns 0 on success, non-zero on failure.*/
9637 
9638 #define MDBX_ALLOC_CACHE 1
9639 #define MDBX_ALLOC_GC 2
9640 #define MDBX_ALLOC_NEW 4
9641 #define MDBX_ALLOC_SLOT 8
9642 #define MDBX_ALLOC_ALL (MDBX_ALLOC_CACHE | MDBX_ALLOC_GC | MDBX_ALLOC_NEW)
9643 
mdbx_page_alloc(MDBX_cursor * mc,const unsigned num,int flags)9644 __hot static struct page_result mdbx_page_alloc(MDBX_cursor *mc,
9645                                                 const unsigned num, int flags) {
9646   struct page_result ret;
9647   MDBX_txn *const txn = mc->mc_txn;
9648   MDBX_env *const env = txn->mt_env;
9649 
9650   const unsigned coalesce_threshold =
9651       env->me_maxgc_ov1page - env->me_maxgc_ov1page / 4;
9652   if (likely(flags & MDBX_ALLOC_GC)) {
9653     flags |= env->me_flags & (MDBX_COALESCE | MDBX_LIFORECLAIM);
9654     if (MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) > coalesce_threshold)
9655       flags &= ~MDBX_COALESCE;
9656     if (unlikely(
9657             /* If mc is updating the GC, then the retired-list cannot play
9658                catch-up with itself by growing while trying to save it. */
9659             (mc->mc_flags & C_RECLAIMING) ||
9660             /* avoid (recursive) search inside empty tree and while tree is
9661                updating, https://github.com/erthink/libmdbx/issues/31 */
9662             txn->mt_dbs[FREE_DBI].md_entries == 0 ||
9663             /* If our dirty list is already full, we can't touch GC */
9664             (txn->tw.dirtyroom < txn->mt_dbs[FREE_DBI].md_depth &&
9665              !(txn->mt_dbistate[FREE_DBI] & DBI_DIRTY))))
9666       flags &= ~(MDBX_ALLOC_GC | MDBX_COALESCE);
9667   }
9668 
9669   if (likely(num == 1 && (flags & MDBX_ALLOC_CACHE) != 0)) {
9670     /* If there are any loose pages, just use them */
9671     mdbx_assert(env, (flags & MDBX_ALLOC_SLOT) == 0);
9672     if (likely(txn->tw.loose_pages)) {
9673 #if MDBX_ENABLE_REFUND
9674       if (txn->tw.loose_refund_wl > txn->mt_next_pgno) {
9675         mdbx_refund(txn);
9676         if (unlikely(!txn->tw.loose_pages))
9677           goto no_loose;
9678       }
9679 #endif /* MDBX_ENABLE_REFUND */
9680 
9681       ret.page = txn->tw.loose_pages;
9682       txn->tw.loose_pages = ret.page->mp_next;
9683       txn->tw.loose_count--;
9684       mdbx_debug("db %d use loose page %" PRIaPGNO, DDBI(mc),
9685                  ret.page->mp_pgno);
9686       mdbx_tassert(txn, ret.page->mp_pgno < txn->mt_next_pgno);
9687       mdbx_ensure(env, ret.page->mp_pgno >= NUM_METAS);
9688       VALGRIND_MAKE_MEM_UNDEFINED(page_data(ret.page), page_space(txn->mt_env));
9689       MDBX_ASAN_UNPOISON_MEMORY_REGION(page_data(ret.page),
9690                                        page_space(txn->mt_env));
9691       ret.page->mp_txnid = txn->mt_front;
9692       ret.err = MDBX_SUCCESS;
9693       return ret;
9694     }
9695   }
9696 #if MDBX_ENABLE_REFUND
9697 no_loose:
9698 #endif /* MDBX_ENABLE_REFUND */
9699 
9700   mdbx_tassert(txn,
9701                mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
9702                                      txn->mt_next_pgno - MDBX_ENABLE_REFUND));
9703   pgno_t pgno, *re_list = txn->tw.reclaimed_pglist;
9704   unsigned range_begin = 0, re_len = MDBX_PNL_SIZE(re_list);
9705   txnid_t oldest = 0, last = 0;
9706   const unsigned wanna_range = num - 1;
9707 
9708   while (true) { /* hsr-kick retry loop */
9709     MDBX_cursor_couple recur;
9710     for (MDBX_cursor_op op = MDBX_FIRST;;
9711          op = (flags & MDBX_LIFORECLAIM) ? MDBX_PREV : MDBX_NEXT) {
9712       MDBX_val key, data;
9713 
9714       /* Seek a big enough contiguous page range.
9715        * Prefer pages with lower pgno. */
9716       mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
9717                                               txn->mt_next_pgno));
9718       if ((flags & (MDBX_COALESCE | MDBX_ALLOC_CACHE)) == MDBX_ALLOC_CACHE &&
9719           re_len > wanna_range) {
9720         mdbx_tassert(txn, MDBX_PNL_LAST(re_list) < txn->mt_next_pgno &&
9721                               MDBX_PNL_FIRST(re_list) < txn->mt_next_pgno);
9722         range_begin = MDBX_PNL_ASCENDING ? 1 : re_len;
9723         pgno = MDBX_PNL_LEAST(re_list);
9724         if (likely(wanna_range == 0))
9725           goto done;
9726 #if MDBX_PNL_ASCENDING
9727         mdbx_tassert(txn, pgno == re_list[1] && range_begin == 1);
9728         while (true) {
9729           unsigned range_end = range_begin + wanna_range;
9730           if (re_list[range_end] - pgno == wanna_range)
9731             goto done;
9732           if (range_end == re_len)
9733             break;
9734           pgno = re_list[++range_begin];
9735         }
9736 #else
9737         mdbx_tassert(txn, pgno == re_list[re_len] && range_begin == re_len);
9738         while (true) {
9739           if (re_list[range_begin - wanna_range] - pgno == wanna_range)
9740             goto done;
9741           if (range_begin == wanna_range)
9742             break;
9743           pgno = re_list[--range_begin];
9744         }
9745 #endif /* MDBX_PNL sort-order */
9746       }
9747 
9748       if (op == MDBX_FIRST) { /* 1st iteration, setup cursor, etc */
9749         if (unlikely(!(flags & MDBX_ALLOC_GC)))
9750           break /* reclaiming is prohibited for now */;
9751 
9752         /* Prepare to fetch more and coalesce */
9753         oldest = (flags & MDBX_LIFORECLAIM)
9754                      ? mdbx_find_oldest(txn)
9755                      : atomic_load64(&env->me_lck->mti_oldest_reader,
9756                                      mo_AcquireRelease);
9757         ret.err = mdbx_cursor_init(&recur.outer, txn, FREE_DBI);
9758         if (unlikely(ret.err != MDBX_SUCCESS))
9759           goto fail;
9760         if (flags & MDBX_LIFORECLAIM) {
9761           /* Begin from oldest reader if any */
9762           if (oldest > MIN_TXNID) {
9763             last = oldest - 1;
9764             op = MDBX_SET_RANGE;
9765           }
9766         } else if (txn->tw.last_reclaimed) {
9767           /* Continue lookup from txn->tw.last_reclaimed to oldest reader */
9768           last = txn->tw.last_reclaimed;
9769           op = MDBX_SET_RANGE;
9770         }
9771 
9772         key.iov_base = &last;
9773         key.iov_len = sizeof(last);
9774       }
9775 
9776       if (!(flags & MDBX_LIFORECLAIM)) {
9777         /* Do not try fetch more if the record will be too recent */
9778         if (op != MDBX_FIRST && ++last >= oldest) {
9779           oldest = mdbx_find_oldest(txn);
9780           if (oldest <= last)
9781             break;
9782         }
9783       }
9784 
9785       ret.err = mdbx_cursor_get(&recur.outer, &key, NULL, op);
9786       if (ret.err == MDBX_NOTFOUND && (flags & MDBX_LIFORECLAIM)) {
9787         if (op == MDBX_SET_RANGE)
9788           continue;
9789         txnid_t snap = mdbx_find_oldest(txn);
9790         if (oldest < snap) {
9791           oldest = snap;
9792           last = oldest - 1;
9793           key.iov_base = &last;
9794           key.iov_len = sizeof(last);
9795           op = MDBX_SET_RANGE;
9796           ret.err = mdbx_cursor_get(&recur.outer, &key, NULL, op);
9797         }
9798       }
9799       if (unlikely(ret.err)) {
9800         if (ret.err == MDBX_NOTFOUND)
9801           break;
9802         goto fail;
9803       }
9804 
9805       if (!MDBX_DISABLE_PAGECHECKS &&
9806           unlikely(key.iov_len != sizeof(txnid_t))) {
9807         ret.err = MDBX_CORRUPTED;
9808         goto fail;
9809       }
9810       last = unaligned_peek_u64(4, key.iov_base);
9811       if (!MDBX_DISABLE_PAGECHECKS &&
9812           unlikely(last < MIN_TXNID || last > MAX_TXNID)) {
9813         ret.err = MDBX_CORRUPTED;
9814         goto fail;
9815       }
9816       if (oldest <= last) {
9817         oldest = mdbx_find_oldest(txn);
9818         if (oldest <= last) {
9819           if (flags & MDBX_LIFORECLAIM)
9820             continue;
9821           break;
9822         }
9823       }
9824 
9825       if (flags & MDBX_LIFORECLAIM) {
9826         /* skip IDs of records that already reclaimed */
9827         if (txn->tw.lifo_reclaimed) {
9828           size_t i;
9829           for (i = (size_t)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed); i > 0; --i)
9830             if (txn->tw.lifo_reclaimed[i] == last)
9831               break;
9832           if (i)
9833             continue;
9834         }
9835       }
9836 
9837       /* Reading next GC record */
9838       MDBX_page *const mp = recur.outer.mc_pg[recur.outer.mc_top];
9839       if (unlikely((ret.err = mdbx_node_read(
9840                         &recur.outer,
9841                         page_node(mp, recur.outer.mc_ki[recur.outer.mc_top]),
9842                         &data, pp_txnid4chk(mp, txn))) != MDBX_SUCCESS))
9843         goto fail;
9844 
9845       if ((flags & MDBX_LIFORECLAIM) && !txn->tw.lifo_reclaimed) {
9846         txn->tw.lifo_reclaimed = mdbx_txl_alloc();
9847         if (unlikely(!txn->tw.lifo_reclaimed)) {
9848           ret.err = MDBX_ENOMEM;
9849           goto fail;
9850         }
9851       }
9852 
9853       /* Append PNL from GC record to tw.reclaimed_pglist */
9854       mdbx_cassert(mc, (mc->mc_flags & C_GCFREEZE) == 0);
9855       pgno_t *gc_pnl = (pgno_t *)data.iov_base;
9856       mdbx_tassert(txn, data.iov_len >= MDBX_PNL_SIZEOF(gc_pnl));
9857       if (unlikely(data.iov_len < MDBX_PNL_SIZEOF(gc_pnl) ||
9858                    !mdbx_pnl_check(gc_pnl, txn->mt_next_pgno))) {
9859         ret.err = MDBX_CORRUPTED;
9860         goto fail;
9861       }
9862       const unsigned gc_len = MDBX_PNL_SIZE(gc_pnl);
9863       if (unlikely(/* resulting list is tool long */ gc_len +
9864                        MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) >
9865                    env->me_options.rp_augment_limit) &&
9866           (((/* not a slot-request from gc-update */
9867              (flags & MDBX_ALLOC_SLOT) == 0 ||
9868              (flags & MDBX_LIFORECLAIM) == 0 ||
9869              (txn->tw.lifo_reclaimed &&
9870               MDBX_PNL_SIZE(txn->tw.lifo_reclaimed))) &&
9871             /* have enough unallocated space */ pgno_add(
9872                 txn->mt_next_pgno, num) <= txn->mt_geo.upper) ||
9873            gc_len + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) >=
9874                MDBX_PGL_LIMIT / 16 * 15)) {
9875         /* Stop reclaiming to avoid overflow the page list.
9876          * This is a rare case while search for a continuously multi-page region
9877          * in a large database. https://github.com/erthink/libmdbx/issues/123 */
9878         mdbx_debug("stop reclaiming to avoid PNL overflow: %u (current) + %u "
9879                    "(chunk) -> %u",
9880                    MDBX_PNL_SIZE(txn->tw.reclaimed_pglist), gc_len,
9881                    gc_len + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist));
9882         flags &= ~(MDBX_ALLOC_GC | MDBX_COALESCE);
9883         break;
9884       }
9885       ret.err = mdbx_pnl_need(&txn->tw.reclaimed_pglist, gc_len);
9886       if (unlikely(ret.err != MDBX_SUCCESS))
9887         goto fail;
9888       re_list = txn->tw.reclaimed_pglist;
9889 
9890       /* Remember ID of GC record */
9891       if (flags & MDBX_LIFORECLAIM) {
9892         ret.err = mdbx_txl_append(&txn->tw.lifo_reclaimed, last);
9893         if (unlikely(ret.err != MDBX_SUCCESS))
9894           goto fail;
9895       }
9896       txn->tw.last_reclaimed = last;
9897 
9898       if (mdbx_log_enabled(MDBX_LOG_EXTRA)) {
9899         mdbx_debug_extra("PNL read txn %" PRIaTXN " root %" PRIaPGNO
9900                          " num %u, PNL",
9901                          last, txn->mt_dbs[FREE_DBI].md_root, gc_len);
9902         for (unsigned i = gc_len; i; i--)
9903           mdbx_debug_extra_print(" %" PRIaPGNO, gc_pnl[i]);
9904         mdbx_debug_extra_print("%s\n", ".");
9905       }
9906 
9907       /* Merge in descending sorted order */
9908       const unsigned prev_re_len = MDBX_PNL_SIZE(re_list);
9909       mdbx_pnl_xmerge(re_list, gc_pnl);
9910       /* re-check to avoid duplicates */
9911       if (!MDBX_DISABLE_PAGECHECKS &&
9912           unlikely(!mdbx_pnl_check(re_list, txn->mt_next_pgno))) {
9913         ret.err = MDBX_CORRUPTED;
9914         goto fail;
9915       }
9916       mdbx_tassert(txn, mdbx_dirtylist_check(txn));
9917 
9918       re_len = MDBX_PNL_SIZE(re_list);
9919       mdbx_tassert(txn, re_len == 0 || re_list[re_len] < txn->mt_next_pgno);
9920       if (MDBX_ENABLE_REFUND && re_len &&
9921           unlikely(MDBX_PNL_MOST(re_list) == txn->mt_next_pgno - 1)) {
9922         /* Refund suitable pages into "unallocated" space */
9923         mdbx_refund(txn);
9924         re_list = txn->tw.reclaimed_pglist;
9925         re_len = MDBX_PNL_SIZE(re_list);
9926       }
9927 
9928       /* Done for a kick-reclaim mode, actually no page needed */
9929       if (unlikely(flags & MDBX_ALLOC_SLOT)) {
9930         ret.err = MDBX_SUCCESS;
9931         ret.page = NULL;
9932         return ret;
9933       }
9934 
9935       /* Don't try to coalesce too much. */
9936       if (re_len /* current size */ > coalesce_threshold ||
9937           (re_len > prev_re_len && re_len - prev_re_len /* delta from prev */ >=
9938                                        coalesce_threshold / 2))
9939         flags &= ~MDBX_COALESCE;
9940     }
9941 
9942     if (F_ISSET(flags, MDBX_COALESCE | MDBX_ALLOC_CACHE)) {
9943       flags -= MDBX_COALESCE;
9944       continue;
9945     }
9946 
9947     /* There is no suitable pages in the GC and to be able to allocate
9948      * we should CHOICE one of:
9949      *  - make a new steady checkpoint if reclaiming was stopped by
9950      *    the last steady-sync, or wipe it in the MDBX_UTTERLY_NOSYNC mode;
9951      *  - kick lagging reader(s) if reclaiming was stopped by ones of it.
9952      *  - extend the database file. */
9953 
9954     /* Will use new pages from the map if nothing is suitable in the GC. */
9955     range_begin = 0;
9956     pgno = txn->mt_next_pgno;
9957     const pgno_t next = pgno_add(pgno, num);
9958 
9959     if (flags & MDBX_ALLOC_GC) {
9960       const MDBX_meta *const head = mdbx_meta_head(env);
9961       MDBX_meta *const steady = mdbx_meta_steady(env);
9962       /* does reclaiming stopped at the last steady point? */
9963       if (head != steady && META_IS_STEADY(steady) &&
9964           oldest == mdbx_meta_txnid_stable(env, steady)) {
9965         mdbx_debug("gc-kick-steady: head %" PRIaTXN "-%s, tail %" PRIaTXN
9966                    "-%s, oldest %" PRIaTXN,
9967                    mdbx_meta_txnid_stable(env, head), mdbx_durable_str(head),
9968                    mdbx_meta_txnid_stable(env, steady),
9969                    mdbx_durable_str(steady), oldest);
9970         ret.err = MDBX_RESULT_TRUE;
9971         const pgno_t autosync_threshold =
9972             atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed);
9973         const uint64_t autosync_period =
9974             atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed);
9975         /* wipe the last steady-point if one of:
9976          *  - UTTERLY_NOSYNC mode AND auto-sync threshold is NOT specified
9977          *  - UTTERLY_NOSYNC mode AND free space at steady-point is exhausted
9978          * otherwise, make a new steady-point if one of:
9979          *  - auto-sync threshold is specified and reached;
9980          *  - upper limit of database size is reached;
9981          *  - database is full (with the current file size)
9982          *       AND auto-sync threshold it NOT specified */
9983         if (F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC) &&
9984             ((autosync_threshold | autosync_period) == 0 ||
9985              next >= steady->mm_geo.now)) {
9986           /* wipe steady checkpoint in MDBX_UTTERLY_NOSYNC mode
9987            * without any auto-sync threshold(s). */
9988           ret.err = mdbx_wipe_steady(env, oldest);
9989           mdbx_debug("gc-wipe-steady, rc %d", ret.err);
9990           mdbx_assert(env, steady != mdbx_meta_steady(env));
9991         } else if ((flags & MDBX_ALLOC_NEW) == 0 ||
9992                    (autosync_threshold &&
9993                     atomic_load32(&env->me_lck->mti_unsynced_pages,
9994                                   mo_Relaxed) >= autosync_threshold) ||
9995                    (autosync_period &&
9996                     mdbx_osal_monotime() -
9997                             atomic_load64(&env->me_lck->mti_sync_timestamp,
9998                                           mo_Relaxed) >=
9999                         autosync_period) ||
10000                    next >= txn->mt_geo.upper ||
10001                    (next >= txn->mt_end_pgno &&
10002                     (autosync_threshold | autosync_period) == 0)) {
10003           /* make steady checkpoint. */
10004           MDBX_meta meta = *head;
10005           ret.err = mdbx_sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta);
10006           mdbx_debug("gc-make-steady, rc %d", ret.err);
10007           mdbx_assert(env, steady != mdbx_meta_steady(env));
10008         }
10009         if (ret.err == MDBX_SUCCESS) {
10010           if (mdbx_find_oldest(txn) > oldest)
10011             continue;
10012           /* it is reasonable check/kick lagging reader(s) here,
10013            * since we made a new steady point or wipe the last. */
10014           if (oldest < txn->mt_txnid - xMDBX_TXNID_STEP &&
10015               mdbx_kick_longlived_readers(env, oldest) > oldest)
10016             continue;
10017         } else if (unlikely(ret.err != MDBX_RESULT_TRUE))
10018           goto fail;
10019       }
10020     }
10021 
10022     /* don't kick lagging reader(s) if is enough unallocated space
10023      * at the end of database file. */
10024     if ((flags & MDBX_ALLOC_NEW) && next <= txn->mt_end_pgno)
10025       goto done;
10026     if ((flags & MDBX_ALLOC_GC) && oldest < txn->mt_txnid - xMDBX_TXNID_STEP &&
10027         mdbx_kick_longlived_readers(env, oldest) > oldest)
10028       continue;
10029 
10030     ret.err = MDBX_NOTFOUND;
10031     if (flags & MDBX_ALLOC_NEW) {
10032       ret.err = MDBX_MAP_FULL;
10033       if (next <= txn->mt_geo.upper && txn->mt_geo.grow_pv) {
10034         mdbx_assert(env, next > txn->mt_end_pgno);
10035         const pgno_t grow_step = pv2pages(txn->mt_geo.grow_pv);
10036         pgno_t aligned = pgno_align2os_pgno(
10037             env, pgno_add(next, grow_step - next % grow_step));
10038 
10039         if (aligned > txn->mt_geo.upper)
10040           aligned = txn->mt_geo.upper;
10041         mdbx_assert(env, aligned > txn->mt_end_pgno);
10042 
10043         mdbx_verbose("try growth datafile to %" PRIaPGNO " pages (+%" PRIaPGNO
10044                      ")",
10045                      aligned, aligned - txn->mt_end_pgno);
10046         ret.err = mdbx_mapresize_implicit(env, txn->mt_next_pgno, aligned,
10047                                           txn->mt_geo.upper);
10048         if (ret.err == MDBX_SUCCESS) {
10049           env->me_txn->mt_end_pgno = aligned;
10050           goto done;
10051         }
10052 
10053         mdbx_error("unable growth datafile to %" PRIaPGNO " pages (+%" PRIaPGNO
10054                    "), errcode %d",
10055                    aligned, aligned - txn->mt_end_pgno, ret.err);
10056       } else {
10057         mdbx_debug("gc-alloc: next %u > upper %u", next, txn->mt_geo.upper);
10058       }
10059     }
10060 
10061   fail:
10062     mdbx_tassert(txn,
10063                  mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
10064                                        txn->mt_next_pgno - MDBX_ENABLE_REFUND));
10065     if (likely(!(flags & MDBX_ALLOC_SLOT)))
10066       txn->mt_flags |= MDBX_TXN_ERROR;
10067     mdbx_assert(env, ret.err != MDBX_SUCCESS);
10068     ret.page = NULL;
10069     return ret;
10070   }
10071 
10072 done:
10073   ret.page = NULL;
10074   if (unlikely(flags & MDBX_ALLOC_SLOT)) {
10075     ret.err = MDBX_SUCCESS;
10076     return ret;
10077   }
10078 
10079   mdbx_ensure(env, pgno >= NUM_METAS);
10080   if (env->me_flags & MDBX_WRITEMAP) {
10081     ret.page = pgno2page(env, pgno);
10082     /* LY: reset no-access flag from mdbx_page_loose() */
10083     VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num));
10084     MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, pgno2bytes(env, num));
10085   } else {
10086     if (unlikely(!(ret.page = mdbx_page_malloc(txn, num)))) {
10087       ret.err = MDBX_ENOMEM;
10088       goto fail;
10089     }
10090   }
10091 
10092   if (range_begin) {
10093     mdbx_cassert(mc, (mc->mc_flags & C_GCFREEZE) == 0);
10094     mdbx_tassert(txn, pgno < txn->mt_next_pgno);
10095     mdbx_tassert(txn, pgno == re_list[range_begin]);
10096     /* Cutoff allocated pages from tw.reclaimed_pglist */
10097 #if MDBX_PNL_ASCENDING
10098     for (unsigned i = range_begin + num; i <= re_len;)
10099       re_list[range_begin++] = re_list[i++];
10100     MDBX_PNL_SIZE(re_list) = re_len = range_begin - 1;
10101 #else
10102     MDBX_PNL_SIZE(re_list) = re_len -= num;
10103     for (unsigned i = range_begin - num; i < re_len;)
10104       re_list[++i] = re_list[++range_begin];
10105 #endif
10106     mdbx_tassert(txn,
10107                  mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
10108                                        txn->mt_next_pgno - MDBX_ENABLE_REFUND));
10109   } else {
10110     txn->mt_next_pgno = pgno + num;
10111     mdbx_assert(env, txn->mt_next_pgno <= txn->mt_end_pgno);
10112   }
10113 
10114   if (unlikely(env->me_flags & MDBX_PAGEPERTURB))
10115     memset(ret.page, -1, pgno2bytes(env, num));
10116   VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num));
10117 
10118   ret.page->mp_pgno = pgno;
10119   ret.page->mp_leaf2_ksize = 0;
10120   ret.page->mp_flags = 0;
10121   if ((mdbx_assert_enabled() || mdbx_audit_enabled()) && num > 1) {
10122     ret.page->mp_pages = num;
10123     ret.page->mp_flags = P_OVERFLOW;
10124   }
10125   ret.err = mdbx_page_dirty(txn, ret.page, num);
10126   if (unlikely(ret.err != MDBX_SUCCESS))
10127     goto fail;
10128 
10129   mdbx_tassert(txn,
10130                mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
10131                                      txn->mt_next_pgno - MDBX_ENABLE_REFUND));
10132   return ret;
10133 }
10134 
10135 /* Copy the used portions of a non-overflow page. */
mdbx_page_copy(MDBX_page * dst,const MDBX_page * src,size_t psize)10136 __hot static void mdbx_page_copy(MDBX_page *dst, const MDBX_page *src,
10137                                  size_t psize) {
10138   STATIC_ASSERT(UINT16_MAX > MAX_PAGESIZE - PAGEHDRSZ);
10139   STATIC_ASSERT(MIN_PAGESIZE > PAGEHDRSZ + NODESIZE * 4);
10140   if ((src->mp_flags & (P_LEAF2 | P_OVERFLOW)) == 0) {
10141     size_t upper = src->mp_upper, lower = src->mp_lower, unused = upper - lower;
10142 
10143     /* If page isn't full, just copy the used portion. Adjust
10144      * alignment so memcpy may copy words instead of bytes. */
10145     if (unused >= MDBX_CACHELINE_SIZE * 2) {
10146       lower = ceil_powerof2(lower + PAGEHDRSZ, sizeof(void *));
10147       upper = floor_powerof2(upper + PAGEHDRSZ, sizeof(void *));
10148       memcpy(dst, src, lower);
10149       dst = (void *)((char *)dst + upper);
10150       src = (void *)((char *)src + upper);
10151       psize -= upper;
10152     }
10153   }
10154   memcpy(dst, src, psize);
10155 }
10156 
10157 /* Pull a page off the txn's spill list, if present.
10158  *
10159  * If a page being referenced was spilled to disk in this txn, bring
10160  * it back and make it dirty/writable again. */
10161 static struct page_result __must_check_result
mdbx_page_unspill(MDBX_txn * const txn,const MDBX_page * const mp)10162 mdbx_page_unspill(MDBX_txn *const txn, const MDBX_page *const mp) {
10163   mdbx_verbose("unspill page %" PRIaPGNO, mp->mp_pgno);
10164   mdbx_tassert(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0);
10165   mdbx_tassert(txn, IS_SPILLED(txn, mp));
10166   const pgno_t spilled_pgno = mp->mp_pgno << 1;
10167   const MDBX_txn *scan = txn;
10168   struct page_result ret;
10169   do {
10170     mdbx_tassert(txn, (scan->mt_flags & MDBX_TXN_SPILLS) != 0);
10171     if (!scan->tw.spill_pages)
10172       continue;
10173     const unsigned si = mdbx_pnl_exist(scan->tw.spill_pages, spilled_pgno);
10174     if (!si)
10175       continue;
10176     const unsigned npages = IS_OVERFLOW(mp) ? mp->mp_pages : 1;
10177     ret.page = mdbx_page_malloc(txn, npages);
10178     if (unlikely(!ret.page)) {
10179       ret.err = MDBX_ENOMEM;
10180       return ret;
10181     }
10182     mdbx_page_copy(ret.page, mp, pgno2bytes(txn->mt_env, npages));
10183     if (scan == txn) {
10184       /* If in current txn, this page is no longer spilled.
10185        * If it happens to be the last page, truncate the spill list.
10186        * Otherwise mark it as deleted by setting the LSB. */
10187       mdbx_spill_remove(txn, si, npages);
10188     } /* otherwise, if belonging to a parent txn, the
10189        * page remains spilled until child commits */
10190 
10191     ret.err = mdbx_page_dirty(txn, ret.page, npages);
10192     if (unlikely(ret.err != MDBX_SUCCESS))
10193       return ret;
10194 #if MDBX_ENABLE_PGOP_STAT
10195     txn->mt_env->me_lck->mti_pgop_stat.unspill.weak += npages;
10196 #endif /* MDBX_ENABLE_PGOP_STAT */
10197     ret.page->mp_flags |= (scan == txn) ? 0 : P_SPILLED;
10198     ret.err = MDBX_SUCCESS;
10199     return ret;
10200   } while (likely((scan = scan->mt_parent) != nullptr &&
10201                   (scan->mt_flags & MDBX_TXN_SPILLS) != 0));
10202   mdbx_error("Page %" PRIaPGNO " mod-txnid %" PRIaTXN
10203              " not found in the spill-list(s), current txn %" PRIaTXN
10204              " front %" PRIaTXN ", root txn %" PRIaTXN " front %" PRIaTXN,
10205              mp->mp_pgno, mp->mp_txnid, txn->mt_txnid, txn->mt_front,
10206              txn->mt_env->me_txn0->mt_txnid, txn->mt_env->me_txn0->mt_front);
10207   ret.err = MDBX_PROBLEM;
10208   ret.page = NULL;
10209   return ret;
10210 }
10211 
10212 /* Touch a page: make it dirty and re-insert into tree with updated pgno.
10213  * Set MDBX_TXN_ERROR on failure.
10214  *
10215  * [in] mc  cursor pointing to the page to be touched
10216  *
10217  * Returns 0 on success, non-zero on failure. */
mdbx_page_touch(MDBX_cursor * mc)10218 __hot static int mdbx_page_touch(MDBX_cursor *mc) {
10219   const MDBX_page *const mp = mc->mc_pg[mc->mc_top];
10220   MDBX_page *np;
10221   MDBX_txn *txn = mc->mc_txn;
10222   int rc;
10223 
10224   if (mdbx_assert_enabled()) {
10225     if (mc->mc_flags & C_SUB) {
10226       MDBX_xcursor *mx = container_of(mc->mc_db, MDBX_xcursor, mx_db);
10227       MDBX_cursor_couple *couple = container_of(mx, MDBX_cursor_couple, inner);
10228       mdbx_tassert(txn, mc->mc_db == &couple->outer.mc_xcursor->mx_db);
10229       mdbx_tassert(txn, mc->mc_dbx == &couple->outer.mc_xcursor->mx_dbx);
10230       mdbx_tassert(txn, *couple->outer.mc_dbistate & DBI_DIRTY);
10231     } else {
10232       mdbx_tassert(txn, *mc->mc_dbistate & DBI_DIRTY);
10233     }
10234     mdbx_tassert(txn, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY);
10235     mdbx_tassert(txn, !IS_OVERFLOW(mp));
10236     mdbx_tassert(txn, mdbx_dirtylist_check(txn));
10237   }
10238 
10239   if (IS_MODIFIABLE(txn, mp) || IS_SUBP(mp))
10240     return MDBX_SUCCESS;
10241 
10242   if (IS_FROZEN(txn, mp)) {
10243     /* CoW the page */
10244     rc = mdbx_pnl_need(&txn->tw.retired_pages, 1);
10245     if (unlikely(rc != MDBX_SUCCESS))
10246       goto fail;
10247     const struct page_result par = mdbx_page_alloc(mc, 1, MDBX_ALLOC_ALL);
10248     rc = par.err;
10249     np = par.page;
10250     if (unlikely(rc != MDBX_SUCCESS))
10251       goto fail;
10252 
10253     const pgno_t pgno = np->mp_pgno;
10254     mdbx_debug("touched db %d page %" PRIaPGNO " -> %" PRIaPGNO, DDBI(mc),
10255                mp->mp_pgno, pgno);
10256     mdbx_tassert(txn, mp->mp_pgno != pgno);
10257     mdbx_pnl_xappend(txn->tw.retired_pages, mp->mp_pgno);
10258     /* Update the parent page, if any, to point to the new page */
10259     if (mc->mc_top) {
10260       MDBX_page *parent = mc->mc_pg[mc->mc_top - 1];
10261       MDBX_node *node = page_node(parent, mc->mc_ki[mc->mc_top - 1]);
10262       node_set_pgno(node, pgno);
10263     } else {
10264       mc->mc_db->md_root = pgno;
10265     }
10266 
10267 #if MDBX_ENABLE_PGOP_STAT
10268     txn->mt_env->me_lck->mti_pgop_stat.cow.weak += 1;
10269 #endif /* MDBX_ENABLE_PGOP_STAT */
10270     mdbx_page_copy(np, mp, txn->mt_env->me_psize);
10271     np->mp_pgno = pgno;
10272     np->mp_txnid = txn->mt_front;
10273   } else if (IS_SPILLED(txn, mp)) {
10274     struct page_result pur = mdbx_page_unspill(txn, mp);
10275     np = pur.page;
10276     rc = pur.err;
10277     if (likely(rc == MDBX_SUCCESS)) {
10278       mdbx_tassert(txn, np != nullptr);
10279       goto done;
10280     }
10281     goto fail;
10282   } else {
10283     if (unlikely(!txn->mt_parent)) {
10284       mdbx_error("Unexpected not frozen/modifiable/spilled but shadowed %s "
10285                  "page %" PRIaPGNO " mod-txnid %" PRIaTXN ","
10286                  " without parent transaction, current txn %" PRIaTXN
10287                  " front %" PRIaTXN,
10288                  IS_BRANCH(mp) ? "branch" : "leaf", mp->mp_pgno, mp->mp_txnid,
10289                  mc->mc_txn->mt_txnid, mc->mc_txn->mt_front);
10290       rc = MDBX_PROBLEM;
10291       goto fail;
10292     }
10293 
10294     mdbx_debug("clone db %d page %" PRIaPGNO, DDBI(mc), mp->mp_pgno);
10295     mdbx_tassert(txn, txn->tw.dirtylist->length <=
10296                           MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE);
10297     /* No - copy it */
10298     np = mdbx_page_malloc(txn, 1);
10299     if (unlikely(!np)) {
10300       rc = MDBX_ENOMEM;
10301       goto fail;
10302     }
10303     mdbx_page_copy(np, mp, txn->mt_env->me_psize);
10304 
10305     /* insert a clone of parent's dirty page, so don't touch dirtyroom */
10306     rc = mdbx_page_dirty(txn, np, 1);
10307     if (unlikely(rc != MDBX_SUCCESS))
10308       goto fail;
10309 
10310 #if MDBX_ENABLE_PGOP_STAT
10311     txn->mt_env->me_lck->mti_pgop_stat.clone.weak += 1;
10312 #endif /* MDBX_ENABLE_PGOP_STAT */
10313   }
10314 
10315 done:
10316   /* Adjust cursors pointing to mp */
10317   mc->mc_pg[mc->mc_top] = np;
10318   MDBX_cursor *m2 = txn->tw.cursors[mc->mc_dbi];
10319   if (mc->mc_flags & C_SUB) {
10320     for (; m2; m2 = m2->mc_next) {
10321       MDBX_cursor *m3 = &m2->mc_xcursor->mx_cursor;
10322       if (m3->mc_snum < mc->mc_snum)
10323         continue;
10324       if (m3->mc_pg[mc->mc_top] == mp)
10325         m3->mc_pg[mc->mc_top] = np;
10326     }
10327   } else {
10328     for (; m2; m2 = m2->mc_next) {
10329       if (m2->mc_snum < mc->mc_snum)
10330         continue;
10331       if (m2 == mc)
10332         continue;
10333       if (m2->mc_pg[mc->mc_top] == mp) {
10334         m2->mc_pg[mc->mc_top] = np;
10335         if (XCURSOR_INITED(m2) && IS_LEAF(np))
10336           XCURSOR_REFRESH(m2, np, m2->mc_ki[mc->mc_top]);
10337       }
10338     }
10339   }
10340   return MDBX_SUCCESS;
10341 
10342 fail:
10343   txn->mt_flags |= MDBX_TXN_ERROR;
10344   return rc;
10345 }
10346 
mdbx_env_sync_internal(MDBX_env * env,bool force,bool nonblock)10347 __cold static int mdbx_env_sync_internal(MDBX_env *env, bool force,
10348                                          bool nonblock) {
10349   unsigned flags = env->me_flags & ~MDBX_NOMETASYNC;
10350   if (unlikely(flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)))
10351     return MDBX_EACCESS;
10352 
10353   int rc = MDBX_RESULT_TRUE /* means "nothing to sync" */;
10354   bool need_unlock = false;
10355   if (nonblock &&
10356       atomic_load32(&env->me_lck->mti_unsynced_pages, mo_AcquireRelease) == 0)
10357     goto fastpath;
10358 
10359   const bool outside_txn = (env->me_txn0->mt_owner != mdbx_thread_self());
10360   if (outside_txn) {
10361     int err = mdbx_txn_lock(env, nonblock);
10362     if (unlikely(err != MDBX_SUCCESS))
10363       return err;
10364     need_unlock = true;
10365   }
10366 
10367   const MDBX_meta *head = mdbx_meta_head(env);
10368   pgno_t unsynced_pages =
10369       atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed);
10370   if (!META_IS_STEADY(head) || unsynced_pages) {
10371     const pgno_t autosync_threshold =
10372         atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed);
10373     const uint64_t autosync_period =
10374         atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed);
10375     if (force || (autosync_threshold && unsynced_pages >= autosync_threshold) ||
10376         (autosync_period &&
10377          mdbx_osal_monotime() -
10378                  atomic_load64(&env->me_lck->mti_sync_timestamp, mo_Relaxed) >=
10379              autosync_period))
10380       flags &= MDBX_WRITEMAP /* clear flags for full steady sync */;
10381 
10382     if (outside_txn) {
10383       if (unsynced_pages > /* FIXME: define threshold */ 16 &&
10384           (flags & MDBX_SAFE_NOSYNC) == 0) {
10385         mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0);
10386         const size_t usedbytes = pgno_align2os_bytes(env, head->mm_geo.next);
10387 
10388 #if MDBX_ENABLE_PGOP_STAT
10389         env->me_lck->mti_pgop_stat.wops.weak += 1;
10390 #endif /* MDBX_ENABLE_PGOP_STAT */
10391         mdbx_txn_unlock(env);
10392 
10393         /* LY: pre-sync without holding lock to reduce latency for writer(s) */
10394         int err =
10395             (flags & MDBX_WRITEMAP)
10396                 ? mdbx_msync(&env->me_dxb_mmap, 0, usedbytes, MDBX_SYNC_DATA)
10397                 : mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA);
10398         if (unlikely(err != MDBX_SUCCESS))
10399           return err;
10400 
10401         err = mdbx_txn_lock(env, nonblock);
10402         if (unlikely(err != MDBX_SUCCESS))
10403           return err;
10404 
10405         /* LY: head and unsynced_pages may be changed. */
10406         head = mdbx_meta_head(env);
10407         unsynced_pages =
10408             atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed);
10409       }
10410       env->me_txn0->mt_txnid = meta_txnid(env, head, false);
10411       mdbx_find_oldest(env->me_txn0);
10412       rc = MDBX_RESULT_FALSE /* means "some data was synced" */;
10413     }
10414 
10415     if (!META_IS_STEADY(head) ||
10416         ((flags & MDBX_SAFE_NOSYNC) == 0 && unsynced_pages)) {
10417       mdbx_debug("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIaPGNO,
10418                  data_page(head)->mp_pgno, mdbx_durable_str(head),
10419                  unsynced_pages);
10420       MDBX_meta meta = *head;
10421       int err = mdbx_sync_locked(env, flags | MDBX_SHRINK_ALLOWED, &meta);
10422       if (unlikely(err != MDBX_SUCCESS)) {
10423         if (need_unlock)
10424           mdbx_txn_unlock(env);
10425         return err;
10426       }
10427       rc = MDBX_RESULT_FALSE /* means "some data was synced" */;
10428     }
10429   }
10430 
10431 fastpath:
10432   /* LY: sync meta-pages if MDBX_NOMETASYNC enabled
10433    *     and someone was not synced above. */
10434   if (rc == MDBX_RESULT_TRUE && (env->me_flags & MDBX_NOMETASYNC) != 0) {
10435     const txnid_t head_txnid = mdbx_recent_committed_txnid(env);
10436     if (atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) !=
10437         (uint32_t)head_txnid) {
10438 #if MDBX_ENABLE_PGOP_STAT
10439       if (need_unlock)
10440         env->me_lck->mti_pgop_stat.wops.weak += 1;
10441 #if MDBX_64BIT_ATOMIC
10442       else {
10443         MDBX_atomic_uint64_t *wops = &env->me_lck->mti_pgop_stat.wops;
10444         while (unlikely(!atomic_cas64(wops, wops->weak, wops->weak + 1)))
10445           atomic_yield();
10446       }
10447 #else
10448         /* loose the env->me_lck->mti_pgop_stat.wops.weak increment */
10449 #endif /* MDBX_64BIT_ATOMIC */
10450 #endif /* MDBX_ENABLE_PGOP_STAT */
10451       rc = (flags & MDBX_WRITEMAP)
10452                ? mdbx_msync(&env->me_dxb_mmap, 0,
10453                             pgno_align2os_bytes(env, NUM_METAS),
10454                             MDBX_SYNC_DATA | MDBX_SYNC_IODQ)
10455                : mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
10456       if (likely(rc == MDBX_SUCCESS))
10457         atomic_store32(&env->me_lck->mti_meta_sync_txnid, (uint32_t)head_txnid,
10458                        mo_Relaxed);
10459     }
10460   }
10461   if (need_unlock)
10462     mdbx_txn_unlock(env);
10463   return rc;
10464 }
10465 
check_env(const MDBX_env * env,const bool wanna_active)10466 static __inline int check_env(const MDBX_env *env, const bool wanna_active) {
10467   if (unlikely(!env))
10468     return MDBX_EINVAL;
10469 
10470   if (unlikely(env->me_signature.weak != MDBX_ME_SIGNATURE))
10471     return MDBX_EBADSIGN;
10472 
10473 #if MDBX_ENV_CHECKPID
10474   if (unlikely(env->me_pid != mdbx_getpid())) {
10475     ((MDBX_env *)env)->me_flags |= MDBX_FATAL_ERROR;
10476     return MDBX_PANIC;
10477   }
10478 #endif /* MDBX_ENV_CHECKPID */
10479 
10480   if (unlikely(env->me_flags & MDBX_FATAL_ERROR))
10481     return MDBX_PANIC;
10482 
10483   if (wanna_active) {
10484     if (unlikely((env->me_flags & MDBX_ENV_ACTIVE) == 0))
10485       return MDBX_EPERM;
10486     mdbx_assert(env, env->me_map != nullptr);
10487   }
10488 
10489   return MDBX_SUCCESS;
10490 }
10491 
mdbx_env_sync_ex(MDBX_env * env,bool force,bool nonblock)10492 __cold int mdbx_env_sync_ex(MDBX_env *env, bool force, bool nonblock) {
10493   int rc = check_env(env, true);
10494   if (unlikely(rc != MDBX_SUCCESS))
10495     return rc;
10496 
10497   return mdbx_env_sync_internal(env, force, nonblock);
10498 }
10499 
10500 #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API
mdbx_env_sync(MDBX_env * env)10501 __cold int mdbx_env_sync(MDBX_env *env) { return __inline_mdbx_env_sync(env); }
10502 
mdbx_env_sync_poll(MDBX_env * env)10503 __cold int mdbx_env_sync_poll(MDBX_env *env) {
10504   return __inline_mdbx_env_sync_poll(env);
10505 }
10506 #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */
10507 
10508 /* Back up parent txn's cursors, then grab the originals for tracking */
mdbx_cursor_shadow(MDBX_txn * parent,MDBX_txn * nested)10509 static int mdbx_cursor_shadow(MDBX_txn *parent, MDBX_txn *nested) {
10510   for (int i = parent->mt_numdbs; --i >= 0;) {
10511     nested->tw.cursors[i] = NULL;
10512     MDBX_cursor *mc = parent->tw.cursors[i];
10513     if (mc != NULL) {
10514       size_t size = mc->mc_xcursor ? sizeof(MDBX_cursor) + sizeof(MDBX_xcursor)
10515                                    : sizeof(MDBX_cursor);
10516       for (MDBX_cursor *bk; mc; mc = bk->mc_next) {
10517         bk = mc;
10518         if (mc->mc_signature != MDBX_MC_LIVE)
10519           continue;
10520         bk = mdbx_malloc(size);
10521         if (unlikely(!bk))
10522           return MDBX_ENOMEM;
10523         *bk = *mc;
10524         mc->mc_backup = bk;
10525         /* Kill pointers into src to reduce abuse: The
10526          * user may not use mc until dst ends. But we need a valid
10527          * txn pointer here for cursor fixups to keep working. */
10528         mc->mc_txn = nested;
10529         mc->mc_db = &nested->mt_dbs[i];
10530         mc->mc_dbistate = &nested->mt_dbistate[i];
10531         MDBX_xcursor *mx = mc->mc_xcursor;
10532         if (mx != NULL) {
10533           *(MDBX_xcursor *)(bk + 1) = *mx;
10534           mx->mx_cursor.mc_txn = nested;
10535         }
10536         mc->mc_next = nested->tw.cursors[i];
10537         nested->tw.cursors[i] = mc;
10538       }
10539     }
10540   }
10541   return MDBX_SUCCESS;
10542 }
10543 
10544 /* Close this write txn's cursors, give parent txn's cursors back to parent.
10545  *
10546  * [in] txn     the transaction handle.
10547  * [in] merge   true to keep changes to parent cursors, false to revert.
10548  *
10549  * Returns 0 on success, non-zero on failure. */
mdbx_cursors_eot(MDBX_txn * txn,const bool merge)10550 static void mdbx_cursors_eot(MDBX_txn *txn, const bool merge) {
10551   mdbx_tassert(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0);
10552   for (int i = txn->mt_numdbs; --i >= 0;) {
10553     MDBX_cursor *next, *mc = txn->tw.cursors[i];
10554     if (!mc)
10555       continue;
10556     txn->tw.cursors[i] = NULL;
10557     do {
10558       const unsigned stage = mc->mc_signature;
10559       MDBX_cursor *bk = mc->mc_backup;
10560       next = mc->mc_next;
10561       mdbx_ensure(txn->mt_env,
10562                   stage == MDBX_MC_LIVE || (stage == MDBX_MC_WAIT4EOT && bk));
10563       mdbx_cassert(mc, mc->mc_dbi == (unsigned)i);
10564       if (bk) {
10565         MDBX_xcursor *mx = mc->mc_xcursor;
10566         mdbx_cassert(mc, mx == bk->mc_xcursor);
10567         mdbx_tassert(txn, txn->mt_parent != NULL);
10568         mdbx_ensure(txn->mt_env, bk->mc_signature == MDBX_MC_LIVE);
10569         if (stage == MDBX_MC_WAIT4EOT /* Cursor was closed by user */)
10570           mc->mc_signature = stage /* Promote closed state to parent txn */;
10571         else if (merge) {
10572           /* Restore pointers to parent txn */
10573           mc->mc_next = bk->mc_next;
10574           mc->mc_backup = bk->mc_backup;
10575           mc->mc_txn = bk->mc_txn;
10576           mc->mc_db = bk->mc_db;
10577           mc->mc_dbistate = bk->mc_dbistate;
10578           if (mx) {
10579             if (mx != bk->mc_xcursor) {
10580               *bk->mc_xcursor = *mx;
10581               mx = bk->mc_xcursor;
10582             }
10583             mx->mx_cursor.mc_txn = bk->mc_txn;
10584           }
10585         } else {
10586           /* Restore from backup, i.e. rollback/abort nested txn */
10587           *mc = *bk;
10588           if (mx)
10589             *mx = *(MDBX_xcursor *)(bk + 1);
10590         }
10591         bk->mc_signature = 0;
10592         mdbx_free(bk);
10593       } else {
10594         mdbx_ensure(txn->mt_env, stage == MDBX_MC_LIVE);
10595         mc->mc_signature = MDBX_MC_READY4CLOSE /* Cursor may be reused */;
10596         mc->mc_flags = 0 /* reset C_UNTRACK */;
10597       }
10598     } while ((mc = next) != NULL);
10599   }
10600 }
10601 
10602 #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__)
10603 /* Find largest mvcc-snapshot still referenced by this process. */
mdbx_find_largest_this(MDBX_env * env,pgno_t largest)10604 static pgno_t mdbx_find_largest_this(MDBX_env *env, pgno_t largest) {
10605   MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
10606   if (likely(lck != NULL /* exclusive mode */)) {
10607     const unsigned snap_nreaders =
10608         atomic_load32(&lck->mti_numreaders, mo_AcquireRelease);
10609     for (unsigned i = 0; i < snap_nreaders; ++i) {
10610     retry:
10611       if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease) ==
10612           env->me_pid) {
10613         /* mdbx_jitter4testing(true); */
10614         const pgno_t snap_pages = atomic_load32(
10615             &lck->mti_readers[i].mr_snapshot_pages_used, mo_Relaxed);
10616         const txnid_t snap_txnid = safe64_read(&lck->mti_readers[i].mr_txnid);
10617         if (unlikely(
10618                 snap_pages !=
10619                     atomic_load32(&lck->mti_readers[i].mr_snapshot_pages_used,
10620                                   mo_AcquireRelease) ||
10621                 snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid)))
10622           goto retry;
10623         if (largest < snap_pages &&
10624             atomic_load64(&lck->mti_oldest_reader, mo_AcquireRelease) <=
10625                 /* ignore pending updates */ snap_txnid &&
10626             snap_txnid <= MAX_TXNID)
10627           largest = snap_pages;
10628       }
10629     }
10630   }
10631   return largest;
10632 }
10633 
mdbx_txn_valgrind(MDBX_env * env,MDBX_txn * txn)10634 static void mdbx_txn_valgrind(MDBX_env *env, MDBX_txn *txn) {
10635 #if !defined(__SANITIZE_ADDRESS__)
10636   if (!RUNNING_ON_VALGRIND)
10637     return;
10638 #endif
10639 
10640   if (txn) { /* transaction start */
10641     if (env->me_poison_edge < txn->mt_next_pgno)
10642       env->me_poison_edge = txn->mt_next_pgno;
10643     VALGRIND_MAKE_MEM_DEFINED(env->me_map, pgno2bytes(env, txn->mt_next_pgno));
10644     MDBX_ASAN_UNPOISON_MEMORY_REGION(env->me_map,
10645                                      pgno2bytes(env, txn->mt_next_pgno));
10646     /* don't touch more, it should be already poisoned */
10647   } else { /* transaction end */
10648     bool should_unlock = false;
10649     pgno_t last = MAX_PAGENO;
10650     if (env->me_txn0 && env->me_txn0->mt_owner == mdbx_thread_self()) {
10651       /* inside write-txn */
10652       MDBX_meta *head = mdbx_meta_head(env);
10653       last = head->mm_geo.next;
10654     } else if (env->me_flags & MDBX_RDONLY) {
10655       /* read-only mode, no write-txn, no wlock mutex */
10656       last = NUM_METAS;
10657     } else if (mdbx_txn_lock(env, true) == MDBX_SUCCESS) {
10658       /* no write-txn */
10659       last = NUM_METAS;
10660       should_unlock = true;
10661     } else {
10662       /* write txn is running, therefore shouldn't poison any memory range */
10663       return;
10664     }
10665 
10666     last = mdbx_find_largest_this(env, last);
10667     const pgno_t edge = env->me_poison_edge;
10668     if (edge > last) {
10669       mdbx_assert(env, last >= NUM_METAS);
10670       env->me_poison_edge = last;
10671       VALGRIND_MAKE_MEM_NOACCESS(env->me_map + pgno2bytes(env, last),
10672                                  pgno2bytes(env, edge - last));
10673       MDBX_ASAN_POISON_MEMORY_REGION(env->me_map + pgno2bytes(env, last),
10674                                      pgno2bytes(env, edge - last));
10675     }
10676     if (should_unlock)
10677       mdbx_txn_unlock(env);
10678   }
10679 }
10680 #endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */
10681 
10682 typedef struct {
10683   int err;
10684   MDBX_reader *rslot;
10685 } bind_rslot_result;
10686 
bind_rslot(MDBX_env * env,const uintptr_t tid)10687 static bind_rslot_result bind_rslot(MDBX_env *env, const uintptr_t tid) {
10688   mdbx_assert(env, env->me_lck_mmap.lck);
10689   mdbx_assert(env, env->me_lck->mti_magic_and_version == MDBX_LOCK_MAGIC);
10690   mdbx_assert(env, env->me_lck->mti_os_and_format == MDBX_LOCK_FORMAT);
10691 
10692   bind_rslot_result result = {mdbx_rdt_lock(env), nullptr};
10693   if (unlikely(MDBX_IS_ERROR(result.err)))
10694     return result;
10695   if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) {
10696     mdbx_rdt_unlock(env);
10697     result.err = MDBX_PANIC;
10698     return result;
10699   }
10700   if (unlikely(!env->me_map)) {
10701     mdbx_rdt_unlock(env);
10702     result.err = MDBX_EPERM;
10703     return result;
10704   }
10705 
10706   if (unlikely(env->me_live_reader != env->me_pid)) {
10707     result.err = mdbx_rpid_set(env);
10708     if (unlikely(result.err != MDBX_SUCCESS)) {
10709       mdbx_rdt_unlock(env);
10710       return result;
10711     }
10712     env->me_live_reader = env->me_pid;
10713   }
10714 
10715   result.err = MDBX_SUCCESS;
10716   unsigned slot, nreaders;
10717   while (1) {
10718     nreaders = atomic_load32(&env->me_lck->mti_numreaders, mo_Relaxed);
10719     for (slot = 0; slot < nreaders; slot++)
10720       if (atomic_load32(&env->me_lck->mti_readers[slot].mr_pid, mo_Relaxed) ==
10721           0)
10722         break;
10723 
10724     if (likely(slot < env->me_maxreaders))
10725       break;
10726 
10727     result.err = mdbx_cleanup_dead_readers(env, true, NULL);
10728     if (result.err != MDBX_RESULT_TRUE) {
10729       mdbx_rdt_unlock(env);
10730       result.err =
10731           (result.err == MDBX_SUCCESS) ? MDBX_READERS_FULL : result.err;
10732       return result;
10733     }
10734   }
10735 
10736   result.rslot = &env->me_lck->mti_readers[slot];
10737   /* Claim the reader slot, carefully since other code
10738    * uses the reader table un-mutexed: First reset the
10739    * slot, next publish it in lck->mti_numreaders.  After
10740    * that, it is safe for mdbx_env_close() to touch it.
10741    * When it will be closed, we can finally claim it. */
10742   atomic_store32(&result.rslot->mr_pid, 0, mo_Relaxed);
10743   safe64_reset(&result.rslot->mr_txnid, true);
10744   if (slot == nreaders)
10745     atomic_store32(&env->me_lck->mti_numreaders, ++nreaders, mo_Relaxed);
10746   atomic_store64(&result.rslot->mr_tid, (env->me_flags & MDBX_NOTLS) ? 0 : tid,
10747                  mo_Relaxed);
10748   atomic_store32(&result.rslot->mr_pid, env->me_pid, mo_Relaxed);
10749   mdbx_rdt_unlock(env);
10750 
10751   if (likely(env->me_flags & MDBX_ENV_TXKEY)) {
10752     mdbx_assert(env, env->me_live_reader == env->me_pid);
10753     thread_rthc_set(env->me_txkey, result.rslot);
10754   }
10755   return result;
10756 }
10757 
mdbx_thread_register(const MDBX_env * env)10758 __cold int mdbx_thread_register(const MDBX_env *env) {
10759   int rc = check_env(env, true);
10760   if (unlikely(rc != MDBX_SUCCESS))
10761     return rc;
10762 
10763   if (unlikely(!env->me_lck_mmap.lck))
10764     return (env->me_flags & MDBX_EXCLUSIVE) ? MDBX_EINVAL : MDBX_EPERM;
10765 
10766   if (unlikely((env->me_flags & MDBX_ENV_TXKEY) == 0)) {
10767     mdbx_assert(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS));
10768     return MDBX_EINVAL /* MDBX_NOTLS mode */;
10769   }
10770 
10771   mdbx_assert(env, (env->me_flags & (MDBX_NOTLS | MDBX_ENV_TXKEY |
10772                                      MDBX_EXCLUSIVE)) == MDBX_ENV_TXKEY);
10773   MDBX_reader *r = thread_rthc_get(env->me_txkey);
10774   if (unlikely(r != NULL)) {
10775     mdbx_assert(env, r->mr_pid.weak == env->me_pid);
10776     mdbx_assert(env, r->mr_tid.weak == mdbx_thread_self());
10777     if (unlikely(r->mr_pid.weak != env->me_pid))
10778       return MDBX_BAD_RSLOT;
10779     return MDBX_RESULT_TRUE /* already registered */;
10780   }
10781 
10782   const uintptr_t tid = mdbx_thread_self();
10783   if (env->me_txn0 && unlikely(env->me_txn0->mt_owner == tid))
10784     return MDBX_TXN_OVERLAPPING;
10785   return bind_rslot((MDBX_env *)env, tid).err;
10786 }
10787 
mdbx_thread_unregister(const MDBX_env * env)10788 __cold int mdbx_thread_unregister(const MDBX_env *env) {
10789   int rc = check_env(env, true);
10790   if (unlikely(rc != MDBX_SUCCESS))
10791     return rc;
10792 
10793   if (unlikely(!env->me_lck_mmap.lck))
10794     return MDBX_RESULT_TRUE;
10795 
10796   if (unlikely((env->me_flags & MDBX_ENV_TXKEY) == 0)) {
10797     mdbx_assert(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS));
10798     return MDBX_RESULT_TRUE /* MDBX_NOTLS mode */;
10799   }
10800 
10801   mdbx_assert(env, (env->me_flags & (MDBX_NOTLS | MDBX_ENV_TXKEY |
10802                                      MDBX_EXCLUSIVE)) == MDBX_ENV_TXKEY);
10803   MDBX_reader *r = thread_rthc_get(env->me_txkey);
10804   if (unlikely(r == NULL))
10805     return MDBX_RESULT_TRUE /* not registered */;
10806 
10807   mdbx_assert(env, r->mr_pid.weak == env->me_pid);
10808   mdbx_assert(env, r->mr_tid.weak == mdbx_thread_self());
10809   if (unlikely(r->mr_pid.weak != env->me_pid ||
10810                r->mr_tid.weak != mdbx_thread_self()))
10811     return MDBX_BAD_RSLOT;
10812 
10813   if (unlikely(r->mr_txnid.weak < SAFE64_INVALID_THRESHOLD))
10814     return MDBX_BUSY /* transaction is still active */;
10815 
10816   atomic_store32(&r->mr_pid, 0, mo_Relaxed);
10817   atomic_store32(&env->me_lck->mti_readers_refresh_flag, true,
10818                  mo_AcquireRelease);
10819   thread_rthc_set(env->me_txkey, nullptr);
10820   return MDBX_SUCCESS;
10821 }
10822 
10823 /* Common code for mdbx_txn_begin() and mdbx_txn_renew(). */
mdbx_txn_renew0(MDBX_txn * txn,const unsigned flags)10824 static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) {
10825   MDBX_env *env = txn->mt_env;
10826   int rc;
10827 
10828 #if MDBX_ENV_CHECKPID
10829   if (unlikely(env->me_pid != mdbx_getpid())) {
10830     env->me_flags |= MDBX_FATAL_ERROR;
10831     return MDBX_PANIC;
10832   }
10833 #endif /* MDBX_ENV_CHECKPID */
10834 
10835   STATIC_ASSERT(sizeof(MDBX_reader) == 32);
10836 #if MDBX_LOCKING > 0
10837   STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_wlock) % MDBX_CACHELINE_SIZE == 0);
10838   STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_rlock) % MDBX_CACHELINE_SIZE == 0);
10839 #else
10840   STATIC_ASSERT(
10841       offsetof(MDBX_lockinfo, mti_oldest_reader) % MDBX_CACHELINE_SIZE == 0);
10842   STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_numreaders) % MDBX_CACHELINE_SIZE ==
10843                 0);
10844 #endif /* MDBX_LOCKING */
10845   STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_readers) % MDBX_CACHELINE_SIZE ==
10846                 0);
10847 
10848   const uintptr_t tid = mdbx_thread_self();
10849   if (flags & MDBX_TXN_RDONLY) {
10850     mdbx_assert(env, (flags & ~(MDBX_TXN_RO_BEGIN_FLAGS | MDBX_WRITEMAP)) == 0);
10851     txn->mt_flags =
10852         MDBX_TXN_RDONLY | (env->me_flags & (MDBX_NOTLS | MDBX_WRITEMAP));
10853     MDBX_reader *r = txn->to.reader;
10854     STATIC_ASSERT(sizeof(uintptr_t) <= sizeof(r->mr_tid));
10855     if (likely(env->me_flags & MDBX_ENV_TXKEY)) {
10856       mdbx_assert(env, !(env->me_flags & MDBX_NOTLS));
10857       r = thread_rthc_get(env->me_txkey);
10858       if (likely(r)) {
10859         if (unlikely(!r->mr_pid.weak) &&
10860             (mdbx_runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN)) {
10861           thread_rthc_set(env->me_txkey, nullptr);
10862           r = nullptr;
10863         } else {
10864           mdbx_assert(env, r->mr_pid.weak == env->me_pid);
10865           mdbx_assert(env, r->mr_tid.weak == mdbx_thread_self());
10866         }
10867       }
10868     } else {
10869       mdbx_assert(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS));
10870     }
10871 
10872     if (likely(r)) {
10873       if (unlikely(r->mr_pid.weak != env->me_pid ||
10874                    r->mr_txnid.weak < SAFE64_INVALID_THRESHOLD))
10875         return MDBX_BAD_RSLOT;
10876     } else if (env->me_lck_mmap.lck) {
10877       bind_rslot_result brs = bind_rslot(env, tid);
10878       if (unlikely(brs.err != MDBX_SUCCESS))
10879         return brs.err;
10880       r = brs.rslot;
10881     }
10882     txn->to.reader = r;
10883     if (flags & (MDBX_TXN_RDONLY_PREPARE - MDBX_TXN_RDONLY)) {
10884       mdbx_assert(env, txn->mt_txnid == 0);
10885       mdbx_assert(env, txn->mt_owner == 0);
10886       mdbx_assert(env, txn->mt_numdbs == 0);
10887       if (likely(r)) {
10888         mdbx_assert(env, r->mr_snapshot_pages_used.weak == 0);
10889         mdbx_assert(env, r->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD);
10890         atomic_store32(&r->mr_snapshot_pages_used, 0, mo_Relaxed);
10891       }
10892       txn->mt_flags = MDBX_TXN_RDONLY | MDBX_TXN_FINISHED;
10893       return MDBX_SUCCESS;
10894     }
10895 
10896     /* Seek & fetch the last meta */
10897     if (likely(/* not recovery mode */ env->me_stuck_meta < 0)) {
10898       while (1) {
10899         MDBX_meta *const meta = mdbx_meta_head(env);
10900         mdbx_jitter4testing(false);
10901         const txnid_t snap = mdbx_meta_txnid_fluid(env, meta);
10902         mdbx_jitter4testing(false);
10903         if (likely(r)) {
10904           safe64_reset(&r->mr_txnid, false);
10905           atomic_store32(&r->mr_snapshot_pages_used, meta->mm_geo.next,
10906                          mo_Relaxed);
10907           atomic_store64(&r->mr_snapshot_pages_retired,
10908                          unaligned_peek_u64(4, meta->mm_pages_retired),
10909                          mo_Relaxed);
10910           safe64_write(&r->mr_txnid, snap);
10911           mdbx_jitter4testing(false);
10912           mdbx_assert(env, r->mr_pid.weak == mdbx_getpid());
10913           mdbx_assert(
10914               env, r->mr_tid.weak ==
10915                        ((env->me_flags & MDBX_NOTLS) ? 0 : mdbx_thread_self()));
10916           mdbx_assert(env, r->mr_txnid.weak == snap);
10917           atomic_store32(&env->me_lck->mti_readers_refresh_flag, true,
10918                          mo_AcquireRelease);
10919         }
10920         mdbx_jitter4testing(true);
10921 
10922         /* Snap the state from current meta-head */
10923         txn->mt_txnid = snap;
10924         txn->mt_geo = meta->mm_geo;
10925         memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDBX_db));
10926         txn->mt_canary = meta->mm_canary;
10927 
10928         /* LY: Retry on a race, ITS#7970. */
10929         if (likely(meta == mdbx_meta_head(env) &&
10930                    snap == mdbx_meta_txnid_fluid(env, meta) &&
10931                    snap >= atomic_load64(&env->me_lck->mti_oldest_reader,
10932                                          mo_AcquireRelease))) {
10933           mdbx_jitter4testing(false);
10934           break;
10935         }
10936       }
10937     } else {
10938       /* r/o recovery mode */
10939       MDBX_meta *const meta = METAPAGE(env, env->me_stuck_meta);
10940       txn->mt_txnid = mdbx_meta_txnid_stable(env, meta);
10941       txn->mt_geo = meta->mm_geo;
10942       memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDBX_db));
10943       txn->mt_canary = meta->mm_canary;
10944       if (likely(r)) {
10945         atomic_store32(&r->mr_snapshot_pages_used, meta->mm_geo.next,
10946                        mo_Relaxed);
10947         atomic_store64(&r->mr_snapshot_pages_retired,
10948                        unaligned_peek_u64(4, meta->mm_pages_retired),
10949                        mo_Relaxed);
10950         atomic_store64(&r->mr_txnid, txn->mt_txnid, mo_Relaxed);
10951         mdbx_jitter4testing(false);
10952         mdbx_assert(env, r->mr_pid.weak == mdbx_getpid());
10953         mdbx_assert(
10954             env, r->mr_tid.weak ==
10955                      ((env->me_flags & MDBX_NOTLS) ? 0 : mdbx_thread_self()));
10956         mdbx_assert(env, r->mr_txnid.weak == txn->mt_txnid);
10957         atomic_store32(&env->me_lck->mti_readers_refresh_flag, true,
10958                        mo_Relaxed);
10959       }
10960     }
10961 
10962     if (unlikely(txn->mt_txnid < MIN_TXNID || txn->mt_txnid > MAX_TXNID)) {
10963       mdbx_error("%s", "environment corrupted by died writer, must shutdown!");
10964       rc = MDBX_CORRUPTED;
10965       goto bailout;
10966     }
10967     mdbx_assert(env, txn->mt_txnid >= env->me_lck->mti_oldest_reader.weak);
10968     txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */
10969     mdbx_ensure(env, txn->mt_txnid >=
10970                          /* paranoia is appropriate here */ env->me_lck
10971                              ->mti_oldest_reader.weak);
10972     txn->mt_numdbs = env->me_numdbs;
10973   } else {
10974     mdbx_assert(env, (flags & ~(MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_SPILLS |
10975                                 MDBX_WRITEMAP)) == 0);
10976     if (unlikely(txn->mt_owner == tid ||
10977                  /* not recovery mode */ env->me_stuck_meta >= 0))
10978       return MDBX_BUSY;
10979     MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
10980     if (lck && (env->me_flags & MDBX_NOTLS) == 0 &&
10981         (mdbx_runtime_flags & MDBX_DBG_LEGACY_OVERLAP) == 0) {
10982       const unsigned snap_nreaders =
10983           atomic_load32(&lck->mti_numreaders, mo_AcquireRelease);
10984       for (unsigned i = 0; i < snap_nreaders; ++i) {
10985         if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_Relaxed) ==
10986                 env->me_pid &&
10987             unlikely(atomic_load64(&lck->mti_readers[i].mr_tid, mo_Relaxed) ==
10988                      tid)) {
10989           const txnid_t txnid = safe64_read(&lck->mti_readers[i].mr_txnid);
10990           if (txnid >= MIN_TXNID && txnid <= MAX_TXNID)
10991             return MDBX_TXN_OVERLAPPING;
10992         }
10993       }
10994     }
10995 
10996     /* Not yet touching txn == env->me_txn0, it may be active */
10997     mdbx_jitter4testing(false);
10998     rc = mdbx_txn_lock(env, F_ISSET(flags, MDBX_TXN_TRY));
10999     if (unlikely(rc))
11000       return rc;
11001     if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) {
11002       mdbx_txn_unlock(env);
11003       return MDBX_PANIC;
11004     }
11005 #if defined(_WIN32) || defined(_WIN64)
11006     if (unlikely(!env->me_map)) {
11007       mdbx_txn_unlock(env);
11008       return MDBX_EPERM;
11009     }
11010 #endif /* Windows */
11011 
11012     mdbx_jitter4testing(false);
11013     MDBX_meta *meta = mdbx_meta_head(env);
11014     mdbx_jitter4testing(false);
11015     txn->mt_canary = meta->mm_canary;
11016     const txnid_t snap = mdbx_meta_txnid_stable(env, meta);
11017     txn->mt_txnid = safe64_txnid_next(snap);
11018     if (unlikely(txn->mt_txnid > MAX_TXNID)) {
11019       rc = MDBX_TXN_FULL;
11020       mdbx_error("txnid overflow, raise %d", rc);
11021       goto bailout;
11022     }
11023 
11024     txn->mt_flags = flags;
11025     txn->mt_child = NULL;
11026     txn->tw.loose_pages = NULL;
11027     txn->tw.loose_count = 0;
11028 #if MDBX_ENABLE_REFUND
11029     txn->tw.loose_refund_wl = 0;
11030 #endif /* MDBX_ENABLE_REFUND */
11031     MDBX_PNL_SIZE(txn->tw.retired_pages) = 0;
11032     txn->tw.spill_pages = NULL;
11033     txn->tw.spill_least_removed = 0;
11034     txn->tw.last_reclaimed = 0;
11035     if (txn->tw.lifo_reclaimed)
11036       MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) = 0;
11037     env->me_txn = txn;
11038     txn->mt_numdbs = env->me_numdbs;
11039     memcpy(txn->mt_dbiseqs, env->me_dbiseqs, txn->mt_numdbs * sizeof(unsigned));
11040     /* Copy the DB info and flags */
11041     memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDBX_db));
11042     /* Moved to here to avoid a data race in read TXNs */
11043     txn->mt_geo = meta->mm_geo;
11044 
11045     rc = mdbx_dpl_alloc(txn);
11046     if (unlikely(rc != MDBX_SUCCESS))
11047       goto bailout;
11048     txn->tw.dirtyroom = txn->mt_env->me_options.dp_limit;
11049     txn->tw.dirtylru = MDBX_DEBUG ? ~42u : 0;
11050   }
11051 
11052   /* Setup db info */
11053   mdbx_compiler_barrier();
11054   for (unsigned i = CORE_DBS; i < txn->mt_numdbs; i++) {
11055     const unsigned db_flags = env->me_dbflags[i];
11056     txn->mt_dbs[i].md_flags = db_flags & DB_PERSISTENT_FLAGS;
11057     txn->mt_dbistate[i] =
11058         (db_flags & DB_VALID) ? DBI_VALID | DBI_USRVALID | DBI_STALE : 0;
11059   }
11060   txn->mt_dbistate[MAIN_DBI] = DBI_VALID | DBI_USRVALID;
11061   txn->mt_dbistate[FREE_DBI] = DBI_VALID;
11062   txn->mt_front =
11063       txn->mt_txnid + ((flags & (MDBX_WRITEMAP | MDBX_RDONLY)) == 0);
11064 
11065   if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) {
11066     mdbx_warning("%s", "environment had fatal error, must shutdown!");
11067     rc = MDBX_PANIC;
11068   } else {
11069     const size_t size =
11070         pgno2bytes(env, (txn->mt_flags & MDBX_TXN_RDONLY) ? txn->mt_next_pgno
11071                                                           : txn->mt_end_pgno);
11072     if (unlikely(size > env->me_dxb_mmap.limit)) {
11073       if (txn->mt_geo.upper > MAX_PAGENO ||
11074           bytes2pgno(env, pgno2bytes(env, txn->mt_geo.upper)) !=
11075               txn->mt_geo.upper) {
11076         rc = MDBX_UNABLE_EXTEND_MAPSIZE;
11077         goto bailout;
11078       }
11079       rc = mdbx_mapresize(env, txn->mt_next_pgno, txn->mt_end_pgno,
11080                           txn->mt_geo.upper,
11081                           (txn->mt_flags & MDBX_TXN_RDONLY) ? true : false);
11082       if (rc != MDBX_SUCCESS)
11083         goto bailout;
11084     }
11085     if (txn->mt_flags & MDBX_TXN_RDONLY) {
11086 #if defined(_WIN32) || defined(_WIN64)
11087       if (((size > env->me_dbgeo.lower && env->me_dbgeo.shrink) ||
11088            (mdbx_RunningUnderWine() &&
11089             /* under Wine acquisition of remap_guard is always required,
11090              * since Wine don't support section extending,
11091              * i.e. in both cases unmap+map are required. */
11092             size < env->me_dbgeo.upper && env->me_dbgeo.grow)) &&
11093           /* avoid recursive use SRW */ (txn->mt_flags & MDBX_NOTLS) == 0) {
11094         txn->mt_flags |= MDBX_SHRINK_ALLOWED;
11095         mdbx_srwlock_AcquireShared(&env->me_remap_guard);
11096       }
11097 #endif /* Windows */
11098     } else {
11099       env->me_dxb_mmap.current = size;
11100       env->me_dxb_mmap.filesize =
11101           (env->me_dxb_mmap.filesize < size) ? size : env->me_dxb_mmap.filesize;
11102     }
11103 #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__)
11104     mdbx_txn_valgrind(env, txn);
11105 #endif
11106     txn->mt_owner = tid;
11107     return MDBX_SUCCESS;
11108   }
11109 bailout:
11110   mdbx_tassert(txn, rc != MDBX_SUCCESS);
11111   mdbx_txn_end(txn, MDBX_END_SLOT | MDBX_END_FAIL_BEGIN);
11112   return rc;
11113 }
11114 
check_txn(const MDBX_txn * txn,int bad_bits)11115 static __always_inline int check_txn(const MDBX_txn *txn, int bad_bits) {
11116   if (unlikely(!txn))
11117     return MDBX_EINVAL;
11118 
11119   if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE))
11120     return MDBX_EBADSIGN;
11121 
11122   if (unlikely(txn->mt_flags & bad_bits))
11123     return MDBX_BAD_TXN;
11124 
11125 #if MDBX_TXN_CHECKOWNER
11126   if ((txn->mt_flags & MDBX_NOTLS) == 0 &&
11127       unlikely(txn->mt_owner != mdbx_thread_self()))
11128     return txn->mt_owner ? MDBX_THREAD_MISMATCH : MDBX_BAD_TXN;
11129 #endif /* MDBX_TXN_CHECKOWNER */
11130 
11131   if (unlikely(!txn->mt_env->me_map))
11132     return MDBX_EPERM;
11133 
11134   return MDBX_SUCCESS;
11135 }
11136 
check_txn_rw(const MDBX_txn * txn,int bad_bits)11137 static __always_inline int check_txn_rw(const MDBX_txn *txn, int bad_bits) {
11138   int err = check_txn(txn, bad_bits);
11139   if (unlikely(err))
11140     return err;
11141 
11142   if (unlikely(F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY)))
11143     return MDBX_EACCESS;
11144 
11145   return MDBX_SUCCESS;
11146 }
11147 
mdbx_txn_renew(MDBX_txn * txn)11148 int mdbx_txn_renew(MDBX_txn *txn) {
11149   if (unlikely(!txn))
11150     return MDBX_EINVAL;
11151 
11152   if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE))
11153     return MDBX_EBADSIGN;
11154 
11155   if (unlikely((txn->mt_flags & MDBX_TXN_RDONLY) == 0))
11156     return MDBX_EINVAL;
11157 
11158   int rc;
11159   if (unlikely(txn->mt_owner != 0 || !(txn->mt_flags & MDBX_TXN_FINISHED))) {
11160     rc = mdbx_txn_reset(txn);
11161     if (unlikely(rc != MDBX_SUCCESS))
11162       return rc;
11163   }
11164 
11165   rc = mdbx_txn_renew0(txn, MDBX_TXN_RDONLY);
11166   if (rc == MDBX_SUCCESS) {
11167     txn->mt_owner = mdbx_thread_self();
11168     mdbx_debug("renew txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO
11169                "/%" PRIaPGNO,
11170                txn->mt_txnid, (txn->mt_flags & MDBX_TXN_RDONLY) ? 'r' : 'w',
11171                (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root,
11172                txn->mt_dbs[FREE_DBI].md_root);
11173   }
11174   return rc;
11175 }
11176 
11177 #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API
mdbx_txn_begin(MDBX_env * env,MDBX_txn * parent,MDBX_txn_flags_t flags,MDBX_txn ** ret)11178 int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags,
11179                    MDBX_txn **ret) {
11180   return __inline_mdbx_txn_begin(env, parent, flags, ret);
11181 }
11182 #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */
11183 
mdbx_txn_set_userctx(MDBX_txn * txn,void * ctx)11184 int mdbx_txn_set_userctx(MDBX_txn *txn, void *ctx) {
11185   int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_HAS_CHILD);
11186   if (unlikely(rc != MDBX_SUCCESS))
11187     return rc;
11188 
11189   txn->mt_userctx = ctx;
11190   return MDBX_SUCCESS;
11191 }
11192 
mdbx_txn_get_userctx(const MDBX_txn * txn)11193 void *mdbx_txn_get_userctx(const MDBX_txn *txn) {
11194   return check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_HAS_CHILD)
11195              ? nullptr
11196              : txn->mt_userctx;
11197 }
11198 
mdbx_txn_begin_ex(MDBX_env * env,MDBX_txn * parent,MDBX_txn_flags_t flags,MDBX_txn ** ret,void * context)11199 int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags,
11200                       MDBX_txn **ret, void *context) {
11201   MDBX_txn *txn;
11202   unsigned size, tsize;
11203 
11204   if (unlikely(!ret))
11205     return MDBX_EINVAL;
11206   *ret = NULL;
11207 
11208   if (unlikely((flags & ~MDBX_TXN_RW_BEGIN_FLAGS) &&
11209                (flags & ~MDBX_TXN_RO_BEGIN_FLAGS)))
11210     return MDBX_EINVAL;
11211 
11212   int rc = check_env(env, true);
11213   if (unlikely(rc != MDBX_SUCCESS))
11214     return rc;
11215 
11216   if (unlikely(env->me_flags & MDBX_RDONLY &
11217                ~flags)) /* write txn in RDONLY env */
11218     return MDBX_EACCESS;
11219 
11220   flags |= env->me_flags & MDBX_WRITEMAP;
11221 
11222   if (parent) {
11223     /* Nested transactions: Max 1 child, write txns only, no writemap */
11224     rc = check_txn_rw(parent,
11225                       MDBX_TXN_RDONLY | MDBX_WRITEMAP | MDBX_TXN_BLOCKED);
11226     if (unlikely(rc != MDBX_SUCCESS))
11227       return rc;
11228 
11229     if (env->me_options.spill_parent4child_denominator) {
11230       /* Spill dirty-pages of parent to provide dirtyroom for child txn */
11231       rc = mdbx_txn_spill(parent, nullptr,
11232                           parent->tw.dirtylist->length /
11233                               env->me_options.spill_parent4child_denominator);
11234       if (unlikely(rc != MDBX_SUCCESS))
11235         return rc;
11236     }
11237     mdbx_tassert(parent, mdbx_audit_ex(parent, 0, false) == 0);
11238 
11239     flags |= parent->mt_flags & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_SPILLS);
11240     /* Child txns save MDBX_pgstate and use own copy of cursors */
11241     size = env->me_maxdbs * (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + 1);
11242     size += tsize = sizeof(MDBX_txn);
11243   } else if (flags & MDBX_TXN_RDONLY) {
11244     if (env->me_txn0 &&
11245         unlikely(env->me_txn0->mt_owner == mdbx_thread_self()) &&
11246         (mdbx_runtime_flags & MDBX_DBG_LEGACY_OVERLAP) == 0)
11247       return MDBX_TXN_OVERLAPPING;
11248     size = env->me_maxdbs * (sizeof(MDBX_db) + 1);
11249     size += tsize = sizeof(MDBX_txn);
11250   } else {
11251     /* Reuse preallocated write txn. However, do not touch it until
11252      * mdbx_txn_renew0() succeeds, since it currently may be active. */
11253     txn = env->me_txn0;
11254     goto renew;
11255   }
11256   if (unlikely((txn = mdbx_malloc(size)) == NULL)) {
11257     mdbx_debug("calloc: %s", "failed");
11258     return MDBX_ENOMEM;
11259   }
11260   memset(txn, 0, tsize);
11261   txn->mt_dbxs = env->me_dbxs; /* static */
11262   txn->mt_dbs = (MDBX_db *)((char *)txn + tsize);
11263   txn->mt_dbistate = (uint8_t *)txn + size - env->me_maxdbs;
11264   txn->mt_flags = flags;
11265   txn->mt_env = env;
11266 
11267   if (parent) {
11268     mdbx_tassert(parent, mdbx_dirtylist_check(parent));
11269     txn->tw.cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs);
11270     txn->mt_dbiseqs = parent->mt_dbiseqs;
11271     txn->mt_geo = parent->mt_geo;
11272     rc = mdbx_dpl_alloc(txn);
11273     if (likely(rc == MDBX_SUCCESS)) {
11274       const unsigned len =
11275           MDBX_PNL_SIZE(parent->tw.reclaimed_pglist) + parent->tw.loose_count;
11276       txn->tw.reclaimed_pglist =
11277           mdbx_pnl_alloc((len > MDBX_PNL_INITIAL) ? len : MDBX_PNL_INITIAL);
11278       if (unlikely(!txn->tw.reclaimed_pglist))
11279         rc = MDBX_ENOMEM;
11280     }
11281     if (unlikely(rc != MDBX_SUCCESS)) {
11282     nested_failed:
11283       mdbx_pnl_free(txn->tw.reclaimed_pglist);
11284       mdbx_dpl_free(txn);
11285       mdbx_free(txn);
11286       return rc;
11287     }
11288 
11289     /* Move loose pages to reclaimed list */
11290     if (parent->tw.loose_count) {
11291       do {
11292         MDBX_page *lp = parent->tw.loose_pages;
11293         const unsigned di = mdbx_dpl_exist(parent, lp->mp_pgno);
11294         mdbx_tassert(parent, di && parent->tw.dirtylist->items[di].ptr == lp);
11295         mdbx_tassert(parent, lp->mp_flags == P_LOOSE);
11296         rc =
11297             mdbx_pnl_insert_range(&parent->tw.reclaimed_pglist, lp->mp_pgno, 1);
11298         if (unlikely(rc != MDBX_SUCCESS))
11299           goto nested_failed;
11300         parent->tw.loose_pages = lp->mp_next;
11301         /* Remove from dirty list */
11302         mdbx_page_wash(parent, di, lp, 1);
11303       } while (parent->tw.loose_pages);
11304       parent->tw.loose_count = 0;
11305 #if MDBX_ENABLE_REFUND
11306       parent->tw.loose_refund_wl = 0;
11307 #endif /* MDBX_ENABLE_REFUND */
11308       mdbx_tassert(parent, mdbx_dirtylist_check(parent));
11309     }
11310     txn->tw.dirtyroom = parent->tw.dirtyroom;
11311     txn->tw.dirtylru = parent->tw.dirtylru;
11312 
11313     mdbx_dpl_sort(parent);
11314     if (parent->tw.spill_pages)
11315       mdbx_spill_purge(parent);
11316 
11317     mdbx_tassert(txn, MDBX_PNL_ALLOCLEN(txn->tw.reclaimed_pglist) >=
11318                           MDBX_PNL_SIZE(parent->tw.reclaimed_pglist));
11319     memcpy(txn->tw.reclaimed_pglist, parent->tw.reclaimed_pglist,
11320            MDBX_PNL_SIZEOF(parent->tw.reclaimed_pglist));
11321     mdbx_assert(env, mdbx_pnl_check4assert(
11322                          txn->tw.reclaimed_pglist,
11323                          (txn->mt_next_pgno /* LY: intentional assignment here,
11324                                                    only for assertion */
11325                           = parent->mt_next_pgno) -
11326                              MDBX_ENABLE_REFUND));
11327 
11328     txn->tw.last_reclaimed = parent->tw.last_reclaimed;
11329     if (parent->tw.lifo_reclaimed) {
11330       txn->tw.lifo_reclaimed = parent->tw.lifo_reclaimed;
11331       parent->tw.lifo_reclaimed =
11332           (void *)(intptr_t)MDBX_PNL_SIZE(parent->tw.lifo_reclaimed);
11333     }
11334 
11335     txn->tw.retired_pages = parent->tw.retired_pages;
11336     parent->tw.retired_pages =
11337         (void *)(intptr_t)MDBX_PNL_SIZE(parent->tw.retired_pages);
11338 
11339     txn->mt_txnid = parent->mt_txnid;
11340     txn->mt_front = parent->mt_front + 1;
11341 #if MDBX_ENABLE_REFUND
11342     txn->tw.loose_refund_wl = 0;
11343 #endif /* MDBX_ENABLE_REFUND */
11344     txn->mt_canary = parent->mt_canary;
11345     parent->mt_flags |= MDBX_TXN_HAS_CHILD;
11346     parent->mt_child = txn;
11347     txn->mt_parent = parent;
11348     txn->mt_numdbs = parent->mt_numdbs;
11349     txn->mt_owner = parent->mt_owner;
11350     memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDBX_db));
11351     /* Copy parent's mt_dbistate, but clear DB_NEW */
11352     for (unsigned i = 0; i < txn->mt_numdbs; i++)
11353       txn->mt_dbistate[i] =
11354           parent->mt_dbistate[i] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY);
11355     mdbx_tassert(parent,
11356                  parent->tw.dirtyroom + parent->tw.dirtylist->length ==
11357                      (parent->mt_parent ? parent->mt_parent->tw.dirtyroom
11358                                         : parent->mt_env->me_options.dp_limit));
11359     mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
11360                           (txn->mt_parent ? txn->mt_parent->tw.dirtyroom
11361                                           : txn->mt_env->me_options.dp_limit));
11362     env->me_txn = txn;
11363     rc = mdbx_cursor_shadow(parent, txn);
11364     if (mdbx_audit_enabled() && mdbx_assert_enabled()) {
11365       txn->mt_signature = MDBX_MT_SIGNATURE;
11366       mdbx_tassert(txn, mdbx_audit_ex(txn, 0, false) == 0);
11367     }
11368     if (unlikely(rc != MDBX_SUCCESS))
11369       mdbx_txn_end(txn, MDBX_END_FAIL_BEGINCHILD);
11370   } else { /* MDBX_TXN_RDONLY */
11371     txn->mt_dbiseqs = env->me_dbiseqs;
11372   renew:
11373     rc = mdbx_txn_renew0(txn, flags);
11374   }
11375 
11376   if (unlikely(rc != MDBX_SUCCESS)) {
11377     if (txn != env->me_txn0)
11378       mdbx_free(txn);
11379   } else {
11380     if (flags & (MDBX_TXN_RDONLY_PREPARE - MDBX_TXN_RDONLY))
11381       mdbx_assert(env, txn->mt_flags == (MDBX_TXN_RDONLY | MDBX_TXN_FINISHED));
11382     else if (flags & MDBX_TXN_RDONLY)
11383       mdbx_assert(env, (txn->mt_flags &
11384                         ~(MDBX_NOTLS | MDBX_TXN_RDONLY | MDBX_WRITEMAP |
11385                           /* Win32: SRWL flag */ MDBX_SHRINK_ALLOWED)) == 0);
11386     else {
11387       mdbx_assert(env, (txn->mt_flags & ~(MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED |
11388                                           MDBX_NOMETASYNC | MDBX_SAFE_NOSYNC |
11389                                           MDBX_TXN_SPILLS)) == 0);
11390       assert(!txn->tw.spill_pages && !txn->tw.spill_least_removed);
11391     }
11392     txn->mt_signature = MDBX_MT_SIGNATURE;
11393     txn->mt_userctx = context;
11394     *ret = txn;
11395     mdbx_debug("begin txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO
11396                "/%" PRIaPGNO,
11397                txn->mt_txnid, (flags & MDBX_TXN_RDONLY) ? 'r' : 'w',
11398                (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root,
11399                txn->mt_dbs[FREE_DBI].md_root);
11400   }
11401 
11402   return rc;
11403 }
11404 
mdbx_txn_info(const MDBX_txn * txn,MDBX_txn_info * info,bool scan_rlt)11405 int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) {
11406   int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_HAS_CHILD);
11407   if (unlikely(rc != MDBX_SUCCESS))
11408     return rc;
11409 
11410   if (unlikely(!info))
11411     return MDBX_EINVAL;
11412 
11413   MDBX_env *const env = txn->mt_env;
11414 #if MDBX_ENV_CHECKPID
11415   if (unlikely(env->me_pid != mdbx_getpid())) {
11416     env->me_flags |= MDBX_FATAL_ERROR;
11417     return MDBX_PANIC;
11418   }
11419 #endif /* MDBX_ENV_CHECKPID */
11420 
11421   info->txn_id = txn->mt_txnid;
11422   info->txn_space_used = pgno2bytes(env, txn->mt_geo.next);
11423 
11424   if (txn->mt_flags & MDBX_TXN_RDONLY) {
11425     const MDBX_meta *head_meta;
11426     txnid_t head_txnid;
11427     uint64_t head_retired;
11428     do {
11429       /* fetch info from volatile head */
11430       head_meta = mdbx_meta_head(env);
11431       head_txnid = mdbx_meta_txnid_fluid(env, head_meta);
11432       head_retired = unaligned_peek_u64(4, head_meta->mm_pages_retired);
11433       info->txn_space_limit_soft = pgno2bytes(env, head_meta->mm_geo.now);
11434       info->txn_space_limit_hard = pgno2bytes(env, head_meta->mm_geo.upper);
11435       info->txn_space_leftover =
11436           pgno2bytes(env, head_meta->mm_geo.now - head_meta->mm_geo.next);
11437       mdbx_compiler_barrier();
11438     } while (unlikely(head_meta != mdbx_meta_head(env) ||
11439                       head_txnid != mdbx_meta_txnid_fluid(env, head_meta)));
11440 
11441     info->txn_reader_lag = head_txnid - info->txn_id;
11442     info->txn_space_dirty = info->txn_space_retired = 0;
11443     uint64_t reader_snapshot_pages_retired;
11444     if (txn->to.reader &&
11445         head_retired >
11446             (reader_snapshot_pages_retired = atomic_load64(
11447                  &txn->to.reader->mr_snapshot_pages_retired, mo_Relaxed))) {
11448       info->txn_space_dirty = info->txn_space_retired = pgno2bytes(
11449           env, (pgno_t)(head_retired - reader_snapshot_pages_retired));
11450 
11451       size_t retired_next_reader = 0;
11452       MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
11453       if (scan_rlt && info->txn_reader_lag > 1 && lck) {
11454         /* find next more recent reader */
11455         txnid_t next_reader = head_txnid;
11456         const unsigned snap_nreaders =
11457             atomic_load32(&lck->mti_numreaders, mo_AcquireRelease);
11458         for (unsigned i = 0; i < snap_nreaders; ++i) {
11459         retry:
11460           if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) {
11461             mdbx_jitter4testing(true);
11462             const txnid_t snap_txnid =
11463                 safe64_read(&lck->mti_readers[i].mr_txnid);
11464             const uint64_t snap_retired =
11465                 atomic_load64(&lck->mti_readers[i].mr_snapshot_pages_retired,
11466                               mo_AcquireRelease);
11467             if (unlikely(snap_retired !=
11468                          atomic_load64(
11469                              &lck->mti_readers[i].mr_snapshot_pages_retired,
11470                              mo_Relaxed)) ||
11471                 snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid))
11472               goto retry;
11473             if (snap_txnid <= txn->mt_txnid) {
11474               retired_next_reader = 0;
11475               break;
11476             }
11477             if (snap_txnid < next_reader) {
11478               next_reader = snap_txnid;
11479               retired_next_reader = pgno2bytes(
11480                   env, (pgno_t)(snap_retired -
11481                                 atomic_load64(
11482                                     &txn->to.reader->mr_snapshot_pages_retired,
11483                                     mo_Relaxed)));
11484             }
11485           }
11486         }
11487       }
11488       info->txn_space_dirty = retired_next_reader;
11489     }
11490   } else {
11491     info->txn_space_limit_soft = pgno2bytes(env, txn->mt_geo.now);
11492     info->txn_space_limit_hard = pgno2bytes(env, txn->mt_geo.upper);
11493     info->txn_space_retired = pgno2bytes(
11494         env, txn->mt_child ? (unsigned)(uintptr_t)txn->tw.retired_pages
11495                            : MDBX_PNL_SIZE(txn->tw.retired_pages));
11496     info->txn_space_leftover = pgno2bytes(env, txn->tw.dirtyroom);
11497     info->txn_space_dirty =
11498         pgno2bytes(env, txn->mt_env->me_options.dp_limit - txn->tw.dirtyroom);
11499     info->txn_reader_lag = INT64_MAX;
11500     MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
11501     if (scan_rlt && lck) {
11502       txnid_t oldest_snapshot = txn->mt_txnid;
11503       const unsigned snap_nreaders =
11504           atomic_load32(&lck->mti_numreaders, mo_AcquireRelease);
11505       if (snap_nreaders) {
11506         oldest_snapshot = mdbx_find_oldest(txn);
11507         if (oldest_snapshot == txn->mt_txnid - 1) {
11508           /* check if there is at least one reader */
11509           bool exists = false;
11510           for (unsigned i = 0; i < snap_nreaders; ++i) {
11511             if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_Relaxed) &&
11512                 txn->mt_txnid > safe64_read(&lck->mti_readers[i].mr_txnid)) {
11513               exists = true;
11514               break;
11515             }
11516           }
11517           oldest_snapshot += !exists;
11518         }
11519       }
11520       info->txn_reader_lag = txn->mt_txnid - oldest_snapshot;
11521     }
11522   }
11523 
11524   return MDBX_SUCCESS;
11525 }
11526 
mdbx_txn_env(const MDBX_txn * txn)11527 MDBX_env *mdbx_txn_env(const MDBX_txn *txn) {
11528   if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE ||
11529                txn->mt_env->me_signature.weak != MDBX_ME_SIGNATURE))
11530     return NULL;
11531   return txn->mt_env;
11532 }
11533 
mdbx_txn_id(const MDBX_txn * txn)11534 uint64_t mdbx_txn_id(const MDBX_txn *txn) {
11535   if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE))
11536     return 0;
11537   return txn->mt_txnid;
11538 }
11539 
mdbx_txn_flags(const MDBX_txn * txn)11540 int mdbx_txn_flags(const MDBX_txn *txn) {
11541   if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE))
11542     return -1;
11543   return txn->mt_flags;
11544 }
11545 
11546 /* Check for misused dbi handles */
11547 #define TXN_DBI_CHANGED(txn, dbi)                                              \
11548   ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi])
11549 
dbi_import_locked(MDBX_txn * txn)11550 static void dbi_import_locked(MDBX_txn *txn) {
11551   MDBX_env *const env = txn->mt_env;
11552   const unsigned n = env->me_numdbs;
11553   for (unsigned i = CORE_DBS; i < n; ++i) {
11554     if (i >= txn->mt_numdbs) {
11555       txn->mt_dbistate[i] = 0;
11556       if (!(txn->mt_flags & MDBX_TXN_RDONLY))
11557         txn->tw.cursors[i] = NULL;
11558     }
11559     if ((env->me_dbflags[i] & DB_VALID) &&
11560         !(txn->mt_dbistate[i] & DBI_USRVALID)) {
11561       txn->mt_dbiseqs[i] = env->me_dbiseqs[i];
11562       txn->mt_dbs[i].md_flags = env->me_dbflags[i] & DB_PERSISTENT_FLAGS;
11563       txn->mt_dbistate[i] = DBI_VALID | DBI_USRVALID | DBI_STALE;
11564       mdbx_tassert(txn, txn->mt_dbxs[i].md_cmp != NULL);
11565       mdbx_tassert(txn, txn->mt_dbxs[i].md_name.iov_base != NULL);
11566     }
11567   }
11568   txn->mt_numdbs = n;
11569 }
11570 
11571 /* Import DBI which opened after txn started into context */
dbi_import(MDBX_txn * txn,MDBX_dbi dbi)11572 __cold static bool dbi_import(MDBX_txn *txn, MDBX_dbi dbi) {
11573   if (dbi < CORE_DBS || dbi >= txn->mt_env->me_numdbs)
11574     return false;
11575 
11576   mdbx_ensure(txn->mt_env, mdbx_fastmutex_acquire(&txn->mt_env->me_dbi_lock) ==
11577                                MDBX_SUCCESS);
11578   dbi_import_locked(txn);
11579   mdbx_ensure(txn->mt_env, mdbx_fastmutex_release(&txn->mt_env->me_dbi_lock) ==
11580                                MDBX_SUCCESS);
11581   return txn->mt_dbistate[dbi] & DBI_USRVALID;
11582 }
11583 
11584 /* Export or close DBI handles opened in this txn. */
dbi_update(MDBX_txn * txn,int keep)11585 static void dbi_update(MDBX_txn *txn, int keep) {
11586   mdbx_tassert(txn, !txn->mt_parent && txn == txn->mt_env->me_txn0);
11587   MDBX_dbi n = txn->mt_numdbs;
11588   if (n) {
11589     bool locked = false;
11590     MDBX_env *const env = txn->mt_env;
11591 
11592     for (unsigned i = n; --i >= CORE_DBS;) {
11593       if (likely((txn->mt_dbistate[i] & DBI_CREAT) == 0))
11594         continue;
11595       if (!locked) {
11596         mdbx_ensure(env,
11597                     mdbx_fastmutex_acquire(&env->me_dbi_lock) == MDBX_SUCCESS);
11598         locked = true;
11599       }
11600       if (env->me_numdbs <= i || txn->mt_dbiseqs[i] != env->me_dbiseqs[i])
11601         continue /* dbi explicitly closed and/or then re-opened by other txn */;
11602       if (keep) {
11603         env->me_dbflags[i] = txn->mt_dbs[i].md_flags | DB_VALID;
11604       } else {
11605         char *ptr = env->me_dbxs[i].md_name.iov_base;
11606         if (ptr) {
11607           env->me_dbxs[i].md_name.iov_len = 0;
11608           mdbx_memory_fence(mo_AcquireRelease, true);
11609           mdbx_assert(env, env->me_dbflags[i] == 0);
11610           env->me_dbiseqs[i]++;
11611           env->me_dbxs[i].md_name.iov_base = NULL;
11612           mdbx_free(ptr);
11613         }
11614       }
11615     }
11616 
11617     n = env->me_numdbs;
11618     if (n > CORE_DBS && unlikely(!(env->me_dbflags[n - 1] & DB_VALID))) {
11619       if (!locked) {
11620         mdbx_ensure(env,
11621                     mdbx_fastmutex_acquire(&env->me_dbi_lock) == MDBX_SUCCESS);
11622         locked = true;
11623       }
11624 
11625       n = env->me_numdbs;
11626       while (n > CORE_DBS && !(env->me_dbflags[n - 1] & DB_VALID))
11627         --n;
11628       env->me_numdbs = n;
11629     }
11630 
11631     if (unlikely(locked))
11632       mdbx_ensure(env,
11633                   mdbx_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS);
11634   }
11635 }
11636 
11637 /* Filter-out pgno list from transaction's dirty-page list */
mdbx_dpl_sift(MDBX_txn * const txn,MDBX_PNL pl,const bool spilled)11638 static void mdbx_dpl_sift(MDBX_txn *const txn, MDBX_PNL pl,
11639                           const bool spilled) {
11640   if (MDBX_PNL_SIZE(pl) && txn->tw.dirtylist->length) {
11641     mdbx_tassert(txn, mdbx_pnl_check4assert(pl, txn->mt_next_pgno << spilled));
11642     MDBX_dpl *dl = mdbx_dpl_sort(txn);
11643 
11644     /* Scanning in ascend order */
11645     const int step = MDBX_PNL_ASCENDING ? 1 : -1;
11646     const int begin = MDBX_PNL_ASCENDING ? 1 : MDBX_PNL_SIZE(pl);
11647     const int end = MDBX_PNL_ASCENDING ? MDBX_PNL_SIZE(pl) + 1 : 0;
11648     mdbx_tassert(txn, pl[begin] <= pl[end - step]);
11649 
11650     unsigned r = mdbx_dpl_search(txn, pl[begin] >> spilled);
11651     mdbx_tassert(txn, dl->sorted == dl->length);
11652     for (int i = begin; r <= dl->length;) { /* scan loop */
11653       assert(i != end);
11654       mdbx_tassert(txn, !spilled || (pl[i] & 1) == 0);
11655       pgno_t pl_pgno = pl[i] >> spilled;
11656       pgno_t dp_pgno = dl->items[r].pgno;
11657       if (likely(dp_pgno != pl_pgno)) {
11658         const bool cmp = dp_pgno < pl_pgno;
11659         r += cmp;
11660         i += cmp ? 0 : step;
11661         if (likely(i != end))
11662           continue;
11663         return;
11664       }
11665 
11666       /* update loop */
11667       unsigned w = r;
11668     remove_dl:
11669       if ((txn->mt_env->me_flags & MDBX_WRITEMAP) == 0) {
11670         MDBX_page *dp = dl->items[r].ptr;
11671         mdbx_dpage_free(txn->mt_env, dp, dpl_npages(dl, r));
11672       }
11673       ++r;
11674     next_i:
11675       i += step;
11676       if (unlikely(i == end)) {
11677         while (r <= dl->length)
11678           dl->items[w++] = dl->items[r++];
11679       } else {
11680         while (r <= dl->length) {
11681           assert(i != end);
11682           mdbx_tassert(txn, !spilled || (pl[i] & 1) == 0);
11683           pl_pgno = pl[i] >> spilled;
11684           dp_pgno = dl->items[r].pgno;
11685           if (dp_pgno < pl_pgno)
11686             dl->items[w++] = dl->items[r++];
11687           else if (dp_pgno > pl_pgno)
11688             goto next_i;
11689           else
11690             goto remove_dl;
11691         }
11692       }
11693       dl->sorted = dpl_setlen(dl, w - 1);
11694       txn->tw.dirtyroom += r - w;
11695       mdbx_tassert(txn,
11696                    txn->tw.dirtyroom + txn->tw.dirtylist->length ==
11697                        (txn->mt_parent ? txn->mt_parent->tw.dirtyroom
11698                                        : txn->mt_env->me_options.dp_limit));
11699       return;
11700     }
11701   }
11702 }
11703 
11704 /* End a transaction, except successful commit of a nested transaction.
11705  * May be called twice for readonly txns: First reset it, then abort.
11706  * [in] txn   the transaction handle to end
11707  * [in] mode  why and how to end the transaction */
mdbx_txn_end(MDBX_txn * txn,const unsigned mode)11708 static int mdbx_txn_end(MDBX_txn *txn, const unsigned mode) {
11709   MDBX_env *env = txn->mt_env;
11710   static const char *const names[] = MDBX_END_NAMES;
11711 
11712 #if MDBX_ENV_CHECKPID
11713   if (unlikely(txn->mt_env->me_pid != mdbx_getpid())) {
11714     env->me_flags |= MDBX_FATAL_ERROR;
11715     return MDBX_PANIC;
11716   }
11717 #endif /* MDBX_ENV_CHECKPID */
11718 
11719   mdbx_debug("%s txn %" PRIaTXN "%c %p on mdbenv %p, root page %" PRIaPGNO
11720              "/%" PRIaPGNO,
11721              names[mode & MDBX_END_OPMASK], txn->mt_txnid,
11722              (txn->mt_flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn,
11723              (void *)env, txn->mt_dbs[MAIN_DBI].md_root,
11724              txn->mt_dbs[FREE_DBI].md_root);
11725 
11726   mdbx_ensure(env, txn->mt_txnid >=
11727                        /* paranoia is appropriate here */ env->me_lck
11728                            ->mti_oldest_reader.weak);
11729 
11730   int rc = MDBX_SUCCESS;
11731   if (F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY)) {
11732     if (txn->to.reader) {
11733       MDBX_reader *slot = txn->to.reader;
11734       mdbx_assert(env, slot->mr_pid.weak == env->me_pid);
11735       if (likely(!F_ISSET(txn->mt_flags, MDBX_TXN_FINISHED))) {
11736         mdbx_assert(env, txn->mt_txnid == slot->mr_txnid.weak &&
11737                              slot->mr_txnid.weak >=
11738                                  env->me_lck->mti_oldest_reader.weak);
11739 #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__)
11740         mdbx_txn_valgrind(env, nullptr);
11741 #endif
11742         atomic_store32(&slot->mr_snapshot_pages_used, 0, mo_Relaxed);
11743         safe64_reset(&slot->mr_txnid, false);
11744         atomic_store32(&env->me_lck->mti_readers_refresh_flag, true,
11745                        mo_Relaxed);
11746       } else {
11747         mdbx_assert(env, slot->mr_pid.weak == env->me_pid);
11748         mdbx_assert(env, slot->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD);
11749       }
11750       if (mode & MDBX_END_SLOT) {
11751         if ((env->me_flags & MDBX_ENV_TXKEY) == 0)
11752           atomic_store32(&slot->mr_pid, 0, mo_Relaxed);
11753         txn->to.reader = NULL;
11754       }
11755     }
11756 #if defined(_WIN32) || defined(_WIN64)
11757     if (txn->mt_flags & MDBX_SHRINK_ALLOWED)
11758       mdbx_srwlock_ReleaseShared(&env->me_remap_guard);
11759 #endif
11760     txn->mt_numdbs = 0; /* prevent further DBI activity */
11761     txn->mt_flags = MDBX_TXN_RDONLY | MDBX_TXN_FINISHED;
11762     txn->mt_owner = 0;
11763   } else if (!F_ISSET(txn->mt_flags, MDBX_TXN_FINISHED)) {
11764 #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__)
11765     if (txn == env->me_txn0)
11766       mdbx_txn_valgrind(env, nullptr);
11767 #endif
11768     if (!(mode & MDBX_END_EOTDONE)) /* !(already closed cursors) */
11769       mdbx_cursors_eot(txn, false);
11770 
11771     txn->mt_flags = MDBX_TXN_FINISHED;
11772     txn->mt_owner = 0;
11773     env->me_txn = txn->mt_parent;
11774     mdbx_pnl_free(txn->tw.spill_pages);
11775     txn->tw.spill_pages = nullptr;
11776     if (txn == env->me_txn0) {
11777       mdbx_assert(env, txn->mt_parent == NULL);
11778       /* Export or close DBI handles created in this txn */
11779       dbi_update(txn, mode & MDBX_END_UPDATE);
11780       mdbx_pnl_shrink(&txn->tw.retired_pages);
11781       mdbx_pnl_shrink(&txn->tw.reclaimed_pglist);
11782       if (!(env->me_flags & MDBX_WRITEMAP))
11783         mdbx_dlist_free(txn);
11784       /* The writer mutex was locked in mdbx_txn_begin. */
11785       mdbx_txn_unlock(env);
11786     } else {
11787       mdbx_assert(env, txn->mt_parent != NULL);
11788       MDBX_txn *const parent = txn->mt_parent;
11789       mdbx_assert(env, parent->mt_signature == MDBX_MT_SIGNATURE);
11790       mdbx_assert(env, parent->mt_child == txn &&
11791                            (parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0);
11792       mdbx_assert(
11793           env, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
11794                                      txn->mt_next_pgno - MDBX_ENABLE_REFUND));
11795 
11796       if (txn->tw.lifo_reclaimed) {
11797         mdbx_assert(env, MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) >=
11798                              (unsigned)(uintptr_t)parent->tw.lifo_reclaimed);
11799         MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) =
11800             (unsigned)(uintptr_t)parent->tw.lifo_reclaimed;
11801         parent->tw.lifo_reclaimed = txn->tw.lifo_reclaimed;
11802       }
11803 
11804       if (txn->tw.retired_pages) {
11805         mdbx_assert(env, MDBX_PNL_SIZE(txn->tw.retired_pages) >=
11806                              (unsigned)(uintptr_t)parent->tw.retired_pages);
11807         MDBX_PNL_SIZE(txn->tw.retired_pages) =
11808             (unsigned)(uintptr_t)parent->tw.retired_pages;
11809         parent->tw.retired_pages = txn->tw.retired_pages;
11810       }
11811 
11812       parent->mt_child = nullptr;
11813       parent->mt_flags &= ~MDBX_TXN_HAS_CHILD;
11814       parent->tw.dirtylru = txn->tw.dirtylru;
11815       mdbx_tassert(parent, mdbx_dirtylist_check(parent));
11816       mdbx_tassert(parent, mdbx_audit_ex(parent, 0, false) == 0);
11817       if (!(env->me_flags & MDBX_WRITEMAP))
11818         mdbx_dlist_free(txn);
11819       mdbx_dpl_free(txn);
11820       mdbx_pnl_free(txn->tw.reclaimed_pglist);
11821 
11822       if (parent->mt_geo.upper != txn->mt_geo.upper ||
11823           parent->mt_geo.now != txn->mt_geo.now) {
11824         /* undo resize performed by child txn */
11825         rc = mdbx_mapresize_implicit(env, parent->mt_next_pgno,
11826                                      parent->mt_geo.now, parent->mt_geo.upper);
11827         if (rc == MDBX_RESULT_TRUE) {
11828           /* unable undo resize (it is regular for Windows),
11829            * therefore promote size changes from child to the parent txn */
11830           mdbx_warning("unable undo resize performed by child txn, promote to "
11831                        "the parent (%u->%u, %u->%u)",
11832                        txn->mt_geo.now, parent->mt_geo.now, txn->mt_geo.upper,
11833                        parent->mt_geo.upper);
11834           parent->mt_geo.now = txn->mt_geo.now;
11835           parent->mt_geo.upper = txn->mt_geo.upper;
11836           rc = MDBX_SUCCESS;
11837         } else if (unlikely(rc != MDBX_SUCCESS)) {
11838           mdbx_error("error %d while undo resize performed by child txn, fail "
11839                      "the parent",
11840                      rc);
11841           parent->mt_flags |= MDBX_TXN_ERROR;
11842           if (!env->me_dxb_mmap.address)
11843             env->me_flags |= MDBX_FATAL_ERROR;
11844         }
11845       }
11846     }
11847   }
11848 
11849   mdbx_assert(env, txn == env->me_txn0 || txn->mt_owner == 0);
11850   if ((mode & MDBX_END_FREE) != 0 && txn != env->me_txn0) {
11851     txn->mt_signature = 0;
11852     mdbx_free(txn);
11853   }
11854 
11855   return rc;
11856 }
11857 
mdbx_txn_reset(MDBX_txn * txn)11858 int mdbx_txn_reset(MDBX_txn *txn) {
11859   int rc = check_txn(txn, 0);
11860   if (unlikely(rc != MDBX_SUCCESS))
11861     return rc;
11862 
11863   /* This call is only valid for read-only txns */
11864   if (unlikely((txn->mt_flags & MDBX_TXN_RDONLY) == 0))
11865     return MDBX_EINVAL;
11866 
11867   /* LY: don't close DBI-handles */
11868   rc = mdbx_txn_end(txn, MDBX_END_RESET | MDBX_END_UPDATE);
11869   if (rc == MDBX_SUCCESS) {
11870     mdbx_tassert(txn, txn->mt_signature == MDBX_MT_SIGNATURE);
11871     mdbx_tassert(txn, txn->mt_owner == 0);
11872   }
11873   return rc;
11874 }
11875 
mdbx_txn_break(MDBX_txn * txn)11876 int mdbx_txn_break(MDBX_txn *txn) {
11877   do {
11878     int rc = check_txn(txn, 0);
11879     if (unlikely(rc != MDBX_SUCCESS))
11880       return rc;
11881     txn->mt_flags |= MDBX_TXN_ERROR;
11882     if (txn->mt_flags & MDBX_TXN_RDONLY)
11883       break;
11884     txn = txn->mt_child;
11885   } while (txn);
11886   return MDBX_SUCCESS;
11887 }
11888 
mdbx_txn_abort(MDBX_txn * txn)11889 int mdbx_txn_abort(MDBX_txn *txn) {
11890   int rc = check_txn(txn, 0);
11891   if (unlikely(rc != MDBX_SUCCESS))
11892     return rc;
11893 
11894   if (F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY))
11895     /* LY: don't close DBI-handles */
11896     return mdbx_txn_end(txn, MDBX_END_ABORT | MDBX_END_UPDATE | MDBX_END_SLOT |
11897                                  MDBX_END_FREE);
11898 
11899   if (txn->mt_child)
11900     mdbx_txn_abort(txn->mt_child);
11901 
11902   mdbx_tassert(txn, mdbx_dirtylist_check(txn));
11903   return mdbx_txn_end(txn, MDBX_END_ABORT | MDBX_END_SLOT | MDBX_END_FREE);
11904 }
11905 
11906 /* Count all the pages in each DB and in the GC and make sure
11907  * it matches the actual number of pages being used. */
mdbx_audit_ex(MDBX_txn * txn,unsigned retired_stored,bool dont_filter_gc)11908 __cold static int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored,
11909                                 bool dont_filter_gc) {
11910   pgno_t pending = 0;
11911   if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) {
11912     pending = txn->tw.loose_count + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) +
11913               (MDBX_PNL_SIZE(txn->tw.retired_pages) - retired_stored);
11914   }
11915 
11916   MDBX_cursor_couple cx;
11917   int rc = mdbx_cursor_init(&cx.outer, txn, FREE_DBI);
11918   if (unlikely(rc != MDBX_SUCCESS))
11919     return rc;
11920 
11921   pgno_t gc = 0;
11922   MDBX_val key, data;
11923   while ((rc = mdbx_cursor_get(&cx.outer, &key, &data, MDBX_NEXT)) == 0) {
11924     if (!dont_filter_gc) {
11925       if (unlikely(key.iov_len != sizeof(txnid_t)))
11926         return MDBX_CORRUPTED;
11927       txnid_t id = unaligned_peek_u64(4, key.iov_base);
11928       if (txn->tw.lifo_reclaimed) {
11929         for (unsigned i = 1; i <= MDBX_PNL_SIZE(txn->tw.lifo_reclaimed); ++i)
11930           if (id == txn->tw.lifo_reclaimed[i])
11931             goto skip;
11932       } else if (id <= txn->tw.last_reclaimed)
11933         goto skip;
11934     }
11935 
11936     gc += *(pgno_t *)data.iov_base;
11937   skip:;
11938   }
11939   mdbx_tassert(txn, rc == MDBX_NOTFOUND);
11940 
11941   for (MDBX_dbi i = FREE_DBI; i < txn->mt_numdbs; i++)
11942     txn->mt_dbistate[i] &= ~DBI_AUDITED;
11943 
11944   pgno_t used = NUM_METAS;
11945   for (MDBX_dbi i = FREE_DBI; i <= MAIN_DBI; i++) {
11946     if (!(txn->mt_dbistate[i] & DBI_VALID))
11947       continue;
11948     rc = mdbx_cursor_init(&cx.outer, txn, i);
11949     if (unlikely(rc != MDBX_SUCCESS))
11950       return rc;
11951     txn->mt_dbistate[i] |= DBI_AUDITED;
11952     if (txn->mt_dbs[i].md_root == P_INVALID)
11953       continue;
11954     used += txn->mt_dbs[i].md_branch_pages + txn->mt_dbs[i].md_leaf_pages +
11955             txn->mt_dbs[i].md_overflow_pages;
11956 
11957     if (i != MAIN_DBI)
11958       continue;
11959     rc = mdbx_page_search(&cx.outer, NULL, MDBX_PS_FIRST);
11960     while (rc == MDBX_SUCCESS) {
11961       MDBX_page *mp = cx.outer.mc_pg[cx.outer.mc_top];
11962       for (unsigned j = 0; j < page_numkeys(mp); j++) {
11963         MDBX_node *node = page_node(mp, j);
11964         if (node_flags(node) == F_SUBDATA) {
11965           if (unlikely(node_ds(node) != sizeof(MDBX_db)))
11966             return MDBX_CORRUPTED;
11967           MDBX_db db_copy, *db;
11968           memcpy(db = &db_copy, node_data(node), sizeof(db_copy));
11969           if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) {
11970             for (MDBX_dbi k = txn->mt_numdbs; --k > MAIN_DBI;) {
11971               if ((txn->mt_dbistate[k] & DBI_VALID) &&
11972                   /* txn->mt_dbxs[k].md_name.iov_len > 0 && */
11973                   node_ks(node) == txn->mt_dbxs[k].md_name.iov_len &&
11974                   memcmp(node_key(node), txn->mt_dbxs[k].md_name.iov_base,
11975                          node_ks(node)) == 0) {
11976                 txn->mt_dbistate[k] |= DBI_AUDITED;
11977                 if (!(txn->mt_dbistate[k] & MDBX_DBI_STALE))
11978                   db = txn->mt_dbs + k;
11979                 break;
11980               }
11981             }
11982           }
11983           used +=
11984               db->md_branch_pages + db->md_leaf_pages + db->md_overflow_pages;
11985         }
11986       }
11987       rc = mdbx_cursor_sibling(&cx.outer, SIBLING_RIGHT);
11988     }
11989     mdbx_tassert(txn, rc == MDBX_NOTFOUND);
11990   }
11991 
11992   for (MDBX_dbi i = FREE_DBI; i < txn->mt_numdbs; i++) {
11993     if ((txn->mt_dbistate[i] & (DBI_VALID | DBI_AUDITED | DBI_STALE)) !=
11994         DBI_VALID)
11995       continue;
11996     for (MDBX_txn *t = txn; t; t = t->mt_parent)
11997       if (F_ISSET(t->mt_dbistate[i], DBI_DIRTY | DBI_CREAT)) {
11998         used += t->mt_dbs[i].md_branch_pages + t->mt_dbs[i].md_leaf_pages +
11999                 t->mt_dbs[i].md_overflow_pages;
12000         txn->mt_dbistate[i] |= DBI_AUDITED;
12001         break;
12002       }
12003     if (!(txn->mt_dbistate[i] & DBI_AUDITED)) {
12004       mdbx_warning("audit %s@%" PRIaTXN
12005                    ": unable account dbi %d / \"%*s\", state 0x%02x",
12006                    txn->mt_parent ? "nested-" : "", txn->mt_txnid, i,
12007                    (int)txn->mt_dbxs[i].md_name.iov_len,
12008                    (const char *)txn->mt_dbxs[i].md_name.iov_base,
12009                    txn->mt_dbistate[i]);
12010     }
12011   }
12012 
12013   if (pending + gc + used == txn->mt_next_pgno)
12014     return MDBX_SUCCESS;
12015 
12016   if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0)
12017     mdbx_error("audit @%" PRIaTXN ": %u(pending) = %u(loose) + "
12018                "%u(reclaimed) + %u(retired-pending) - %u(retired-stored)",
12019                txn->mt_txnid, pending, txn->tw.loose_count,
12020                MDBX_PNL_SIZE(txn->tw.reclaimed_pglist),
12021                txn->tw.retired_pages ? MDBX_PNL_SIZE(txn->tw.retired_pages) : 0,
12022                retired_stored);
12023   mdbx_error("audit @%" PRIaTXN ": %" PRIaPGNO "(pending) + %" PRIaPGNO
12024              "(gc) + %" PRIaPGNO "(count) = %" PRIaPGNO "(total) <> %" PRIaPGNO
12025              "(allocated)",
12026              txn->mt_txnid, pending, gc, used, pending + gc + used,
12027              txn->mt_next_pgno);
12028   return MDBX_PROBLEM;
12029 }
12030 
backlog_size(MDBX_txn * txn)12031 static __always_inline unsigned backlog_size(MDBX_txn *txn) {
12032   return MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) + txn->tw.loose_count;
12033 }
12034 
12035 /* LY: Prepare a backlog of pages to modify GC itself,
12036  * while reclaiming is prohibited. It should be enough to prevent search
12037  * in mdbx_page_alloc() during a deleting, when GC tree is unbalanced. */
mdbx_prep_backlog(MDBX_txn * txn,MDBX_cursor * gc_cursor,const size_t pnl_bytes)12038 static int mdbx_prep_backlog(MDBX_txn *txn, MDBX_cursor *gc_cursor,
12039                              const size_t pnl_bytes) {
12040   const unsigned linear4list = number_of_ovpages(txn->mt_env, pnl_bytes);
12041   const unsigned backlog4cow = txn->mt_dbs[FREE_DBI].md_depth;
12042   const unsigned backlog4rebalance = backlog4cow + 1;
12043 
12044   if (likely(linear4list == 1 &&
12045              backlog_size(txn) > (pnl_bytes
12046                                       ? backlog4rebalance
12047                                       : (backlog4cow + backlog4rebalance))))
12048     return MDBX_SUCCESS;
12049 
12050   mdbx_trace(">> pnl_bytes %zu, backlog %u, 4list %u, 4cow %u, 4rebalance %u",
12051              pnl_bytes, backlog_size(txn), linear4list, backlog4cow,
12052              backlog4rebalance);
12053 
12054   MDBX_val fake_key, fake_val;
12055   fake_key.iov_base = fake_val.iov_base = nullptr;
12056   fake_key.iov_len = sizeof(txnid_t);
12057   fake_val.iov_len = pnl_bytes;
12058   int err = mdbx_cursor_spill(gc_cursor, &fake_key, &fake_val);
12059   if (unlikely(err != MDBX_SUCCESS))
12060     return err;
12061 
12062   gc_cursor->mc_flags &= ~C_RECLAIMING;
12063   err = mdbx_cursor_touch(gc_cursor);
12064   mdbx_trace("== after-touch, backlog %u, err %d", backlog_size(txn), err);
12065 
12066   if (linear4list > 1 && err == MDBX_SUCCESS) {
12067     err = mdbx_page_alloc(gc_cursor, linear4list,
12068                           MDBX_ALLOC_GC | MDBX_ALLOC_CACHE | MDBX_ALLOC_SLOT)
12069               .err;
12070     mdbx_trace("== after-4linear, backlog %u, err %d", backlog_size(txn), err);
12071   }
12072 
12073   while (backlog_size(txn) < backlog4cow + linear4list && err == MDBX_SUCCESS)
12074     err = mdbx_page_alloc(gc_cursor, 1, MDBX_ALLOC_GC | MDBX_ALLOC_SLOT).err;
12075 
12076   gc_cursor->mc_flags |= C_RECLAIMING;
12077   mdbx_trace("<< backlog %u, err %d", backlog_size(txn), err);
12078   return (err != MDBX_NOTFOUND) ? err : MDBX_SUCCESS;
12079 }
12080 
clean_reserved_gc_pnl(MDBX_env * env,MDBX_val pnl)12081 static __inline void clean_reserved_gc_pnl(MDBX_env *env, MDBX_val pnl) {
12082   /* PNL is initially empty, zero out at least the length */
12083   memset(pnl.iov_base, 0, sizeof(pgno_t));
12084   if ((env->me_flags & (MDBX_WRITEMAP | MDBX_NOMEMINIT)) == 0)
12085     /* zero out to avoid leaking values from uninitialized malloc'ed memory
12086      * to the file in non-writemap mode if length of the saving page-list
12087      * was changed during space reservation. */
12088     memset(pnl.iov_base, 0, pnl.iov_len);
12089 }
12090 
12091 /* Cleanup reclaimed GC records, than save the retired-list as of this
12092  * transaction to GC (aka freeDB). This recursive changes the reclaimed-list
12093  * loose-list and retired-list. Keep trying until it stabilizes. */
mdbx_update_gc(MDBX_txn * txn)12094 static int mdbx_update_gc(MDBX_txn *txn) {
12095   /* txn->tw.reclaimed_pglist[] can grow and shrink during this call.
12096    * txn->tw.last_reclaimed and txn->tw.retired_pages[] can only grow.
12097    * Page numbers cannot disappear from txn->tw.retired_pages[]. */
12098   MDBX_env *const env = txn->mt_env;
12099   const bool lifo = (env->me_flags & MDBX_LIFORECLAIM) != 0;
12100   const char *dbg_prefix_mode = lifo ? "    lifo" : "    fifo";
12101   (void)dbg_prefix_mode;
12102   mdbx_trace("\n>>> @%" PRIaTXN, txn->mt_txnid);
12103 
12104   unsigned retired_stored = 0, loop = 0;
12105   MDBX_cursor_couple couple;
12106   int rc = mdbx_cursor_init(&couple.outer, txn, FREE_DBI);
12107   if (unlikely(rc != MDBX_SUCCESS))
12108     goto bailout_notracking;
12109 
12110   couple.outer.mc_flags |= C_RECLAIMING;
12111   couple.outer.mc_next = txn->tw.cursors[FREE_DBI];
12112   txn->tw.cursors[FREE_DBI] = &couple.outer;
12113 
12114 retry:
12115   ++loop;
12116 retry_noaccount:
12117   mdbx_trace("%s", " >> restart");
12118   mdbx_tassert(txn,
12119                mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
12120                                      txn->mt_next_pgno - MDBX_ENABLE_REFUND));
12121   mdbx_tassert(txn, mdbx_dirtylist_check(txn));
12122   if (unlikely(/* paranoia */ loop > ((MDBX_DEBUG > 0) ? 12 : 42))) {
12123     mdbx_error("too more loops %u, bailout", loop);
12124     rc = MDBX_PROBLEM;
12125     goto bailout;
12126   }
12127 
12128   rc = mdbx_prep_backlog(txn, &couple.outer,
12129                          MDBX_PNL_SIZEOF(txn->tw.retired_pages));
12130   if (unlikely(rc != MDBX_SUCCESS))
12131     goto bailout;
12132 
12133   unsigned settled = 0, cleaned_gc_slot = 0, reused_gc_slot = 0,
12134            filled_gc_slot = ~0u;
12135   txnid_t cleaned_gc_id = 0, gc_rid = txn->tw.last_reclaimed;
12136   while (true) {
12137     /* Come back here after each Put() in case retired-list changed */
12138     MDBX_val key, data;
12139     mdbx_trace("%s", " >> continue");
12140 
12141     mdbx_tassert(txn,
12142                  mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
12143                                        txn->mt_next_pgno - MDBX_ENABLE_REFUND));
12144     if (lifo) {
12145       if (cleaned_gc_slot < (txn->tw.lifo_reclaimed
12146                                  ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)
12147                                  : 0)) {
12148         settled = 0;
12149         cleaned_gc_slot = 0;
12150         reused_gc_slot = 0;
12151         filled_gc_slot = ~0u;
12152         /* LY: cleanup reclaimed records. */
12153         do {
12154           cleaned_gc_id = txn->tw.lifo_reclaimed[++cleaned_gc_slot];
12155           mdbx_tassert(txn,
12156                        cleaned_gc_slot > 0 &&
12157                            cleaned_gc_id < env->me_lck->mti_oldest_reader.weak);
12158           key.iov_base = &cleaned_gc_id;
12159           key.iov_len = sizeof(cleaned_gc_id);
12160           rc = mdbx_cursor_get(&couple.outer, &key, NULL, MDBX_SET);
12161           if (rc == MDBX_NOTFOUND)
12162             continue;
12163           if (unlikely(rc != MDBX_SUCCESS))
12164             goto bailout;
12165           rc = mdbx_prep_backlog(txn, &couple.outer, 0);
12166           if (unlikely(rc != MDBX_SUCCESS))
12167             goto bailout;
12168           mdbx_tassert(txn,
12169                        cleaned_gc_id < env->me_lck->mti_oldest_reader.weak);
12170           mdbx_trace("%s.cleanup-reclaimed-id [%u]%" PRIaTXN, dbg_prefix_mode,
12171                      cleaned_gc_slot, cleaned_gc_id);
12172           mdbx_tassert(txn, *txn->tw.cursors == &couple.outer);
12173           rc = mdbx_cursor_del(&couple.outer, 0);
12174           if (unlikely(rc != MDBX_SUCCESS))
12175             goto bailout;
12176         } while (cleaned_gc_slot < MDBX_PNL_SIZE(txn->tw.lifo_reclaimed));
12177         mdbx_txl_sort(txn->tw.lifo_reclaimed);
12178       }
12179     } else {
12180       /* If using records from GC which we have not yet deleted,
12181        * now delete them and any we reserved for tw.reclaimed_pglist. */
12182       while (cleaned_gc_id <= txn->tw.last_reclaimed) {
12183         rc = mdbx_cursor_first(&couple.outer, &key, NULL);
12184         if (unlikely(rc != MDBX_SUCCESS)) {
12185           if (rc == MDBX_NOTFOUND)
12186             break;
12187           goto bailout;
12188         }
12189         if (!MDBX_DISABLE_PAGECHECKS &&
12190             unlikely(key.iov_len != sizeof(txnid_t))) {
12191           rc = MDBX_CORRUPTED;
12192           goto bailout;
12193         }
12194         gc_rid = cleaned_gc_id;
12195         settled = 0;
12196         reused_gc_slot = 0;
12197         cleaned_gc_id = unaligned_peek_u64(4, key.iov_base);
12198         if (!MDBX_DISABLE_PAGECHECKS &&
12199             unlikely(cleaned_gc_id < MIN_TXNID || cleaned_gc_id > MAX_TXNID)) {
12200           rc = MDBX_CORRUPTED;
12201           goto bailout;
12202         }
12203         if (cleaned_gc_id > txn->tw.last_reclaimed)
12204           break;
12205         if (cleaned_gc_id < txn->tw.last_reclaimed) {
12206           rc = mdbx_prep_backlog(txn, &couple.outer, 0);
12207           if (unlikely(rc != MDBX_SUCCESS))
12208             goto bailout;
12209         }
12210         mdbx_tassert(txn, cleaned_gc_id <= txn->tw.last_reclaimed);
12211         mdbx_tassert(txn, cleaned_gc_id < env->me_lck->mti_oldest_reader.weak);
12212         mdbx_trace("%s.cleanup-reclaimed-id %" PRIaTXN, dbg_prefix_mode,
12213                    cleaned_gc_id);
12214         mdbx_tassert(txn, *txn->tw.cursors == &couple.outer);
12215         rc = mdbx_cursor_del(&couple.outer, 0);
12216         if (unlikely(rc != MDBX_SUCCESS))
12217           goto bailout;
12218       }
12219     }
12220 
12221     mdbx_tassert(txn,
12222                  mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
12223                                        txn->mt_next_pgno - MDBX_ENABLE_REFUND));
12224     mdbx_tassert(txn, mdbx_dirtylist_check(txn));
12225     if (mdbx_audit_enabled()) {
12226       rc = mdbx_audit_ex(txn, retired_stored, false);
12227       if (unlikely(rc != MDBX_SUCCESS))
12228         goto bailout;
12229     }
12230 
12231     /* return suitable into unallocated space */
12232     if (mdbx_refund(txn)) {
12233       mdbx_tassert(
12234           txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
12235                                      txn->mt_next_pgno - MDBX_ENABLE_REFUND));
12236       if (mdbx_audit_enabled()) {
12237         rc = mdbx_audit_ex(txn, retired_stored, false);
12238         if (unlikely(rc != MDBX_SUCCESS))
12239           goto bailout;
12240       }
12241     }
12242 
12243     /* handle loose pages - put ones into the reclaimed- or retired-list */
12244     if (txn->tw.loose_pages) {
12245       /* Return loose page numbers to tw.reclaimed_pglist,
12246        * though usually none are left at this point.
12247        * The pages themselves remain in dirtylist. */
12248       if (unlikely(!txn->tw.lifo_reclaimed && txn->tw.last_reclaimed < 1)) {
12249         if (txn->tw.loose_count > 0) {
12250           /* Put loose page numbers in tw.retired_pages,
12251            * since unable to return them to tw.reclaimed_pglist. */
12252           if (unlikely((rc = mdbx_pnl_need(&txn->tw.retired_pages,
12253                                            txn->tw.loose_count)) != 0))
12254             goto bailout;
12255           for (MDBX_page *mp = txn->tw.loose_pages; mp; mp = mp->mp_next)
12256             mdbx_pnl_xappend(txn->tw.retired_pages, mp->mp_pgno);
12257           mdbx_trace("%s: append %u loose-pages to retired-pages",
12258                      dbg_prefix_mode, txn->tw.loose_count);
12259         }
12260       } else {
12261         /* Room for loose pages + temp PNL with same */
12262         rc = mdbx_pnl_need(&txn->tw.reclaimed_pglist,
12263                            2 * txn->tw.loose_count + 2);
12264         if (unlikely(rc != MDBX_SUCCESS))
12265           goto bailout;
12266         MDBX_PNL loose = txn->tw.reclaimed_pglist +
12267                          MDBX_PNL_ALLOCLEN(txn->tw.reclaimed_pglist) -
12268                          txn->tw.loose_count - 1;
12269         unsigned count = 0;
12270         for (MDBX_page *mp = txn->tw.loose_pages; mp; mp = mp->mp_next) {
12271           mdbx_tassert(txn, mp->mp_flags == P_LOOSE);
12272           loose[++count] = mp->mp_pgno;
12273         }
12274         mdbx_tassert(txn, count == txn->tw.loose_count);
12275         MDBX_PNL_SIZE(loose) = count;
12276         mdbx_pnl_sort(loose);
12277         mdbx_pnl_xmerge(txn->tw.reclaimed_pglist, loose);
12278         mdbx_trace("%s: append %u loose-pages to reclaimed-pages",
12279                    dbg_prefix_mode, txn->tw.loose_count);
12280       }
12281 
12282       /* filter-out list of dirty-pages from loose-pages */
12283       MDBX_dpl *const dl = txn->tw.dirtylist;
12284       unsigned w = 0;
12285       for (unsigned r = w; ++r <= dl->length;) {
12286         MDBX_page *dp = dl->items[r].ptr;
12287         mdbx_tassert(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp));
12288         mdbx_tassert(txn, dpl_endpgno(dl, r) <= txn->mt_next_pgno);
12289         if ((dp->mp_flags & P_LOOSE) == 0) {
12290           if (++w != r)
12291             dl->items[w] = dl->items[r];
12292         } else {
12293           mdbx_tassert(txn, dp->mp_flags == P_LOOSE);
12294           if ((env->me_flags & MDBX_WRITEMAP) == 0)
12295             mdbx_dpage_free(env, dp, 1);
12296         }
12297       }
12298       mdbx_trace("%s: filtered-out loose-pages from %u -> %u dirty-pages",
12299                  dbg_prefix_mode, dl->length, w);
12300       mdbx_tassert(txn, txn->tw.loose_count == dl->length - w);
12301       dpl_setlen(dl, w);
12302       dl->sorted = 0;
12303       txn->tw.dirtyroom += txn->tw.loose_count;
12304       mdbx_tassert(txn,
12305                    txn->tw.dirtyroom + txn->tw.dirtylist->length ==
12306                        (txn->mt_parent ? txn->mt_parent->tw.dirtyroom
12307                                        : txn->mt_env->me_options.dp_limit));
12308       txn->tw.loose_pages = NULL;
12309       txn->tw.loose_count = 0;
12310 #if MDBX_ENABLE_REFUND
12311       txn->tw.loose_refund_wl = 0;
12312 #endif /* MDBX_ENABLE_REFUND */
12313     }
12314 
12315     const unsigned amount = (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist);
12316     /* handle retired-list - store ones into single gc-record */
12317     if (retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages)) {
12318       if (unlikely(!retired_stored)) {
12319         /* Make sure last page of GC is touched and on retired-list */
12320         couple.outer.mc_flags &= ~C_RECLAIMING;
12321         rc = mdbx_page_search(&couple.outer, NULL,
12322                               MDBX_PS_LAST | MDBX_PS_MODIFY);
12323         couple.outer.mc_flags |= C_RECLAIMING;
12324         if (unlikely(rc != MDBX_SUCCESS) && rc != MDBX_NOTFOUND)
12325           goto bailout;
12326       }
12327       /* Write to last page of GC */
12328       key.iov_len = sizeof(txn->mt_txnid);
12329       key.iov_base = &txn->mt_txnid;
12330       do {
12331         data.iov_len = MDBX_PNL_SIZEOF(txn->tw.retired_pages);
12332         mdbx_prep_backlog(txn, &couple.outer, data.iov_len);
12333         rc = mdbx_cursor_put(&couple.outer, &key, &data, MDBX_RESERVE);
12334         if (unlikely(rc != MDBX_SUCCESS))
12335           goto bailout;
12336         /* Retry if tw.retired_pages[] grew during the Put() */
12337       } while (data.iov_len < MDBX_PNL_SIZEOF(txn->tw.retired_pages));
12338 
12339       retired_stored = (unsigned)MDBX_PNL_SIZE(txn->tw.retired_pages);
12340       mdbx_pnl_sort(txn->tw.retired_pages);
12341       mdbx_assert(env, data.iov_len == MDBX_PNL_SIZEOF(txn->tw.retired_pages));
12342       memcpy(data.iov_base, txn->tw.retired_pages, data.iov_len);
12343 
12344       mdbx_trace("%s.put-retired #%u @ %" PRIaTXN, dbg_prefix_mode,
12345                  retired_stored, txn->mt_txnid);
12346 
12347       if (mdbx_log_enabled(MDBX_LOG_EXTRA)) {
12348         unsigned i = retired_stored;
12349         mdbx_debug_extra("PNL write txn %" PRIaTXN " root %" PRIaPGNO
12350                          " num %u, PNL",
12351                          txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i);
12352         for (; i; i--)
12353           mdbx_debug_extra_print(" %" PRIaPGNO, txn->tw.retired_pages[i]);
12354         mdbx_debug_extra_print("%s\n", ".");
12355       }
12356       if (unlikely(amount != MDBX_PNL_SIZE(txn->tw.reclaimed_pglist))) {
12357         mdbx_trace("%s.reclaimed-list changed %u -> %u, retry", dbg_prefix_mode,
12358                    amount, (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist));
12359         goto retry_noaccount /* rare case, but avoids GC fragmentation and one
12360                                 cycle. */
12361             ;
12362       }
12363       continue;
12364     }
12365 
12366     /* handle reclaimed and lost pages - merge and store both into gc */
12367     mdbx_tassert(txn,
12368                  mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
12369                                        txn->mt_next_pgno - MDBX_ENABLE_REFUND));
12370     mdbx_tassert(txn, txn->tw.loose_count == 0);
12371 
12372     mdbx_trace("%s", " >> reserving");
12373     if (mdbx_audit_enabled()) {
12374       rc = mdbx_audit_ex(txn, retired_stored, false);
12375       if (unlikely(rc != MDBX_SUCCESS))
12376         goto bailout;
12377     }
12378     const unsigned left = amount - settled;
12379     mdbx_trace("%s: amount %u, settled %d, left %d, lifo-reclaimed-slots %u, "
12380                "reused-gc-slots %u",
12381                dbg_prefix_mode, amount, settled, (int)left,
12382                txn->tw.lifo_reclaimed
12383                    ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)
12384                    : 0,
12385                reused_gc_slot);
12386     if (0 >= (int)left)
12387       break;
12388 
12389     const unsigned prefer_max_scatter = 257;
12390     txnid_t reservation_gc_id;
12391     if (lifo) {
12392       if (txn->tw.lifo_reclaimed == nullptr) {
12393         txn->tw.lifo_reclaimed = mdbx_txl_alloc();
12394         if (unlikely(!txn->tw.lifo_reclaimed)) {
12395           rc = MDBX_ENOMEM;
12396           goto bailout;
12397         }
12398       }
12399       if ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) <
12400               prefer_max_scatter &&
12401           left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) -
12402                   reused_gc_slot) *
12403                      env->me_maxgc_ov1page) {
12404 
12405         /* LY: need just a txn-id for save page list. */
12406         bool need_cleanup = false;
12407         txnid_t snap_oldest;
12408       retry_rid:
12409         couple.outer.mc_flags &= ~C_RECLAIMING;
12410         do {
12411           snap_oldest = mdbx_find_oldest(txn);
12412           rc =
12413               mdbx_page_alloc(&couple.outer, 0, MDBX_ALLOC_GC | MDBX_ALLOC_SLOT)
12414                   .err;
12415           if (likely(rc == MDBX_SUCCESS)) {
12416             mdbx_trace("%s: took @%" PRIaTXN " from GC", dbg_prefix_mode,
12417                        MDBX_PNL_LAST(txn->tw.lifo_reclaimed));
12418             need_cleanup = true;
12419           }
12420         } while (rc == MDBX_SUCCESS &&
12421                  (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) <
12422                      prefer_max_scatter &&
12423                  left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) -
12424                          reused_gc_slot) *
12425                             env->me_maxgc_ov1page);
12426         couple.outer.mc_flags |= C_RECLAIMING;
12427 
12428         if (likely(rc == MDBX_SUCCESS)) {
12429           mdbx_trace("%s: got enough from GC.", dbg_prefix_mode);
12430           continue;
12431         } else if (unlikely(rc != MDBX_NOTFOUND))
12432           /* LY: some troubles... */
12433           goto bailout;
12434 
12435         if (MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)) {
12436           if (need_cleanup) {
12437             mdbx_txl_sort(txn->tw.lifo_reclaimed);
12438             cleaned_gc_slot = 0;
12439           }
12440           gc_rid = MDBX_PNL_LAST(txn->tw.lifo_reclaimed);
12441         } else {
12442           mdbx_tassert(txn, txn->tw.last_reclaimed == 0);
12443           if (unlikely(mdbx_find_oldest(txn) != snap_oldest))
12444             /* should retry mdbx_page_alloc(MDBX_ALLOC_GC)
12445              * if the oldest reader changes since the last attempt */
12446             goto retry_rid;
12447           /* no reclaimable GC entries,
12448            * therefore no entries with ID < mdbx_find_oldest(txn) */
12449           txn->tw.last_reclaimed = gc_rid = snap_oldest - 1;
12450           mdbx_trace("%s: none recycled yet, set rid to @%" PRIaTXN,
12451                      dbg_prefix_mode, gc_rid);
12452         }
12453 
12454         /* LY: GC is empty, will look any free txn-id in high2low order. */
12455         while (MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) < prefer_max_scatter &&
12456                left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) -
12457                        reused_gc_slot) *
12458                           env->me_maxgc_ov1page) {
12459           if (unlikely(gc_rid < 2)) {
12460             if (unlikely(MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) <=
12461                          reused_gc_slot)) {
12462               mdbx_notice("** restart: reserve depleted (reused_gc_slot %u >= "
12463                           "lifo_reclaimed %u" PRIaTXN,
12464                           reused_gc_slot,
12465                           (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed));
12466               goto retry;
12467             }
12468             break;
12469           }
12470 
12471           mdbx_tassert(txn, gc_rid >= MIN_TXNID && gc_rid <= MAX_TXNID);
12472           --gc_rid;
12473           key.iov_base = &gc_rid;
12474           key.iov_len = sizeof(gc_rid);
12475           rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_SET_KEY);
12476           if (unlikely(rc == MDBX_SUCCESS)) {
12477             mdbx_debug("%s: GC's id %" PRIaTXN
12478                        " is used, continue bottom-up search",
12479                        dbg_prefix_mode, gc_rid);
12480             ++gc_rid;
12481             rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_FIRST);
12482             if (rc == MDBX_NOTFOUND) {
12483               mdbx_debug("%s: GC is empty", dbg_prefix_mode);
12484               break;
12485             }
12486             if (unlikely(rc != MDBX_SUCCESS ||
12487                          key.iov_len != sizeof(mdbx_tid_t))) {
12488               rc = MDBX_CORRUPTED;
12489               goto bailout;
12490             }
12491             txnid_t gc_first = unaligned_peek_u64(4, key.iov_base);
12492             if (!MDBX_DISABLE_PAGECHECKS &&
12493                 unlikely(gc_first < MIN_TXNID || gc_first > MAX_TXNID)) {
12494               rc = MDBX_CORRUPTED;
12495               goto bailout;
12496             }
12497             if (gc_first < 2) {
12498               mdbx_debug("%s: no free GC's id(s) less than %" PRIaTXN,
12499                          dbg_prefix_mode, gc_rid);
12500               break;
12501             }
12502             gc_rid = gc_first - 1;
12503           }
12504 
12505           rc = mdbx_txl_append(&txn->tw.lifo_reclaimed, gc_rid);
12506           if (unlikely(rc != MDBX_SUCCESS))
12507             goto bailout;
12508 
12509           if (reused_gc_slot)
12510             /* rare case, but it is better to clear and re-create GC entries
12511              * with less fragmentation. */
12512             need_cleanup = true;
12513           else
12514             cleaned_gc_slot +=
12515                 1 /* mark cleanup is not needed for added slot. */;
12516 
12517           mdbx_trace("%s: append @%" PRIaTXN
12518                      " to lifo-reclaimed, cleaned-gc-slot = %u",
12519                      dbg_prefix_mode, gc_rid, cleaned_gc_slot);
12520         }
12521 
12522         if (need_cleanup) {
12523           cleaned_gc_slot = 0;
12524           mdbx_trace("%s: restart inner-loop to clear and re-create GC entries",
12525                      dbg_prefix_mode);
12526           continue;
12527         }
12528       }
12529 
12530       const unsigned i =
12531           (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - reused_gc_slot;
12532       mdbx_tassert(txn, i > 0 && i <= MDBX_PNL_SIZE(txn->tw.lifo_reclaimed));
12533       reservation_gc_id = txn->tw.lifo_reclaimed[i];
12534       mdbx_trace("%s: take @%" PRIaTXN " from lifo-reclaimed[%u]",
12535                  dbg_prefix_mode, reservation_gc_id, i);
12536     } else {
12537       mdbx_tassert(txn, txn->tw.lifo_reclaimed == NULL);
12538       if (unlikely(gc_rid == 0)) {
12539         gc_rid = mdbx_find_oldest(txn) - 1;
12540         rc = mdbx_cursor_get(&couple.outer, &key, NULL, MDBX_FIRST);
12541         if (rc == MDBX_SUCCESS) {
12542           if (!MDBX_DISABLE_PAGECHECKS &&
12543               unlikely(key.iov_len != sizeof(txnid_t))) {
12544             rc = MDBX_CORRUPTED;
12545             goto bailout;
12546           }
12547           txnid_t gc_first = unaligned_peek_u64(4, key.iov_base);
12548           if (!MDBX_DISABLE_PAGECHECKS &&
12549               unlikely(gc_first < MIN_TXNID || gc_first > MAX_TXNID)) {
12550             rc = MDBX_CORRUPTED;
12551             goto bailout;
12552           }
12553           if (gc_rid >= gc_first)
12554             gc_rid = gc_first - 1;
12555           if (unlikely(gc_rid == 0)) {
12556             mdbx_error("%s", "** no GC tail-space to store");
12557             goto retry;
12558           }
12559         } else if (rc != MDBX_NOTFOUND)
12560           goto bailout;
12561         txn->tw.last_reclaimed = gc_rid;
12562         cleaned_gc_id = gc_rid + 1;
12563       }
12564       reservation_gc_id = gc_rid--;
12565       mdbx_trace("%s: take @%" PRIaTXN " from head-gc-id", dbg_prefix_mode,
12566                  reservation_gc_id);
12567     }
12568     ++reused_gc_slot;
12569 
12570     unsigned chunk = left;
12571     if (unlikely(chunk > env->me_maxgc_ov1page)) {
12572       const unsigned avail_gc_slots =
12573           txn->tw.lifo_reclaimed
12574               ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) -
12575                     reused_gc_slot + 1
12576           : (gc_rid < INT16_MAX) ? (unsigned)gc_rid
12577                                  : INT16_MAX;
12578       if (avail_gc_slots > 1) {
12579         if (chunk < env->me_maxgc_ov1page * 2)
12580           chunk /= 2;
12581         else {
12582           const unsigned threshold =
12583               env->me_maxgc_ov1page * ((avail_gc_slots < prefer_max_scatter)
12584                                            ? avail_gc_slots
12585                                            : prefer_max_scatter);
12586           if (left < threshold)
12587             chunk = env->me_maxgc_ov1page;
12588           else {
12589             const unsigned tail = left - threshold + env->me_maxgc_ov1page + 1;
12590             unsigned span = 1;
12591             unsigned avail = (unsigned)((pgno2bytes(env, span) - PAGEHDRSZ) /
12592                                         sizeof(pgno_t)) /*- 1 + span */;
12593             if (tail > avail) {
12594               for (unsigned i = amount - span; i > 0; --i) {
12595                 if (MDBX_PNL_ASCENDING
12596                         ? (txn->tw.reclaimed_pglist[i] + span)
12597                         : (txn->tw.reclaimed_pglist[i] - span) ==
12598                               txn->tw.reclaimed_pglist[i + span]) {
12599                   span += 1;
12600                   avail = (unsigned)((pgno2bytes(env, span) - PAGEHDRSZ) /
12601                                      sizeof(pgno_t)) -
12602                           1 + span;
12603                   if (avail >= tail)
12604                     break;
12605                 }
12606               }
12607             }
12608 
12609             chunk = (avail >= tail) ? tail - span
12610                     : (avail_gc_slots > 3 &&
12611                        reused_gc_slot < prefer_max_scatter - 3)
12612                         ? avail - span
12613                         : tail;
12614           }
12615         }
12616       }
12617     }
12618     mdbx_tassert(txn, chunk > 0);
12619 
12620     mdbx_trace("%s: gc_rid %" PRIaTXN ", reused_gc_slot %u, reservation-id "
12621                "%" PRIaTXN,
12622                dbg_prefix_mode, gc_rid, reused_gc_slot, reservation_gc_id);
12623 
12624     mdbx_trace("%s: chunk %u, gc-per-ovpage %u", dbg_prefix_mode, chunk,
12625                env->me_maxgc_ov1page);
12626 
12627     mdbx_tassert(txn, reservation_gc_id < env->me_lck->mti_oldest_reader.weak);
12628     if (unlikely(
12629             reservation_gc_id < 1 ||
12630             reservation_gc_id >=
12631                 atomic_load64(&env->me_lck->mti_oldest_reader, mo_Relaxed))) {
12632       mdbx_error("** internal error (reservation_gc_id %" PRIaTXN ")",
12633                  reservation_gc_id);
12634       rc = MDBX_PROBLEM;
12635       goto bailout;
12636     }
12637 
12638     key.iov_len = sizeof(reservation_gc_id);
12639     key.iov_base = &reservation_gc_id;
12640     data.iov_len = (chunk + 1) * sizeof(pgno_t);
12641     mdbx_trace("%s.reserve: %u [%u...%u) @%" PRIaTXN, dbg_prefix_mode, chunk,
12642                settled + 1, settled + chunk + 1, reservation_gc_id);
12643     mdbx_prep_backlog(txn, &couple.outer, data.iov_len);
12644     rc = mdbx_cursor_put(&couple.outer, &key, &data,
12645                          MDBX_RESERVE | MDBX_NOOVERWRITE);
12646     mdbx_tassert(txn,
12647                  mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
12648                                        txn->mt_next_pgno - MDBX_ENABLE_REFUND));
12649     if (unlikely(rc != MDBX_SUCCESS))
12650       goto bailout;
12651 
12652     clean_reserved_gc_pnl(env, data);
12653     settled += chunk;
12654     mdbx_trace("%s.settled %u (+%u), continue", dbg_prefix_mode, settled,
12655                chunk);
12656 
12657     if (txn->tw.lifo_reclaimed &&
12658         unlikely(amount < MDBX_PNL_SIZE(txn->tw.reclaimed_pglist))) {
12659       mdbx_notice("** restart: reclaimed-list growth %u -> %u", amount,
12660                   (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist));
12661       goto retry_noaccount;
12662     }
12663 
12664     continue;
12665   }
12666 
12667   mdbx_tassert(
12668       txn,
12669       cleaned_gc_slot ==
12670           (txn->tw.lifo_reclaimed ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) : 0));
12671 
12672   mdbx_trace("%s", " >> filling");
12673   /* Fill in the reserved records */
12674   filled_gc_slot =
12675       txn->tw.lifo_reclaimed
12676           ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - reused_gc_slot
12677           : reused_gc_slot;
12678   rc = MDBX_SUCCESS;
12679   mdbx_tassert(txn,
12680                mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
12681                                      txn->mt_next_pgno - MDBX_ENABLE_REFUND));
12682   mdbx_tassert(txn, mdbx_dirtylist_check(txn));
12683   if (MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)) {
12684     MDBX_val key, data;
12685     key.iov_len = data.iov_len = 0; /* avoid MSVC warning */
12686     key.iov_base = data.iov_base = NULL;
12687 
12688     const unsigned amount = MDBX_PNL_SIZE(txn->tw.reclaimed_pglist);
12689     unsigned left = amount;
12690     if (txn->tw.lifo_reclaimed == nullptr) {
12691       mdbx_tassert(txn, lifo == 0);
12692       rc = mdbx_cursor_first(&couple.outer, &key, &data);
12693       if (unlikely(rc != MDBX_SUCCESS))
12694         goto bailout;
12695     } else {
12696       mdbx_tassert(txn, lifo != 0);
12697     }
12698 
12699     while (true) {
12700       txnid_t fill_gc_id;
12701       mdbx_trace("%s: left %u of %u", dbg_prefix_mode, left,
12702                  (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist));
12703       if (txn->tw.lifo_reclaimed == nullptr) {
12704         mdbx_tassert(txn, lifo == 0);
12705         fill_gc_id = unaligned_peek_u64(4, key.iov_base);
12706         if (filled_gc_slot-- == 0 || fill_gc_id > txn->tw.last_reclaimed) {
12707           mdbx_notice(
12708               "** restart: reserve depleted (filled_slot %u, fill_id %" PRIaTXN
12709               " > last_reclaimed %" PRIaTXN,
12710               filled_gc_slot, fill_gc_id, txn->tw.last_reclaimed);
12711           goto retry;
12712         }
12713       } else {
12714         mdbx_tassert(txn, lifo != 0);
12715         if (++filled_gc_slot >
12716             (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)) {
12717           mdbx_notice("** restart: reserve depleted (filled_gc_slot %u > "
12718                       "lifo_reclaimed %u" PRIaTXN,
12719                       filled_gc_slot,
12720                       (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed));
12721           goto retry;
12722         }
12723         fill_gc_id = txn->tw.lifo_reclaimed[filled_gc_slot];
12724         mdbx_trace("%s.seek-reservation @%" PRIaTXN " at lifo_reclaimed[%u]",
12725                    dbg_prefix_mode, fill_gc_id, filled_gc_slot);
12726         key.iov_base = &fill_gc_id;
12727         key.iov_len = sizeof(fill_gc_id);
12728         rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_SET_KEY);
12729         if (unlikely(rc != MDBX_SUCCESS))
12730           goto bailout;
12731       }
12732       mdbx_tassert(txn, cleaned_gc_slot ==
12733                             (txn->tw.lifo_reclaimed
12734                                  ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)
12735                                  : 0));
12736       mdbx_tassert(txn, fill_gc_id > 0 &&
12737                             fill_gc_id < env->me_lck->mti_oldest_reader.weak);
12738       key.iov_base = &fill_gc_id;
12739       key.iov_len = sizeof(fill_gc_id);
12740 
12741       mdbx_tassert(txn, data.iov_len >= sizeof(pgno_t) * 2);
12742       couple.outer.mc_flags |= C_GCFREEZE;
12743       unsigned chunk = (unsigned)(data.iov_len / sizeof(pgno_t)) - 1;
12744       if (unlikely(chunk > left)) {
12745         mdbx_trace("%s: chunk %u > left %u, @%" PRIaTXN, dbg_prefix_mode, chunk,
12746                    left, fill_gc_id);
12747         if ((loop < 5 && chunk - left > loop / 2) ||
12748             chunk - left > env->me_maxgc_ov1page) {
12749           data.iov_len = (left + 1) * sizeof(pgno_t);
12750           if (loop < 7)
12751             couple.outer.mc_flags &= ~C_GCFREEZE;
12752         }
12753         chunk = left;
12754       }
12755       rc = mdbx_cursor_put(&couple.outer, &key, &data,
12756                            MDBX_CURRENT | MDBX_RESERVE);
12757       couple.outer.mc_flags &= ~C_GCFREEZE;
12758       if (unlikely(rc != MDBX_SUCCESS))
12759         goto bailout;
12760       clean_reserved_gc_pnl(env, data);
12761 
12762       if (unlikely(txn->tw.loose_count ||
12763                    amount != MDBX_PNL_SIZE(txn->tw.reclaimed_pglist))) {
12764         mdbx_notice("** restart: reclaimed-list growth (%u -> %u, loose +%u)",
12765                     amount, MDBX_PNL_SIZE(txn->tw.reclaimed_pglist),
12766                     txn->tw.loose_count);
12767         goto retry;
12768       }
12769       if (unlikely(txn->tw.lifo_reclaimed
12770                        ? cleaned_gc_slot < MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)
12771                        : cleaned_gc_id < txn->tw.last_reclaimed)) {
12772         mdbx_notice("%s", "** restart: reclaimed-slots changed");
12773         goto retry;
12774       }
12775       if (unlikely(retired_stored != MDBX_PNL_SIZE(txn->tw.retired_pages))) {
12776         mdbx_tassert(txn,
12777                      retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages));
12778         mdbx_notice("** restart: retired-list growth (%u -> %u)",
12779                     retired_stored, MDBX_PNL_SIZE(txn->tw.retired_pages));
12780         goto retry;
12781       }
12782 
12783       pgno_t *dst = data.iov_base;
12784       *dst++ = chunk;
12785       pgno_t *src = MDBX_PNL_BEGIN(txn->tw.reclaimed_pglist) + left - chunk;
12786       memcpy(dst, src, chunk * sizeof(pgno_t));
12787       pgno_t *from = src, *to = src + chunk;
12788       mdbx_trace("%s.fill: %u [ %u:%" PRIaPGNO "...%u:%" PRIaPGNO
12789                  "] @%" PRIaTXN,
12790                  dbg_prefix_mode, chunk,
12791                  (unsigned)(from - txn->tw.reclaimed_pglist), from[0],
12792                  (unsigned)(to - txn->tw.reclaimed_pglist), to[-1], fill_gc_id);
12793 
12794       left -= chunk;
12795       if (mdbx_audit_enabled()) {
12796         rc = mdbx_audit_ex(txn, retired_stored + amount - left, true);
12797         if (unlikely(rc != MDBX_SUCCESS))
12798           goto bailout;
12799       }
12800       if (left == 0) {
12801         rc = MDBX_SUCCESS;
12802         break;
12803       }
12804 
12805       if (txn->tw.lifo_reclaimed == nullptr) {
12806         mdbx_tassert(txn, lifo == 0);
12807         rc = mdbx_cursor_next(&couple.outer, &key, &data, MDBX_NEXT);
12808         if (unlikely(rc != MDBX_SUCCESS))
12809           goto bailout;
12810       } else {
12811         mdbx_tassert(txn, lifo != 0);
12812       }
12813     }
12814   }
12815 
12816   mdbx_tassert(txn, rc == MDBX_SUCCESS);
12817   if (unlikely(txn->tw.loose_count != 0)) {
12818     mdbx_notice("** restart: got %u loose pages", txn->tw.loose_count);
12819     goto retry;
12820   }
12821   if (unlikely(filled_gc_slot !=
12822                (txn->tw.lifo_reclaimed
12823                     ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)
12824                     : 0))) {
12825 
12826     const bool will_retry = loop < 9;
12827     mdbx_notice("** %s: reserve excess (filled-slot %u, loop %u)",
12828                 will_retry ? "restart" : "ignore", filled_gc_slot, loop);
12829     if (will_retry)
12830       goto retry;
12831   }
12832 
12833   mdbx_tassert(txn,
12834                txn->tw.lifo_reclaimed == NULL ||
12835                    cleaned_gc_slot == MDBX_PNL_SIZE(txn->tw.lifo_reclaimed));
12836 
12837 bailout:
12838   txn->tw.cursors[FREE_DBI] = couple.outer.mc_next;
12839 
12840 bailout_notracking:
12841   MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) = 0;
12842   mdbx_trace("<<< %u loops, rc = %d", loop, rc);
12843   return rc;
12844 }
12845 
mdbx_txn_write(MDBX_txn * txn,struct mdbx_iov_ctx * ctx)12846 static int mdbx_txn_write(MDBX_txn *txn, struct mdbx_iov_ctx *ctx) {
12847   MDBX_dpl *const dl =
12848       (txn->mt_flags & MDBX_WRITEMAP) ? txn->tw.dirtylist : mdbx_dpl_sort(txn);
12849   int rc = MDBX_SUCCESS;
12850   unsigned r, w;
12851   for (w = 0, r = 1; r <= dl->length; ++r) {
12852     MDBX_page *dp = dl->items[r].ptr;
12853     if (dp->mp_flags & P_LOOSE) {
12854       dl->items[++w] = dl->items[r];
12855       continue;
12856     }
12857     unsigned npages = dpl_npages(dl, r);
12858     rc = iov_page(txn, ctx, dp, npages);
12859     if (unlikely(rc != MDBX_SUCCESS))
12860       break;
12861   }
12862 
12863   if (ctx->iov_items)
12864     rc = mdbx_iov_write(txn, ctx);
12865 
12866   while (r <= dl->length)
12867     dl->items[++w] = dl->items[r++];
12868 
12869   dl->sorted = dpl_setlen(dl, w);
12870   txn->tw.dirtyroom += r - 1 - w;
12871   mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
12872                         (txn->mt_parent ? txn->mt_parent->tw.dirtyroom
12873                                         : txn->mt_env->me_options.dp_limit));
12874   return rc;
12875 }
12876 
12877 /* Check txn and dbi arguments to a function */
check_dbi(MDBX_txn * txn,MDBX_dbi dbi,unsigned validity)12878 static __always_inline bool check_dbi(MDBX_txn *txn, MDBX_dbi dbi,
12879                                       unsigned validity) {
12880   if (likely(dbi < txn->mt_numdbs))
12881     return likely((txn->mt_dbistate[dbi] & validity) &&
12882                   !TXN_DBI_CHANGED(txn, dbi) &&
12883                   (txn->mt_dbxs[dbi].md_name.iov_base || dbi < CORE_DBS));
12884 
12885   return dbi_import(txn, dbi);
12886 }
12887 
12888 #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API
mdbx_txn_commit(MDBX_txn * txn)12889 int mdbx_txn_commit(MDBX_txn *txn) { return __inline_mdbx_txn_commit(txn); }
12890 #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */
12891 
12892 /* Merge child txn into parent */
mdbx_txn_merge(MDBX_txn * const parent,MDBX_txn * const txn,const unsigned parent_retired_len)12893 static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn,
12894                                     const unsigned parent_retired_len) {
12895   MDBX_dpl *const src = mdbx_dpl_sort(txn);
12896 
12897   /* Remove refunded pages from parent's dirty list */
12898   MDBX_dpl *const dst = mdbx_dpl_sort(parent);
12899   if (MDBX_ENABLE_REFUND) {
12900     unsigned n = dst->length;
12901     while (n && dst->items[n].pgno >= parent->mt_next_pgno) {
12902       if (!(txn->mt_env->me_flags & MDBX_WRITEMAP)) {
12903         MDBX_page *dp = dst->items[n].ptr;
12904         mdbx_dpage_free(txn->mt_env, dp, dpl_npages(dst, n));
12905       }
12906       --n;
12907     }
12908     parent->tw.dirtyroom += dst->sorted - n;
12909     dst->sorted = dpl_setlen(dst, n);
12910     mdbx_tassert(parent,
12911                  parent->tw.dirtyroom + parent->tw.dirtylist->length ==
12912                      (parent->mt_parent ? parent->mt_parent->tw.dirtyroom
12913                                         : parent->mt_env->me_options.dp_limit));
12914   }
12915 
12916   /* Remove reclaimed pages from parent's dirty list */
12917   const MDBX_PNL reclaimed_list = parent->tw.reclaimed_pglist;
12918   mdbx_dpl_sift(parent, reclaimed_list, false);
12919 
12920   /* Move retired pages from parent's dirty & spilled list to reclaimed */
12921   unsigned r, w, d, s, l;
12922   for (r = w = parent_retired_len;
12923        ++r <= MDBX_PNL_SIZE(parent->tw.retired_pages);) {
12924     const pgno_t pgno = parent->tw.retired_pages[r];
12925     const unsigned di = mdbx_dpl_exist(parent, pgno);
12926     const unsigned si = (!di && unlikely(parent->tw.spill_pages))
12927                             ? mdbx_pnl_exist(parent->tw.spill_pages, pgno << 1)
12928                             : 0;
12929     unsigned npages;
12930     const char *kind;
12931     if (di) {
12932       MDBX_page *dp = dst->items[di].ptr;
12933       mdbx_tassert(parent, (dp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH |
12934                                              P_OVERFLOW | P_SPILLED)) == 0);
12935       npages = dpl_npages(dst, di);
12936       mdbx_page_wash(parent, di, dp, npages);
12937       kind = "dirty";
12938       l = 1;
12939       if (unlikely(npages > l)) {
12940         /* OVERFLOW-страница могла быть переиспользована по частям. Тогда
12941          * в retired-списке может быть только начало последовательности,
12942          * а остаток растащен по dirty, spilled и reclaimed спискам. Поэтому
12943          * переносим в reclaimed с проверкой на обрыв последовательности.
12944          * В любом случае, все осколки будут учтены и отфильтрованы, т.е. если
12945          * страница была разбита на части, то важно удалить dirty-элемент,
12946          * а все осколки будут учтены отдельно. */
12947 
12948         /* Список retired страниц не сортирован, но для ускорения сортировки
12949          * дополняется в соответствии с MDBX_PNL_ASCENDING */
12950 #if MDBX_PNL_ASCENDING
12951         const unsigned len = MDBX_PNL_SIZE(parent->tw.retired_pages);
12952         while (r < len && parent->tw.retired_pages[r + 1] == pgno + l) {
12953           ++r;
12954           if (++l == npages)
12955             break;
12956         }
12957 #else
12958         while (w > parent_retired_len &&
12959                parent->tw.retired_pages[w - 1] == pgno + l) {
12960           --w;
12961           if (++l == npages)
12962             break;
12963         }
12964 #endif
12965       }
12966     } else if (unlikely(si)) {
12967       l = npages = 1;
12968       mdbx_spill_remove(parent, si, 1);
12969       kind = "spilled";
12970     } else {
12971       parent->tw.retired_pages[++w] = pgno;
12972       continue;
12973     }
12974 
12975     mdbx_debug("reclaim retired parent's %u->%u %s page %" PRIaPGNO, npages, l,
12976                kind, pgno);
12977     int err = mdbx_pnl_insert_range(&parent->tw.reclaimed_pglist, pgno, l);
12978     mdbx_ensure(txn->mt_env, err == MDBX_SUCCESS);
12979   }
12980   MDBX_PNL_SIZE(parent->tw.retired_pages) = w;
12981 
12982   /* Filter-out parent spill list */
12983   if (parent->tw.spill_pages && MDBX_PNL_SIZE(parent->tw.spill_pages) > 0) {
12984     const MDBX_PNL sl = mdbx_spill_purge(parent);
12985     unsigned len = MDBX_PNL_SIZE(sl);
12986     if (len) {
12987       /* Remove refunded pages from parent's spill list */
12988       if (MDBX_ENABLE_REFUND &&
12989           MDBX_PNL_MOST(sl) >= (parent->mt_next_pgno << 1)) {
12990 #if MDBX_PNL_ASCENDING
12991         unsigned i = MDBX_PNL_SIZE(sl);
12992         assert(MDBX_PNL_MOST(sl) == MDBX_PNL_LAST(sl));
12993         do {
12994           if ((sl[i] & 1) == 0)
12995             mdbx_debug("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1);
12996           i -= 1;
12997         } while (i && sl[i] >= (parent->mt_next_pgno << 1));
12998         MDBX_PNL_SIZE(sl) = i;
12999 #else
13000         assert(MDBX_PNL_MOST(sl) == MDBX_PNL_FIRST(sl));
13001         unsigned i = 0;
13002         do {
13003           ++i;
13004           if ((sl[i] & 1) == 0)
13005             mdbx_debug("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1);
13006         } while (i < len && sl[i + 1] >= (parent->mt_next_pgno << 1));
13007         MDBX_PNL_SIZE(sl) = len -= i;
13008         memmove(sl + 1, sl + 1 + i, len * sizeof(sl[0]));
13009 #endif
13010       }
13011       mdbx_tassert(txn, mdbx_pnl_check4assert(sl, parent->mt_next_pgno << 1));
13012 
13013       /* Remove reclaimed pages from parent's spill list */
13014       s = MDBX_PNL_SIZE(sl), r = MDBX_PNL_SIZE(reclaimed_list);
13015       /* Scanning from end to begin */
13016       while (s && r) {
13017         if (sl[s] & 1) {
13018           --s;
13019           continue;
13020         }
13021         const pgno_t spilled_pgno = sl[s] >> 1;
13022         const pgno_t reclaimed_pgno = reclaimed_list[r];
13023         if (reclaimed_pgno != spilled_pgno) {
13024           const bool cmp = MDBX_PNL_ORDERED(spilled_pgno, reclaimed_pgno);
13025           s -= !cmp;
13026           r -= cmp;
13027         } else {
13028           mdbx_debug("remove reclaimed parent's spilled page %" PRIaPGNO,
13029                      reclaimed_pgno);
13030           mdbx_spill_remove(parent, s, 1);
13031           --s;
13032           --r;
13033         }
13034       }
13035 
13036       /* Remove anything in our dirty list from parent's spill list */
13037       /* Scanning spill list in descend order */
13038       const int step = MDBX_PNL_ASCENDING ? -1 : 1;
13039       s = MDBX_PNL_ASCENDING ? MDBX_PNL_SIZE(sl) : 1;
13040       d = src->length;
13041       while (d && (MDBX_PNL_ASCENDING ? s > 0 : s <= MDBX_PNL_SIZE(sl))) {
13042         if (sl[s] & 1) {
13043           s += step;
13044           continue;
13045         }
13046         const pgno_t spilled_pgno = sl[s] >> 1;
13047         const pgno_t dirty_pgno_form = src->items[d].pgno;
13048         const unsigned npages = dpl_npages(src, d);
13049         const pgno_t dirty_pgno_to = dirty_pgno_form + npages;
13050         if (dirty_pgno_form > spilled_pgno) {
13051           --d;
13052           continue;
13053         }
13054         if (dirty_pgno_to <= spilled_pgno) {
13055           s += step;
13056           continue;
13057         }
13058 
13059         mdbx_debug("remove dirtied parent's spilled %u page %" PRIaPGNO, npages,
13060                    dirty_pgno_form);
13061         mdbx_spill_remove(parent, s, 1);
13062         s += step;
13063       }
13064 
13065       /* Squash deleted pagenums if we deleted any */
13066       mdbx_spill_purge(parent);
13067     }
13068   }
13069 
13070   /* Remove anything in our spill list from parent's dirty list */
13071   if (txn->tw.spill_pages) {
13072     mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.spill_pages,
13073                                             parent->mt_next_pgno << 1));
13074     mdbx_dpl_sift(parent, txn->tw.spill_pages, true);
13075     mdbx_tassert(parent,
13076                  parent->tw.dirtyroom + parent->tw.dirtylist->length ==
13077                      (parent->mt_parent ? parent->mt_parent->tw.dirtyroom
13078                                         : parent->mt_env->me_options.dp_limit));
13079   }
13080 
13081   /* Find length of merging our dirty list with parent's and release
13082    * filter-out pages */
13083   for (l = 0, d = dst->length, s = src->length; d > 0 && s > 0;) {
13084     MDBX_page *sp = src->items[s].ptr;
13085     mdbx_tassert(parent,
13086                  (sp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | P_OVERFLOW |
13087                                    P_LOOSE | P_SPILLED)) == 0);
13088     const unsigned s_npages = dpl_npages(src, s);
13089     const pgno_t s_pgno = src->items[s].pgno;
13090 
13091     MDBX_page *dp = dst->items[d].ptr;
13092     mdbx_tassert(parent, (dp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH |
13093                                            P_OVERFLOW | P_SPILLED)) == 0);
13094     const unsigned d_npages = dpl_npages(dst, d);
13095     const pgno_t d_pgno = dst->items[d].pgno;
13096 
13097     if (d_pgno >= s_pgno + s_npages) {
13098       --d;
13099       ++l;
13100     } else if (d_pgno + d_npages <= s_pgno) {
13101       if (sp->mp_flags != P_LOOSE) {
13102         sp->mp_txnid = parent->mt_front;
13103         sp->mp_flags &= ~P_SPILLED;
13104       }
13105       --s;
13106       ++l;
13107     } else {
13108       dst->items[d--].ptr = nullptr;
13109       if ((txn->mt_flags & MDBX_WRITEMAP) == 0)
13110         mdbx_dpage_free(txn->mt_env, dp, d_npages);
13111     }
13112   }
13113   assert(dst->sorted == dst->length);
13114   mdbx_tassert(parent, dst->detent >= l + d + s);
13115   dst->sorted = l + d + s; /* the merged length */
13116 
13117   while (s > 0) {
13118     MDBX_page *sp = src->items[s].ptr;
13119     mdbx_tassert(parent,
13120                  (sp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | P_OVERFLOW |
13121                                    P_LOOSE | P_SPILLED)) == 0);
13122     if (sp->mp_flags != P_LOOSE) {
13123       sp->mp_txnid = parent->mt_front;
13124       sp->mp_flags &= ~P_SPILLED;
13125     }
13126     --s;
13127   }
13128 
13129   /* Merge our dirty list into parent's, i.e. merge(dst, src) -> dst */
13130   if (dst->sorted >= dst->length) {
13131     /* from end to begin with dst extending */
13132     for (l = dst->sorted, s = src->length, d = dst->length; s > 0 && d > 0;) {
13133       if (unlikely(l <= d)) {
13134         /* squash to get a gap of free space for merge */
13135         for (r = w = 1; r <= d; ++r)
13136           if (dst->items[r].ptr) {
13137             if (w != r) {
13138               dst->items[w] = dst->items[r];
13139               dst->items[r].ptr = nullptr;
13140             }
13141             ++w;
13142           }
13143         mdbx_notice("squash to begin for extending-merge %u -> %u", d, w - 1);
13144         d = w - 1;
13145         continue;
13146       }
13147       assert(l > d);
13148       if (dst->items[d].ptr) {
13149         dst->items[l--] = (dst->items[d].pgno > src->items[s].pgno)
13150                               ? dst->items[d--]
13151                               : src->items[s--];
13152       } else
13153         --d;
13154     }
13155     if (s > 0) {
13156       assert(l == s);
13157       while (d > 0) {
13158         assert(dst->items[d].ptr == nullptr);
13159         --d;
13160       }
13161       do {
13162         assert(l > 0);
13163         dst->items[l--] = src->items[s--];
13164       } while (s > 0);
13165     } else {
13166       assert(l == d);
13167       while (l > 0) {
13168         assert(dst->items[l].ptr != nullptr);
13169         --l;
13170       }
13171     }
13172   } else {
13173     /* from begin to end with dst shrinking (a lot of new overflow pages) */
13174     for (l = s = d = 1; s <= src->length && d <= dst->length;) {
13175       if (unlikely(l >= d)) {
13176         /* squash to get a gap of free space for merge */
13177         for (r = w = dst->length; r >= d; --r)
13178           if (dst->items[r].ptr) {
13179             if (w != r) {
13180               dst->items[w] = dst->items[r];
13181               dst->items[r].ptr = nullptr;
13182             }
13183             --w;
13184           }
13185         mdbx_notice("squash to end for shrinking-merge %u -> %u", d, w + 1);
13186         d = w + 1;
13187         continue;
13188       }
13189       assert(l < d);
13190       if (dst->items[d].ptr) {
13191         dst->items[l++] = (dst->items[d].pgno < src->items[s].pgno)
13192                               ? dst->items[d++]
13193                               : src->items[s++];
13194       } else
13195         ++d;
13196     }
13197     if (s <= src->length) {
13198       assert(dst->sorted - l == src->length - s);
13199       while (d <= dst->length) {
13200         assert(dst->items[d].ptr == nullptr);
13201         --d;
13202       }
13203       do {
13204         assert(l <= dst->sorted);
13205         dst->items[l++] = src->items[s++];
13206       } while (s <= src->length);
13207     } else {
13208       assert(dst->sorted - l == dst->length - d);
13209       while (l <= dst->sorted) {
13210         assert(l <= d && d <= dst->length && dst->items[d].ptr);
13211         dst->items[l++] = dst->items[d++];
13212       }
13213     }
13214   }
13215   parent->tw.dirtyroom -= dst->sorted - dst->length;
13216   assert(parent->tw.dirtyroom <= parent->mt_env->me_options.dp_limit);
13217   dpl_setlen(dst, dst->sorted);
13218   parent->tw.dirtylru = txn->tw.dirtylru;
13219   mdbx_tassert(parent, mdbx_dirtylist_check(parent));
13220   mdbx_dpl_free(txn);
13221 
13222   if (txn->tw.spill_pages) {
13223     if (parent->tw.spill_pages) {
13224       /* Must not fail since space was preserved above. */
13225       mdbx_pnl_xmerge(parent->tw.spill_pages, txn->tw.spill_pages);
13226       mdbx_pnl_free(txn->tw.spill_pages);
13227     } else {
13228       parent->tw.spill_pages = txn->tw.spill_pages;
13229       parent->tw.spill_least_removed = txn->tw.spill_least_removed;
13230     }
13231     mdbx_tassert(parent, mdbx_dirtylist_check(parent));
13232   }
13233 
13234   parent->mt_flags &= ~MDBX_TXN_HAS_CHILD;
13235   if (parent->tw.spill_pages) {
13236     assert(mdbx_pnl_check4assert(parent->tw.spill_pages,
13237                                  parent->mt_next_pgno << 1));
13238     if (MDBX_PNL_SIZE(parent->tw.spill_pages))
13239       parent->mt_flags |= MDBX_TXN_SPILLS;
13240   }
13241 }
13242 
mdbx_txn_commit_ex(MDBX_txn * txn,MDBX_commit_latency * latency)13243 int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) {
13244   STATIC_ASSERT(MDBX_TXN_FINISHED ==
13245                 MDBX_TXN_BLOCKED - MDBX_TXN_HAS_CHILD - MDBX_TXN_ERROR);
13246   const uint64_t ts_0 = latency ? mdbx_osal_monotime() : 0;
13247   uint64_t ts_1 = 0, ts_2 = 0, ts_3 = 0, ts_4 = 0;
13248   uint32_t audit_duration = 0;
13249 
13250   int rc = check_txn(txn, MDBX_TXN_FINISHED);
13251   if (unlikely(rc != MDBX_SUCCESS))
13252     goto provide_latency;
13253 
13254   if (unlikely(txn->mt_flags & MDBX_TXN_ERROR)) {
13255     rc = MDBX_RESULT_TRUE;
13256     goto fail;
13257   }
13258 
13259   MDBX_env *env = txn->mt_env;
13260 #if MDBX_ENV_CHECKPID
13261   if (unlikely(env->me_pid != mdbx_getpid())) {
13262     env->me_flags |= MDBX_FATAL_ERROR;
13263     rc = MDBX_PANIC;
13264     goto provide_latency;
13265   }
13266 #endif /* MDBX_ENV_CHECKPID */
13267 
13268   /* mdbx_txn_end() mode for a commit which writes nothing */
13269   unsigned end_mode =
13270       MDBX_END_PURE_COMMIT | MDBX_END_UPDATE | MDBX_END_SLOT | MDBX_END_FREE;
13271   if (unlikely(F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY)))
13272     goto done;
13273 
13274   if (txn->mt_child) {
13275     rc = mdbx_txn_commit_ex(txn->mt_child, NULL);
13276     mdbx_tassert(txn, txn->mt_child == NULL);
13277     if (unlikely(rc != MDBX_SUCCESS))
13278       goto fail;
13279   }
13280 
13281   if (unlikely(txn != env->me_txn)) {
13282     mdbx_debug("%s", "attempt to commit unknown transaction");
13283     rc = MDBX_EINVAL;
13284     goto fail;
13285   }
13286 
13287   if (txn->mt_parent) {
13288     mdbx_tassert(txn, mdbx_audit_ex(txn, 0, false) == 0);
13289     mdbx_assert(env, txn != env->me_txn0);
13290     MDBX_txn *const parent = txn->mt_parent;
13291     mdbx_assert(env, parent->mt_signature == MDBX_MT_SIGNATURE);
13292     mdbx_assert(env, parent->mt_child == txn &&
13293                          (parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0);
13294     mdbx_assert(env, mdbx_dirtylist_check(txn));
13295 
13296     if (txn->tw.dirtylist->length == 0 && !(txn->mt_flags & MDBX_TXN_DIRTY) &&
13297         parent->mt_numdbs == txn->mt_numdbs) {
13298       for (int i = txn->mt_numdbs; --i >= 0;) {
13299         mdbx_tassert(txn, (txn->mt_dbistate[i] & DBI_DIRTY) == 0);
13300         if ((txn->mt_dbistate[i] & DBI_STALE) &&
13301             !(parent->mt_dbistate[i] & DBI_STALE))
13302           mdbx_tassert(txn, memcmp(&parent->mt_dbs[i], &txn->mt_dbs[i],
13303                                    sizeof(MDBX_db)) == 0);
13304       }
13305 
13306       mdbx_tassert(txn, memcmp(&parent->mt_geo, &txn->mt_geo,
13307                                sizeof(parent->mt_geo)) == 0);
13308       mdbx_tassert(txn, memcmp(&parent->mt_canary, &txn->mt_canary,
13309                                sizeof(parent->mt_canary)) == 0);
13310       mdbx_tassert(txn, !txn->tw.spill_pages ||
13311                             MDBX_PNL_SIZE(txn->tw.spill_pages) == 0);
13312       mdbx_tassert(txn, txn->tw.loose_count == 0);
13313 
13314       /* fast completion of pure nested transaction */
13315       end_mode = MDBX_END_PURE_COMMIT | MDBX_END_SLOT | MDBX_END_FREE;
13316       goto done;
13317     }
13318 
13319     /* Preserve space for spill list to avoid parent's state corruption
13320      * if allocation fails. */
13321     const unsigned parent_retired_len =
13322         (unsigned)(uintptr_t)parent->tw.retired_pages;
13323     mdbx_tassert(txn,
13324                  parent_retired_len <= MDBX_PNL_SIZE(txn->tw.retired_pages));
13325     const unsigned retired_delta =
13326         MDBX_PNL_SIZE(txn->tw.retired_pages) - parent_retired_len;
13327     if (retired_delta) {
13328       rc = mdbx_pnl_need(&txn->tw.reclaimed_pglist, retired_delta);
13329       if (unlikely(rc != MDBX_SUCCESS))
13330         goto fail;
13331     }
13332 
13333     if (txn->tw.spill_pages) {
13334       if (parent->tw.spill_pages) {
13335         rc = mdbx_pnl_need(&parent->tw.spill_pages,
13336                            MDBX_PNL_SIZE(txn->tw.spill_pages));
13337         if (unlikely(rc != MDBX_SUCCESS))
13338           goto fail;
13339       }
13340       mdbx_spill_purge(txn);
13341     }
13342 
13343     if (unlikely(txn->tw.dirtylist->length + parent->tw.dirtylist->length >
13344                      parent->tw.dirtylist->detent &&
13345                  !mdbx_dpl_reserve(parent, txn->tw.dirtylist->length +
13346                                                parent->tw.dirtylist->length))) {
13347       rc = MDBX_ENOMEM;
13348       goto fail;
13349     }
13350 
13351     //-------------------------------------------------------------------------
13352 
13353     parent->tw.lifo_reclaimed = txn->tw.lifo_reclaimed;
13354     txn->tw.lifo_reclaimed = NULL;
13355 
13356     parent->tw.retired_pages = txn->tw.retired_pages;
13357     txn->tw.retired_pages = NULL;
13358 
13359     mdbx_pnl_free(parent->tw.reclaimed_pglist);
13360     parent->tw.reclaimed_pglist = txn->tw.reclaimed_pglist;
13361     txn->tw.reclaimed_pglist = NULL;
13362     parent->tw.last_reclaimed = txn->tw.last_reclaimed;
13363 
13364     parent->mt_geo = txn->mt_geo;
13365     parent->mt_canary = txn->mt_canary;
13366     parent->mt_flags |= txn->mt_flags & MDBX_TXN_DIRTY;
13367 
13368     /* Move loose pages to parent */
13369 #if MDBX_ENABLE_REFUND
13370     parent->tw.loose_refund_wl = txn->tw.loose_refund_wl;
13371 #endif /* MDBX_ENABLE_REFUND */
13372     parent->tw.loose_count = txn->tw.loose_count;
13373     parent->tw.loose_pages = txn->tw.loose_pages;
13374 
13375     /* Merge our cursors into parent's and close them */
13376     mdbx_cursors_eot(txn, true);
13377     end_mode |= MDBX_END_EOTDONE;
13378 
13379     /* Update parent's DBs array */
13380     memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDBX_db));
13381     parent->mt_numdbs = txn->mt_numdbs;
13382     parent->mt_dbistate[FREE_DBI] = txn->mt_dbistate[FREE_DBI];
13383     parent->mt_dbistate[MAIN_DBI] = txn->mt_dbistate[MAIN_DBI];
13384     for (unsigned i = CORE_DBS; i < txn->mt_numdbs; i++) {
13385       /* preserve parent's status */
13386       const uint8_t state =
13387           txn->mt_dbistate[i] |
13388           (parent->mt_dbistate[i] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY));
13389       mdbx_debug("db %u dbi-state %s 0x%02x -> 0x%02x", i,
13390                  (parent->mt_dbistate[i] != state) ? "update" : "still",
13391                  parent->mt_dbistate[i], state);
13392       parent->mt_dbistate[i] = state;
13393     }
13394 
13395     ts_1 = latency ? mdbx_osal_monotime() : 0;
13396     mdbx_txn_merge(parent, txn, parent_retired_len);
13397     ts_2 = latency ? mdbx_osal_monotime() : 0;
13398     env->me_txn = parent;
13399     parent->mt_child = NULL;
13400     mdbx_tassert(parent, mdbx_dirtylist_check(parent));
13401 
13402 #if MDBX_ENABLE_REFUND
13403     mdbx_refund(parent);
13404     if (mdbx_assert_enabled()) {
13405       /* Check parent's loose pages not suitable for refund */
13406       for (MDBX_page *lp = parent->tw.loose_pages; lp; lp = lp->mp_next)
13407         mdbx_tassert(parent, lp->mp_pgno < parent->tw.loose_refund_wl &&
13408                                  lp->mp_pgno + 1 < parent->mt_next_pgno);
13409       /* Check parent's reclaimed pages not suitable for refund */
13410       if (MDBX_PNL_SIZE(parent->tw.reclaimed_pglist))
13411         mdbx_tassert(parent, MDBX_PNL_MOST(parent->tw.reclaimed_pglist) + 1 <
13412                                  parent->mt_next_pgno);
13413     }
13414 #endif /* MDBX_ENABLE_REFUND */
13415 
13416     ts_4 = ts_3 = latency ? mdbx_osal_monotime() : 0;
13417     txn->mt_signature = 0;
13418     mdbx_free(txn);
13419     mdbx_tassert(parent, mdbx_audit_ex(parent, 0, false) == 0);
13420     rc = MDBX_SUCCESS;
13421     goto provide_latency;
13422   }
13423 
13424   mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
13425                         (txn->mt_parent ? txn->mt_parent->tw.dirtyroom
13426                                         : txn->mt_env->me_options.dp_limit));
13427   mdbx_cursors_eot(txn, false);
13428   end_mode |= MDBX_END_EOTDONE;
13429 
13430   if (txn->tw.dirtylist->length == 0 &&
13431       (txn->mt_flags & (MDBX_TXN_DIRTY | MDBX_TXN_SPILLS)) == 0) {
13432     for (int i = txn->mt_numdbs; --i >= 0;)
13433       mdbx_tassert(txn, (txn->mt_dbistate[i] & DBI_DIRTY) == 0);
13434     rc = MDBX_SUCCESS;
13435     goto done;
13436   }
13437 
13438   mdbx_debug("committing txn %" PRIaTXN " %p on mdbenv %p, root page %" PRIaPGNO
13439              "/%" PRIaPGNO,
13440              txn->mt_txnid, (void *)txn, (void *)env,
13441              txn->mt_dbs[MAIN_DBI].md_root, txn->mt_dbs[FREE_DBI].md_root);
13442 
13443   /* Update DB root pointers */
13444   if (txn->mt_numdbs > CORE_DBS) {
13445     MDBX_cursor_couple couple;
13446     MDBX_val data;
13447     data.iov_len = sizeof(MDBX_db);
13448 
13449     rc = mdbx_cursor_init(&couple.outer, txn, MAIN_DBI);
13450     if (unlikely(rc != MDBX_SUCCESS))
13451       goto fail;
13452     for (MDBX_dbi i = CORE_DBS; i < txn->mt_numdbs; i++) {
13453       if (txn->mt_dbistate[i] & DBI_DIRTY) {
13454         if (unlikely(TXN_DBI_CHANGED(txn, i))) {
13455           rc = MDBX_BAD_DBI;
13456           goto fail;
13457         }
13458         MDBX_db *db = &txn->mt_dbs[i];
13459         mdbx_debug("update main's entry for sub-db %u, mod_txnid %" PRIaTXN
13460                    " -> %" PRIaTXN,
13461                    i, db->md_mod_txnid, txn->mt_txnid);
13462         db->md_mod_txnid = txn->mt_txnid;
13463         data.iov_base = db;
13464         WITH_CURSOR_TRACKING(couple.outer,
13465                              rc = mdbx_cursor_put(&couple.outer,
13466                                                   &txn->mt_dbxs[i].md_name,
13467                                                   &data, F_SUBDATA));
13468         if (unlikely(rc != MDBX_SUCCESS))
13469           goto fail;
13470       }
13471     }
13472   }
13473 
13474   ts_1 = latency ? mdbx_osal_monotime() : 0;
13475   rc = mdbx_update_gc(txn);
13476   if (unlikely(rc != MDBX_SUCCESS))
13477     goto fail;
13478 
13479   ts_2 = latency ? mdbx_osal_monotime() : 0;
13480   if (mdbx_audit_enabled()) {
13481     rc = mdbx_audit_ex(txn, MDBX_PNL_SIZE(txn->tw.retired_pages), true);
13482     const uint64_t audit_end = mdbx_osal_monotime();
13483     audit_duration = mdbx_osal_monotime_to_16dot16(audit_end - ts_2);
13484     ts_2 = audit_end;
13485     if (unlikely(rc != MDBX_SUCCESS))
13486       goto fail;
13487   }
13488 
13489   struct mdbx_iov_ctx ctx;
13490   mdbx_iov_init(txn, &ctx);
13491   rc = mdbx_txn_write(txn, &ctx);
13492   if (likely(rc == MDBX_SUCCESS))
13493     mdbx_iov_done(txn, &ctx);
13494   /* TODO: use ctx.flush_begin & ctx.flush_end for range-sync */
13495   ts_3 = latency ? mdbx_osal_monotime() : 0;
13496 
13497   if (likely(rc == MDBX_SUCCESS)) {
13498 
13499     MDBX_meta meta, *head = mdbx_meta_head(env);
13500     memcpy(meta.mm_magic_and_version, head->mm_magic_and_version, 8);
13501     meta.mm_extra_flags = head->mm_extra_flags;
13502     meta.mm_validator_id = head->mm_validator_id;
13503     meta.mm_extra_pagehdr = head->mm_extra_pagehdr;
13504     unaligned_poke_u64(4, meta.mm_pages_retired,
13505                        unaligned_peek_u64(4, head->mm_pages_retired) +
13506                            MDBX_PNL_SIZE(txn->tw.retired_pages));
13507 
13508     meta.mm_geo = txn->mt_geo;
13509     meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI];
13510     meta.mm_dbs[FREE_DBI].md_mod_txnid =
13511         (txn->mt_dbistate[FREE_DBI] & DBI_DIRTY)
13512             ? txn->mt_txnid
13513             : txn->mt_dbs[FREE_DBI].md_mod_txnid;
13514     meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI];
13515     meta.mm_dbs[MAIN_DBI].md_mod_txnid =
13516         (txn->mt_dbistate[MAIN_DBI] & DBI_DIRTY)
13517             ? txn->mt_txnid
13518             : txn->mt_dbs[MAIN_DBI].md_mod_txnid;
13519     meta.mm_canary = txn->mt_canary;
13520     mdbx_meta_set_txnid(env, &meta, txn->mt_txnid);
13521 
13522     rc = mdbx_sync_locked(
13523         env, env->me_flags | txn->mt_flags | MDBX_SHRINK_ALLOWED, &meta);
13524   }
13525   ts_4 = latency ? mdbx_osal_monotime() : 0;
13526   if (unlikely(rc != MDBX_SUCCESS)) {
13527     env->me_flags |= MDBX_FATAL_ERROR;
13528     goto fail;
13529   }
13530 
13531   end_mode = MDBX_END_COMMITTED | MDBX_END_UPDATE | MDBX_END_EOTDONE;
13532 
13533 done:
13534   rc = mdbx_txn_end(txn, end_mode);
13535 
13536 provide_latency:
13537   if (latency) {
13538     latency->audit = audit_duration;
13539     latency->preparation =
13540         ts_1 ? mdbx_osal_monotime_to_16dot16(ts_1 - ts_0) : 0;
13541     latency->gc =
13542         (ts_1 && ts_2) ? mdbx_osal_monotime_to_16dot16(ts_2 - ts_1) : 0;
13543     latency->write =
13544         (ts_2 && ts_3) ? mdbx_osal_monotime_to_16dot16(ts_3 - ts_2) : 0;
13545     latency->sync =
13546         (ts_3 && ts_4) ? mdbx_osal_monotime_to_16dot16(ts_4 - ts_3) : 0;
13547     const uint64_t ts_5 = mdbx_osal_monotime();
13548     latency->ending = ts_4 ? mdbx_osal_monotime_to_16dot16(ts_5 - ts_4) : 0;
13549     latency->whole = mdbx_osal_monotime_to_16dot16(ts_5 - ts_0);
13550   }
13551   return rc;
13552 
13553 fail:
13554   mdbx_txn_abort(txn);
13555   goto provide_latency;
13556 }
13557 
mdbx_validate_meta(MDBX_env * env,MDBX_meta * const meta,const MDBX_page * const page,const unsigned meta_number,unsigned * guess_pagesize)13558 static int mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta,
13559                               const MDBX_page *const page,
13560                               const unsigned meta_number,
13561                               unsigned *guess_pagesize) {
13562   const uint64_t magic_and_version =
13563       unaligned_peek_u64(4, &meta->mm_magic_and_version);
13564   if (unlikely(magic_and_version != MDBX_DATA_MAGIC &&
13565                magic_and_version != MDBX_DATA_MAGIC_LEGACY_COMPAT &&
13566                magic_and_version != MDBX_DATA_MAGIC_LEGACY_DEVEL)) {
13567     mdbx_error("meta[%u] has invalid magic/version %" PRIx64, meta_number,
13568                magic_and_version);
13569     return ((magic_and_version >> 8) != MDBX_MAGIC) ? MDBX_INVALID
13570                                                     : MDBX_VERSION_MISMATCH;
13571   }
13572 
13573   if (unlikely(page->mp_pgno != meta_number)) {
13574     mdbx_error("meta[%u] has invalid pageno %" PRIaPGNO, meta_number,
13575                page->mp_pgno);
13576     return MDBX_INVALID;
13577   }
13578 
13579   if (unlikely(page->mp_flags != P_META)) {
13580     mdbx_error("page #%u not a meta-page", meta_number);
13581     return MDBX_INVALID;
13582   }
13583 
13584   /* LY: check pagesize */
13585   if (unlikely(!is_powerof2(meta->mm_psize) || meta->mm_psize < MIN_PAGESIZE ||
13586                meta->mm_psize > MAX_PAGESIZE)) {
13587     mdbx_warning("meta[%u] has invalid pagesize (%u), skip it", meta_number,
13588                  meta->mm_psize);
13589     return is_powerof2(meta->mm_psize) ? MDBX_VERSION_MISMATCH : MDBX_INVALID;
13590   }
13591 
13592   if (guess_pagesize && *guess_pagesize != meta->mm_psize) {
13593     *guess_pagesize = meta->mm_psize;
13594     mdbx_verbose("meta[%u] took pagesize %u", meta_number, meta->mm_psize);
13595   }
13596 
13597   const txnid_t txnid = unaligned_peek_u64(4, &meta->mm_txnid_a);
13598   if (unlikely(txnid != unaligned_peek_u64(4, &meta->mm_txnid_b))) {
13599     mdbx_warning("meta[%u] not completely updated, skip it", meta_number);
13600     return MDBX_RESULT_TRUE;
13601   }
13602 
13603   /* LY: check signature as a checksum */
13604   if (META_IS_STEADY(meta) &&
13605       unlikely(unaligned_peek_u64(4, &meta->mm_datasync_sign) !=
13606                mdbx_meta_sign(meta))) {
13607     mdbx_warning("meta[%u] has invalid steady-checksum (0x%" PRIx64
13608                  " != 0x%" PRIx64 "), skip it",
13609                  meta_number, unaligned_peek_u64(4, &meta->mm_datasync_sign),
13610                  mdbx_meta_sign(meta));
13611     return MDBX_RESULT_TRUE;
13612   }
13613 
13614   mdbx_debug("checking meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO
13615              ", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO
13616              " +%u -%u, txn_id %" PRIaTXN ", %s",
13617              page->mp_pgno, meta->mm_dbs[MAIN_DBI].md_root,
13618              meta->mm_dbs[FREE_DBI].md_root, meta->mm_geo.lower,
13619              meta->mm_geo.next, meta->mm_geo.now, meta->mm_geo.upper,
13620              pv2pages(meta->mm_geo.grow_pv), pv2pages(meta->mm_geo.shrink_pv),
13621              txnid, mdbx_durable_str(meta));
13622 
13623   if (unlikely(txnid < MIN_TXNID || txnid > MAX_TXNID)) {
13624     mdbx_warning("meta[%u] has invalid txnid %" PRIaTXN ", skip it",
13625                  meta_number, txnid);
13626     return MDBX_RESULT_TRUE;
13627   }
13628 
13629   /* LY: check min-pages value */
13630   if (unlikely(meta->mm_geo.lower < MIN_PAGENO ||
13631                meta->mm_geo.lower > MAX_PAGENO)) {
13632     mdbx_warning("meta[%u] has invalid min-pages (%" PRIaPGNO "), skip it",
13633                  meta_number, meta->mm_geo.lower);
13634     return MDBX_INVALID;
13635   }
13636 
13637   /* LY: check max-pages value */
13638   if (unlikely(meta->mm_geo.upper < MIN_PAGENO ||
13639                meta->mm_geo.upper > MAX_PAGENO ||
13640                meta->mm_geo.upper < meta->mm_geo.lower)) {
13641     mdbx_warning("meta[%u] has invalid max-pages (%" PRIaPGNO "), skip it",
13642                  meta_number, meta->mm_geo.upper);
13643     return MDBX_INVALID;
13644   }
13645 
13646   /* LY: check last_pgno */
13647   if (unlikely(meta->mm_geo.next < MIN_PAGENO ||
13648                meta->mm_geo.next - 1 > MAX_PAGENO)) {
13649     mdbx_warning("meta[%u] has invalid next-pageno (%" PRIaPGNO "), skip it",
13650                  meta_number, meta->mm_geo.next);
13651     return MDBX_CORRUPTED;
13652   }
13653 
13654   /* LY: check filesize & used_bytes */
13655   const uint64_t used_bytes = meta->mm_geo.next * (uint64_t)meta->mm_psize;
13656   if (unlikely(used_bytes > env->me_dxb_mmap.filesize)) {
13657     /* Here could be a race with DB-shrinking performed by other process */
13658     int err = mdbx_filesize(env->me_lazy_fd, &env->me_dxb_mmap.filesize);
13659     if (unlikely(err != MDBX_SUCCESS))
13660       return err;
13661     if (unlikely(used_bytes > env->me_dxb_mmap.filesize)) {
13662       mdbx_warning("meta[%u] used-bytes (%" PRIu64 ") beyond filesize (%" PRIu64
13663                    "), skip it",
13664                    meta_number, used_bytes, env->me_dxb_mmap.filesize);
13665       return MDBX_CORRUPTED;
13666     }
13667   }
13668   if (unlikely(meta->mm_geo.next - 1 > MAX_PAGENO ||
13669                used_bytes > MAX_MAPSIZE)) {
13670     mdbx_warning("meta[%u] has too large used-space (%" PRIu64 "), skip it",
13671                  meta_number, used_bytes);
13672     return MDBX_TOO_LARGE;
13673   }
13674 
13675   /* LY: check mapsize limits */
13676   pgno_t geo_lower = meta->mm_geo.lower;
13677   uint64_t mapsize_min = geo_lower * (uint64_t)meta->mm_psize;
13678   STATIC_ASSERT(MAX_MAPSIZE < PTRDIFF_MAX - MAX_PAGESIZE);
13679   STATIC_ASSERT(MIN_MAPSIZE < MAX_MAPSIZE);
13680   if (unlikely(mapsize_min < MIN_MAPSIZE || mapsize_min > MAX_MAPSIZE)) {
13681     if (MAX_MAPSIZE != MAX_MAPSIZE64 && mapsize_min > MAX_MAPSIZE &&
13682         mapsize_min <= MAX_MAPSIZE64) {
13683       mdbx_assert(env, meta->mm_geo.next - 1 <= MAX_PAGENO &&
13684                            used_bytes <= MAX_MAPSIZE);
13685       mdbx_warning("meta[%u] has too large min-mapsize (%" PRIu64 "), "
13686                    "but size of used space still acceptable (%" PRIu64 ")",
13687                    meta_number, mapsize_min, used_bytes);
13688       geo_lower = (pgno_t)(mapsize_min = MAX_MAPSIZE / meta->mm_psize);
13689       mdbx_warning("meta[%u] consider get-%s pageno is %" PRIaPGNO
13690                    " instead of wrong %" PRIaPGNO
13691                    ", will be corrected on next commit(s)",
13692                    meta_number, "lower", geo_lower, meta->mm_geo.lower);
13693       meta->mm_geo.lower = geo_lower;
13694     } else {
13695       mdbx_warning("meta[%u] has invalid min-mapsize (%" PRIu64 "), skip it",
13696                    meta_number, mapsize_min);
13697       return MDBX_VERSION_MISMATCH;
13698     }
13699   }
13700 
13701   pgno_t geo_upper = meta->mm_geo.upper;
13702   uint64_t mapsize_max = geo_upper * (uint64_t)meta->mm_psize;
13703   STATIC_ASSERT(MIN_MAPSIZE < MAX_MAPSIZE);
13704   if (unlikely(mapsize_max > MAX_MAPSIZE ||
13705                MAX_PAGENO <
13706                    ceil_powerof2((size_t)mapsize_max, env->me_os_psize) /
13707                        (size_t)meta->mm_psize)) {
13708     if (mapsize_max > MAX_MAPSIZE64) {
13709       mdbx_warning("meta[%u] has invalid max-mapsize (%" PRIu64 "), skip it",
13710                    meta_number, mapsize_max);
13711       return MDBX_VERSION_MISMATCH;
13712     }
13713     /* allow to open large DB from a 32-bit environment */
13714     mdbx_assert(env, meta->mm_geo.next - 1 <= MAX_PAGENO &&
13715                          used_bytes <= MAX_MAPSIZE);
13716     mdbx_warning("meta[%u] has too large max-mapsize (%" PRIu64 "), "
13717                  "but size of used space still acceptable (%" PRIu64 ")",
13718                  meta_number, mapsize_max, used_bytes);
13719     geo_upper = (pgno_t)(mapsize_max = MAX_MAPSIZE / meta->mm_psize);
13720     mdbx_warning("meta[%u] consider get-%s pageno is %" PRIaPGNO
13721                  " instead of wrong %" PRIaPGNO
13722                  ", will be corrected on next commit(s)",
13723                  meta_number, "upper", geo_upper, meta->mm_geo.upper);
13724     meta->mm_geo.upper = geo_upper;
13725   }
13726 
13727   /* LY: check and silently put mm_geo.now into [geo.lower...geo.upper].
13728    *
13729    * Copy-with-compaction by previous version of libmdbx could produce DB-file
13730    * less than meta.geo.lower bound, in case actual filling is low or no data
13731    * at all. This is not a problem as there is no damage or loss of data.
13732    * Therefore it is better not to consider such situation as an error, but
13733    * silently correct it. */
13734   pgno_t geo_now = meta->mm_geo.now;
13735   if (geo_now < geo_lower)
13736     geo_now = geo_lower;
13737   if (geo_now > geo_upper && meta->mm_geo.next <= geo_upper)
13738     geo_now = geo_upper;
13739 
13740   if (unlikely(meta->mm_geo.next > geo_now)) {
13741     mdbx_warning("meta[%u] next-pageno (%" PRIaPGNO
13742                  ") is beyond end-pgno (%" PRIaPGNO "), skip it",
13743                  meta_number, meta->mm_geo.next, geo_now);
13744     return MDBX_CORRUPTED;
13745   }
13746   if (meta->mm_geo.now != geo_now) {
13747     mdbx_warning("meta[%u] consider geo-%s pageno is %" PRIaPGNO
13748                  " instead of wrong %" PRIaPGNO
13749                  ", will be corrected on next commit(s)",
13750                  meta_number, "now", geo_now, meta->mm_geo.now);
13751     meta->mm_geo.now = geo_now;
13752   }
13753 
13754   /* GC */
13755   if (meta->mm_dbs[FREE_DBI].md_root == P_INVALID) {
13756     if (unlikely(meta->mm_dbs[FREE_DBI].md_branch_pages ||
13757                  meta->mm_dbs[FREE_DBI].md_depth ||
13758                  meta->mm_dbs[FREE_DBI].md_entries ||
13759                  meta->mm_dbs[FREE_DBI].md_leaf_pages ||
13760                  meta->mm_dbs[FREE_DBI].md_overflow_pages)) {
13761       mdbx_warning("meta[%u] has false-empty GC, skip it", meta_number);
13762       return MDBX_CORRUPTED;
13763     }
13764   } else if (unlikely(meta->mm_dbs[FREE_DBI].md_root >= meta->mm_geo.next)) {
13765     mdbx_warning("meta[%u] has invalid GC-root %" PRIaPGNO ", skip it",
13766                  meta_number, meta->mm_dbs[FREE_DBI].md_root);
13767     return MDBX_CORRUPTED;
13768   }
13769 
13770   /* MainDB */
13771   if (meta->mm_dbs[MAIN_DBI].md_root == P_INVALID) {
13772     if (unlikely(meta->mm_dbs[MAIN_DBI].md_branch_pages ||
13773                  meta->mm_dbs[MAIN_DBI].md_depth ||
13774                  meta->mm_dbs[MAIN_DBI].md_entries ||
13775                  meta->mm_dbs[MAIN_DBI].md_leaf_pages ||
13776                  meta->mm_dbs[MAIN_DBI].md_overflow_pages)) {
13777       mdbx_warning("meta[%u] has false-empty maindb", meta_number);
13778       return MDBX_CORRUPTED;
13779     }
13780   } else if (unlikely(meta->mm_dbs[MAIN_DBI].md_root >= meta->mm_geo.next)) {
13781     mdbx_warning("meta[%u] has invalid maindb-root %" PRIaPGNO ", skip it",
13782                  meta_number, meta->mm_dbs[MAIN_DBI].md_root);
13783     return MDBX_CORRUPTED;
13784   }
13785 
13786   return MDBX_SUCCESS;
13787 }
13788 
mdbx_validate_meta_copy(MDBX_env * env,const MDBX_meta * meta,MDBX_meta * dest)13789 static int mdbx_validate_meta_copy(MDBX_env *env, const MDBX_meta *meta,
13790                                    MDBX_meta *dest) {
13791   *dest = *meta;
13792   return mdbx_validate_meta(env, dest, data_page(meta),
13793                             bytes2pgno(env, (uint8_t *)meta - env->me_map),
13794                             nullptr);
13795 }
13796 
13797 /* Read the environment parameters of a DB environment
13798  * before mapping it into memory. */
mdbx_read_header(MDBX_env * env,MDBX_meta * dest,const int lck_exclusive,const mdbx_mode_t mode_bits)13799 __cold static int mdbx_read_header(MDBX_env *env, MDBX_meta *dest,
13800                                    const int lck_exclusive,
13801                                    const mdbx_mode_t mode_bits) {
13802   int rc = mdbx_filesize(env->me_lazy_fd, &env->me_dxb_mmap.filesize);
13803   if (unlikely(rc != MDBX_SUCCESS))
13804     return rc;
13805 
13806   memset(dest, 0, sizeof(MDBX_meta));
13807   unaligned_poke_u64(4, dest->mm_datasync_sign, MDBX_DATASIGN_WEAK);
13808   rc = MDBX_CORRUPTED;
13809 
13810   /* Read twice all meta pages so we can find the latest one. */
13811   unsigned loop_limit = NUM_METAS * 2;
13812   /* We don't know the page size on first time. So, just guess it. */
13813   unsigned guess_pagesize = 0;
13814   for (unsigned loop_count = 0; loop_count < loop_limit; ++loop_count) {
13815     const unsigned meta_number = loop_count % NUM_METAS;
13816     const unsigned offset = (guess_pagesize             ? guess_pagesize
13817                              : (loop_count > NUM_METAS) ? env->me_psize
13818                                                         : env->me_os_psize) *
13819                             meta_number;
13820 
13821     char buffer[MIN_PAGESIZE];
13822     unsigned retryleft = 42;
13823     while (1) {
13824       mdbx_trace("reading meta[%d]: offset %u, bytes %u, retry-left %u",
13825                  meta_number, offset, MIN_PAGESIZE, retryleft);
13826       int err = mdbx_pread(env->me_lazy_fd, buffer, MIN_PAGESIZE, offset);
13827       if (err != MDBX_SUCCESS) {
13828         if (err == MDBX_ENODATA && offset == 0 && loop_count == 0 &&
13829             env->me_dxb_mmap.filesize == 0 &&
13830             mode_bits /* non-zero for DB creation */ != 0)
13831           mdbx_notice("read meta: empty file (%d, %s)", err,
13832                       mdbx_strerror(err));
13833         else
13834           mdbx_error("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err,
13835                      mdbx_strerror(err));
13836         return err;
13837       }
13838 
13839       char again[MIN_PAGESIZE];
13840       err = mdbx_pread(env->me_lazy_fd, again, MIN_PAGESIZE, offset);
13841       if (err != MDBX_SUCCESS) {
13842         mdbx_error("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err,
13843                    mdbx_strerror(err));
13844         return err;
13845       }
13846 
13847       if (memcmp(buffer, again, MIN_PAGESIZE) == 0 || --retryleft == 0)
13848         break;
13849 
13850       mdbx_verbose("meta[%u] was updated, re-read it", meta_number);
13851     }
13852 
13853     if (!retryleft) {
13854       mdbx_error("meta[%u] is too volatile, skip it", meta_number);
13855       continue;
13856     }
13857 
13858     MDBX_page *const page = (MDBX_page *)buffer;
13859     MDBX_meta *const meta = page_meta(page);
13860     rc = mdbx_validate_meta(env, meta, page, meta_number, &guess_pagesize);
13861     if (rc != MDBX_SUCCESS)
13862       continue;
13863 
13864     if ((env->me_stuck_meta < 0)
13865             ? mdbx_meta_ot(meta_bootid_match(meta) ? prefer_last
13866                                                    : prefer_steady,
13867                            env, dest, meta)
13868             : (meta_number == (unsigned)env->me_stuck_meta)) {
13869       *dest = *meta;
13870       if (!lck_exclusive && !META_IS_STEADY(dest))
13871         loop_limit += 1; /* LY: should re-read to hush race with update */
13872       mdbx_verbose("latch meta[%u]", meta_number);
13873     }
13874   }
13875 
13876   if (dest->mm_psize == 0 ||
13877       (env->me_stuck_meta < 0 &&
13878        !(META_IS_STEADY(dest) ||
13879          meta_weak_acceptable(env, dest, lck_exclusive)))) {
13880     mdbx_error("%s", "no usable meta-pages, database is corrupted");
13881     if (rc == MDBX_SUCCESS) {
13882       /* TODO: try to restore the database by fully checking b-tree structure
13883        * for the each meta page, if the corresponding option was given */
13884       return MDBX_CORRUPTED;
13885     }
13886     return rc;
13887   }
13888 
13889   return MDBX_SUCCESS;
13890 }
13891 
mdbx_meta_model(const MDBX_env * env,MDBX_page * model,unsigned num)13892 __cold static MDBX_page *mdbx_meta_model(const MDBX_env *env, MDBX_page *model,
13893                                          unsigned num) {
13894   mdbx_ensure(env, is_powerof2(env->me_psize));
13895   mdbx_ensure(env, env->me_psize >= MIN_PAGESIZE);
13896   mdbx_ensure(env, env->me_psize <= MAX_PAGESIZE);
13897   mdbx_ensure(env, env->me_dbgeo.lower >= MIN_MAPSIZE);
13898   mdbx_ensure(env, env->me_dbgeo.upper <= MAX_MAPSIZE);
13899   mdbx_ensure(env, env->me_dbgeo.now >= env->me_dbgeo.lower);
13900   mdbx_ensure(env, env->me_dbgeo.now <= env->me_dbgeo.upper);
13901 
13902   memset(model, 0, env->me_psize);
13903   model->mp_pgno = num;
13904   model->mp_flags = P_META;
13905   MDBX_meta *const model_meta = page_meta(model);
13906   unaligned_poke_u64(4, model_meta->mm_magic_and_version, MDBX_DATA_MAGIC);
13907 
13908   model_meta->mm_geo.lower = bytes2pgno(env, env->me_dbgeo.lower);
13909   model_meta->mm_geo.upper = bytes2pgno(env, env->me_dbgeo.upper);
13910   model_meta->mm_geo.grow_pv = pages2pv(bytes2pgno(env, env->me_dbgeo.grow));
13911   model_meta->mm_geo.shrink_pv =
13912       pages2pv(bytes2pgno(env, env->me_dbgeo.shrink));
13913   model_meta->mm_geo.now = bytes2pgno(env, env->me_dbgeo.now);
13914   model_meta->mm_geo.next = NUM_METAS;
13915 
13916   mdbx_ensure(env, model_meta->mm_geo.lower >= MIN_PAGENO);
13917   mdbx_ensure(env, model_meta->mm_geo.upper <= MAX_PAGENO);
13918   mdbx_ensure(env, model_meta->mm_geo.now >= model_meta->mm_geo.lower);
13919   mdbx_ensure(env, model_meta->mm_geo.now <= model_meta->mm_geo.upper);
13920   mdbx_ensure(env, model_meta->mm_geo.next >= MIN_PAGENO);
13921   mdbx_ensure(env, model_meta->mm_geo.next <= model_meta->mm_geo.now);
13922   mdbx_ensure(env, model_meta->mm_geo.grow_pv ==
13923                        pages2pv(pv2pages(model_meta->mm_geo.grow_pv)));
13924   mdbx_ensure(env, model_meta->mm_geo.shrink_pv ==
13925                        pages2pv(pv2pages(model_meta->mm_geo.shrink_pv)));
13926 
13927   model_meta->mm_psize = env->me_psize;
13928   model_meta->mm_dbs[FREE_DBI].md_flags = MDBX_INTEGERKEY;
13929   model_meta->mm_dbs[FREE_DBI].md_root = P_INVALID;
13930   model_meta->mm_dbs[MAIN_DBI].md_root = P_INVALID;
13931   mdbx_meta_set_txnid(env, model_meta, MIN_TXNID + num);
13932   unaligned_poke_u64(4, model_meta->mm_datasync_sign,
13933                      mdbx_meta_sign(model_meta));
13934   return (MDBX_page *)((uint8_t *)model + env->me_psize);
13935 }
13936 
13937 /* Fill in most of the zeroed meta-pages for an empty database environment.
13938  * Return pointer to recently (head) meta-page. */
mdbx_init_metas(const MDBX_env * env,void * buffer)13939 __cold static MDBX_meta *mdbx_init_metas(const MDBX_env *env, void *buffer) {
13940   MDBX_page *page0 = (MDBX_page *)buffer;
13941   MDBX_page *page1 = mdbx_meta_model(env, page0, 0);
13942   MDBX_page *page2 = mdbx_meta_model(env, page1, 1);
13943   mdbx_meta_model(env, page2, 2);
13944   mdbx_assert(env, !mdbx_meta_eq(env, page_meta(page0), page_meta(page1)));
13945   mdbx_assert(env, !mdbx_meta_eq(env, page_meta(page1), page_meta(page2)));
13946   mdbx_assert(env, !mdbx_meta_eq(env, page_meta(page2), page_meta(page0)));
13947   return page_meta(page2);
13948 }
13949 
13950 #if MDBX_ENABLE_MADVISE && !(defined(_WIN32) || defined(_WIN64))
mdbx_madvise_threshold(const MDBX_env * env,const size_t largest_bytes)13951 static size_t mdbx_madvise_threshold(const MDBX_env *env,
13952                                      const size_t largest_bytes) {
13953   /* TODO: use options */
13954   const unsigned factor = 9;
13955   const size_t threshold = (largest_bytes < (65536ul << factor))
13956                                ? 65536 /* minimal threshold */
13957                            : (largest_bytes > (MEGABYTE * 4 << factor))
13958                                ? MEGABYTE * 4 /* maximal threshold */
13959                                : largest_bytes >> factor;
13960   return bytes_align2os_bytes(env, threshold);
13961 }
13962 #endif /* MDBX_ENABLE_MADVISE */
13963 
mdbx_sync_locked(MDBX_env * env,unsigned flags,MDBX_meta * const pending)13964 static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
13965                             MDBX_meta *const pending) {
13966   mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0);
13967   MDBX_meta *const meta0 = METAPAGE(env, 0);
13968   MDBX_meta *const meta1 = METAPAGE(env, 1);
13969   MDBX_meta *const meta2 = METAPAGE(env, 2);
13970   MDBX_meta *const head = mdbx_meta_head(env);
13971   int rc;
13972 
13973   mdbx_assert(env, mdbx_meta_eq_mask(env) == 0);
13974   mdbx_assert(env,
13975               pending < METAPAGE(env, 0) || pending > METAPAGE(env, NUM_METAS));
13976   mdbx_assert(env, (env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0);
13977   mdbx_assert(env, pending->mm_geo.next <= pending->mm_geo.now);
13978 
13979   if (flags & MDBX_SAFE_NOSYNC) {
13980     /* Check auto-sync conditions */
13981     const pgno_t autosync_threshold =
13982         atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed);
13983     const uint64_t autosync_period =
13984         atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed);
13985     if ((autosync_threshold &&
13986          atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >=
13987              autosync_threshold) ||
13988         (autosync_period &&
13989          mdbx_osal_monotime() -
13990                  atomic_load64(&env->me_lck->mti_sync_timestamp, mo_Relaxed) >=
13991              autosync_period))
13992       flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */
13993   }
13994 
13995   pgno_t shrink = 0;
13996   if (flags & MDBX_SHRINK_ALLOWED) {
13997     /* LY: check conditions to discard unused pages */
13998     const pgno_t largest_pgno = mdbx_find_largest(
13999         env, (head->mm_geo.next > pending->mm_geo.next) ? head->mm_geo.next
14000                                                         : pending->mm_geo.next);
14001     mdbx_assert(env, largest_pgno >= NUM_METAS);
14002 #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__)
14003     const pgno_t edge = env->me_poison_edge;
14004     if (edge > largest_pgno) {
14005       env->me_poison_edge = largest_pgno;
14006       VALGRIND_MAKE_MEM_NOACCESS(env->me_map + pgno2bytes(env, largest_pgno),
14007                                  pgno2bytes(env, edge - largest_pgno));
14008       MDBX_ASAN_POISON_MEMORY_REGION(env->me_map +
14009                                          pgno2bytes(env, largest_pgno),
14010                                      pgno2bytes(env, edge - largest_pgno));
14011     }
14012 #endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */
14013 #if MDBX_ENABLE_MADVISE &&                                                     \
14014     (defined(MADV_DONTNEED) || defined(POSIX_MADV_DONTNEED))
14015     const size_t largest_bytes = pgno2bytes(env, largest_pgno);
14016     /* threshold to avoid unreasonable frequent madvise() calls */
14017     const size_t madvise_threshold = mdbx_madvise_threshold(env, largest_bytes);
14018     const size_t discard_edge_bytes = bytes_align2os_bytes(
14019         env, ((MDBX_RDONLY &
14020                (env->me_lck_mmap.lck ? env->me_lck_mmap.lck->mti_envmode.weak
14021                                      : env->me_flags))
14022                   ? largest_bytes
14023                   : largest_bytes + madvise_threshold));
14024     const pgno_t discard_edge_pgno = bytes2pgno(env, discard_edge_bytes);
14025     const pgno_t prev_discarded_pgno =
14026         atomic_load32(&env->me_lck->mti_discarded_tail, mo_Relaxed);
14027     if (prev_discarded_pgno >=
14028         discard_edge_pgno + bytes2pgno(env, madvise_threshold)) {
14029       mdbx_notice("open-MADV_%s %u..%u", "DONTNEED", prev_discarded_pgno,
14030                   largest_pgno);
14031       atomic_store32(&env->me_lck->mti_discarded_tail, discard_edge_pgno,
14032                      mo_Relaxed);
14033       const size_t prev_discarded_bytes =
14034           ceil_powerof2(pgno2bytes(env, prev_discarded_pgno), env->me_os_psize);
14035       mdbx_ensure(env, prev_discarded_bytes > discard_edge_bytes);
14036 #if defined(MADV_DONTNEED)
14037       int advise = MADV_DONTNEED;
14038 #if defined(MADV_FREE) &&                                                      \
14039     0 /* MADV_FREE works for only anonymous vma at the moment */
14040       if ((env->me_flags & MDBX_WRITEMAP) &&
14041           mdbx_linux_kernel_version > 0x04050000)
14042         advise = MADV_FREE;
14043 #endif /* MADV_FREE */
14044       int err = madvise(env->me_map + discard_edge_bytes,
14045                         prev_discarded_bytes - discard_edge_bytes, advise)
14046                     ? ignore_enosys(errno)
14047                     : MDBX_SUCCESS;
14048 #else
14049       int err = ignore_enosys(posix_madvise(
14050           env->me_map + discard_edge_bytes,
14051           prev_discarded_bytes - discard_edge_bytes, POSIX_MADV_DONTNEED));
14052 #endif
14053       if (unlikely(MDBX_IS_ERROR(err)))
14054         return err;
14055     }
14056 #endif /* MDBX_ENABLE_MADVISE && (MADV_DONTNEED || POSIX_MADV_DONTNEED) */
14057 
14058     /* LY: check conditions to shrink datafile */
14059     const pgno_t backlog_gap = 3 + pending->mm_dbs[FREE_DBI].md_depth * 3;
14060     pgno_t shrink_step = 0;
14061     if (pending->mm_geo.shrink_pv &&
14062         pending->mm_geo.now - pending->mm_geo.next >
14063             (shrink_step = pv2pages(pending->mm_geo.shrink_pv)) + backlog_gap) {
14064       if (pending->mm_geo.now > largest_pgno &&
14065           pending->mm_geo.now - largest_pgno > shrink_step + backlog_gap) {
14066         pgno_t grow_step = 0;
14067         const pgno_t aligner =
14068             pending->mm_geo.grow_pv
14069                 ? (grow_step = pv2pages(pending->mm_geo.grow_pv))
14070                 : shrink_step;
14071         const pgno_t with_backlog_gap = largest_pgno + backlog_gap;
14072         const pgno_t aligned = pgno_align2os_pgno(
14073             env, with_backlog_gap + aligner - with_backlog_gap % aligner);
14074         const pgno_t bottom =
14075             (aligned > pending->mm_geo.lower) ? aligned : pending->mm_geo.lower;
14076         if (pending->mm_geo.now > bottom) {
14077           if (META_IS_STEADY(mdbx_meta_steady(env)))
14078             /* force steady, but only if steady-checkpoint is present */
14079             flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED;
14080           shrink = pending->mm_geo.now - bottom;
14081           pending->mm_geo.now = bottom;
14082           if (unlikely(mdbx_meta_txnid_stable(env, head) ==
14083                        unaligned_peek_u64(4, pending->mm_txnid_a))) {
14084             const txnid_t txnid =
14085                 safe64_txnid_next(unaligned_peek_u64(4, pending->mm_txnid_a));
14086             if (unlikely(txnid > MAX_TXNID)) {
14087               rc = MDBX_TXN_FULL;
14088               mdbx_error("txnid overflow, raise %d", rc);
14089               goto fail;
14090             }
14091             mdbx_meta_set_txnid(env, pending, txnid);
14092           }
14093         }
14094       }
14095     }
14096   }
14097 
14098   /* LY: step#1 - sync previously written/updated data-pages */
14099   rc = MDBX_RESULT_FALSE /* carry steady */;
14100   if (atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed)) {
14101     mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0);
14102     enum mdbx_syncmode_bits mode_bits = MDBX_SYNC_NONE;
14103     if ((flags & MDBX_SAFE_NOSYNC) == 0) {
14104       mode_bits = MDBX_SYNC_DATA;
14105       if (pending->mm_geo.next > mdbx_meta_steady(env)->mm_geo.now)
14106         mode_bits |= MDBX_SYNC_SIZE;
14107       if (flags & MDBX_NOMETASYNC)
14108         mode_bits |= MDBX_SYNC_IODQ;
14109     }
14110 #if MDBX_ENABLE_PGOP_STAT
14111     env->me_lck->mti_pgop_stat.wops.weak += 1;
14112 #endif /* MDBX_ENABLE_PGOP_STAT */
14113     if (flags & MDBX_WRITEMAP)
14114       rc =
14115           mdbx_msync(&env->me_dxb_mmap, 0,
14116                      pgno_align2os_bytes(env, pending->mm_geo.next), mode_bits);
14117     else
14118       rc = mdbx_fsync(env->me_lazy_fd, mode_bits);
14119     if (unlikely(rc != MDBX_SUCCESS))
14120       goto fail;
14121     rc = (flags & MDBX_SAFE_NOSYNC) ? MDBX_RESULT_TRUE /* carry non-steady */
14122                                     : MDBX_RESULT_FALSE /* carry steady */;
14123   }
14124 
14125   /* Steady or Weak */
14126   if (rc == MDBX_RESULT_FALSE /* carry steady */) {
14127     atomic_store64(&env->me_lck->mti_sync_timestamp, mdbx_osal_monotime(),
14128                    mo_Relaxed);
14129     unaligned_poke_u64(4, pending->mm_datasync_sign, mdbx_meta_sign(pending));
14130     atomic_store32(&env->me_lck->mti_unsynced_pages, 0, mo_Relaxed);
14131   } else {
14132     assert(rc == MDBX_RESULT_TRUE /* carry non-steady */);
14133     unaligned_poke_u64(4, pending->mm_datasync_sign, MDBX_DATASIGN_WEAK);
14134   }
14135 
14136   MDBX_meta *target = nullptr;
14137   if (mdbx_meta_txnid_stable(env, head) ==
14138       unaligned_peek_u64(4, pending->mm_txnid_a)) {
14139     mdbx_assert(env, memcmp(&head->mm_dbs, &pending->mm_dbs,
14140                             sizeof(head->mm_dbs)) == 0);
14141     mdbx_assert(env, memcmp(&head->mm_canary, &pending->mm_canary,
14142                             sizeof(head->mm_canary)) == 0);
14143     mdbx_assert(env, memcmp(&head->mm_geo, &pending->mm_geo,
14144                             sizeof(pending->mm_geo)) == 0);
14145     if (!META_IS_STEADY(head) && META_IS_STEADY(pending))
14146       target = head;
14147     else {
14148       mdbx_ensure(env, mdbx_meta_eq(env, head, pending));
14149       mdbx_debug("%s", "skip update meta");
14150       return MDBX_SUCCESS;
14151     }
14152   } else if (head == meta0)
14153     target = mdbx_meta_ancient(prefer_steady, env, meta1, meta2);
14154   else if (head == meta1)
14155     target = mdbx_meta_ancient(prefer_steady, env, meta0, meta2);
14156   else {
14157     mdbx_assert(env, head == meta2);
14158     target = mdbx_meta_ancient(prefer_steady, env, meta0, meta1);
14159   }
14160 
14161   /* LY: step#2 - update meta-page. */
14162   mdbx_debug(
14163       "writing meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO
14164       ", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO
14165       " +%u -%u, txn_id %" PRIaTXN ", %s",
14166       data_page(target)->mp_pgno, pending->mm_dbs[MAIN_DBI].md_root,
14167       pending->mm_dbs[FREE_DBI].md_root, pending->mm_geo.lower,
14168       pending->mm_geo.next, pending->mm_geo.now, pending->mm_geo.upper,
14169       pv2pages(pending->mm_geo.grow_pv), pv2pages(pending->mm_geo.shrink_pv),
14170       unaligned_peek_u64(4, pending->mm_txnid_a), mdbx_durable_str(pending));
14171 
14172   mdbx_debug("meta0: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO
14173              "/%" PRIaPGNO,
14174              (meta0 == head)     ? "head"
14175              : (meta0 == target) ? "tail"
14176                                  : "stay",
14177              mdbx_durable_str(meta0), mdbx_meta_txnid_fluid(env, meta0),
14178              meta0->mm_dbs[MAIN_DBI].md_root, meta0->mm_dbs[FREE_DBI].md_root);
14179   mdbx_debug("meta1: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO
14180              "/%" PRIaPGNO,
14181              (meta1 == head)     ? "head"
14182              : (meta1 == target) ? "tail"
14183                                  : "stay",
14184              mdbx_durable_str(meta1), mdbx_meta_txnid_fluid(env, meta1),
14185              meta1->mm_dbs[MAIN_DBI].md_root, meta1->mm_dbs[FREE_DBI].md_root);
14186   mdbx_debug("meta2: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO
14187              "/%" PRIaPGNO,
14188              (meta2 == head)     ? "head"
14189              : (meta2 == target) ? "tail"
14190                                  : "stay",
14191              mdbx_durable_str(meta2), mdbx_meta_txnid_fluid(env, meta2),
14192              meta2->mm_dbs[MAIN_DBI].md_root, meta2->mm_dbs[FREE_DBI].md_root);
14193 
14194   mdbx_assert(env, !mdbx_meta_eq(env, pending, meta0));
14195   mdbx_assert(env, !mdbx_meta_eq(env, pending, meta1));
14196   mdbx_assert(env, !mdbx_meta_eq(env, pending, meta2));
14197 
14198   mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0);
14199   mdbx_ensure(env,
14200               target == head || mdbx_meta_txnid_stable(env, target) <
14201                                     unaligned_peek_u64(4, pending->mm_txnid_a));
14202 #if MDBX_ENABLE_PGOP_STAT
14203   env->me_lck->mti_pgop_stat.wops.weak += 1;
14204 #endif /* MDBX_ENABLE_PGOP_STAT */
14205   if (flags & MDBX_WRITEMAP) {
14206     mdbx_jitter4testing(true);
14207     if (likely(target != head)) {
14208       /* LY: 'invalidate' the meta. */
14209       mdbx_meta_update_begin(env, target,
14210                              unaligned_peek_u64(4, pending->mm_txnid_a));
14211       unaligned_poke_u64(4, target->mm_datasync_sign, MDBX_DATASIGN_WEAK);
14212 #ifndef NDEBUG
14213       /* debug: provoke failure to catch a violators, but don't touch mm_psize
14214        * to allow readers catch actual pagesize. */
14215       uint8_t *provoke_begin = (uint8_t *)&target->mm_dbs[FREE_DBI].md_root;
14216       uint8_t *provoke_end = (uint8_t *)&target->mm_datasync_sign;
14217       memset(provoke_begin, 0xCC, provoke_end - provoke_begin);
14218       mdbx_jitter4testing(false);
14219 #endif
14220 
14221       /* LY: update info */
14222       target->mm_geo = pending->mm_geo;
14223       target->mm_dbs[FREE_DBI] = pending->mm_dbs[FREE_DBI];
14224       target->mm_dbs[MAIN_DBI] = pending->mm_dbs[MAIN_DBI];
14225       target->mm_canary = pending->mm_canary;
14226       memcpy(target->mm_pages_retired, pending->mm_pages_retired, 8);
14227       mdbx_jitter4testing(true);
14228 
14229       /* LY: 'commit' the meta */
14230       mdbx_meta_update_end(env, target,
14231                            unaligned_peek_u64(4, pending->mm_txnid_b));
14232       mdbx_jitter4testing(true);
14233     } else {
14234       /* dangerous case (target == head), only mm_datasync_sign could
14235        * me updated, check assertions once again */
14236       mdbx_ensure(env, mdbx_meta_txnid_stable(env, head) ==
14237                                unaligned_peek_u64(4, pending->mm_txnid_a) &&
14238                            !META_IS_STEADY(head) && META_IS_STEADY(pending));
14239       mdbx_ensure(env, memcmp(&head->mm_geo, &pending->mm_geo,
14240                               sizeof(head->mm_geo)) == 0);
14241       mdbx_ensure(env, memcmp(&head->mm_dbs, &pending->mm_dbs,
14242                               sizeof(head->mm_dbs)) == 0);
14243       mdbx_ensure(env, memcmp(&head->mm_canary, &pending->mm_canary,
14244                               sizeof(head->mm_canary)) == 0);
14245     }
14246     memcpy(target->mm_datasync_sign, pending->mm_datasync_sign, 8);
14247     mdbx_flush_incoherent_cpu_writeback();
14248     mdbx_jitter4testing(true);
14249     /* sync meta-pages */
14250     rc =
14251         mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS),
14252                    (flags & MDBX_NOMETASYNC) ? MDBX_SYNC_NONE
14253                                              : MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
14254     if (unlikely(rc != MDBX_SUCCESS))
14255       goto fail;
14256   } else {
14257     const MDBX_meta undo_meta = *target;
14258     const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE)
14259                                      ? env->me_dsync_fd
14260                                      : env->me_lazy_fd;
14261 #if MDBX_ENABLE_PGOP_STAT
14262     env->me_lck->mti_pgop_stat.wops.weak += 1;
14263 #endif /* MDBX_ENABLE_PGOP_STAT */
14264     rc = mdbx_pwrite(fd, pending, sizeof(MDBX_meta),
14265                      (uint8_t *)target - env->me_map);
14266     if (unlikely(rc != MDBX_SUCCESS)) {
14267     undo:
14268       mdbx_debug("%s", "write failed, disk error?");
14269       /* On a failure, the pagecache still contains the new data.
14270        * Try write some old data back, to prevent it from being used. */
14271       mdbx_pwrite(fd, &undo_meta, sizeof(MDBX_meta),
14272                   (uint8_t *)target - env->me_map);
14273       goto fail;
14274     }
14275     mdbx_flush_incoherent_mmap(target, sizeof(MDBX_meta), env->me_os_psize);
14276     /* sync meta-pages */
14277     if ((flags & MDBX_NOMETASYNC) == 0 && fd == env->me_lazy_fd) {
14278       rc = mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
14279       if (rc != MDBX_SUCCESS)
14280         goto undo;
14281     }
14282   }
14283   if (flags & MDBX_NOMETASYNC)
14284     env->me_lck->mti_unsynced_pages.weak += 1;
14285   else
14286     env->me_lck->mti_meta_sync_txnid.weak =
14287         (uint32_t)unaligned_peek_u64(4, pending->mm_txnid_a);
14288 
14289   /* LY: shrink datafile if needed */
14290   if (unlikely(shrink)) {
14291     mdbx_verbose("shrink to %" PRIaPGNO " pages (-%" PRIaPGNO ")",
14292                  pending->mm_geo.now, shrink);
14293     rc = mdbx_mapresize_implicit(env, pending->mm_geo.next, pending->mm_geo.now,
14294                                  pending->mm_geo.upper);
14295     if (MDBX_IS_ERROR(rc))
14296       goto fail;
14297   }
14298 
14299   MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
14300   if (likely(lck))
14301     /* toggle oldest refresh */
14302     atomic_store32(&lck->mti_readers_refresh_flag, false, mo_Relaxed);
14303 
14304   return MDBX_SUCCESS;
14305 
14306 fail:
14307   env->me_flags |= MDBX_FATAL_ERROR;
14308   return rc;
14309 }
14310 
recalculate_merge_threshold(MDBX_env * env)14311 static void recalculate_merge_threshold(MDBX_env *env) {
14312   const unsigned bytes = page_space(env);
14313   env->me_merge_threshold =
14314       (uint16_t)(bytes -
14315                  (bytes * env->me_options.merge_threshold_16dot16_percent >>
14316                   16));
14317   env->me_merge_threshold_gc =
14318       (uint16_t)(bytes -
14319                  ((env->me_options.merge_threshold_16dot16_percent > 19005)
14320                       ? bytes / 3 /* 33 % */
14321                       : bytes / 4 /* 25 % */));
14322 }
14323 
mdbx_setup_pagesize(MDBX_env * env,const size_t pagesize)14324 __cold static void mdbx_setup_pagesize(MDBX_env *env, const size_t pagesize) {
14325   STATIC_ASSERT(PTRDIFF_MAX > MAX_MAPSIZE);
14326   STATIC_ASSERT(MIN_PAGESIZE > sizeof(MDBX_page) + sizeof(MDBX_meta));
14327   mdbx_ensure(env, is_powerof2(pagesize));
14328   mdbx_ensure(env, pagesize >= MIN_PAGESIZE);
14329   mdbx_ensure(env, pagesize <= MAX_PAGESIZE);
14330   env->me_psize = (unsigned)pagesize;
14331 
14332   STATIC_ASSERT(MAX_GC1OVPAGE(MIN_PAGESIZE) > 4);
14333   STATIC_ASSERT(MAX_GC1OVPAGE(MAX_PAGESIZE) < MDBX_PGL_LIMIT / 4);
14334   const intptr_t maxgc_ov1page = (pagesize - PAGEHDRSZ) / sizeof(pgno_t) - 1;
14335   mdbx_ensure(env, maxgc_ov1page > 42 &&
14336                        maxgc_ov1page < (intptr_t)MDBX_PGL_LIMIT / 4);
14337   env->me_maxgc_ov1page = (unsigned)maxgc_ov1page;
14338 
14339   STATIC_ASSERT(LEAF_NODE_MAX(MIN_PAGESIZE) > sizeof(MDBX_db) + NODESIZE + 42);
14340   STATIC_ASSERT(LEAF_NODE_MAX(MAX_PAGESIZE) < UINT16_MAX);
14341   STATIC_ASSERT(LEAF_NODE_MAX(MIN_PAGESIZE) >= BRANCH_NODE_MAX(MIN_PAGESIZE));
14342   STATIC_ASSERT(BRANCH_NODE_MAX(MAX_PAGESIZE) > NODESIZE + 42);
14343   STATIC_ASSERT(BRANCH_NODE_MAX(MAX_PAGESIZE) < UINT16_MAX);
14344   const intptr_t branch_nodemax = BRANCH_NODE_MAX(pagesize);
14345   const intptr_t leaf_nodemax = LEAF_NODE_MAX(pagesize);
14346   mdbx_ensure(env,
14347               branch_nodemax > (intptr_t)(NODESIZE + 42) &&
14348                   branch_nodemax % 2 == 0 &&
14349                   leaf_nodemax > (intptr_t)(sizeof(MDBX_db) + NODESIZE + 42) &&
14350                   leaf_nodemax >= branch_nodemax &&
14351                   leaf_nodemax < (int)UINT16_MAX && leaf_nodemax % 2 == 0);
14352   env->me_leaf_nodemax = (unsigned)leaf_nodemax;
14353   env->me_psize2log = (uint8_t)log2n_powerof2(pagesize);
14354   mdbx_assert(env, pgno2bytes(env, 1) == pagesize);
14355   mdbx_assert(env, bytes2pgno(env, pagesize + pagesize) == 2);
14356   recalculate_merge_threshold(env);
14357 
14358   const pgno_t max_pgno = bytes2pgno(env, MAX_MAPSIZE);
14359   if (!env->me_options.flags.non_auto.dp_limit) {
14360     /* auto-setup dp_limit by "The42" ;-) */
14361     intptr_t total_ram_pages, avail_ram_pages;
14362     int err = mdbx_get_sysraminfo(nullptr, &total_ram_pages, &avail_ram_pages);
14363     if (unlikely(err != MDBX_SUCCESS))
14364       mdbx_error("mdbx_get_sysraminfo(), rc %d", err);
14365     else {
14366       size_t reasonable_dpl_limit =
14367           (size_t)(total_ram_pages + avail_ram_pages) / 42;
14368       if (pagesize > env->me_os_psize)
14369         reasonable_dpl_limit /= pagesize / env->me_os_psize;
14370       else if (pagesize < env->me_os_psize)
14371         reasonable_dpl_limit *= env->me_os_psize / pagesize;
14372       reasonable_dpl_limit = (reasonable_dpl_limit < MDBX_PGL_LIMIT)
14373                                  ? reasonable_dpl_limit
14374                                  : MDBX_PGL_LIMIT;
14375       reasonable_dpl_limit = (reasonable_dpl_limit > CURSOR_STACK * 4)
14376                                  ? reasonable_dpl_limit
14377                                  : CURSOR_STACK * 4;
14378       env->me_options.dp_limit = (unsigned)reasonable_dpl_limit;
14379     }
14380   }
14381   if (env->me_options.dp_limit > max_pgno - NUM_METAS)
14382     env->me_options.dp_limit = max_pgno - NUM_METAS;
14383   if (env->me_options.dp_initial > env->me_options.dp_limit)
14384     env->me_options.dp_initial = env->me_options.dp_limit;
14385 }
14386 
14387 static __inline MDBX_CONST_FUNCTION MDBX_lockinfo *
lckless_stub(const MDBX_env * env)14388 lckless_stub(const MDBX_env *env) {
14389   uintptr_t stub = (uintptr_t)&env->x_lckless_stub;
14390   /* align to avoid false-positive alarm from UndefinedBehaviorSanitizer */
14391   stub = (stub + MDBX_CACHELINE_SIZE - 1) & ~(MDBX_CACHELINE_SIZE - 1);
14392   return (MDBX_lockinfo *)stub;
14393 }
14394 
mdbx_env_create(MDBX_env ** penv)14395 __cold int mdbx_env_create(MDBX_env **penv) {
14396   MDBX_env *env = mdbx_calloc(1, sizeof(MDBX_env));
14397   if (unlikely(!env))
14398     return MDBX_ENOMEM;
14399 
14400   env->me_maxreaders = DEFAULT_READERS;
14401   env->me_maxdbs = env->me_numdbs = CORE_DBS;
14402   env->me_lazy_fd = INVALID_HANDLE_VALUE;
14403   env->me_dsync_fd = INVALID_HANDLE_VALUE;
14404   env->me_lfd = INVALID_HANDLE_VALUE;
14405   env->me_pid = mdbx_getpid();
14406   env->me_stuck_meta = -1;
14407 
14408   env->me_options.dp_reserve_limit = 1024;
14409   env->me_options.rp_augment_limit = 256 * 1024;
14410   env->me_options.dp_limit = 64 * 1024;
14411   if (env->me_options.dp_limit > MAX_PAGENO - NUM_METAS)
14412     env->me_options.dp_limit = MAX_PAGENO - NUM_METAS;
14413   env->me_options.dp_initial = MDBX_PNL_INITIAL;
14414   if (env->me_options.dp_initial > env->me_options.dp_limit)
14415     env->me_options.dp_initial = env->me_options.dp_limit;
14416   env->me_options.spill_max_denominator = 8;
14417   env->me_options.spill_min_denominator = 8;
14418   env->me_options.spill_parent4child_denominator = 0;
14419   env->me_options.dp_loose_limit = 64;
14420   env->me_options.merge_threshold_16dot16_percent = 65536 / 4 /* 25% */;
14421 
14422   int rc;
14423   const size_t os_psize = mdbx_syspagesize();
14424   if (unlikely(!is_powerof2(os_psize) || os_psize < MIN_PAGESIZE)) {
14425     mdbx_error("unsuitable system pagesize %" PRIuPTR, os_psize);
14426     rc = MDBX_INCOMPATIBLE;
14427     goto bailout;
14428   }
14429   env->me_os_psize = (unsigned)os_psize;
14430   mdbx_setup_pagesize(env, (env->me_os_psize < MAX_PAGESIZE) ? env->me_os_psize
14431                                                              : MAX_PAGESIZE);
14432 
14433   rc = mdbx_fastmutex_init(&env->me_dbi_lock);
14434   if (unlikely(rc != MDBX_SUCCESS))
14435     goto bailout;
14436 
14437 #if defined(_WIN32) || defined(_WIN64)
14438   mdbx_srwlock_Init(&env->me_remap_guard);
14439   InitializeCriticalSection(&env->me_windowsbug_lock);
14440 #else
14441   rc = mdbx_fastmutex_init(&env->me_remap_guard);
14442   if (unlikely(rc != MDBX_SUCCESS)) {
14443     mdbx_fastmutex_destroy(&env->me_dbi_lock);
14444     goto bailout;
14445   }
14446 
14447 #if MDBX_LOCKING > MDBX_LOCKING_SYSV
14448   MDBX_lockinfo *const stub = lckless_stub(env);
14449   rc = mdbx_ipclock_stub(&stub->mti_wlock);
14450 #endif /* MDBX_LOCKING */
14451   if (unlikely(rc != MDBX_SUCCESS)) {
14452     mdbx_fastmutex_destroy(&env->me_remap_guard);
14453     mdbx_fastmutex_destroy(&env->me_dbi_lock);
14454     goto bailout;
14455   }
14456 #endif /* Windows */
14457 
14458   VALGRIND_CREATE_MEMPOOL(env, 0, 0);
14459   env->me_signature.weak = MDBX_ME_SIGNATURE;
14460   *penv = env;
14461   return MDBX_SUCCESS;
14462 
14463 bailout:
14464   mdbx_free(env);
14465   *penv = nullptr;
14466   return rc;
14467 }
14468 
get_reasonable_db_maxsize(intptr_t * cached_result)14469 __cold static intptr_t get_reasonable_db_maxsize(intptr_t *cached_result) {
14470   if (*cached_result == 0) {
14471     intptr_t pagesize, total_ram_pages;
14472     if (unlikely(mdbx_get_sysraminfo(&pagesize, &total_ram_pages, nullptr) !=
14473                  MDBX_SUCCESS))
14474       return MAX_MAPSIZE32 /* the 32-bit limit is good enough for fallback */;
14475 
14476     if (unlikely((size_t)total_ram_pages * 2 > MAX_MAPSIZE / (size_t)pagesize))
14477       return MAX_MAPSIZE;
14478     assert(MAX_MAPSIZE >= (size_t)(total_ram_pages * pagesize * 2));
14479 
14480     /* Suggesting should not be more than golden ratio of the size of RAM. */
14481     *cached_result = (intptr_t)((size_t)total_ram_pages * 207 >> 7) * pagesize;
14482 
14483     /* Round to the nearest human-readable granulation. */
14484     for (size_t unit = MEGABYTE; unit; unit <<= 5) {
14485       const size_t floor = floor_powerof2(*cached_result, unit);
14486       const size_t ceil = ceil_powerof2(*cached_result, unit);
14487       const size_t threshold = (size_t)*cached_result >> 4;
14488       const bool down =
14489           *cached_result - floor < ceil - *cached_result || ceil > MAX_MAPSIZE;
14490       if (threshold < (down ? *cached_result - floor : ceil - *cached_result))
14491         break;
14492       *cached_result = down ? floor : ceil;
14493     }
14494   }
14495   return *cached_result;
14496 }
14497 
14498 __cold LIBMDBX_API int
mdbx_env_set_geometry(MDBX_env * env,intptr_t size_lower,intptr_t size_now,intptr_t size_upper,intptr_t growth_step,intptr_t shrink_threshold,intptr_t pagesize)14499 mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now,
14500                       intptr_t size_upper, intptr_t growth_step,
14501                       intptr_t shrink_threshold, intptr_t pagesize) {
14502   int rc = check_env(env, false);
14503   if (unlikely(rc != MDBX_SUCCESS))
14504     return rc;
14505 
14506   const bool inside_txn =
14507       (env->me_txn0 && env->me_txn0->mt_owner == mdbx_thread_self());
14508 
14509 #if MDBX_DEBUG
14510   if (growth_step < 0) {
14511     growth_step = 1;
14512     if (shrink_threshold < 0)
14513       shrink_threshold = 1;
14514   }
14515 #endif /* MDBX_DEBUG */
14516 
14517   intptr_t reasonable_maxsize = 0;
14518   bool need_unlock = false;
14519   if (env->me_map) {
14520     /* env already mapped */
14521     if (unlikely(env->me_flags & MDBX_RDONLY))
14522       return MDBX_EACCESS;
14523 
14524     if (!inside_txn) {
14525       int err = mdbx_txn_lock(env, false);
14526       if (unlikely(err != MDBX_SUCCESS))
14527         return err;
14528       need_unlock = true;
14529     }
14530     MDBX_meta *head = mdbx_meta_head(env);
14531     if (!inside_txn) {
14532       env->me_txn0->mt_txnid = meta_txnid(env, head, false);
14533       mdbx_find_oldest(env->me_txn0);
14534     }
14535 
14536     /* get untouched params from DB */
14537     if (pagesize <= 0 || pagesize >= INT_MAX)
14538       pagesize = env->me_psize;
14539     if (size_lower < 0)
14540       size_lower = pgno2bytes(env, head->mm_geo.lower);
14541     if (size_now < 0)
14542       size_now = pgno2bytes(env, head->mm_geo.now);
14543     if (size_upper < 0)
14544       size_upper = pgno2bytes(env, head->mm_geo.upper);
14545     if (growth_step < 0)
14546       growth_step = pgno2bytes(env, pv2pages(head->mm_geo.grow_pv));
14547     if (shrink_threshold < 0)
14548       shrink_threshold = pgno2bytes(env, pv2pages(head->mm_geo.shrink_pv));
14549 
14550     if (pagesize != (intptr_t)env->me_psize) {
14551       rc = MDBX_EINVAL;
14552       goto bailout;
14553     }
14554     const size_t usedbytes =
14555         pgno2bytes(env, mdbx_find_largest(env, head->mm_geo.next));
14556     if ((size_t)size_upper < usedbytes) {
14557       rc = MDBX_MAP_FULL;
14558       goto bailout;
14559     }
14560     if ((size_t)size_now < usedbytes)
14561       size_now = usedbytes;
14562   } else {
14563     /* env NOT yet mapped */
14564     if (unlikely(inside_txn))
14565       return MDBX_PANIC;
14566 
14567     /* is requested some auto-value for pagesize ? */
14568     if (pagesize >= INT_MAX /* maximal */)
14569       pagesize = MAX_PAGESIZE;
14570     else if (pagesize <= 0) {
14571       if (pagesize < 0 /* default */) {
14572         pagesize = env->me_os_psize;
14573         if ((uintptr_t)pagesize > MAX_PAGESIZE)
14574           pagesize = MAX_PAGESIZE;
14575         mdbx_assert(env, (uintptr_t)pagesize >= MIN_PAGESIZE);
14576       } else if (pagesize == 0 /* minimal */)
14577         pagesize = MIN_PAGESIZE;
14578 
14579       /* choose pagesize */
14580       intptr_t max_size = (size_now > size_lower) ? size_now : size_lower;
14581       max_size = (size_upper > max_size) ? size_upper : max_size;
14582       if (max_size < 0 /* default */)
14583         max_size = DEFAULT_MAPSIZE;
14584       else if (max_size == 0 /* minimal */)
14585         max_size = MIN_MAPSIZE;
14586       else if (max_size >= (intptr_t)MAX_MAPSIZE /* maximal */)
14587         max_size = get_reasonable_db_maxsize(&reasonable_maxsize);
14588 
14589       while (max_size > pagesize * (int64_t)MAX_PAGENO &&
14590              pagesize < MAX_PAGESIZE)
14591         pagesize <<= 1;
14592     }
14593   }
14594 
14595   if (pagesize < (intptr_t)MIN_PAGESIZE || pagesize > (intptr_t)MAX_PAGESIZE ||
14596       !is_powerof2(pagesize)) {
14597     rc = MDBX_EINVAL;
14598     goto bailout;
14599   }
14600 
14601   if (size_lower <= 0) {
14602     size_lower = MIN_MAPSIZE;
14603     if (MIN_MAPSIZE / pagesize < MIN_PAGENO)
14604       size_lower = MIN_PAGENO * pagesize;
14605   }
14606   if (size_lower >= INTPTR_MAX) {
14607     size_lower = get_reasonable_db_maxsize(&reasonable_maxsize);
14608     if ((size_t)size_lower / pagesize > MAX_PAGENO)
14609       size_lower = pagesize * MAX_PAGENO;
14610   }
14611 
14612   if (size_now <= 0) {
14613     size_now = size_lower;
14614     if (size_upper >= size_lower && size_now > size_upper)
14615       size_now = size_upper;
14616   }
14617   if (size_now >= INTPTR_MAX) {
14618     size_now = get_reasonable_db_maxsize(&reasonable_maxsize);
14619     if ((size_t)size_now / pagesize > MAX_PAGENO)
14620       size_now = pagesize * MAX_PAGENO;
14621   }
14622 
14623   if (size_upper <= 0) {
14624     if (size_now >= get_reasonable_db_maxsize(&reasonable_maxsize) / 2)
14625       size_upper = get_reasonable_db_maxsize(&reasonable_maxsize);
14626     else if (MAX_MAPSIZE != MAX_MAPSIZE32 &&
14627              (size_t)size_now >= MAX_MAPSIZE32 / 2 &&
14628              (size_t)size_now <= MAX_MAPSIZE32 / 4 * 3)
14629       size_upper = MAX_MAPSIZE32;
14630     else {
14631       size_upper = size_now + size_now;
14632       if ((size_t)size_upper < DEFAULT_MAPSIZE * 2)
14633         size_upper = DEFAULT_MAPSIZE * 2;
14634     }
14635     if ((size_t)size_upper / pagesize > MAX_PAGENO)
14636       size_upper = pagesize * MAX_PAGENO;
14637   } else if (size_upper >= INTPTR_MAX) {
14638     size_upper = get_reasonable_db_maxsize(&reasonable_maxsize);
14639     if ((size_t)size_upper / pagesize > MAX_PAGENO)
14640       size_upper = pagesize * MAX_PAGENO;
14641   }
14642 
14643   if (unlikely(size_lower < (intptr_t)MIN_MAPSIZE || size_lower > size_upper)) {
14644     rc = MDBX_EINVAL;
14645     goto bailout;
14646   }
14647 
14648   if ((uint64_t)size_lower / pagesize < MIN_PAGENO) {
14649     rc = MDBX_EINVAL;
14650     goto bailout;
14651   }
14652 
14653   if (unlikely((size_t)size_upper > MAX_MAPSIZE ||
14654                (uint64_t)size_upper / pagesize > MAX_PAGENO)) {
14655     rc = MDBX_TOO_LARGE;
14656     goto bailout;
14657   }
14658 
14659   const size_t unit = (env->me_os_psize > (size_t)pagesize) ? env->me_os_psize
14660                                                             : (size_t)pagesize;
14661   size_lower = ceil_powerof2(size_lower, unit);
14662   size_upper = ceil_powerof2(size_upper, unit);
14663   size_now = ceil_powerof2(size_now, unit);
14664 
14665   /* LY: подбираем значение size_upper:
14666    *  - кратное размеру страницы
14667    *  - без нарушения MAX_MAPSIZE и MAX_PAGENO */
14668   while (unlikely((size_t)size_upper > MAX_MAPSIZE ||
14669                   (uint64_t)size_upper / pagesize > MAX_PAGENO)) {
14670     if ((size_t)size_upper < unit + MIN_MAPSIZE ||
14671         (size_t)size_upper < (size_t)pagesize * (MIN_PAGENO + 1)) {
14672       /* паранойа на случай переполнения при невероятных значениях */
14673       rc = MDBX_EINVAL;
14674       goto bailout;
14675     }
14676     size_upper -= unit;
14677     if ((size_t)size_upper < (size_t)size_lower)
14678       size_lower = size_upper;
14679   }
14680   mdbx_assert(env, (size_upper - size_lower) % env->me_os_psize == 0);
14681 
14682   if (size_now < size_lower)
14683     size_now = size_lower;
14684   if (size_now > size_upper)
14685     size_now = size_upper;
14686 
14687   if (growth_step < 0) {
14688     growth_step = ((size_t)(size_upper - size_lower)) / 42;
14689     if (growth_step > size_lower && size_lower < (intptr_t)MEGABYTE)
14690       growth_step = size_lower;
14691     if (growth_step < 65536)
14692       growth_step = 65536;
14693     if ((size_t)growth_step > MAX_MAPSIZE / 64)
14694       growth_step = MAX_MAPSIZE / 64;
14695   }
14696   if (growth_step == 0 && shrink_threshold > 0)
14697     growth_step = 1;
14698   growth_step = ceil_powerof2(growth_step, unit);
14699 
14700   if (shrink_threshold < 0)
14701     shrink_threshold = growth_step + growth_step;
14702   shrink_threshold = ceil_powerof2(shrink_threshold, unit);
14703 
14704   //----------------------------------------------------------------------------
14705 
14706   if (!env->me_map) {
14707     /* save user's geo-params for future open/create */
14708     if (pagesize != (intptr_t)env->me_psize)
14709       mdbx_setup_pagesize(env, pagesize);
14710     env->me_dbgeo.lower = size_lower;
14711     env->me_dbgeo.now = size_now;
14712     env->me_dbgeo.upper = size_upper;
14713     env->me_dbgeo.grow =
14714         pgno2bytes(env, pv2pages(pages2pv(bytes2pgno(env, growth_step))));
14715     env->me_dbgeo.shrink =
14716         pgno2bytes(env, pv2pages(pages2pv(bytes2pgno(env, shrink_threshold))));
14717 
14718     mdbx_ensure(env, env->me_dbgeo.lower >= MIN_MAPSIZE);
14719     mdbx_ensure(env, env->me_dbgeo.lower / (unsigned)pagesize >= MIN_PAGENO);
14720     mdbx_ensure(env, env->me_dbgeo.lower % (unsigned)pagesize == 0);
14721     mdbx_ensure(env, env->me_dbgeo.lower % env->me_os_psize == 0);
14722 
14723     mdbx_ensure(env, env->me_dbgeo.upper <= MAX_MAPSIZE);
14724     mdbx_ensure(env, env->me_dbgeo.upper / (unsigned)pagesize <= MAX_PAGENO);
14725     mdbx_ensure(env, env->me_dbgeo.upper % (unsigned)pagesize == 0);
14726     mdbx_ensure(env, env->me_dbgeo.upper % env->me_os_psize == 0);
14727 
14728     mdbx_ensure(env, env->me_dbgeo.now >= env->me_dbgeo.lower);
14729     mdbx_ensure(env, env->me_dbgeo.now <= env->me_dbgeo.upper);
14730     mdbx_ensure(env, env->me_dbgeo.now % (unsigned)pagesize == 0);
14731     mdbx_ensure(env, env->me_dbgeo.now % env->me_os_psize == 0);
14732 
14733     mdbx_ensure(env, env->me_dbgeo.grow % (unsigned)pagesize == 0);
14734     mdbx_ensure(env, env->me_dbgeo.grow % env->me_os_psize == 0);
14735     mdbx_ensure(env, env->me_dbgeo.shrink % (unsigned)pagesize == 0);
14736     mdbx_ensure(env, env->me_dbgeo.shrink % env->me_os_psize == 0);
14737 
14738     rc = MDBX_SUCCESS;
14739   } else {
14740     /* apply new params to opened environment */
14741     mdbx_ensure(env, pagesize == (intptr_t)env->me_psize);
14742     MDBX_meta meta;
14743     MDBX_meta *head = nullptr;
14744     const MDBX_geo *current_geo;
14745     if (inside_txn) {
14746       current_geo = &env->me_txn->mt_geo;
14747     } else {
14748       head = mdbx_meta_head(env);
14749       meta = *head;
14750       current_geo = &meta.mm_geo;
14751     }
14752 
14753     MDBX_geo new_geo;
14754     new_geo.lower = bytes2pgno(env, size_lower);
14755     new_geo.now = bytes2pgno(env, size_now);
14756     new_geo.upper = bytes2pgno(env, size_upper);
14757     new_geo.grow_pv = pages2pv(bytes2pgno(env, growth_step));
14758     new_geo.shrink_pv = pages2pv(bytes2pgno(env, shrink_threshold));
14759     new_geo.next = current_geo->next;
14760 
14761     mdbx_ensure(env,
14762                 pgno_align2os_bytes(env, new_geo.lower) == (size_t)size_lower);
14763     mdbx_ensure(env,
14764                 pgno_align2os_bytes(env, new_geo.upper) == (size_t)size_upper);
14765     mdbx_ensure(env, pgno_align2os_bytes(env, new_geo.now) == (size_t)size_now);
14766     mdbx_ensure(env, new_geo.grow_pv == pages2pv(pv2pages(new_geo.grow_pv)));
14767     mdbx_ensure(env,
14768                 new_geo.shrink_pv == pages2pv(pv2pages(new_geo.shrink_pv)));
14769 
14770     mdbx_ensure(env, (size_t)size_lower >= MIN_MAPSIZE);
14771     mdbx_ensure(env, new_geo.lower >= MIN_PAGENO);
14772     mdbx_ensure(env, (size_t)size_upper <= MAX_MAPSIZE);
14773     mdbx_ensure(env, new_geo.upper <= MAX_PAGENO);
14774     mdbx_ensure(env, new_geo.now >= new_geo.next);
14775     mdbx_ensure(env, new_geo.upper >= new_geo.now);
14776     mdbx_ensure(env, new_geo.now >= new_geo.lower);
14777 
14778     if (memcmp(current_geo, &new_geo, sizeof(MDBX_geo)) != 0) {
14779 #if defined(_WIN32) || defined(_WIN64)
14780       /* Was DB shrinking disabled before and now it will be enabled? */
14781       if (new_geo.lower < new_geo.upper && new_geo.shrink_pv &&
14782           !(current_geo->lower < current_geo->upper &&
14783             current_geo->shrink_pv)) {
14784         if (!env->me_lck_mmap.lck) {
14785           rc = MDBX_EPERM;
14786           goto bailout;
14787         }
14788         int err = mdbx_rdt_lock(env);
14789         if (unlikely(MDBX_IS_ERROR(err))) {
14790           rc = err;
14791           goto bailout;
14792         }
14793 
14794         /* Check if there are any reading threads that do not use the SRWL */
14795         const size_t CurrentTid = GetCurrentThreadId();
14796         const MDBX_reader *const begin = env->me_lck_mmap.lck->mti_readers;
14797         const MDBX_reader *const end =
14798             begin + atomic_load32(&env->me_lck_mmap.lck->mti_numreaders,
14799                                   mo_AcquireRelease);
14800         for (const MDBX_reader *reader = begin; reader < end; ++reader) {
14801           if (reader->mr_pid.weak == env->me_pid && reader->mr_tid.weak &&
14802               reader->mr_tid.weak != CurrentTid) {
14803             /* At least one thread may don't use SRWL */
14804             rc = MDBX_EPERM;
14805             break;
14806           }
14807         }
14808 
14809         mdbx_rdt_unlock(env);
14810         if (unlikely(rc != MDBX_SUCCESS))
14811           goto bailout;
14812       }
14813 #endif
14814 
14815       if (new_geo.now != current_geo->now ||
14816           new_geo.upper != current_geo->upper) {
14817         rc = mdbx_mapresize(env, current_geo->next, new_geo.now, new_geo.upper,
14818                             false);
14819         if (unlikely(rc != MDBX_SUCCESS))
14820           goto bailout;
14821         mdbx_assert(env, (head == nullptr) == inside_txn);
14822         if (head)
14823           head = /* base address could be changed */ mdbx_meta_head(env);
14824       }
14825       if (inside_txn) {
14826         env->me_txn->mt_geo = new_geo;
14827         env->me_txn->mt_flags |= MDBX_TXN_DIRTY;
14828       } else {
14829         meta.mm_geo = new_geo;
14830         const txnid_t txnid =
14831             safe64_txnid_next(mdbx_meta_txnid_stable(env, head));
14832         if (unlikely(txnid > MAX_TXNID)) {
14833           rc = MDBX_TXN_FULL;
14834           mdbx_error("txnid overflow, raise %d", rc);
14835         } else {
14836           mdbx_meta_set_txnid(env, &meta, txnid);
14837           rc = mdbx_sync_locked(env, env->me_flags, &meta);
14838         }
14839       }
14840 
14841       if (likely(rc == MDBX_SUCCESS)) {
14842         /* store new geo to env to avoid influences */
14843         env->me_dbgeo.now = pgno2bytes(env, new_geo.now);
14844         env->me_dbgeo.lower = pgno2bytes(env, new_geo.lower);
14845         env->me_dbgeo.upper = pgno2bytes(env, new_geo.upper);
14846         env->me_dbgeo.grow = pgno2bytes(env, pv2pages(new_geo.grow_pv));
14847         env->me_dbgeo.shrink = pgno2bytes(env, pv2pages(new_geo.shrink_pv));
14848       }
14849     }
14850   }
14851 
14852 bailout:
14853   if (need_unlock)
14854     mdbx_txn_unlock(env);
14855   return rc;
14856 }
14857 
14858 #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API
mdbx_env_set_mapsize(MDBX_env * env,size_t size)14859 __cold int mdbx_env_set_mapsize(MDBX_env *env, size_t size) {
14860   return __inline_mdbx_env_set_mapsize(env, size);
14861 }
14862 
mdbx_env_set_maxdbs(MDBX_env * env,MDBX_dbi dbs)14863 __cold int mdbx_env_set_maxdbs(MDBX_env *env, MDBX_dbi dbs) {
14864   return __inline_mdbx_env_set_maxdbs(env, dbs);
14865 }
14866 
mdbx_env_get_maxdbs(const MDBX_env * env,MDBX_dbi * dbs)14867 __cold int mdbx_env_get_maxdbs(const MDBX_env *env, MDBX_dbi *dbs) {
14868   return __inline_mdbx_env_get_maxdbs(env, dbs);
14869 }
14870 
mdbx_env_set_maxreaders(MDBX_env * env,unsigned readers)14871 __cold int mdbx_env_set_maxreaders(MDBX_env *env, unsigned readers) {
14872   return __inline_mdbx_env_set_maxreaders(env, readers);
14873 }
14874 
mdbx_env_get_maxreaders(const MDBX_env * env,unsigned * readers)14875 __cold int mdbx_env_get_maxreaders(const MDBX_env *env, unsigned *readers) {
14876   return __inline_mdbx_env_get_maxreaders(env, readers);
14877 }
14878 #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */
14879 
alloc_page_buf(MDBX_env * env)14880 __cold static int alloc_page_buf(MDBX_env *env) {
14881   return env->me_pbuf
14882              ? MDBX_SUCCESS
14883              : mdbx_memalign_alloc(env->me_os_psize, env->me_psize * NUM_METAS,
14884                                    &env->me_pbuf);
14885 }
14886 
14887 /* Further setup required for opening an MDBX environment */
mdbx_setup_dxb(MDBX_env * env,const int lck_rc,const mdbx_mode_t mode_bits)14888 __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc,
14889                                  const mdbx_mode_t mode_bits) {
14890   MDBX_meta meta;
14891   int rc = MDBX_RESULT_FALSE;
14892   int err = mdbx_read_header(env, &meta, lck_rc, mode_bits);
14893   if (unlikely(err != MDBX_SUCCESS)) {
14894     if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE || err != MDBX_ENODATA ||
14895         (env->me_flags & MDBX_RDONLY) != 0 ||
14896         /* recovery mode */ env->me_stuck_meta >= 0)
14897       return err;
14898 
14899     mdbx_debug("%s", "create new database");
14900     rc = /* new database */ MDBX_RESULT_TRUE;
14901 
14902     if (!env->me_dbgeo.now) {
14903       /* set defaults if not configured */
14904       err = mdbx_env_set_geometry(env, 0, -1, DEFAULT_MAPSIZE, -1, -1, -1);
14905       if (unlikely(err != MDBX_SUCCESS))
14906         return err;
14907     }
14908 
14909     err = alloc_page_buf(env);
14910     if (unlikely(err != MDBX_SUCCESS))
14911       return err;
14912 
14913     meta = *mdbx_init_metas(env, env->me_pbuf);
14914     err = mdbx_pwrite(env->me_lazy_fd, env->me_pbuf, env->me_psize * NUM_METAS,
14915                       0);
14916     if (unlikely(err != MDBX_SUCCESS))
14917       return err;
14918 
14919     err = mdbx_ftruncate(env->me_lazy_fd,
14920                          env->me_dxb_mmap.filesize = env->me_dbgeo.now);
14921     if (unlikely(err != MDBX_SUCCESS))
14922       return err;
14923 
14924 #ifndef NDEBUG /* just for checking */
14925     err = mdbx_read_header(env, &meta, lck_rc, mode_bits);
14926     if (unlikely(err != MDBX_SUCCESS))
14927       return err;
14928 #endif
14929   }
14930 
14931   mdbx_verbose(
14932       "header: root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO "/%" PRIaPGNO
14933       "-%" PRIaPGNO "/%" PRIaPGNO " +%u -%u, txn_id %" PRIaTXN ", %s",
14934       meta.mm_dbs[MAIN_DBI].md_root, meta.mm_dbs[FREE_DBI].md_root,
14935       meta.mm_geo.lower, meta.mm_geo.next, meta.mm_geo.now, meta.mm_geo.upper,
14936       pv2pages(meta.mm_geo.grow_pv), pv2pages(meta.mm_geo.shrink_pv),
14937       unaligned_peek_u64(4, meta.mm_txnid_a), mdbx_durable_str(&meta));
14938 
14939   mdbx_setup_pagesize(env, meta.mm_psize);
14940   const size_t used_bytes = pgno2bytes(env, meta.mm_geo.next);
14941   const size_t used_aligned2os_bytes =
14942       ceil_powerof2(used_bytes, env->me_os_psize);
14943   if ((env->me_flags & MDBX_RDONLY) /* readonly */
14944       || lck_rc != MDBX_RESULT_TRUE /* not exclusive */
14945       || /* recovery mode */ env->me_stuck_meta >= 0) {
14946     /* use present params from db */
14947     const size_t pagesize = meta.mm_psize;
14948     err = mdbx_env_set_geometry(
14949         env, meta.mm_geo.lower * pagesize, meta.mm_geo.now * pagesize,
14950         meta.mm_geo.upper * pagesize, pv2pages(meta.mm_geo.grow_pv) * pagesize,
14951         pv2pages(meta.mm_geo.shrink_pv) * pagesize, meta.mm_psize);
14952     if (unlikely(err != MDBX_SUCCESS)) {
14953       mdbx_error("%s: err %d", "could not apply preconfigured geometry from db",
14954                  err);
14955       return (err == MDBX_EINVAL) ? MDBX_INCOMPATIBLE : err;
14956     }
14957   } else if (env->me_dbgeo.now) {
14958     /* silently growth to last used page */
14959     if (env->me_dbgeo.now < used_aligned2os_bytes)
14960       env->me_dbgeo.now = used_aligned2os_bytes;
14961     if (env->me_dbgeo.upper < used_aligned2os_bytes)
14962       env->me_dbgeo.upper = used_aligned2os_bytes;
14963 
14964     /* apply preconfigured params, but only if substantial changes:
14965      *  - upper or lower limit changes
14966      *  - shrink threshold or growth step
14967      * But ignore change just a 'now/current' size. */
14968     if (bytes_align2os_bytes(env, env->me_dbgeo.upper) !=
14969             pgno2bytes(env, meta.mm_geo.upper) ||
14970         bytes_align2os_bytes(env, env->me_dbgeo.lower) !=
14971             pgno2bytes(env, meta.mm_geo.lower) ||
14972         bytes_align2os_bytes(env, env->me_dbgeo.shrink) !=
14973             pgno2bytes(env, pv2pages(meta.mm_geo.shrink_pv)) ||
14974         bytes_align2os_bytes(env, env->me_dbgeo.grow) !=
14975             pgno2bytes(env, pv2pages(meta.mm_geo.grow_pv))) {
14976 
14977       if (env->me_dbgeo.shrink && env->me_dbgeo.now > used_bytes)
14978         /* pre-shrink if enabled */
14979         env->me_dbgeo.now = used_bytes + env->me_dbgeo.shrink -
14980                             used_bytes % env->me_dbgeo.shrink;
14981 
14982       err = mdbx_env_set_geometry(env, env->me_dbgeo.lower, env->me_dbgeo.now,
14983                                   env->me_dbgeo.upper, env->me_dbgeo.grow,
14984                                   env->me_dbgeo.shrink, meta.mm_psize);
14985       if (unlikely(err != MDBX_SUCCESS)) {
14986         mdbx_error("%s: err %d", "could not apply preconfigured db-geometry",
14987                    err);
14988         return (err == MDBX_EINVAL) ? MDBX_INCOMPATIBLE : err;
14989       }
14990 
14991       /* update meta fields */
14992       meta.mm_geo.now = bytes2pgno(env, env->me_dbgeo.now);
14993       meta.mm_geo.lower = bytes2pgno(env, env->me_dbgeo.lower);
14994       meta.mm_geo.upper = bytes2pgno(env, env->me_dbgeo.upper);
14995       meta.mm_geo.grow_pv = pages2pv(bytes2pgno(env, env->me_dbgeo.grow));
14996       meta.mm_geo.shrink_pv = pages2pv(bytes2pgno(env, env->me_dbgeo.shrink));
14997 
14998       mdbx_verbose("amended: root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO
14999                    "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO
15000                    " +%u -%u, txn_id %" PRIaTXN ", %s",
15001                    meta.mm_dbs[MAIN_DBI].md_root, meta.mm_dbs[FREE_DBI].md_root,
15002                    meta.mm_geo.lower, meta.mm_geo.next, meta.mm_geo.now,
15003                    meta.mm_geo.upper, pv2pages(meta.mm_geo.grow_pv),
15004                    pv2pages(meta.mm_geo.shrink_pv),
15005                    unaligned_peek_u64(4, meta.mm_txnid_a),
15006                    mdbx_durable_str(&meta));
15007     } else {
15008       /* fetch back 'now/current' size, since it was ignored during comparison
15009        * and may differ. */
15010       env->me_dbgeo.now = pgno_align2os_bytes(env, meta.mm_geo.now);
15011     }
15012     mdbx_ensure(env, meta.mm_geo.now >= meta.mm_geo.next);
15013   } else {
15014     /* geo-params are not pre-configured by user,
15015      * get current values from the meta. */
15016     env->me_dbgeo.now = pgno2bytes(env, meta.mm_geo.now);
15017     env->me_dbgeo.lower = pgno2bytes(env, meta.mm_geo.lower);
15018     env->me_dbgeo.upper = pgno2bytes(env, meta.mm_geo.upper);
15019     env->me_dbgeo.grow = pgno2bytes(env, pv2pages(meta.mm_geo.grow_pv));
15020     env->me_dbgeo.shrink = pgno2bytes(env, pv2pages(meta.mm_geo.shrink_pv));
15021   }
15022 
15023   mdbx_ensure(env,
15024               pgno_align2os_bytes(env, meta.mm_geo.now) == env->me_dbgeo.now);
15025   mdbx_ensure(env, env->me_dbgeo.now >= used_bytes);
15026   const uint64_t filesize_before = env->me_dxb_mmap.filesize;
15027   if (unlikely(filesize_before != env->me_dbgeo.now)) {
15028     if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) {
15029       mdbx_verbose("filesize mismatch (expect %" PRIuPTR "b/%" PRIaPGNO
15030                    "p, have %" PRIu64 "b/%" PRIaPGNO "p), "
15031                    "assume other process working",
15032                    env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now),
15033                    filesize_before, bytes2pgno(env, (size_t)filesize_before));
15034     } else {
15035       mdbx_warning("filesize mismatch (expect %" PRIuSIZE "b/%" PRIaPGNO
15036                    "p, have %" PRIu64 "b/%" PRIaPGNO "p)",
15037                    env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now),
15038                    filesize_before, bytes2pgno(env, (size_t)filesize_before));
15039       if (filesize_before < used_bytes) {
15040         mdbx_error("last-page beyond end-of-file (last %" PRIaPGNO
15041                    ", have %" PRIaPGNO ")",
15042                    meta.mm_geo.next, bytes2pgno(env, (size_t)filesize_before));
15043         return MDBX_CORRUPTED;
15044       }
15045 
15046       if (env->me_flags & MDBX_RDONLY) {
15047         if (filesize_before & (env->me_os_psize - 1)) {
15048           mdbx_error("%s", "filesize should be rounded-up to system page");
15049           return MDBX_WANNA_RECOVERY;
15050         }
15051         mdbx_warning("%s", "ignore filesize mismatch in readonly-mode");
15052       } else {
15053         mdbx_verbose("will resize datafile to %" PRIuSIZE " bytes, %" PRIaPGNO
15054                      " pages",
15055                      env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now));
15056       }
15057     }
15058   }
15059 
15060   mdbx_verbose("current boot-id %" PRIx64 "-%" PRIx64 " (%savailable)",
15061                bootid.x, bootid.y, (bootid.x | bootid.y) ? "" : "not-");
15062 
15063 #if MDBX_ENABLE_MADVISE
15064   /* calculate readahead hint before mmap with zero redundant pages */
15065   const bool readahead =
15066       !(env->me_flags & MDBX_NORDAHEAD) &&
15067       mdbx_is_readahead_reasonable(used_bytes, 0) == MDBX_RESULT_TRUE;
15068 #endif /* MDBX_ENABLE_MADVISE */
15069 
15070   err = mdbx_mmap(env->me_flags, &env->me_dxb_mmap, env->me_dbgeo.now,
15071                   env->me_dbgeo.upper, lck_rc ? MMAP_OPTION_TRUNCATE : 0);
15072   if (unlikely(err != MDBX_SUCCESS))
15073     return err;
15074 
15075 #if MDBX_ENABLE_MADVISE
15076 #if defined(MADV_DONTDUMP)
15077   err = madvise(env->me_map, env->me_dxb_mmap.limit, MADV_DONTDUMP)
15078             ? ignore_enosys(errno)
15079             : MDBX_SUCCESS;
15080   if (unlikely(MDBX_IS_ERROR(err)))
15081     return err;
15082 #endif /* MADV_DONTDUMP */
15083 #if defined(MADV_DODUMP)
15084   if (mdbx_runtime_flags & MDBX_DBG_DUMP) {
15085     const size_t meta_length_aligned2os = pgno_align2os_bytes(env, NUM_METAS);
15086     err = madvise(env->me_map, meta_length_aligned2os, MADV_DODUMP)
15087               ? ignore_enosys(errno)
15088               : MDBX_SUCCESS;
15089     if (unlikely(MDBX_IS_ERROR(err)))
15090       return err;
15091   }
15092 #endif /* MADV_DODUMP */
15093 #endif /* MDBX_ENABLE_MADVISE */
15094 
15095 #ifdef MDBX_USE_VALGRIND
15096   env->me_valgrind_handle =
15097       VALGRIND_CREATE_BLOCK(env->me_map, env->me_dxb_mmap.limit, "mdbx");
15098 #endif /* MDBX_USE_VALGRIND */
15099 
15100   mdbx_assert(env, used_bytes >= pgno2bytes(env, NUM_METAS) &&
15101                        used_bytes <= env->me_dxb_mmap.limit);
15102 #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__)
15103   if (env->me_dxb_mmap.filesize > used_bytes &&
15104       env->me_dxb_mmap.filesize < env->me_dxb_mmap.limit) {
15105     VALGRIND_MAKE_MEM_NOACCESS(env->me_map + used_bytes,
15106                                env->me_dxb_mmap.filesize - used_bytes);
15107     MDBX_ASAN_POISON_MEMORY_REGION(env->me_map + used_bytes,
15108                                    env->me_dxb_mmap.filesize - used_bytes);
15109   }
15110   env->me_poison_edge =
15111       bytes2pgno(env, (env->me_dxb_mmap.filesize < env->me_dxb_mmap.limit)
15112                           ? env->me_dxb_mmap.filesize
15113                           : env->me_dxb_mmap.limit);
15114 #endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */
15115 
15116   //-------------------------------- validate/rollback head & steady meta-pages
15117   if (unlikely(env->me_stuck_meta >= 0)) {
15118     /* recovery mode */
15119     MDBX_meta clone;
15120     MDBX_meta const *const target = METAPAGE(env, env->me_stuck_meta);
15121     err = mdbx_validate_meta_copy(env, target, &clone);
15122     if (unlikely(err != MDBX_SUCCESS)) {
15123       mdbx_error("target meta[%u] is corrupted",
15124                  bytes2pgno(env, (uint8_t *)data_page(target) - env->me_map));
15125       return MDBX_CORRUPTED;
15126     }
15127   } else /* not recovery mode */
15128     while (1) {
15129       const unsigned meta_clash_mask = mdbx_meta_eq_mask(env);
15130       if (unlikely(meta_clash_mask)) {
15131         mdbx_error("meta-pages are clashed: mask 0x%d", meta_clash_mask);
15132         return MDBX_CORRUPTED;
15133       }
15134 
15135       if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) {
15136         /* non-exclusive mode,
15137          * meta-pages should be validated by a first process opened the DB */
15138         MDBX_meta *const head = mdbx_meta_head(env);
15139         MDBX_meta *const steady = mdbx_meta_steady(env);
15140         const txnid_t head_txnid = mdbx_meta_txnid_fluid(env, head);
15141         const txnid_t steady_txnid = mdbx_meta_txnid_fluid(env, steady);
15142         if (head_txnid == steady_txnid)
15143           break;
15144 
15145         if (!env->me_lck_mmap.lck) {
15146           /* LY: without-lck (read-only) mode, so it is impossible that other
15147            * process made weak checkpoint. */
15148           mdbx_error("%s", "without-lck, unable recovery/rollback");
15149           return MDBX_WANNA_RECOVERY;
15150         }
15151 
15152         /* LY: assume just have a collision with other running process,
15153          *     or someone make a weak checkpoint */
15154         mdbx_verbose("%s", "assume collision or online weak checkpoint");
15155         break;
15156       }
15157       mdbx_assert(env, lck_rc == MDBX_RESULT_TRUE);
15158       /* exclusive mode */
15159 
15160       MDBX_meta clone;
15161       MDBX_meta const *const steady = mdbx_meta_steady(env);
15162       MDBX_meta const *const head = mdbx_meta_head(env);
15163       const txnid_t steady_txnid = mdbx_meta_txnid_fluid(env, steady);
15164       if (META_IS_STEADY(steady)) {
15165         err = mdbx_validate_meta_copy(env, steady, &clone);
15166         if (unlikely(err != MDBX_SUCCESS)) {
15167           mdbx_error("meta[%u] with %s txnid %" PRIaTXN
15168                      " is corrupted, %s needed",
15169                      bytes2pgno(env, (uint8_t *)steady - env->me_map), "steady",
15170                      steady_txnid, "manual recovery");
15171           return MDBX_CORRUPTED;
15172         }
15173         if (steady == head)
15174           break;
15175       }
15176 
15177       const pgno_t pgno = bytes2pgno(env, (uint8_t *)head - env->me_map);
15178       const txnid_t head_txnid = mdbx_meta_txnid_fluid(env, head);
15179       const bool head_valid =
15180           mdbx_validate_meta_copy(env, head, &clone) == MDBX_SUCCESS;
15181       mdbx_assert(env, !META_IS_STEADY(steady) || head_txnid != steady_txnid);
15182       if (unlikely(!head_valid)) {
15183         if (unlikely(!META_IS_STEADY(steady))) {
15184           mdbx_error("%s for open or automatic rollback, %s",
15185                      "there are no suitable meta-pages",
15186                      "manual recovery is required");
15187           return MDBX_CORRUPTED;
15188         }
15189         mdbx_warning("meta[%u] with last txnid %" PRIaTXN
15190                      " is corrupted, rollback needed",
15191                      pgno, head_txnid);
15192         goto purge_meta_head;
15193       }
15194 
15195       if (meta_bootid_match(head)) {
15196         if (env->me_flags & MDBX_RDONLY) {
15197           mdbx_error("%s, but boot-id(%016" PRIx64 "-%016" PRIx64 ") is MATCH: "
15198                      "rollback NOT needed, steady-sync NEEDED%s",
15199                      "opening after an unclean shutdown", bootid.x, bootid.y,
15200                      ", but unable in read-only mode");
15201           return MDBX_WANNA_RECOVERY;
15202         }
15203         mdbx_warning("%s, but boot-id(%016" PRIx64 "-%016" PRIx64 ") is MATCH: "
15204                      "rollback NOT needed, steady-sync NEEDED%s",
15205                      "opening after an unclean shutdown", bootid.x, bootid.y,
15206                      "");
15207         meta = clone;
15208         atomic_store32(&env->me_lck->mti_unsynced_pages, meta.mm_geo.next,
15209                        mo_Relaxed);
15210         break;
15211       }
15212       if (unlikely(!META_IS_STEADY(steady))) {
15213         mdbx_error("%s, but %s for automatic rollback: %s",
15214                    "opening after an unclean shutdown",
15215                    "there are no suitable meta-pages",
15216                    "manual recovery is required");
15217         return MDBX_CORRUPTED;
15218       }
15219       if (env->me_flags & MDBX_RDONLY) {
15220         mdbx_error("%s and rollback needed: (from head %" PRIaTXN
15221                    " to steady %" PRIaTXN ")%s",
15222                    "opening after an unclean shutdown", head_txnid,
15223                    steady_txnid, ", but unable in read-only mode");
15224         return MDBX_WANNA_RECOVERY;
15225       }
15226 
15227     purge_meta_head:
15228       mdbx_notice("%s and doing automatic rollback: "
15229                   "purge%s meta[%u] with%s txnid %" PRIaTXN,
15230                   "opening after an unclean shutdown",
15231                   head_valid ? "" : " invalid", pgno, head_valid ? " weak" : "",
15232                   head_txnid);
15233       mdbx_ensure(env, META_IS_STEADY(steady));
15234       err = mdbx_override_meta(env, pgno, 0, head_valid ? head : steady);
15235       if (err) {
15236         mdbx_error("rollback: overwrite meta[%u] with txnid %" PRIaTXN
15237                    ", error %d",
15238                    pgno, head_txnid, err);
15239         return err;
15240       }
15241       mdbx_ensure(env, 0 == mdbx_meta_txnid_fluid(env, head));
15242       mdbx_ensure(env, 0 == mdbx_meta_eq_mask(env));
15243     }
15244 
15245   if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) {
15246     //-------------------------------------------------- shrink DB & update geo
15247     const MDBX_meta *head = mdbx_meta_head(env);
15248     /* re-check size after mmap */
15249     if ((env->me_dxb_mmap.current & (env->me_os_psize - 1)) != 0 ||
15250         env->me_dxb_mmap.current < used_bytes) {
15251       mdbx_error("unacceptable/unexpected datafile size %" PRIuPTR,
15252                  env->me_dxb_mmap.current);
15253       return MDBX_PROBLEM;
15254     }
15255     if (env->me_dxb_mmap.current != env->me_dbgeo.now) {
15256       meta.mm_geo.now = bytes2pgno(env, env->me_dxb_mmap.current);
15257       mdbx_notice("need update meta-geo to filesize %" PRIuPTR
15258                   " bytes, %" PRIaPGNO " pages",
15259                   env->me_dxb_mmap.current, meta.mm_geo.now);
15260     }
15261 
15262     if (memcmp(&meta.mm_geo, &head->mm_geo, sizeof(meta.mm_geo))) {
15263       if ((env->me_flags & MDBX_RDONLY) != 0 ||
15264           /* recovery mode */ env->me_stuck_meta >= 0) {
15265         mdbx_warning(
15266             "skipped update meta.geo in %s mode: from l%" PRIaPGNO
15267             "-n%" PRIaPGNO "-u%" PRIaPGNO "/s%u-g%u, to l%" PRIaPGNO
15268             "-n%" PRIaPGNO "-u%" PRIaPGNO "/s%u-g%u",
15269             (env->me_stuck_meta < 0) ? "read-only" : "recovery",
15270             head->mm_geo.lower, head->mm_geo.now, head->mm_geo.upper,
15271             pv2pages(head->mm_geo.shrink_pv), pv2pages(head->mm_geo.grow_pv),
15272             meta.mm_geo.lower, meta.mm_geo.now, meta.mm_geo.upper,
15273             pv2pages(meta.mm_geo.shrink_pv), pv2pages(meta.mm_geo.grow_pv));
15274       } else {
15275         const txnid_t txnid = mdbx_meta_txnid_stable(env, head);
15276         const txnid_t next_txnid = safe64_txnid_next(txnid);
15277         if (unlikely(txnid > MAX_TXNID)) {
15278           mdbx_error("txnid overflow, raise %d", MDBX_TXN_FULL);
15279           return MDBX_TXN_FULL;
15280         }
15281         mdbx_notice("updating meta.geo: "
15282                     "from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO
15283                     "/s%u-g%u (txn#%" PRIaTXN "), "
15284                     "to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO
15285                     "/s%u-g%u (txn#%" PRIaTXN ")",
15286                     head->mm_geo.lower, head->mm_geo.now, head->mm_geo.upper,
15287                     pv2pages(head->mm_geo.shrink_pv),
15288                     pv2pages(head->mm_geo.grow_pv), txnid, meta.mm_geo.lower,
15289                     meta.mm_geo.now, meta.mm_geo.upper,
15290                     pv2pages(meta.mm_geo.shrink_pv),
15291                     pv2pages(meta.mm_geo.grow_pv), next_txnid);
15292 
15293         mdbx_ensure(env, mdbx_meta_eq(env, &meta, head));
15294         mdbx_meta_set_txnid(env, &meta, next_txnid);
15295         err = mdbx_sync_locked(env, env->me_flags | MDBX_SHRINK_ALLOWED, &meta);
15296         if (err) {
15297           mdbx_error("error %d, while updating meta.geo: "
15298                      "from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO
15299                      "/s%u-g%u (txn#%" PRIaTXN "), "
15300                      "to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO
15301                      "/s%u-g%u (txn#%" PRIaTXN ")",
15302                      err, head->mm_geo.lower, head->mm_geo.now,
15303                      head->mm_geo.upper, pv2pages(head->mm_geo.shrink_pv),
15304                      pv2pages(head->mm_geo.grow_pv), txnid, meta.mm_geo.lower,
15305                      meta.mm_geo.now, meta.mm_geo.upper,
15306                      pv2pages(meta.mm_geo.shrink_pv),
15307                      pv2pages(meta.mm_geo.grow_pv), next_txnid);
15308           return err;
15309         }
15310       }
15311     }
15312 
15313     atomic_store32(&env->me_lck->mti_discarded_tail,
15314                    bytes2pgno(env, used_aligned2os_bytes), mo_Relaxed);
15315 
15316     if ((env->me_flags & MDBX_RDONLY) == 0 && env->me_stuck_meta < 0) {
15317       for (int n = 0; n < NUM_METAS; ++n) {
15318         MDBX_meta *const pmeta = METAPAGE(env, n);
15319         if (unlikely(unaligned_peek_u64(4, &pmeta->mm_magic_and_version) !=
15320                      MDBX_DATA_MAGIC)) {
15321           const txnid_t txnid = mdbx_meta_txnid_fluid(env, pmeta);
15322           mdbx_notice("%s %s"
15323                       "meta[%u], txnid %" PRIaTXN,
15324                       "updating db-format signature for",
15325                       META_IS_STEADY(pmeta) ? "stead-" : "weak-", n, txnid);
15326           err = mdbx_override_meta(env, n, txnid, pmeta);
15327           if (unlikely(err != MDBX_SUCCESS) &&
15328               /* Just ignore the MDBX_PROBLEM error, since here it is
15329                * returned only in case of the attempt to upgrade an obsolete
15330                * meta-page that is invalid for current state of a DB,
15331                * e.g. after shrinking DB file */
15332               err != MDBX_PROBLEM) {
15333             mdbx_error("%s meta[%u], txnid %" PRIaTXN ", error %d",
15334                        "updating db-format signature for", n, txnid, err);
15335             return err;
15336           }
15337         }
15338       }
15339     }
15340   } /* lck exclusive, lck_rc == MDBX_RESULT_TRUE */
15341 
15342   //---------------------------------------------------- setup madvise/readahead
15343 #if MDBX_ENABLE_MADVISE
15344   if (used_aligned2os_bytes < env->me_dxb_mmap.current) {
15345 #if defined(MADV_REMOVE)
15346     if (lck_rc && (env->me_flags & MDBX_WRITEMAP) != 0 &&
15347         /* not recovery mode */ env->me_stuck_meta < 0) {
15348       mdbx_notice("open-MADV_%s %u..%u", "REMOVE (deallocate file space)",
15349                   env->me_lck->mti_discarded_tail.weak,
15350                   bytes2pgno(env, env->me_dxb_mmap.current));
15351       err =
15352           madvise(env->me_map + used_aligned2os_bytes,
15353                   env->me_dxb_mmap.current - used_aligned2os_bytes, MADV_REMOVE)
15354               ? ignore_enosys(errno)
15355               : MDBX_SUCCESS;
15356       if (unlikely(MDBX_IS_ERROR(err)))
15357         return err;
15358     }
15359 #endif /* MADV_REMOVE */
15360 #if defined(MADV_DONTNEED)
15361     mdbx_notice("open-MADV_%s %u..%u", "DONTNEED",
15362                 env->me_lck->mti_discarded_tail.weak,
15363                 bytes2pgno(env, env->me_dxb_mmap.current));
15364     err =
15365         madvise(env->me_map + used_aligned2os_bytes,
15366                 env->me_dxb_mmap.current - used_aligned2os_bytes, MADV_DONTNEED)
15367             ? ignore_enosys(errno)
15368             : MDBX_SUCCESS;
15369     if (unlikely(MDBX_IS_ERROR(err)))
15370       return err;
15371 #elif defined(POSIX_MADV_DONTNEED)
15372     err = ignore_enosys(posix_madvise(
15373         env->me_map + used_aligned2os_bytes,
15374         env->me_dxb_mmap.current - used_aligned2os_bytes, POSIX_MADV_DONTNEED));
15375     if (unlikely(MDBX_IS_ERROR(err)))
15376       return err;
15377 #elif defined(POSIX_FADV_DONTNEED)
15378     err = ignore_enosys(posix_fadvise(
15379         env->me_lazy_fd, used_aligned2os_bytes,
15380         env->me_dxb_mmap.current - used_aligned2os_bytes, POSIX_FADV_DONTNEED));
15381     if (unlikely(MDBX_IS_ERROR(err)))
15382       return err;
15383 #endif /* MADV_DONTNEED */
15384   }
15385 
15386   err = mdbx_set_readahead(env, bytes2pgno(env, used_bytes), readahead, true);
15387   if (unlikely(err != MDBX_SUCCESS))
15388     return err;
15389 #endif /* MDBX_ENABLE_MADVISE */
15390 
15391   return rc;
15392 }
15393 
15394 /******************************************************************************/
15395 
15396 /* Open and/or initialize the lock region for the environment. */
mdbx_setup_lck(MDBX_env * env,char * lck_pathname,mdbx_mode_t mode)15397 __cold static int mdbx_setup_lck(MDBX_env *env, char *lck_pathname,
15398                                  mdbx_mode_t mode) {
15399   mdbx_assert(env, env->me_lazy_fd != INVALID_HANDLE_VALUE);
15400   mdbx_assert(env, env->me_lfd == INVALID_HANDLE_VALUE);
15401 
15402   int err = mdbx_openfile(MDBX_OPEN_LCK, env, lck_pathname, &env->me_lfd, mode);
15403   if (err != MDBX_SUCCESS) {
15404     if (!(err == MDBX_ENOFILE && (env->me_flags & MDBX_EXCLUSIVE)) &&
15405         !((err == MDBX_EROFS || err == MDBX_EACCESS || err == MDBX_EPERM) &&
15406           (env->me_flags & MDBX_RDONLY)))
15407       return err;
15408 
15409     /* ensure the file system is read-only */
15410     err = mdbx_check_fs_rdonly(env->me_lazy_fd, lck_pathname, err);
15411     if (err != MDBX_SUCCESS &&
15412         /* ignore ERROR_NOT_SUPPORTED for exclusive mode */
15413         !(err == MDBX_ENOSYS && (env->me_flags & MDBX_EXCLUSIVE)))
15414       return err;
15415 
15416     /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */
15417     /* beginning of a locked section ---------------------------------------- */
15418     lcklist_lock();
15419     mdbx_assert(env, env->me_lcklist_next == nullptr);
15420     env->me_lfd = INVALID_HANDLE_VALUE;
15421     const int rc = mdbx_lck_seize(env);
15422     if (MDBX_IS_ERROR(rc)) {
15423       /* Calling lcklist_detach_locked() is required to restore POSIX-filelock
15424        * and this job will be done by mdbx_env_close0(). */
15425       lcklist_unlock();
15426       return rc;
15427     }
15428     /* insert into inprocess lck-list */
15429     env->me_lcklist_next = inprocess_lcklist_head;
15430     inprocess_lcklist_head = env;
15431     lcklist_unlock();
15432     /* end of a locked section ---------------------------------------------- */
15433 
15434     env->me_lck = lckless_stub(env);
15435     env->me_maxreaders = UINT_MAX;
15436     mdbx_debug("lck-setup:%s%s%s", " lck-less",
15437                (env->me_flags & MDBX_RDONLY) ? " readonly" : "",
15438                (rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative");
15439     return rc;
15440   }
15441 
15442   /* beginning of a locked section ------------------------------------------ */
15443   lcklist_lock();
15444   mdbx_assert(env, env->me_lcklist_next == nullptr);
15445 
15446   /* Try to get exclusive lock. If we succeed, then
15447    * nobody is using the lock region and we should initialize it. */
15448   err = mdbx_lck_seize(env);
15449   if (MDBX_IS_ERROR(err)) {
15450   bailout:
15451     /* Calling lcklist_detach_locked() is required to restore POSIX-filelock
15452      * and this job will be done by mdbx_env_close0(). */
15453     lcklist_unlock();
15454     return err;
15455   }
15456 
15457   MDBX_env *inprocess_neighbor = nullptr;
15458   if (err == MDBX_RESULT_TRUE) {
15459     err = uniq_check(&env->me_lck_mmap, &inprocess_neighbor);
15460     if (MDBX_IS_ERROR(err))
15461       goto bailout;
15462     if (inprocess_neighbor &&
15463         ((mdbx_runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 ||
15464          (inprocess_neighbor->me_flags & MDBX_EXCLUSIVE) != 0)) {
15465       err = MDBX_BUSY;
15466       goto bailout;
15467     }
15468   }
15469   const int lck_seize_rc = err;
15470 
15471   mdbx_debug("lck-setup:%s%s%s", " with-lck",
15472              (env->me_flags & MDBX_RDONLY) ? " readonly" : "",
15473              (lck_seize_rc == MDBX_RESULT_TRUE) ? " exclusive"
15474                                                 : " cooperative");
15475 
15476   uint64_t size = 0;
15477   err = mdbx_filesize(env->me_lfd, &size);
15478   if (unlikely(err != MDBX_SUCCESS))
15479     goto bailout;
15480 
15481   if (lck_seize_rc == MDBX_RESULT_TRUE) {
15482     size = ceil_powerof2(env->me_maxreaders * sizeof(MDBX_reader) +
15483                              sizeof(MDBX_lockinfo),
15484                          env->me_os_psize);
15485     mdbx_jitter4testing(false);
15486   } else {
15487     if (env->me_flags & MDBX_EXCLUSIVE) {
15488       err = MDBX_BUSY;
15489       goto bailout;
15490     }
15491     if (size > INT_MAX || (size & (env->me_os_psize - 1)) != 0 ||
15492         size < env->me_os_psize) {
15493       mdbx_error("lck-file has invalid size %" PRIu64 " bytes", size);
15494       err = MDBX_PROBLEM;
15495       goto bailout;
15496     }
15497   }
15498 
15499   const size_t maxreaders =
15500       ((size_t)size - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader);
15501   if (maxreaders < 4) {
15502     mdbx_error("lck-size too small (up to %" PRIuPTR " readers)", maxreaders);
15503     err = MDBX_PROBLEM;
15504     goto bailout;
15505   }
15506   env->me_maxreaders = (maxreaders <= MDBX_READERS_LIMIT)
15507                            ? (unsigned)maxreaders
15508                            : (unsigned)MDBX_READERS_LIMIT;
15509 
15510   err = mdbx_mmap((env->me_flags & MDBX_EXCLUSIVE) | MDBX_WRITEMAP,
15511                   &env->me_lck_mmap, (size_t)size, (size_t)size,
15512                   lck_seize_rc ? MMAP_OPTION_TRUNCATE | MMAP_OPTION_SEMAPHORE
15513                                : MMAP_OPTION_SEMAPHORE);
15514   if (unlikely(err != MDBX_SUCCESS))
15515     goto bailout;
15516 
15517 #if MDBX_ENABLE_MADVISE
15518 #ifdef MADV_DODUMP
15519   err = madvise(env->me_lck_mmap.lck, size, MADV_DODUMP) ? ignore_enosys(errno)
15520                                                          : MDBX_SUCCESS;
15521   if (unlikely(MDBX_IS_ERROR(err)))
15522     goto bailout;
15523 #endif /* MADV_DODUMP */
15524 
15525 #ifdef MADV_WILLNEED
15526   err = madvise(env->me_lck_mmap.lck, size, MADV_WILLNEED)
15527             ? ignore_enosys(errno)
15528             : MDBX_SUCCESS;
15529   if (unlikely(MDBX_IS_ERROR(err)))
15530     goto bailout;
15531 #elif defined(POSIX_MADV_WILLNEED)
15532   err = ignore_enosys(
15533       posix_madvise(env->me_lck_mmap.lck, size, POSIX_MADV_WILLNEED));
15534   if (unlikely(MDBX_IS_ERROR(err)))
15535     goto bailout;
15536 #endif /* MADV_WILLNEED */
15537 #endif /* MDBX_ENABLE_MADVISE */
15538 
15539   struct MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
15540   if (lck_seize_rc == MDBX_RESULT_TRUE) {
15541     /* LY: exclusive mode, check and reset lck content */
15542     memset(lck, 0, (size_t)size);
15543     mdbx_jitter4testing(false);
15544     lck->mti_magic_and_version = MDBX_LOCK_MAGIC;
15545     lck->mti_os_and_format = MDBX_LOCK_FORMAT;
15546 #if MDBX_ENABLE_PGOP_STAT
15547     lck->mti_pgop_stat.wops.weak = 1;
15548 #endif /* MDBX_ENABLE_PGOP_STAT */
15549     err = mdbx_msync(&env->me_lck_mmap, 0, (size_t)size, MDBX_SYNC_NONE);
15550     if (unlikely(err != MDBX_SUCCESS)) {
15551       mdbx_error("initial-%s for lck-file failed", "msync");
15552       goto bailout;
15553     }
15554     err = mdbx_fsync(env->me_lck_mmap.fd, MDBX_SYNC_SIZE);
15555     if (unlikely(err != MDBX_SUCCESS)) {
15556       mdbx_error("initial-%s for lck-file failed", "fsync");
15557       goto bailout;
15558     }
15559   } else {
15560     if (lck->mti_magic_and_version != MDBX_LOCK_MAGIC) {
15561       mdbx_error("%s", "lock region has invalid magic/version");
15562       err = ((lck->mti_magic_and_version >> 8) != MDBX_MAGIC)
15563                 ? MDBX_INVALID
15564                 : MDBX_VERSION_MISMATCH;
15565       goto bailout;
15566     }
15567     if (lck->mti_os_and_format != MDBX_LOCK_FORMAT) {
15568       mdbx_error("lock region has os/format 0x%" PRIx32 ", expected 0x%" PRIx32,
15569                  lck->mti_os_and_format, MDBX_LOCK_FORMAT);
15570       err = MDBX_VERSION_MISMATCH;
15571       goto bailout;
15572     }
15573   }
15574 
15575   err = mdbx_lck_init(env, inprocess_neighbor, lck_seize_rc);
15576   if (MDBX_IS_ERROR(err))
15577     goto bailout;
15578 
15579   mdbx_ensure(env, env->me_lcklist_next == nullptr);
15580   /* insert into inprocess lck-list */
15581   env->me_lcklist_next = inprocess_lcklist_head;
15582   inprocess_lcklist_head = env;
15583   lcklist_unlock();
15584   /* end of a locked section ------------------------------------------------ */
15585 
15586   mdbx_assert(env, !MDBX_IS_ERROR(lck_seize_rc));
15587   env->me_lck = lck;
15588   return lck_seize_rc;
15589 }
15590 
mdbx_is_readahead_reasonable(size_t volume,intptr_t redundancy)15591 __cold int mdbx_is_readahead_reasonable(size_t volume, intptr_t redundancy) {
15592   if (volume <= 1024 * 1024 * 4ul)
15593     return MDBX_RESULT_TRUE;
15594 
15595   intptr_t pagesize, total_ram_pages;
15596   int err = mdbx_get_sysraminfo(&pagesize, &total_ram_pages, nullptr);
15597   if (unlikely(err != MDBX_SUCCESS))
15598     return err;
15599 
15600   const int log2page = log2n_powerof2(pagesize);
15601   const intptr_t volume_pages = (volume + pagesize - 1) >> log2page;
15602   const intptr_t redundancy_pages =
15603       (redundancy < 0) ? -(intptr_t)((-redundancy + pagesize - 1) >> log2page)
15604                        : (intptr_t)(redundancy + pagesize - 1) >> log2page;
15605   if (volume_pages >= total_ram_pages ||
15606       volume_pages + redundancy_pages >= total_ram_pages)
15607     return MDBX_RESULT_FALSE;
15608 
15609   intptr_t avail_ram_pages;
15610   err = mdbx_get_sysraminfo(nullptr, nullptr, &avail_ram_pages);
15611   if (unlikely(err != MDBX_SUCCESS))
15612     return err;
15613 
15614   return (volume_pages + redundancy_pages >= avail_ram_pages)
15615              ? MDBX_RESULT_FALSE
15616              : MDBX_RESULT_TRUE;
15617 }
15618 
15619 /* Merge sync flags */
merge_sync_flags(const uint32_t a,const uint32_t b)15620 static uint32_t merge_sync_flags(const uint32_t a, const uint32_t b) {
15621   uint32_t r = a | b;
15622 
15623   /* avoid false MDBX_UTTERLY_NOSYNC */
15624   if (F_ISSET(r, MDBX_UTTERLY_NOSYNC) && !F_ISSET(a, MDBX_UTTERLY_NOSYNC) &&
15625       !F_ISSET(b, MDBX_UTTERLY_NOSYNC))
15626     r = (r - MDBX_UTTERLY_NOSYNC) | MDBX_SAFE_NOSYNC;
15627 
15628   /* convert MDBX_DEPRECATED_MAPASYNC to MDBX_SAFE_NOSYNC */
15629   if ((r & (MDBX_WRITEMAP | MDBX_DEPRECATED_MAPASYNC)) ==
15630           (MDBX_WRITEMAP | MDBX_DEPRECATED_MAPASYNC) &&
15631       !F_ISSET(r, MDBX_UTTERLY_NOSYNC))
15632     r = (r - MDBX_DEPRECATED_MAPASYNC) | MDBX_SAFE_NOSYNC;
15633 
15634   /* force MDBX_NOMETASYNC if MDBX_SAFE_NOSYNC enabled */
15635   if (r & MDBX_SAFE_NOSYNC)
15636     r |= MDBX_NOMETASYNC;
15637 
15638   assert(!(F_ISSET(r, MDBX_UTTERLY_NOSYNC) &&
15639            !F_ISSET(a, MDBX_UTTERLY_NOSYNC) &&
15640            !F_ISSET(b, MDBX_UTTERLY_NOSYNC)));
15641   return r;
15642 }
15643 
mdbx_override_meta(MDBX_env * env,unsigned target,txnid_t txnid,const MDBX_meta * shape)15644 __cold static int __must_check_result mdbx_override_meta(
15645     MDBX_env *env, unsigned target, txnid_t txnid, const MDBX_meta *shape) {
15646   int rc = alloc_page_buf(env);
15647   if (unlikely(rc != MDBX_SUCCESS))
15648     return rc;
15649   MDBX_page *const page = env->me_pbuf;
15650   mdbx_meta_model(env, page, target);
15651   MDBX_meta *const model = page_meta(page);
15652   mdbx_meta_set_txnid(env, model, txnid);
15653   if (shape) {
15654     model->mm_extra_flags = shape->mm_extra_flags;
15655     model->mm_validator_id = shape->mm_validator_id;
15656     model->mm_extra_pagehdr = shape->mm_extra_pagehdr;
15657     memcpy(&model->mm_geo, &shape->mm_geo, sizeof(model->mm_geo));
15658     memcpy(&model->mm_dbs, &shape->mm_dbs, sizeof(model->mm_dbs));
15659     memcpy(&model->mm_canary, &shape->mm_canary, sizeof(model->mm_canary));
15660     memcpy(&model->mm_pages_retired, &shape->mm_pages_retired,
15661            sizeof(model->mm_pages_retired));
15662   }
15663   unaligned_poke_u64(4, model->mm_datasync_sign, mdbx_meta_sign(model));
15664   rc = mdbx_validate_meta(env, model, page, target, nullptr);
15665   if (unlikely(MDBX_IS_ERROR(rc)))
15666     return MDBX_PROBLEM;
15667 
15668 #if MDBX_ENABLE_PGOP_STAT
15669   env->me_lck->mti_pgop_stat.wops.weak += 1;
15670 #endif /* MDBX_ENABLE_PGOP_STAT */
15671   if (env->me_flags & MDBX_WRITEMAP) {
15672     rc = mdbx_msync(&env->me_dxb_mmap, 0,
15673                     pgno_align2os_bytes(env, model->mm_geo.next),
15674                     MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
15675     if (unlikely(rc != MDBX_SUCCESS))
15676       return rc;
15677     MDBX_meta *live = METAPAGE(env, target);
15678     mdbx_meta_update_begin(env, live, unaligned_peek_u64(4, model->mm_txnid_a));
15679     mdbx_flush_incoherent_cpu_writeback();
15680     mdbx_meta_update_begin(env, model,
15681                            unaligned_peek_u64(4, model->mm_txnid_a));
15682     unaligned_poke_u64(4, model->mm_datasync_sign, MDBX_DATASIGN_WEAK);
15683     memcpy((void *)data_page(live), page, env->me_psize);
15684     mdbx_meta_update_end(env, live, unaligned_peek_u64(4, model->mm_txnid_b));
15685     mdbx_flush_incoherent_cpu_writeback();
15686     rc = mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, target),
15687                     MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
15688   } else {
15689     const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE)
15690                                      ? env->me_dsync_fd
15691                                      : env->me_lazy_fd;
15692     rc = mdbx_pwrite(fd, page, env->me_psize, pgno2bytes(env, target));
15693     if (rc == MDBX_SUCCESS && fd == env->me_lazy_fd)
15694       rc = mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
15695   }
15696   mdbx_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS),
15697                              env->me_os_psize);
15698   return rc;
15699 }
15700 
mdbx_env_turn_for_recovery(MDBX_env * env,unsigned target)15701 __cold int mdbx_env_turn_for_recovery(MDBX_env *env, unsigned target) {
15702   if (unlikely(target >= NUM_METAS))
15703     return MDBX_EINVAL;
15704   int rc = check_env(env, true);
15705   if (unlikely(rc != MDBX_SUCCESS))
15706     return rc;
15707 
15708   if (unlikely((env->me_flags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) !=
15709                MDBX_EXCLUSIVE))
15710     return MDBX_EPERM;
15711 
15712   const MDBX_meta *target_meta = METAPAGE(env, target);
15713   txnid_t new_txnid =
15714       safe64_txnid_next(mdbx_meta_txnid_stable(env, target_meta));
15715   for (unsigned n = 0; n < NUM_METAS; ++n) {
15716     MDBX_page *page = pgno2page(env, n);
15717     MDBX_meta meta = *page_meta(page);
15718     if (n == target)
15719       continue;
15720     if (mdbx_validate_meta(env, &meta, page, n, nullptr) != MDBX_SUCCESS) {
15721       int err = mdbx_override_meta(env, n, 0, nullptr);
15722       if (unlikely(err != MDBX_SUCCESS))
15723         return err;
15724     } else {
15725       txnid_t txnid = mdbx_meta_txnid_stable(env, &meta);
15726       if (new_txnid <= txnid)
15727         new_txnid = safe64_txnid_next(txnid);
15728     }
15729   }
15730 
15731   if (unlikely(new_txnid > MAX_TXNID)) {
15732     mdbx_error("txnid overflow, raise %d", MDBX_TXN_FULL);
15733     return MDBX_TXN_FULL;
15734   }
15735   return mdbx_override_meta(env, target, new_txnid, target_meta);
15736 }
15737 
mdbx_env_open_for_recovery(MDBX_env * env,const char * pathname,unsigned target_meta,bool writeable)15738 __cold int mdbx_env_open_for_recovery(MDBX_env *env, const char *pathname,
15739                                       unsigned target_meta, bool writeable) {
15740   if (unlikely(target_meta >= NUM_METAS))
15741     return MDBX_EINVAL;
15742   int rc = check_env(env, false);
15743   if (unlikely(rc != MDBX_SUCCESS))
15744     return rc;
15745   if (unlikely(env->me_map))
15746     return MDBX_EPERM;
15747 
15748   env->me_stuck_meta = (int8_t)target_meta;
15749   return mdbx_env_open(
15750       env, pathname, writeable ? MDBX_EXCLUSIVE : MDBX_EXCLUSIVE | MDBX_RDONLY,
15751       0);
15752 }
15753 
15754 typedef struct {
15755   void *buffer_for_free;
15756   char *lck, *dxb;
15757   size_t ent_len;
15758 } MDBX_handle_env_pathname;
15759 
mdbx_handle_env_pathname(MDBX_handle_env_pathname * ctx,const char * pathname,MDBX_env_flags_t * flags,const mdbx_mode_t mode)15760 __cold static int mdbx_handle_env_pathname(MDBX_handle_env_pathname *ctx,
15761                                            const char *pathname,
15762                                            MDBX_env_flags_t *flags,
15763                                            const mdbx_mode_t mode) {
15764   int rc;
15765   memset(ctx, 0, sizeof(*ctx));
15766   if (unlikely(!pathname))
15767     return MDBX_EINVAL;
15768 
15769 #if defined(_WIN32) || defined(_WIN64)
15770   const size_t wlen = mbstowcs(nullptr, pathname, INT_MAX);
15771   if (wlen < 1 || wlen > /* MAX_PATH */ INT16_MAX)
15772     return ERROR_INVALID_NAME;
15773   wchar_t *const pathnameW = _alloca((wlen + 1) * sizeof(wchar_t));
15774   if (wlen != mbstowcs(pathnameW, pathname, wlen + 1))
15775     return ERROR_INVALID_NAME;
15776 
15777   const DWORD dwAttrib = GetFileAttributesW(pathnameW);
15778   if (dwAttrib == INVALID_FILE_ATTRIBUTES) {
15779     rc = GetLastError();
15780     if (rc != MDBX_ENOFILE)
15781       return rc;
15782     if (mode == 0 || (*flags & MDBX_RDONLY) != 0)
15783       /* can't open existing */
15784       return rc;
15785 
15786     /* auto-create directory if requested */
15787     if ((*flags & MDBX_NOSUBDIR) == 0 &&
15788         !CreateDirectoryW(pathnameW, nullptr)) {
15789       rc = GetLastError();
15790       if (rc != ERROR_ALREADY_EXISTS)
15791         return rc;
15792     }
15793   } else {
15794     /* ignore passed MDBX_NOSUBDIR flag and set it automatically */
15795     *flags |= MDBX_NOSUBDIR;
15796     if (dwAttrib & FILE_ATTRIBUTE_DIRECTORY)
15797       *flags -= MDBX_NOSUBDIR;
15798   }
15799 #else
15800   struct stat st;
15801   if (stat(pathname, &st)) {
15802     rc = errno;
15803     if (rc != MDBX_ENOFILE)
15804       return rc;
15805     if (mode == 0 || (*flags & MDBX_RDONLY) != 0)
15806       /* can't open existing */
15807       return rc;
15808 
15809     /* auto-create directory if requested */
15810     const mdbx_mode_t dir_mode =
15811         (/* inherit read/write permissions for group and others */ mode &
15812          (S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH)) |
15813         /* always add read/write/search for owner */ S_IRWXU |
15814         ((mode & S_IRGRP) ? /* +search if readable by group */ S_IXGRP : 0) |
15815         ((mode & S_IROTH) ? /* +search if readable by others */ S_IXOTH : 0);
15816     if ((*flags & MDBX_NOSUBDIR) == 0 && mkdir(pathname, dir_mode)) {
15817       rc = errno;
15818       if (rc != EEXIST)
15819         return rc;
15820     }
15821   } else {
15822     /* ignore passed MDBX_NOSUBDIR flag and set it automatically */
15823     *flags |= MDBX_NOSUBDIR;
15824     if (S_ISDIR(st.st_mode))
15825       *flags -= MDBX_NOSUBDIR;
15826   }
15827 #endif
15828 
15829   static const char dxb_name[] = MDBX_DATANAME;
15830   static const size_t dxb_name_len = sizeof(dxb_name) - 1;
15831   static const char lck_name[] = MDBX_LOCKNAME;
15832   static const char lock_suffix[] = MDBX_LOCK_SUFFIX;
15833 
15834   ctx->ent_len = strlen(pathname);
15835   if ((*flags & MDBX_NOSUBDIR) && ctx->ent_len >= dxb_name_len &&
15836       !memcmp(dxb_name, pathname + ctx->ent_len - dxb_name_len, dxb_name_len)) {
15837     *flags -= MDBX_NOSUBDIR;
15838     ctx->ent_len -= dxb_name_len;
15839   }
15840 
15841   const size_t bytes_needed =
15842       ctx->ent_len * 2 + ((*flags & MDBX_NOSUBDIR)
15843                               ? sizeof(lock_suffix) + 1
15844                               : sizeof(lck_name) + sizeof(dxb_name));
15845   ctx->buffer_for_free = mdbx_malloc(bytes_needed);
15846   if (!ctx->buffer_for_free)
15847     return MDBX_ENOMEM;
15848 
15849   ctx->lck = ctx->buffer_for_free;
15850   if (*flags & MDBX_NOSUBDIR) {
15851     ctx->dxb = ctx->lck + ctx->ent_len + sizeof(lock_suffix);
15852     sprintf(ctx->lck, "%s%s", pathname, lock_suffix);
15853     strcpy(ctx->dxb, pathname);
15854   } else {
15855     ctx->dxb = ctx->lck + ctx->ent_len + sizeof(lck_name);
15856     sprintf(ctx->lck, "%.*s%s", (int)ctx->ent_len, pathname, lck_name);
15857     sprintf(ctx->dxb, "%.*s%s", (int)ctx->ent_len, pathname, dxb_name);
15858   }
15859 
15860   return MDBX_SUCCESS;
15861 }
15862 
mdbx_env_delete(const char * pathname,MDBX_env_delete_mode_t mode)15863 __cold int mdbx_env_delete(const char *pathname, MDBX_env_delete_mode_t mode) {
15864   switch (mode) {
15865   default:
15866     return MDBX_EINVAL;
15867   case MDBX_ENV_JUST_DELETE:
15868   case MDBX_ENV_ENSURE_UNUSED:
15869   case MDBX_ENV_WAIT_FOR_UNUSED:
15870     break;
15871   }
15872 
15873 #ifdef __e2k__ /* https://bugs.mcst.ru/bugzilla/show_bug.cgi?id=6011 */
15874   MDBX_env *const dummy_env = alloca(sizeof(MDBX_env));
15875 #else
15876   MDBX_env dummy_env_silo, *const dummy_env = &dummy_env_silo;
15877 #endif
15878   memset(dummy_env, 0, sizeof(*dummy_env));
15879   dummy_env->me_flags =
15880       (mode == MDBX_ENV_ENSURE_UNUSED) ? MDBX_EXCLUSIVE : MDBX_ENV_DEFAULTS;
15881   dummy_env->me_os_psize = (unsigned)mdbx_syspagesize();
15882   dummy_env->me_psize = (unsigned)mdbx_default_pagesize();
15883   dummy_env->me_pathname = (char *)pathname;
15884 
15885   MDBX_handle_env_pathname env_pathname;
15886   STATIC_ASSERT(sizeof(dummy_env->me_flags) == sizeof(MDBX_env_flags_t));
15887   int rc = MDBX_RESULT_TRUE,
15888       err = mdbx_handle_env_pathname(
15889           &env_pathname, pathname, (MDBX_env_flags_t *)&dummy_env->me_flags, 0);
15890   if (likely(err == MDBX_SUCCESS)) {
15891     mdbx_filehandle_t clk_handle = INVALID_HANDLE_VALUE,
15892                       dxb_handle = INVALID_HANDLE_VALUE;
15893     if (mode > MDBX_ENV_JUST_DELETE) {
15894       err = mdbx_openfile(MDBX_OPEN_DELETE, dummy_env, env_pathname.dxb,
15895                           &dxb_handle, 0);
15896       err = (err == MDBX_ENOFILE) ? MDBX_SUCCESS : err;
15897       if (err == MDBX_SUCCESS) {
15898         err = mdbx_openfile(MDBX_OPEN_DELETE, dummy_env, env_pathname.lck,
15899                             &clk_handle, 0);
15900         err = (err == MDBX_ENOFILE) ? MDBX_SUCCESS : err;
15901       }
15902       if (err == MDBX_SUCCESS && clk_handle != INVALID_HANDLE_VALUE)
15903         err = mdbx_lockfile(clk_handle, mode == MDBX_ENV_WAIT_FOR_UNUSED);
15904       if (err == MDBX_SUCCESS && dxb_handle != INVALID_HANDLE_VALUE)
15905         err = mdbx_lockfile(dxb_handle, mode == MDBX_ENV_WAIT_FOR_UNUSED);
15906     }
15907 
15908     if (err == MDBX_SUCCESS) {
15909       err = mdbx_removefile(env_pathname.dxb);
15910       if (err == MDBX_SUCCESS)
15911         rc = MDBX_SUCCESS;
15912       else if (err == MDBX_ENOFILE)
15913         err = MDBX_SUCCESS;
15914     }
15915 
15916     if (err == MDBX_SUCCESS) {
15917       err = mdbx_removefile(env_pathname.lck);
15918       if (err == MDBX_SUCCESS)
15919         rc = MDBX_SUCCESS;
15920       else if (err == MDBX_ENOFILE)
15921         err = MDBX_SUCCESS;
15922     }
15923 
15924     if (err == MDBX_SUCCESS && !(dummy_env->me_flags & MDBX_NOSUBDIR)) {
15925       err = mdbx_removedirectory(pathname);
15926       if (err == MDBX_SUCCESS)
15927         rc = MDBX_SUCCESS;
15928       else if (err == MDBX_ENOFILE)
15929         err = MDBX_SUCCESS;
15930     }
15931 
15932     if (dxb_handle != INVALID_HANDLE_VALUE)
15933       mdbx_closefile(dxb_handle);
15934     if (clk_handle != INVALID_HANDLE_VALUE)
15935       mdbx_closefile(clk_handle);
15936   } else if (err == MDBX_ENOFILE)
15937     err = MDBX_SUCCESS;
15938 
15939   mdbx_free(env_pathname.buffer_for_free);
15940   return (err == MDBX_SUCCESS) ? rc : err;
15941 }
15942 
mdbx_env_open(MDBX_env * env,const char * pathname,MDBX_env_flags_t flags,mdbx_mode_t mode)15943 __cold int mdbx_env_open(MDBX_env *env, const char *pathname,
15944                          MDBX_env_flags_t flags, mdbx_mode_t mode) {
15945   int rc = check_env(env, false);
15946   if (unlikely(rc != MDBX_SUCCESS))
15947     return rc;
15948 
15949   if (unlikely(flags & ~ENV_USABLE_FLAGS))
15950     return MDBX_EINVAL;
15951 
15952   if (flags & MDBX_RDONLY)
15953     mode = 0;
15954 
15955   if (env->me_lazy_fd != INVALID_HANDLE_VALUE ||
15956       (env->me_flags & MDBX_ENV_ACTIVE) != 0 || env->me_map)
15957     return MDBX_EPERM;
15958 
15959   /* pickup previously mdbx_env_set_flags(),
15960    * but avoid MDBX_UTTERLY_NOSYNC by disjunction */
15961   const uint32_t saved_me_flags = env->me_flags;
15962   flags = merge_sync_flags(flags, env->me_flags);
15963 
15964   MDBX_handle_env_pathname env_pathname;
15965   rc = mdbx_handle_env_pathname(&env_pathname, pathname, &flags, mode);
15966   if (unlikely(rc != MDBX_SUCCESS))
15967     goto bailout;
15968 
15969   if (flags & MDBX_RDONLY) {
15970     /* LY: silently ignore irrelevant flags when
15971      * we're only getting read access */
15972     flags &= ~(MDBX_WRITEMAP | MDBX_DEPRECATED_MAPASYNC | MDBX_SAFE_NOSYNC |
15973                MDBX_NOMETASYNC | MDBX_COALESCE | MDBX_LIFORECLAIM |
15974                MDBX_NOMEMINIT | MDBX_ACCEDE);
15975   } else {
15976 #if MDBX_MMAP_INCOHERENT_FILE_WRITE
15977     /* Temporary `workaround` for OpenBSD kernel's flaw.
15978      * See https://github.com/erthink/libmdbx/issues/67 */
15979     if ((flags & MDBX_WRITEMAP) == 0) {
15980       if (flags & MDBX_ACCEDE)
15981         flags |= MDBX_WRITEMAP;
15982       else {
15983         mdbx_debug_log(MDBX_LOG_ERROR, __func__, __LINE__,
15984                        "System (i.e. OpenBSD) requires MDBX_WRITEMAP because "
15985                        "of an internal flaw(s) in a file/buffer/page cache.\n");
15986         rc = 42 /* ENOPROTOOPT */;
15987         goto bailout;
15988       }
15989     }
15990 #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */
15991   }
15992 
15993   env->me_flags = (flags & ~MDBX_FATAL_ERROR) | MDBX_ENV_ACTIVE;
15994   env->me_pathname = mdbx_calloc(env_pathname.ent_len + 1, 1);
15995   env->me_dbxs = mdbx_calloc(env->me_maxdbs, sizeof(MDBX_dbx));
15996   env->me_dbflags = mdbx_calloc(env->me_maxdbs, sizeof(env->me_dbflags[0]));
15997   env->me_dbiseqs = mdbx_calloc(env->me_maxdbs, sizeof(env->me_dbiseqs[0]));
15998   if (!(env->me_dbxs && env->me_pathname && env->me_dbflags &&
15999         env->me_dbiseqs)) {
16000     rc = MDBX_ENOMEM;
16001     goto bailout;
16002   }
16003   memcpy(env->me_pathname, env_pathname.dxb, env_pathname.ent_len);
16004   env->me_dbxs[FREE_DBI].md_cmp = cmp_int_align4; /* aligned MDBX_INTEGERKEY */
16005   env->me_dbxs[FREE_DBI].md_dcmp = cmp_lenfast;
16006 
16007   rc = mdbx_openfile(F_ISSET(flags, MDBX_RDONLY) ? MDBX_OPEN_DXB_READ
16008                                                  : MDBX_OPEN_DXB_LAZY,
16009                      env, env_pathname.dxb, &env->me_lazy_fd, mode);
16010   if (rc != MDBX_SUCCESS)
16011     goto bailout;
16012 
16013   mdbx_assert(env, env->me_dsync_fd == INVALID_HANDLE_VALUE);
16014   if ((flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC)) == 0) {
16015     rc = mdbx_openfile(MDBX_OPEN_DXB_DSYNC, env, env_pathname.dxb,
16016                        &env->me_dsync_fd, 0);
16017     mdbx_ensure(env, (rc != MDBX_SUCCESS) ==
16018                          (env->me_dsync_fd == INVALID_HANDLE_VALUE));
16019   }
16020 
16021 #if MDBX_LOCKING == MDBX_LOCKING_SYSV
16022   env->me_sysv_ipc.key = ftok(env_pathname.dxb, 42);
16023   if (env->me_sysv_ipc.key == -1) {
16024     rc = errno;
16025     goto bailout;
16026   }
16027 #endif /* MDBX_LOCKING */
16028 
16029 #if !(defined(_WIN32) || defined(_WIN64))
16030   if (mode == 0) {
16031     /* pickup mode for lck-file */
16032     struct stat st;
16033     if (fstat(env->me_lazy_fd, &st)) {
16034       rc = errno;
16035       goto bailout;
16036     }
16037     mode = st.st_mode;
16038   }
16039   mode = (/* inherit read permissions for group and others */ mode &
16040           (S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH)) |
16041          /* always add read/write/search for owner */ S_IRUSR | S_IWUSR |
16042          ((mode & S_IRGRP) ? /* +write if readable by group */ S_IWGRP : 0) |
16043          ((mode & S_IROTH) ? /* +write if readable by others */ S_IWOTH : 0);
16044 #endif /* !Windows */
16045   const int lck_rc = mdbx_setup_lck(env, env_pathname.lck, mode);
16046   if (MDBX_IS_ERROR(lck_rc)) {
16047     rc = lck_rc;
16048     goto bailout;
16049   }
16050 
16051   /* Set the position in files outside of the data to avoid corruption
16052    * due to erroneous use of file descriptors in the application code. */
16053   mdbx_fseek(env->me_lfd, UINT64_C(1) << 63);
16054   mdbx_fseek(env->me_lazy_fd, UINT64_C(1) << 63);
16055   if (env->me_dsync_fd != INVALID_HANDLE_VALUE)
16056     mdbx_fseek(env->me_dsync_fd, UINT64_C(1) << 63);
16057 
16058   const MDBX_env_flags_t rigorous_flags =
16059       MDBX_SAFE_NOSYNC | MDBX_DEPRECATED_MAPASYNC;
16060   const MDBX_env_flags_t mode_flags = rigorous_flags | MDBX_NOMETASYNC |
16061                                       MDBX_LIFORECLAIM | MDBX_COALESCE |
16062                                       MDBX_NORDAHEAD;
16063 
16064   MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
16065   if (lck && lck_rc != MDBX_RESULT_TRUE && (env->me_flags & MDBX_RDONLY) == 0) {
16066     while (atomic_load32(&lck->mti_envmode, mo_AcquireRelease) == MDBX_RDONLY) {
16067       if (atomic_cas32(&lck->mti_envmode, MDBX_RDONLY,
16068                        env->me_flags & mode_flags)) {
16069         /* The case:
16070          *  - let's assume that for some reason the DB file is smaller
16071          *    than it should be according to the geometry,
16072          *    but not smaller than the last page used;
16073          *  - the first process that opens the database (lc_rc = true)
16074          *    does this in readonly mode and therefore cannot bring
16075          *    the file size back to normal;
16076          *  - some next process (lc_rc = false) opens the DB in read-write
16077          *    mode and now is here.
16078          *
16079          * FIXME: Should we re-check and set the size of DB-file right here? */
16080         break;
16081       }
16082       atomic_yield();
16083     }
16084 
16085     if (env->me_flags & MDBX_ACCEDE) {
16086       /* pickup current mode-flags, including MDBX_LIFORECLAIM |
16087        * MDBX_COALESCE | MDBX_NORDAHEAD */
16088       const unsigned diff =
16089           (lck->mti_envmode.weak ^ env->me_flags) & mode_flags;
16090       mdbx_notice("accede mode-flags: 0x%X, 0x%X -> 0x%X", diff, env->me_flags,
16091                   env->me_flags ^ diff);
16092       env->me_flags ^= diff;
16093     }
16094 
16095     if ((lck->mti_envmode.weak ^ env->me_flags) & rigorous_flags) {
16096       mdbx_error("%s", "current mode/flags incompatible with requested");
16097       rc = MDBX_INCOMPATIBLE;
16098       goto bailout;
16099     }
16100   }
16101 
16102   const int dxb_rc = mdbx_setup_dxb(env, lck_rc, mode);
16103   if (MDBX_IS_ERROR(dxb_rc)) {
16104     rc = dxb_rc;
16105     goto bailout;
16106   }
16107 
16108   if (unlikely(/* recovery mode */ env->me_stuck_meta >= 0) &&
16109       (lck_rc != /* exclusive */ MDBX_RESULT_TRUE ||
16110        (flags & MDBX_EXCLUSIVE) == 0)) {
16111     mdbx_error("%s", "recovery requires exclusive mode");
16112     rc = MDBX_BUSY;
16113     goto bailout;
16114   }
16115 
16116   mdbx_debug("opened dbenv %p", (void *)env);
16117   if (lck) {
16118     if (lck_rc == MDBX_RESULT_TRUE) {
16119       lck->mti_envmode.weak = env->me_flags & (mode_flags | MDBX_RDONLY);
16120       rc = mdbx_lck_downgrade(env);
16121       mdbx_debug("lck-downgrade-%s: rc %i",
16122                  (env->me_flags & MDBX_EXCLUSIVE) ? "partial" : "full", rc);
16123       if (rc != MDBX_SUCCESS)
16124         goto bailout;
16125     } else {
16126       rc = mdbx_cleanup_dead_readers(env, false, NULL);
16127       if (MDBX_IS_ERROR(rc))
16128         goto bailout;
16129     }
16130 
16131     if ((env->me_flags & MDBX_NOTLS) == 0) {
16132       rc = mdbx_rthc_alloc(&env->me_txkey, &lck->mti_readers[0],
16133                            &lck->mti_readers[env->me_maxreaders]);
16134       if (unlikely(rc != MDBX_SUCCESS))
16135         goto bailout;
16136       env->me_flags |= MDBX_ENV_TXKEY;
16137     }
16138   }
16139 
16140   if ((flags & MDBX_RDONLY) == 0) {
16141     const size_t tsize = sizeof(MDBX_txn),
16142                  size = tsize + env->me_maxdbs *
16143                                     (sizeof(MDBX_db) + sizeof(MDBX_cursor *) +
16144                                      sizeof(unsigned) + 1);
16145     rc = alloc_page_buf(env);
16146     if (rc == MDBX_SUCCESS) {
16147       memset(env->me_pbuf, -1, env->me_psize * 2);
16148       MDBX_txn *txn = mdbx_calloc(1, size);
16149       if (txn) {
16150         txn->mt_dbs = (MDBX_db *)((char *)txn + tsize);
16151         txn->tw.cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs);
16152         txn->mt_dbiseqs = (unsigned *)(txn->tw.cursors + env->me_maxdbs);
16153         txn->mt_dbistate = (uint8_t *)(txn->mt_dbiseqs + env->me_maxdbs);
16154         txn->mt_env = env;
16155         txn->mt_dbxs = env->me_dbxs;
16156         txn->mt_flags = MDBX_TXN_FINISHED;
16157         env->me_txn0 = txn;
16158         txn->tw.retired_pages = mdbx_pnl_alloc(MDBX_PNL_INITIAL);
16159         txn->tw.reclaimed_pglist = mdbx_pnl_alloc(MDBX_PNL_INITIAL);
16160         if (unlikely(!txn->tw.retired_pages || !txn->tw.reclaimed_pglist))
16161           rc = MDBX_ENOMEM;
16162       } else
16163         rc = MDBX_ENOMEM;
16164     }
16165   }
16166 
16167 #if MDBX_DEBUG
16168   if (rc == MDBX_SUCCESS) {
16169     MDBX_meta *meta = mdbx_meta_head(env);
16170     MDBX_db *db = &meta->mm_dbs[MAIN_DBI];
16171 
16172     mdbx_debug("opened database version %u, pagesize %u",
16173                (uint8_t)unaligned_peek_u64(4, meta->mm_magic_and_version),
16174                env->me_psize);
16175     mdbx_debug("using meta page %" PRIaPGNO ", txn %" PRIaTXN,
16176                data_page(meta)->mp_pgno, mdbx_meta_txnid_fluid(env, meta));
16177     mdbx_debug("depth: %u", db->md_depth);
16178     mdbx_debug("entries: %" PRIu64, db->md_entries);
16179     mdbx_debug("branch pages: %" PRIaPGNO, db->md_branch_pages);
16180     mdbx_debug("leaf pages: %" PRIaPGNO, db->md_leaf_pages);
16181     mdbx_debug("overflow pages: %" PRIaPGNO, db->md_overflow_pages);
16182     mdbx_debug("root: %" PRIaPGNO, db->md_root);
16183     mdbx_debug("schema_altered: %" PRIaTXN, db->md_mod_txnid);
16184   }
16185 #endif
16186 
16187 bailout:
16188   if (rc != MDBX_SUCCESS) {
16189     rc = mdbx_env_close0(env) ? MDBX_PANIC : rc;
16190     env->me_flags =
16191         saved_me_flags | ((rc != MDBX_PANIC) ? 0 : MDBX_FATAL_ERROR);
16192   } else {
16193 #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__)
16194     mdbx_txn_valgrind(env, nullptr);
16195 #endif
16196   }
16197   mdbx_free(env_pathname.buffer_for_free);
16198   return rc;
16199 }
16200 
16201 /* Destroy resources from mdbx_env_open(), clear our readers & DBIs */
mdbx_env_close0(MDBX_env * env)16202 __cold static int mdbx_env_close0(MDBX_env *env) {
16203   env->me_stuck_meta = -1;
16204   if (!(env->me_flags & MDBX_ENV_ACTIVE)) {
16205     mdbx_ensure(env, env->me_lcklist_next == nullptr);
16206     return MDBX_SUCCESS;
16207   }
16208 
16209   env->me_flags &= ~MDBX_ENV_ACTIVE;
16210   env->me_lck = nullptr;
16211   if (env->me_flags & MDBX_ENV_TXKEY)
16212     mdbx_rthc_remove(env->me_txkey);
16213 
16214   lcklist_lock();
16215   const int rc = lcklist_detach_locked(env);
16216   lcklist_unlock();
16217 
16218   if (env->me_map) {
16219     mdbx_munmap(&env->me_dxb_mmap);
16220 #ifdef MDBX_USE_VALGRIND
16221     VALGRIND_DISCARD(env->me_valgrind_handle);
16222     env->me_valgrind_handle = -1;
16223 #endif
16224   }
16225 
16226   if (env->me_dsync_fd != INVALID_HANDLE_VALUE) {
16227     (void)mdbx_closefile(env->me_dsync_fd);
16228     env->me_dsync_fd = INVALID_HANDLE_VALUE;
16229   }
16230 
16231   if (env->me_lazy_fd != INVALID_HANDLE_VALUE) {
16232     (void)mdbx_closefile(env->me_lazy_fd);
16233     env->me_lazy_fd = INVALID_HANDLE_VALUE;
16234   }
16235 
16236   if (env->me_lck_mmap.lck)
16237     mdbx_munmap(&env->me_lck_mmap);
16238 
16239   if (env->me_lfd != INVALID_HANDLE_VALUE) {
16240     (void)mdbx_closefile(env->me_lfd);
16241     env->me_lfd = INVALID_HANDLE_VALUE;
16242   }
16243 
16244   if (env->me_dbxs) {
16245     for (unsigned i = env->me_numdbs; --i >= CORE_DBS;)
16246       mdbx_free(env->me_dbxs[i].md_name.iov_base);
16247     mdbx_free(env->me_dbxs);
16248   }
16249   mdbx_memalign_free(env->me_pbuf);
16250   mdbx_free(env->me_dbiseqs);
16251   mdbx_free(env->me_dbflags);
16252   mdbx_free(env->me_pathname);
16253   if (env->me_txn0) {
16254     mdbx_dpl_free(env->me_txn0);
16255     mdbx_txl_free(env->me_txn0->tw.lifo_reclaimed);
16256     mdbx_pnl_free(env->me_txn0->tw.retired_pages);
16257     mdbx_pnl_free(env->me_txn0->tw.spill_pages);
16258     mdbx_pnl_free(env->me_txn0->tw.reclaimed_pglist);
16259     mdbx_free(env->me_txn0);
16260   }
16261   env->me_flags = 0;
16262   return rc;
16263 }
16264 
mdbx_env_close_ex(MDBX_env * env,bool dont_sync)16265 __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) {
16266   MDBX_page *dp;
16267   int rc = MDBX_SUCCESS;
16268 
16269   if (unlikely(!env))
16270     return MDBX_EINVAL;
16271 
16272   if (unlikely(env->me_signature.weak != MDBX_ME_SIGNATURE))
16273     return MDBX_EBADSIGN;
16274 
16275 #if MDBX_ENV_CHECKPID || !(defined(_WIN32) || defined(_WIN64))
16276   /* Check the PID even if MDBX_ENV_CHECKPID=0 on non-Windows
16277    * platforms (i.e. where fork() is available).
16278    * This is required to legitimize a call after fork()
16279    * from a child process, that should be allowed to free resources. */
16280   if (unlikely(env->me_pid != mdbx_getpid()))
16281     env->me_flags |= MDBX_FATAL_ERROR;
16282 #endif /* MDBX_ENV_CHECKPID */
16283 
16284   if (env->me_map && (env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0 &&
16285       env->me_txn0) {
16286     if (env->me_txn0->mt_owner && env->me_txn0->mt_owner != mdbx_thread_self())
16287       return MDBX_BUSY;
16288   } else
16289     dont_sync = true;
16290 
16291   if (!atomic_cas32(&env->me_signature, MDBX_ME_SIGNATURE, 0))
16292     return MDBX_EBADSIGN;
16293 
16294   if (!dont_sync) {
16295 #if defined(_WIN32) || defined(_WIN64)
16296     /* On windows, without blocking is impossible to determine whether another
16297      * process is running a writing transaction or not.
16298      * Because in the "owner died" condition kernel don't release
16299      * file lock immediately. */
16300     rc = mdbx_env_sync_internal(env, true, false);
16301     rc = (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc;
16302 #else
16303     struct stat st;
16304     if (unlikely(fstat(env->me_lazy_fd, &st)))
16305       rc = errno;
16306     else if (st.st_nlink > 0 /* don't sync deleted files */) {
16307       rc = mdbx_env_sync_internal(env, true, true);
16308       rc = (rc == MDBX_BUSY || rc == EAGAIN || rc == EACCES || rc == EBUSY ||
16309             rc == EWOULDBLOCK || rc == MDBX_RESULT_TRUE)
16310                ? MDBX_SUCCESS
16311                : rc;
16312     }
16313 #endif
16314   }
16315 
16316   mdbx_assert(env, env->me_signature.weak == 0);
16317   rc = mdbx_env_close0(env) ? MDBX_PANIC : rc;
16318   mdbx_ensure(env, mdbx_fastmutex_destroy(&env->me_dbi_lock) == MDBX_SUCCESS);
16319 #if defined(_WIN32) || defined(_WIN64)
16320   /* me_remap_guard don't have destructor (Slim Reader/Writer Lock) */
16321   DeleteCriticalSection(&env->me_windowsbug_lock);
16322 #else
16323   mdbx_ensure(env,
16324               mdbx_fastmutex_destroy(&env->me_remap_guard) == MDBX_SUCCESS);
16325 #endif /* Windows */
16326 
16327 #if MDBX_LOCKING > MDBX_LOCKING_SYSV
16328   MDBX_lockinfo *const stub = lckless_stub(env);
16329   mdbx_ensure(env, mdbx_ipclock_destroy(&stub->mti_wlock) == 0);
16330 #endif /* MDBX_LOCKING */
16331 
16332   while ((dp = env->me_dp_reserve) != NULL) {
16333     MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, env->me_psize);
16334     VALGRIND_MAKE_MEM_DEFINED(&dp->mp_next, sizeof(dp->mp_next));
16335     env->me_dp_reserve = dp->mp_next;
16336     mdbx_free(dp);
16337   }
16338   VALGRIND_DESTROY_MEMPOOL(env);
16339   mdbx_ensure(env, env->me_lcklist_next == nullptr);
16340   env->me_pid = 0;
16341   mdbx_free(env);
16342 
16343   return rc;
16344 }
16345 
16346 #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API
mdbx_env_close(MDBX_env * env)16347 __cold int mdbx_env_close(MDBX_env *env) {
16348   return __inline_mdbx_env_close(env);
16349 }
16350 #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */
16351 
16352 /* Compare two items pointing at aligned unsigned int's. */
cmp_int_align4(const MDBX_val * a,const MDBX_val * b)16353 static int __hot cmp_int_align4(const MDBX_val *a, const MDBX_val *b) {
16354   mdbx_assert(NULL, a->iov_len == b->iov_len);
16355   switch (a->iov_len) {
16356   case 4:
16357     return CMP2INT(unaligned_peek_u32(4, a->iov_base),
16358                    unaligned_peek_u32(4, b->iov_base));
16359   case 8:
16360     return CMP2INT(unaligned_peek_u64(4, a->iov_base),
16361                    unaligned_peek_u64(4, b->iov_base));
16362   default:
16363     mdbx_assert_fail(NULL, "invalid size for INTEGERKEY/INTEGERDUP", __func__,
16364                      __LINE__);
16365     return 0;
16366   }
16367 }
16368 
16369 /* Compare two items pointing at 2-byte aligned unsigned int's. */
cmp_int_align2(const MDBX_val * a,const MDBX_val * b)16370 static int __hot cmp_int_align2(const MDBX_val *a, const MDBX_val *b) {
16371   mdbx_assert(NULL, a->iov_len == b->iov_len);
16372   switch (a->iov_len) {
16373   case 4:
16374     return CMP2INT(unaligned_peek_u32(2, a->iov_base),
16375                    unaligned_peek_u32(2, b->iov_base));
16376   case 8:
16377     return CMP2INT(unaligned_peek_u64(2, a->iov_base),
16378                    unaligned_peek_u64(2, b->iov_base));
16379   default:
16380     mdbx_assert_fail(NULL, "invalid size for INTEGERKEY/INTEGERDUP", __func__,
16381                      __LINE__);
16382     return 0;
16383   }
16384 }
16385 
16386 /* Compare two items pointing at unsigned values with unknown alignment.
16387  *
16388  * This is also set as MDBX_INTEGERDUP|MDBX_DUPFIXED's MDBX_dbx.md_dcmp. */
cmp_int_unaligned(const MDBX_val * a,const MDBX_val * b)16389 static int __hot cmp_int_unaligned(const MDBX_val *a, const MDBX_val *b) {
16390   mdbx_assert(NULL, a->iov_len == b->iov_len);
16391   switch (a->iov_len) {
16392   case 4:
16393     return CMP2INT(unaligned_peek_u32(1, a->iov_base),
16394                    unaligned_peek_u32(1, b->iov_base));
16395   case 8:
16396     return CMP2INT(unaligned_peek_u64(1, a->iov_base),
16397                    unaligned_peek_u64(1, b->iov_base));
16398   default:
16399     mdbx_assert_fail(NULL, "invalid size for INTEGERKEY/INTEGERDUP", __func__,
16400                      __LINE__);
16401     return 0;
16402   }
16403 }
16404 
16405 /* Compare two items lexically */
cmp_lexical(const MDBX_val * a,const MDBX_val * b)16406 static int __hot cmp_lexical(const MDBX_val *a, const MDBX_val *b) {
16407   if (a->iov_len == b->iov_len)
16408     return a->iov_len ? memcmp(a->iov_base, b->iov_base, a->iov_len) : 0;
16409 
16410   const int diff_len = (a->iov_len < b->iov_len) ? -1 : 1;
16411   const size_t shortest = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len;
16412   int diff_data = shortest ? memcmp(a->iov_base, b->iov_base, shortest) : 0;
16413   return likely(diff_data) ? diff_data : diff_len;
16414 }
16415 
16416 /* Compare two items in reverse byte order */
cmp_reverse(const MDBX_val * a,const MDBX_val * b)16417 static int __hot cmp_reverse(const MDBX_val *a, const MDBX_val *b) {
16418   const size_t shortest = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len;
16419   if (likely(shortest)) {
16420     const uint8_t *pa = (const uint8_t *)a->iov_base + a->iov_len;
16421     const uint8_t *pb = (const uint8_t *)b->iov_base + b->iov_len;
16422     const uint8_t *const end = pa - shortest;
16423     do {
16424       int diff = *--pa - *--pb;
16425       if (likely(diff))
16426         return diff;
16427     } while (pa != end);
16428   }
16429   return CMP2INT(a->iov_len, b->iov_len);
16430 }
16431 
16432 /* Fast non-lexically comparator */
cmp_lenfast(const MDBX_val * a,const MDBX_val * b)16433 static int __hot cmp_lenfast(const MDBX_val *a, const MDBX_val *b) {
16434   int diff = CMP2INT(a->iov_len, b->iov_len);
16435   return likely(diff || a->iov_len == 0)
16436              ? diff
16437              : memcmp(a->iov_base, b->iov_base, a->iov_len);
16438 }
16439 
unsure_equal(MDBX_cmp_func cmp,const MDBX_val * a,const MDBX_val * b)16440 static bool unsure_equal(MDBX_cmp_func cmp, const MDBX_val *a,
16441                          const MDBX_val *b) {
16442   /* checking for the use of a known good comparator
16443    * or/otherwise for a full byte-to-byte match */
16444   return cmp == cmp_lenfast || cmp == cmp_lexical || cmp == cmp_reverse ||
16445          cmp == cmp_int_unaligned || cmp_lenfast(a, b) == 0;
16446 }
16447 
16448 /* Search for key within a page, using binary search.
16449  * Returns the smallest entry larger or equal to the key.
16450  * Updates the cursor index with the index of the found entry.
16451  * If no entry larger or equal to the key is found, returns NULL. */
mdbx_node_search(MDBX_cursor * mc,const MDBX_val * key)16452 static struct node_result __hot mdbx_node_search(MDBX_cursor *mc,
16453                                                  const MDBX_val *key) {
16454   MDBX_page *mp = mc->mc_pg[mc->mc_top];
16455   const int nkeys = page_numkeys(mp);
16456   DKBUF_DEBUG;
16457 
16458   mdbx_debug("searching %u keys in %s %spage %" PRIaPGNO, nkeys,
16459              IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "",
16460              mp->mp_pgno);
16461 
16462   struct node_result ret;
16463   ret.exact = false;
16464   STATIC_ASSERT(P_BRANCH == 1);
16465   int low = mp->mp_flags & P_BRANCH;
16466   int high = nkeys - 1;
16467   if (unlikely(high < low)) {
16468     mc->mc_ki[mc->mc_top] = 0;
16469     ret.node = NULL;
16470     return ret;
16471   }
16472 
16473   int cr = 0, i = 0;
16474   MDBX_cmp_func *cmp = mc->mc_dbx->md_cmp;
16475   MDBX_val nodekey;
16476   if (unlikely(IS_LEAF2(mp))) {
16477     mdbx_cassert(mc, mp->mp_leaf2_ksize == mc->mc_db->md_xsize);
16478     nodekey.iov_len = mp->mp_leaf2_ksize;
16479     do {
16480       i = (low + high) >> 1;
16481       nodekey.iov_base = page_leaf2key(mp, i, nodekey.iov_len);
16482       mdbx_cassert(mc, (char *)mp + mc->mc_txn->mt_env->me_psize >=
16483                            (char *)nodekey.iov_base + nodekey.iov_len);
16484       cr = cmp(key, &nodekey);
16485       mdbx_debug("found leaf index %u [%s], rc = %i", i, DKEY_DEBUG(&nodekey),
16486                  cr);
16487       if (unlikely(cr == 0)) {
16488         ret.exact = true;
16489         break;
16490       }
16491       low = (cr < 0) ? low : i + 1;
16492       high = (cr < 0) ? i - 1 : high;
16493     } while (likely(low <= high));
16494 
16495     /* Found entry is less than the key. */
16496     /* Skip to get the smallest entry larger than key. */
16497     i += cr > 0;
16498 
16499     /* store the key index */
16500     mc->mc_ki[mc->mc_top] = (indx_t)i;
16501     ret.node = (i < nkeys)
16502                    ? /* fake for LEAF2 */ (MDBX_node *)(intptr_t)-1
16503                    : /* There is no entry larger or equal to the key. */ NULL;
16504     return ret;
16505   }
16506 
16507   if (IS_BRANCH(mp) && cmp == cmp_int_align2)
16508     /* Branch pages have no data, so if using integer keys,
16509      * alignment is guaranteed. Use faster cmp_int_align4(). */
16510     cmp = cmp_int_align4;
16511 
16512   MDBX_node *node;
16513   do {
16514     i = (low + high) >> 1;
16515 
16516     node = page_node(mp, i);
16517     nodekey.iov_len = node_ks(node);
16518     nodekey.iov_base = node_key(node);
16519     mdbx_cassert(mc, (char *)mp + mc->mc_txn->mt_env->me_psize >=
16520                          (char *)nodekey.iov_base + nodekey.iov_len);
16521 
16522     cr = cmp(key, &nodekey);
16523     if (IS_LEAF(mp))
16524       mdbx_debug("found leaf index %u [%s], rc = %i", i, DKEY_DEBUG(&nodekey),
16525                  cr);
16526     else
16527       mdbx_debug("found branch index %u [%s -> %" PRIaPGNO "], rc = %i", i,
16528                  DKEY_DEBUG(&nodekey), node_pgno(node), cr);
16529     if (unlikely(cr == 0)) {
16530       ret.exact = true;
16531       break;
16532     }
16533     low = (cr < 0) ? low : i + 1;
16534     high = (cr < 0) ? i - 1 : high;
16535   } while (likely(low <= high));
16536 
16537   /* Found entry is less than the key. */
16538   /* Skip to get the smallest entry larger than key. */
16539   i += cr > 0;
16540 
16541   /* store the key index */
16542   mc->mc_ki[mc->mc_top] = (indx_t)i;
16543   ret.node = (i < nkeys)
16544                  ? page_node(mp, i)
16545                  : /* There is no entry larger or equal to the key. */ NULL;
16546   return ret;
16547 }
16548 
16549 /* Pop a page off the top of the cursor's stack. */
mdbx_cursor_pop(MDBX_cursor * mc)16550 static __inline void mdbx_cursor_pop(MDBX_cursor *mc) {
16551   if (mc->mc_snum) {
16552     mdbx_debug("popped page %" PRIaPGNO " off db %d cursor %p",
16553                mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *)mc);
16554     if (--mc->mc_snum) {
16555       mc->mc_top--;
16556     } else {
16557       mc->mc_flags &= ~C_INITIALIZED;
16558     }
16559   }
16560 }
16561 
16562 /* Push a page onto the top of the cursor's stack.
16563  * Set MDBX_TXN_ERROR on failure. */
mdbx_cursor_push(MDBX_cursor * mc,MDBX_page * mp)16564 static __inline int mdbx_cursor_push(MDBX_cursor *mc, MDBX_page *mp) {
16565   mdbx_debug("pushing page %" PRIaPGNO " on db %d cursor %p", mp->mp_pgno,
16566              DDBI(mc), (void *)mc);
16567 
16568   if (unlikely(mc->mc_snum >= CURSOR_STACK)) {
16569     mc->mc_txn->mt_flags |= MDBX_TXN_ERROR;
16570     return MDBX_CURSOR_FULL;
16571   }
16572 
16573   mdbx_cassert(mc, mc->mc_snum < UINT16_MAX);
16574   mc->mc_top = mc->mc_snum++;
16575   mc->mc_pg[mc->mc_top] = mp;
16576   mc->mc_ki[mc->mc_top] = 0;
16577 
16578   return MDBX_SUCCESS;
16579 }
16580 
16581 __hot static struct page_result
mdbx_page_get_ex(MDBX_cursor * const mc,const pgno_t pgno,txnid_t front)16582 mdbx_page_get_ex(MDBX_cursor *const mc, const pgno_t pgno,
16583                  /* TODO: use parent-page ptr */ txnid_t front) {
16584   struct page_result ret;
16585   MDBX_txn *const txn = mc->mc_txn;
16586   mdbx_tassert(txn, front <= txn->mt_front);
16587   if (unlikely(pgno >= txn->mt_next_pgno)) {
16588     mdbx_error("page #%" PRIaPGNO " beyond next-pgno", pgno);
16589   notfound:
16590     ret.page = nullptr;
16591     ret.err = MDBX_PAGE_NOTFOUND;
16592   bailout:
16593     mc->mc_txn->mt_flags |= MDBX_TXN_ERROR;
16594     return ret;
16595   }
16596 
16597   MDBX_env *const env = txn->mt_env;
16598   mdbx_assert(env, ((txn->mt_flags ^ env->me_flags) & MDBX_WRITEMAP) == 0);
16599   if (unlikely((txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0)) {
16600     const MDBX_txn *spiller = txn;
16601     do {
16602       /* Spilled pages were dirtied in this txn and flushed
16603        * because the dirty list got full. Bring this page
16604        * back in from the map (but don't unspill it here,
16605        * leave that unless page_touch happens again). */
16606       if (unlikely(spiller->mt_flags & MDBX_TXN_SPILLS) &&
16607           spiller->tw.spill_pages &&
16608           mdbx_pnl_exist(spiller->tw.spill_pages, pgno << 1)) {
16609         goto spilled;
16610       }
16611 
16612       const unsigned i = mdbx_dpl_search(spiller, pgno);
16613       assert((int)i > 0);
16614       if (spiller->tw.dirtylist->items[i].pgno == pgno) {
16615         ret.page = spiller->tw.dirtylist->items[i].ptr;
16616         spiller->tw.dirtylist->items[i].lru = txn->tw.dirtylru++;
16617         goto dirty;
16618       }
16619 
16620       spiller = spiller->mt_parent;
16621     } while (spiller != NULL);
16622   }
16623 
16624 spilled:
16625   ret.page = pgno2page(env, pgno);
16626 
16627 dirty:
16628   if (unlikely(ret.page->mp_pgno != pgno)) {
16629     bad_page(ret.page,
16630              "mismatch actual pgno (%" PRIaPGNO ") != expected (%" PRIaPGNO
16631              ")\n",
16632              ret.page->mp_pgno, pgno);
16633     goto notfound;
16634   }
16635 
16636 #if !MDBX_DISABLE_PAGECHECKS
16637   if (unlikely(ret.page->mp_flags & P_ILL_BITS)) {
16638     ret.err =
16639         bad_page(ret.page, "invalid page's flags (%u)\n", ret.page->mp_flags);
16640     goto bailout;
16641   }
16642 
16643   if (unlikely(ret.page->mp_txnid > front) &&
16644       unlikely(ret.page->mp_txnid > txn->mt_front || front < txn->mt_txnid)) {
16645     ret.err = bad_page(
16646         ret.page,
16647         "invalid page txnid (%" PRIaTXN ") for %s' txnid (%" PRIaTXN ")\n",
16648         ret.page->mp_txnid,
16649         (front == txn->mt_front && front != txn->mt_txnid) ? "front-txn"
16650                                                            : "parent-page",
16651         front);
16652     goto bailout;
16653   }
16654 
16655   if (unlikely((ret.page->mp_upper < ret.page->mp_lower ||
16656                 ((ret.page->mp_lower | ret.page->mp_upper) & 1) ||
16657                 PAGEHDRSZ + ret.page->mp_upper > env->me_psize) &&
16658                !IS_OVERFLOW(ret.page))) {
16659     ret.err =
16660         bad_page(ret.page, "invalid page lower(%u)/upper(%u) with limit (%u)\n",
16661                  ret.page->mp_lower, ret.page->mp_upper, page_space(env));
16662     goto bailout;
16663   }
16664 #endif /* !MDBX_DISABLE_PAGECHECKS */
16665 
16666   ret.err = MDBX_SUCCESS;
16667   if (mdbx_audit_enabled())
16668     ret.err = mdbx_page_check(mc, ret.page, C_UPDATING);
16669   return ret;
16670 }
16671 
16672 /* Finish mdbx_page_search() / mdbx_page_search_lowest().
16673  * The cursor is at the root page, set up the rest of it. */
mdbx_page_search_root(MDBX_cursor * mc,const MDBX_val * key,int flags)16674 __hot static int mdbx_page_search_root(MDBX_cursor *mc, const MDBX_val *key,
16675                                        int flags) {
16676   MDBX_page *mp = mc->mc_pg[mc->mc_top];
16677   int rc;
16678   DKBUF_DEBUG;
16679 
16680   while (IS_BRANCH(mp)) {
16681     MDBX_node *node;
16682     int i;
16683 
16684     mdbx_debug("branch page %" PRIaPGNO " has %u keys", mp->mp_pgno,
16685                page_numkeys(mp));
16686     /* Don't assert on branch pages in the GC. We can get here
16687      * while in the process of rebalancing a GC branch page; we must
16688      * let that proceed. ITS#8336 */
16689     mdbx_cassert(mc, !mc->mc_dbi || page_numkeys(mp) > 1);
16690     mdbx_debug("found index 0 to page %" PRIaPGNO, node_pgno(page_node(mp, 0)));
16691 
16692     if (flags & (MDBX_PS_FIRST | MDBX_PS_LAST)) {
16693       i = 0;
16694       if (flags & MDBX_PS_LAST) {
16695         i = page_numkeys(mp) - 1;
16696         /* if already init'd, see if we're already in right place */
16697         if (mc->mc_flags & C_INITIALIZED) {
16698           if (mc->mc_ki[mc->mc_top] == i) {
16699             mc->mc_top = mc->mc_snum++;
16700             mp = mc->mc_pg[mc->mc_top];
16701             goto ready;
16702           }
16703         }
16704       }
16705     } else {
16706       const struct node_result nsr = mdbx_node_search(mc, key);
16707       if (nsr.node)
16708         i = mc->mc_ki[mc->mc_top] + nsr.exact - 1;
16709       else
16710         i = page_numkeys(mp) - 1;
16711       mdbx_debug("following index %u for key [%s]", i, DKEY_DEBUG(key));
16712     }
16713 
16714     mdbx_cassert(mc, i >= 0 && i < (int)page_numkeys(mp));
16715     node = page_node(mp, i);
16716 
16717     if (unlikely((rc = mdbx_page_get(mc, node_pgno(node), &mp,
16718                                      pp_txnid4chk(mp, mc->mc_txn))) != 0))
16719       return rc;
16720 
16721     mc->mc_ki[mc->mc_top] = (indx_t)i;
16722     if (unlikely(rc = mdbx_cursor_push(mc, mp)))
16723       return rc;
16724 
16725   ready:
16726     if (flags & MDBX_PS_MODIFY) {
16727       if (unlikely((rc = mdbx_page_touch(mc)) != 0))
16728         return rc;
16729       mp = mc->mc_pg[mc->mc_top];
16730     }
16731   }
16732 
16733 #if !MDBX_DISABLE_PAGECHECKS
16734   if (unlikely(!IS_LEAF(mp))) {
16735     mc->mc_txn->mt_flags |= MDBX_TXN_ERROR;
16736     return bad_page(mp, "index points to a page with 0x%02x flags\n",
16737                     mp->mp_flags);
16738   }
16739 #endif /* !MDBX_DISABLE_PAGECHECKS */
16740 
16741   mdbx_debug("found leaf page %" PRIaPGNO " for key [%s]", mp->mp_pgno,
16742              DKEY_DEBUG(key));
16743   mc->mc_flags |= C_INITIALIZED;
16744   mc->mc_flags &= ~C_EOF;
16745 
16746   return MDBX_SUCCESS;
16747 }
16748 
mdbx_setup_dbx(MDBX_dbx * const dbx,const MDBX_db * const db,const unsigned pagesize)16749 static int mdbx_setup_dbx(MDBX_dbx *const dbx, const MDBX_db *const db,
16750                           const unsigned pagesize) {
16751   if (unlikely(!dbx->md_cmp)) {
16752     dbx->md_cmp = get_default_keycmp(db->md_flags);
16753     dbx->md_dcmp = get_default_datacmp(db->md_flags);
16754   }
16755 
16756   dbx->md_klen_min =
16757       (db->md_flags & MDBX_INTEGERKEY) ? 4 /* sizeof(uint32_t) */ : 0;
16758   dbx->md_klen_max = keysize_max(pagesize, db->md_flags);
16759   assert(dbx->md_klen_max != (unsigned)-1);
16760 
16761   dbx->md_vlen_min = (db->md_flags & MDBX_INTEGERDUP)
16762                          ? 4 /* sizeof(uint32_t) */
16763                          : ((db->md_flags & MDBX_DUPFIXED) ? 1 : 0);
16764   dbx->md_vlen_max = valsize_max(pagesize, db->md_flags);
16765   assert(dbx->md_vlen_max != (unsigned)-1);
16766 
16767   if ((db->md_flags & (MDBX_DUPFIXED | MDBX_INTEGERDUP)) != 0 && db->md_xsize) {
16768     if (!MDBX_DISABLE_PAGECHECKS && unlikely(db->md_xsize < dbx->md_vlen_min ||
16769                                              db->md_xsize > dbx->md_vlen_max)) {
16770       mdbx_error("db.md_xsize (%u) <> min/max value-length (%zu/%zu)",
16771                  db->md_xsize, dbx->md_vlen_min, dbx->md_vlen_max);
16772       return MDBX_CORRUPTED;
16773     }
16774     dbx->md_vlen_min = dbx->md_vlen_max = db->md_xsize;
16775   }
16776   return MDBX_SUCCESS;
16777 }
16778 
mdbx_fetch_sdb(MDBX_txn * txn,MDBX_dbi dbi)16779 static int mdbx_fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi) {
16780   MDBX_cursor_couple couple;
16781   if (unlikely(TXN_DBI_CHANGED(txn, dbi)))
16782     return MDBX_BAD_DBI;
16783   int rc = mdbx_cursor_init(&couple.outer, txn, MAIN_DBI);
16784   if (unlikely(rc != MDBX_SUCCESS))
16785     return rc;
16786 
16787   MDBX_dbx *const dbx = &txn->mt_dbxs[dbi];
16788   rc = mdbx_page_search(&couple.outer, &dbx->md_name, 0);
16789   if (unlikely(rc != MDBX_SUCCESS))
16790     return (rc == MDBX_NOTFOUND) ? MDBX_BAD_DBI : rc;
16791 
16792   MDBX_val data;
16793   struct node_result nsr = mdbx_node_search(&couple.outer, &dbx->md_name);
16794   if (unlikely(!nsr.exact))
16795     return MDBX_BAD_DBI;
16796   if (unlikely((node_flags(nsr.node) & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA))
16797     return MDBX_INCOMPATIBLE; /* not a named DB */
16798 
16799   const txnid_t pp_txnid =
16800       pp_txnid4chk(couple.outer.mc_pg[couple.outer.mc_top], txn);
16801   rc = mdbx_node_read(&couple.outer, nsr.node, &data, pp_txnid);
16802   if (unlikely(rc != MDBX_SUCCESS))
16803     return rc;
16804 
16805   if (unlikely(data.iov_len != sizeof(MDBX_db)))
16806     return MDBX_INCOMPATIBLE; /* not a named DB */
16807 
16808   uint16_t md_flags = UNALIGNED_PEEK_16(data.iov_base, MDBX_db, md_flags);
16809   /* The txn may not know this DBI, or another process may
16810    * have dropped and recreated the DB with other flags. */
16811   MDBX_db *const db = &txn->mt_dbs[dbi];
16812   if (unlikely((db->md_flags & DB_PERSISTENT_FLAGS) != md_flags))
16813     return MDBX_INCOMPATIBLE;
16814 
16815   memcpy(db, data.iov_base, sizeof(MDBX_db));
16816 #if !MDBX_DISABLE_PAGECHECKS
16817   mdbx_tassert(txn, txn->mt_front >= pp_txnid);
16818   if (unlikely(db->md_mod_txnid > pp_txnid)) {
16819     mdbx_error("db.md_mod_txnid (%" PRIaTXN ") > page-txnid (%" PRIaTXN ")",
16820                db->md_mod_txnid, pp_txnid);
16821     return MDBX_CORRUPTED;
16822   }
16823 #endif /* !MDBX_DISABLE_PAGECHECKS */
16824   rc = mdbx_setup_dbx(dbx, db, txn->mt_env->me_psize);
16825   if (unlikely(rc != MDBX_SUCCESS))
16826     return rc;
16827 
16828   txn->mt_dbistate[dbi] &= ~DBI_STALE;
16829   return MDBX_SUCCESS;
16830 }
16831 
16832 /* Search for the lowest key under the current branch page.
16833  * This just bypasses a numkeys check in the current page
16834  * before calling mdbx_page_search_root(), because the callers
16835  * are all in situations where the current page is known to
16836  * be underfilled. */
mdbx_page_search_lowest(MDBX_cursor * mc)16837 __hot static int mdbx_page_search_lowest(MDBX_cursor *mc) {
16838   MDBX_page *mp = mc->mc_pg[mc->mc_top];
16839   mdbx_cassert(mc, IS_BRANCH(mp));
16840   MDBX_node *node = page_node(mp, 0);
16841   int rc;
16842 
16843   if (unlikely((rc = mdbx_page_get(mc, node_pgno(node), &mp,
16844                                    pp_txnid4chk(mp, mc->mc_txn))) != 0))
16845     return rc;
16846 
16847   mc->mc_ki[mc->mc_top] = 0;
16848   if (unlikely(rc = mdbx_cursor_push(mc, mp)))
16849     return rc;
16850   return mdbx_page_search_root(mc, NULL, MDBX_PS_FIRST);
16851 }
16852 
16853 /* Search for the page a given key should be in.
16854  * Push it and its parent pages on the cursor stack.
16855  *
16856  * [in,out] mc  the cursor for this operation.
16857  * [in] key     the key to search for, or NULL for first/last page.
16858  * [in] flags   If MDBX_PS_MODIFY is set, visited pages in the DB
16859  *              are touched (updated with new page numbers).
16860  *              If MDBX_PS_FIRST or MDBX_PS_LAST is set, find first or last
16861  * leaf.
16862  *              This is used by mdbx_cursor_first() and mdbx_cursor_last().
16863  *              If MDBX_PS_ROOTONLY set, just fetch root node, no further
16864  *              lookups.
16865  *
16866  * Returns 0 on success, non-zero on failure. */
mdbx_page_search(MDBX_cursor * mc,const MDBX_val * key,int flags)16867 __hot static int mdbx_page_search(MDBX_cursor *mc, const MDBX_val *key,
16868                                   int flags) {
16869   int rc;
16870   pgno_t root;
16871 
16872   /* Make sure the txn is still viable, then find the root from
16873    * the txn's db table and set it as the root of the cursor's stack. */
16874   if (unlikely(mc->mc_txn->mt_flags & MDBX_TXN_BLOCKED)) {
16875     mdbx_debug("%s", "transaction has failed, must abort");
16876     return MDBX_BAD_TXN;
16877   }
16878 
16879   /* Make sure we're using an up-to-date root */
16880   if (unlikely(*mc->mc_dbistate & DBI_STALE)) {
16881     rc = mdbx_fetch_sdb(mc->mc_txn, mc->mc_dbi);
16882     if (unlikely(rc != MDBX_SUCCESS))
16883       return rc;
16884   }
16885   root = mc->mc_db->md_root;
16886 
16887   if (unlikely(root == P_INVALID)) { /* Tree is empty. */
16888     mdbx_debug("%s", "tree is empty");
16889     return MDBX_NOTFOUND;
16890   }
16891 
16892   mdbx_cassert(mc, root >= NUM_METAS);
16893   if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) {
16894     txnid_t pp_txnid = mc->mc_db->md_mod_txnid;
16895     pp_txnid = /* mc->mc_db->md_mod_txnid maybe zero in a legacy DB */ pp_txnid
16896                    ? pp_txnid
16897                    : mc->mc_txn->mt_txnid;
16898     MDBX_txn *scan = mc->mc_txn;
16899     do
16900       if ((scan->mt_flags & MDBX_TXN_DIRTY) &&
16901           (mc->mc_dbi == MAIN_DBI ||
16902            (scan->mt_dbistate[mc->mc_dbi] & DBI_DIRTY))) {
16903         pp_txnid = scan->mt_front;
16904         break;
16905       }
16906     while (unlikely((scan = scan->mt_parent) != nullptr));
16907     if (unlikely((rc = mdbx_page_get(mc, root, &mc->mc_pg[0], pp_txnid)) != 0))
16908       return rc;
16909   }
16910 
16911   mc->mc_snum = 1;
16912   mc->mc_top = 0;
16913 
16914   mdbx_debug("db %d root page %" PRIaPGNO " has flags 0x%X", DDBI(mc), root,
16915              mc->mc_pg[0]->mp_flags);
16916 
16917   if (flags & MDBX_PS_MODIFY) {
16918     if (!(*mc->mc_dbistate & DBI_DIRTY) && unlikely(rc = mdbx_touch_dbi(mc)))
16919       return rc;
16920     if (unlikely(rc = mdbx_page_touch(mc)))
16921       return rc;
16922   }
16923 
16924   if (flags & MDBX_PS_ROOTONLY)
16925     return MDBX_SUCCESS;
16926 
16927   return mdbx_page_search_root(mc, key, flags);
16928 }
16929 
16930 /* Return the data associated with a given node.
16931  *
16932  * [in] mc      The cursor for this operation.
16933  * [in] leaf    The node being read.
16934  * [out] data   Updated to point to the node's data.
16935  *
16936  * Returns 0 on success, non-zero on failure. */
mdbx_node_read(MDBX_cursor * mc,MDBX_node * node,MDBX_val * data,const txnid_t front)16937 static __always_inline int mdbx_node_read(MDBX_cursor *mc, MDBX_node *node,
16938                                           MDBX_val *data, const txnid_t front) {
16939   data->iov_len = node_ds(node);
16940   data->iov_base = node_data(node);
16941   if (unlikely(F_ISSET(node_flags(node), F_BIGDATA))) {
16942     /* Read overflow data. */
16943     MDBX_page *omp; /* overflow page */
16944     int rc = mdbx_page_get(mc, node_largedata_pgno(node), &omp, front);
16945     if (unlikely((rc != MDBX_SUCCESS))) {
16946       mdbx_debug("read overflow page %" PRIaPGNO " failed",
16947                  node_largedata_pgno(node));
16948       return rc;
16949     }
16950     data->iov_base = page_data(omp);
16951   }
16952   return MDBX_SUCCESS;
16953 }
16954 
mdbx_get(MDBX_txn * txn,MDBX_dbi dbi,const MDBX_val * key,MDBX_val * data)16955 int mdbx_get(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data) {
16956   DKBUF_DEBUG;
16957   mdbx_debug("===> get db %u key [%s]", dbi, DKEY_DEBUG(key));
16958 
16959   int rc = check_txn(txn, MDBX_TXN_BLOCKED);
16960   if (unlikely(rc != MDBX_SUCCESS))
16961     return rc;
16962 
16963   if (unlikely(!key || !data))
16964     return MDBX_EINVAL;
16965 
16966   if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID)))
16967     return MDBX_BAD_DBI;
16968 
16969   MDBX_cursor_couple cx;
16970   rc = mdbx_cursor_init(&cx.outer, txn, dbi);
16971   if (unlikely(rc != MDBX_SUCCESS))
16972     return rc;
16973 
16974   return mdbx_cursor_set(&cx.outer, (MDBX_val *)key, data, MDBX_SET).err;
16975 }
16976 
mdbx_get_equal_or_great(MDBX_txn * txn,MDBX_dbi dbi,MDBX_val * key,MDBX_val * data)16977 int mdbx_get_equal_or_great(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key,
16978                             MDBX_val *data) {
16979   int rc = check_txn(txn, MDBX_TXN_BLOCKED);
16980   if (unlikely(rc != MDBX_SUCCESS))
16981     return rc;
16982 
16983   if (unlikely(!key || !data))
16984     return MDBX_EINVAL;
16985 
16986   if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID)))
16987     return MDBX_BAD_DBI;
16988 
16989   if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED))
16990     return MDBX_BAD_TXN;
16991 
16992   MDBX_cursor_couple cx;
16993   rc = mdbx_cursor_init(&cx.outer, txn, dbi);
16994   if (unlikely(rc != MDBX_SUCCESS))
16995     return rc;
16996 
16997   return mdbx_cursor_get(&cx.outer, key, data, MDBX_SET_LOWERBOUND);
16998 }
16999 
mdbx_get_ex(MDBX_txn * txn,MDBX_dbi dbi,MDBX_val * key,MDBX_val * data,size_t * values_count)17000 int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data,
17001                 size_t *values_count) {
17002   DKBUF_DEBUG;
17003   mdbx_debug("===> get db %u key [%s]", dbi, DKEY_DEBUG(key));
17004 
17005   int rc = check_txn(txn, MDBX_TXN_BLOCKED);
17006   if (unlikely(rc != MDBX_SUCCESS))
17007     return rc;
17008 
17009   if (unlikely(!key || !data))
17010     return MDBX_EINVAL;
17011 
17012   if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID)))
17013     return MDBX_BAD_DBI;
17014 
17015   MDBX_cursor_couple cx;
17016   rc = mdbx_cursor_init(&cx.outer, txn, dbi);
17017   if (unlikely(rc != MDBX_SUCCESS))
17018     return rc;
17019 
17020   rc = mdbx_cursor_set(&cx.outer, key, data, MDBX_SET_KEY).err;
17021   if (unlikely(rc != MDBX_SUCCESS)) {
17022     if (rc == MDBX_NOTFOUND && values_count)
17023       *values_count = 0;
17024     return rc;
17025   }
17026 
17027   if (values_count) {
17028     *values_count = 1;
17029     if (cx.outer.mc_xcursor != NULL) {
17030       MDBX_node *node = page_node(cx.outer.mc_pg[cx.outer.mc_top],
17031                                   cx.outer.mc_ki[cx.outer.mc_top]);
17032       if (F_ISSET(node_flags(node), F_DUPDATA)) {
17033         // coverity[uninit_use : FALSE]
17034         mdbx_tassert(txn, cx.outer.mc_xcursor == &cx.inner &&
17035                               (cx.inner.mx_cursor.mc_flags & C_INITIALIZED));
17036         // coverity[uninit_use : FALSE]
17037         *values_count =
17038             (sizeof(*values_count) >= sizeof(cx.inner.mx_db.md_entries) ||
17039              cx.inner.mx_db.md_entries <= PTRDIFF_MAX)
17040                 ? (size_t)cx.inner.mx_db.md_entries
17041                 : PTRDIFF_MAX;
17042       }
17043     }
17044   }
17045   return MDBX_SUCCESS;
17046 }
17047 
17048 /* Find a sibling for a page.
17049  * Replaces the page at the top of the cursor's stack with the specified
17050  * sibling, if one exists.
17051  *
17052  * [in] mc    The cursor for this operation.
17053  * [in] dir   SIBLING_LEFT or SIBLING_RIGHT.
17054  *
17055  * Returns 0 on success, non-zero on failure. */
mdbx_cursor_sibling(MDBX_cursor * mc,int dir)17056 static int mdbx_cursor_sibling(MDBX_cursor *mc, int dir) {
17057   int rc;
17058   MDBX_node *node;
17059   MDBX_page *mp;
17060   assert(dir == SIBLING_LEFT || dir == SIBLING_RIGHT);
17061 
17062   if (unlikely(mc->mc_snum < 2))
17063     return MDBX_NOTFOUND; /* root has no siblings */
17064 
17065   mdbx_cursor_pop(mc);
17066   mdbx_debug("parent page is page %" PRIaPGNO ", index %u",
17067              mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]);
17068 
17069   if ((dir == SIBLING_RIGHT)
17070           ? (mc->mc_ki[mc->mc_top] + 1u >= page_numkeys(mc->mc_pg[mc->mc_top]))
17071           : (mc->mc_ki[mc->mc_top] == 0)) {
17072     mdbx_debug("no more keys aside, moving to next %s sibling",
17073                dir ? "right" : "left");
17074     if (unlikely((rc = mdbx_cursor_sibling(mc, dir)) != MDBX_SUCCESS)) {
17075       /* undo cursor_pop before returning */
17076       mc->mc_top++;
17077       mc->mc_snum++;
17078       return rc;
17079     }
17080   } else {
17081     assert((dir - 1) == -1 || (dir - 1) == 1);
17082     mc->mc_ki[mc->mc_top] += (indx_t)(dir - 1);
17083     mdbx_debug("just moving to %s index key %u",
17084                (dir == SIBLING_RIGHT) ? "right" : "left",
17085                mc->mc_ki[mc->mc_top]);
17086   }
17087   mdbx_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top]));
17088 
17089   node = page_node(mp = mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
17090   if (unlikely((rc = mdbx_page_get(mc, node_pgno(node), &mp,
17091                                    pp_txnid4chk(mp, mc->mc_txn))) != 0)) {
17092     /* mc will be inconsistent if caller does mc_snum++ as above */
17093     mc->mc_flags &= ~(C_INITIALIZED | C_EOF);
17094     return rc;
17095   }
17096 
17097   rc = mdbx_cursor_push(mc, mp);
17098   if (unlikely(rc != MDBX_SUCCESS))
17099     return rc;
17100 
17101   mc->mc_ki[mc->mc_top] =
17102       (indx_t)((dir == SIBLING_LEFT) ? page_numkeys(mp) - 1 : 0);
17103   return MDBX_SUCCESS;
17104 }
17105 
17106 /* Move the cursor to the next data item. */
mdbx_cursor_next(MDBX_cursor * mc,MDBX_val * key,MDBX_val * data,MDBX_cursor_op op)17107 static int mdbx_cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data,
17108                             MDBX_cursor_op op) {
17109   MDBX_page *mp;
17110   MDBX_node *node;
17111   int rc;
17112 
17113   if (unlikely(mc->mc_flags & C_DEL) && op == MDBX_NEXT_DUP)
17114     return MDBX_NOTFOUND;
17115 
17116   if (unlikely(!(mc->mc_flags & C_INITIALIZED)))
17117     return mdbx_cursor_first(mc, key, data);
17118 
17119   mp = mc->mc_pg[mc->mc_top];
17120   if (unlikely(mc->mc_flags & C_EOF)) {
17121     if (mc->mc_ki[mc->mc_top] + 1u >= page_numkeys(mp))
17122       return (mc->mc_flags & C_SUB) ? MDBX_NOTFOUND : MDBX_ENODATA;
17123     mc->mc_flags ^= C_EOF;
17124   }
17125 
17126   if (mc->mc_db->md_flags & MDBX_DUPSORT) {
17127     node = page_node(mp, mc->mc_ki[mc->mc_top]);
17128     if (F_ISSET(node_flags(node), F_DUPDATA)) {
17129       if (op == MDBX_NEXT || op == MDBX_NEXT_DUP) {
17130         rc =
17131             mdbx_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_NEXT);
17132         if (op != MDBX_NEXT || rc != MDBX_NOTFOUND) {
17133           if (likely(rc == MDBX_SUCCESS))
17134             get_key_optional(node, key);
17135           return rc;
17136         }
17137       }
17138     } else {
17139       mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF);
17140       if (op == MDBX_NEXT_DUP)
17141         return MDBX_NOTFOUND;
17142     }
17143   }
17144 
17145   mdbx_debug("cursor_next: top page is %" PRIaPGNO " in cursor %p", mp->mp_pgno,
17146              (void *)mc);
17147   if (mc->mc_flags & C_DEL) {
17148     mc->mc_flags ^= C_DEL;
17149     goto skip;
17150   }
17151 
17152   int ki = mc->mc_ki[mc->mc_top];
17153   mc->mc_ki[mc->mc_top] = (indx_t)++ki;
17154   const int numkeys = page_numkeys(mp);
17155   if (unlikely(ki >= numkeys)) {
17156     mdbx_debug("%s", "=====> move to next sibling page");
17157     mc->mc_ki[mc->mc_top] = (indx_t)(numkeys - 1);
17158     if (unlikely((rc = mdbx_cursor_sibling(mc, SIBLING_RIGHT)) !=
17159                  MDBX_SUCCESS)) {
17160       mc->mc_flags |= C_EOF;
17161       return rc;
17162     }
17163     mp = mc->mc_pg[mc->mc_top];
17164     mdbx_debug("next page is %" PRIaPGNO ", key index %u", mp->mp_pgno,
17165                mc->mc_ki[mc->mc_top]);
17166   }
17167 
17168 skip:
17169   mdbx_debug("==> cursor points to page %" PRIaPGNO
17170              " with %u keys, key index %u",
17171              mp->mp_pgno, page_numkeys(mp), mc->mc_ki[mc->mc_top]);
17172 
17173   if (!MDBX_DISABLE_PAGECHECKS && unlikely(!IS_LEAF(mp)))
17174     return MDBX_CORRUPTED;
17175 
17176   if (IS_LEAF2(mp)) {
17177     if (!MDBX_DISABLE_PAGECHECKS && unlikely((mc->mc_flags & C_SUB) == 0)) {
17178       mdbx_error("unexpected LEAF2-page %" PRIaPGNO "for non-dupsort cursor",
17179                  mp->mp_pgno);
17180       return MDBX_CORRUPTED;
17181     } else if (likely(key)) {
17182       key->iov_len = mc->mc_db->md_xsize;
17183       key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len);
17184     }
17185     return MDBX_SUCCESS;
17186   }
17187 
17188   node = page_node(mp, mc->mc_ki[mc->mc_top]);
17189   if (F_ISSET(node_flags(node), F_DUPDATA)) {
17190     rc = mdbx_xcursor_init1(mc, node, mp);
17191     if (unlikely(rc != MDBX_SUCCESS))
17192       return rc;
17193     rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
17194     if (unlikely(rc != MDBX_SUCCESS))
17195       return rc;
17196   } else if (likely(data)) {
17197     if (unlikely((rc = mdbx_node_read(mc, node, data,
17198                                       pp_txnid4chk(mp, mc->mc_txn))) !=
17199                  MDBX_SUCCESS))
17200       return rc;
17201   }
17202 
17203   get_key_optional(node, key);
17204   return MDBX_SUCCESS;
17205 }
17206 
17207 /* Move the cursor to the previous data item. */
mdbx_cursor_prev(MDBX_cursor * mc,MDBX_val * key,MDBX_val * data,MDBX_cursor_op op)17208 static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data,
17209                             MDBX_cursor_op op) {
17210   MDBX_page *mp;
17211   MDBX_node *node;
17212   int rc;
17213 
17214   if (unlikely(mc->mc_flags & C_DEL) && op == MDBX_PREV_DUP)
17215     return MDBX_NOTFOUND;
17216 
17217   if (unlikely(!(mc->mc_flags & C_INITIALIZED))) {
17218     rc = mdbx_cursor_last(mc, key, data);
17219     if (unlikely(rc))
17220       return rc;
17221     mc->mc_ki[mc->mc_top]++;
17222   }
17223 
17224   mp = mc->mc_pg[mc->mc_top];
17225   if ((mc->mc_db->md_flags & MDBX_DUPSORT) &&
17226       mc->mc_ki[mc->mc_top] < page_numkeys(mp)) {
17227     node = page_node(mp, mc->mc_ki[mc->mc_top]);
17228     if (F_ISSET(node_flags(node), F_DUPDATA)) {
17229       if (op == MDBX_PREV || op == MDBX_PREV_DUP) {
17230         rc =
17231             mdbx_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_PREV);
17232         if (op != MDBX_PREV || rc != MDBX_NOTFOUND) {
17233           if (likely(rc == MDBX_SUCCESS)) {
17234             get_key_optional(node, key);
17235             mc->mc_flags &= ~C_EOF;
17236           }
17237           return rc;
17238         }
17239       }
17240     } else {
17241       mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF);
17242       if (op == MDBX_PREV_DUP)
17243         return MDBX_NOTFOUND;
17244     }
17245   }
17246 
17247   mdbx_debug("cursor_prev: top page is %" PRIaPGNO " in cursor %p", mp->mp_pgno,
17248              (void *)mc);
17249 
17250   mc->mc_flags &= ~(C_EOF | C_DEL);
17251 
17252   int ki = mc->mc_ki[mc->mc_top];
17253   mc->mc_ki[mc->mc_top] = (indx_t)--ki;
17254   if (unlikely(ki < 0)) {
17255     mc->mc_ki[mc->mc_top] = 0;
17256     mdbx_debug("%s", "=====> move to prev sibling page");
17257     if ((rc = mdbx_cursor_sibling(mc, SIBLING_LEFT)) != MDBX_SUCCESS)
17258       return rc;
17259     mp = mc->mc_pg[mc->mc_top];
17260     mdbx_debug("prev page is %" PRIaPGNO ", key index %u", mp->mp_pgno,
17261                mc->mc_ki[mc->mc_top]);
17262   }
17263   mdbx_debug("==> cursor points to page %" PRIaPGNO
17264              " with %u keys, key index %u",
17265              mp->mp_pgno, page_numkeys(mp), mc->mc_ki[mc->mc_top]);
17266 
17267   if (!MDBX_DISABLE_PAGECHECKS && unlikely(!IS_LEAF(mp)))
17268     return MDBX_CORRUPTED;
17269 
17270   if (IS_LEAF2(mp)) {
17271     if (!MDBX_DISABLE_PAGECHECKS && unlikely((mc->mc_flags & C_SUB) == 0)) {
17272       mdbx_error("unexpected LEAF2-page %" PRIaPGNO "for non-dupsort cursor",
17273                  mp->mp_pgno);
17274       return MDBX_CORRUPTED;
17275     } else if (likely(key)) {
17276       key->iov_len = mc->mc_db->md_xsize;
17277       key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len);
17278     }
17279     return MDBX_SUCCESS;
17280   }
17281 
17282   node = page_node(mp, mc->mc_ki[mc->mc_top]);
17283 
17284   if (F_ISSET(node_flags(node), F_DUPDATA)) {
17285     rc = mdbx_xcursor_init1(mc, node, mp);
17286     if (unlikely(rc != MDBX_SUCCESS))
17287       return rc;
17288     rc = mdbx_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL);
17289     if (unlikely(rc != MDBX_SUCCESS))
17290       return rc;
17291   } else if (likely(data)) {
17292     if (unlikely((rc = mdbx_node_read(mc, node, data,
17293                                       pp_txnid4chk(mp, mc->mc_txn))) !=
17294                  MDBX_SUCCESS))
17295       return rc;
17296   }
17297 
17298   get_key_optional(node, key);
17299   return MDBX_SUCCESS;
17300 }
17301 
17302 /* Set the cursor on a specific data item. */
mdbx_cursor_set(MDBX_cursor * mc,MDBX_val * key,MDBX_val * data,MDBX_cursor_op op)17303 static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key,
17304                                                 MDBX_val *data,
17305                                                 MDBX_cursor_op op) {
17306   MDBX_page *mp;
17307   MDBX_node *node = NULL;
17308   DKBUF_DEBUG;
17309 
17310   struct cursor_set_result ret;
17311   ret.exact = false;
17312   if (unlikely(key->iov_len < mc->mc_dbx->md_klen_min ||
17313                key->iov_len > mc->mc_dbx->md_klen_max)) {
17314     mdbx_cassert(mc, !"Invalid key-size");
17315     ret.err = MDBX_BAD_VALSIZE;
17316     return ret;
17317   }
17318 
17319   MDBX_val aligned_key = *key;
17320   uint64_t aligned_keybytes;
17321   if (mc->mc_db->md_flags & MDBX_INTEGERKEY) {
17322     switch (aligned_key.iov_len) {
17323     default:
17324       mdbx_cassert(mc, !"key-size is invalid for MDBX_INTEGERKEY");
17325       ret.err = MDBX_BAD_VALSIZE;
17326       return ret;
17327     case 4:
17328       if (unlikely(3 & (uintptr_t)aligned_key.iov_base))
17329         /* copy instead of return error to avoid break compatibility */
17330         aligned_key.iov_base =
17331             memcpy(&aligned_keybytes, aligned_key.iov_base, 4);
17332       break;
17333     case 8:
17334       if (unlikely(7 & (uintptr_t)aligned_key.iov_base))
17335         /* copy instead of return error to avoid break compatibility */
17336         aligned_key.iov_base =
17337             memcpy(&aligned_keybytes, aligned_key.iov_base, 8);
17338       break;
17339     }
17340   }
17341 
17342   if (mc->mc_xcursor)
17343     mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF);
17344 
17345   /* See if we're already on the right page */
17346   if (mc->mc_flags & C_INITIALIZED) {
17347     MDBX_val nodekey;
17348 
17349     mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]));
17350     mp = mc->mc_pg[mc->mc_top];
17351     if (unlikely(!page_numkeys(mp))) {
17352       mc->mc_ki[mc->mc_top] = 0;
17353       mc->mc_flags |= C_EOF;
17354       ret.err = MDBX_NOTFOUND;
17355       return ret;
17356     }
17357     if (IS_LEAF2(mp)) {
17358       nodekey.iov_len = mc->mc_db->md_xsize;
17359       nodekey.iov_base = page_leaf2key(mp, 0, nodekey.iov_len);
17360     } else {
17361       node = page_node(mp, 0);
17362       get_key(node, &nodekey);
17363     }
17364     int cmp = mc->mc_dbx->md_cmp(&aligned_key, &nodekey);
17365     if (unlikely(cmp == 0)) {
17366       /* Probably happens rarely, but first node on the page
17367        * was the one we wanted. */
17368       mc->mc_ki[mc->mc_top] = 0;
17369       ret.exact = true;
17370       mdbx_cassert(mc, mc->mc_ki[mc->mc_top] <
17371                                page_numkeys(mc->mc_pg[mc->mc_top]) ||
17372                            (mc->mc_flags & C_EOF));
17373       goto got_node;
17374     }
17375     if (cmp > 0) {
17376       const unsigned nkeys = page_numkeys(mp);
17377       if (nkeys > 1) {
17378         if (IS_LEAF2(mp)) {
17379           nodekey.iov_base = page_leaf2key(mp, nkeys - 1, nodekey.iov_len);
17380         } else {
17381           node = page_node(mp, nkeys - 1);
17382           get_key(node, &nodekey);
17383         }
17384         cmp = mc->mc_dbx->md_cmp(&aligned_key, &nodekey);
17385         if (cmp == 0) {
17386           /* last node was the one we wanted */
17387           mdbx_cassert(mc, nkeys >= 1 && nkeys <= UINT16_MAX + 1);
17388           mc->mc_ki[mc->mc_top] = (indx_t)(nkeys - 1);
17389           ret.exact = true;
17390           mdbx_cassert(mc, mc->mc_ki[mc->mc_top] <
17391                                    page_numkeys(mc->mc_pg[mc->mc_top]) ||
17392                                (mc->mc_flags & C_EOF));
17393           goto got_node;
17394         }
17395         if (cmp < 0) {
17396           if (mc->mc_ki[mc->mc_top] < page_numkeys(mp)) {
17397             /* This is definitely the right page, skip search_page */
17398             if (IS_LEAF2(mp)) {
17399               nodekey.iov_base =
17400                   page_leaf2key(mp, mc->mc_ki[mc->mc_top], nodekey.iov_len);
17401             } else {
17402               node = page_node(mp, mc->mc_ki[mc->mc_top]);
17403               get_key(node, &nodekey);
17404             }
17405             cmp = mc->mc_dbx->md_cmp(&aligned_key, &nodekey);
17406             if (cmp == 0) {
17407               /* current node was the one we wanted */
17408               ret.exact = true;
17409               mdbx_cassert(mc, mc->mc_ki[mc->mc_top] <
17410                                        page_numkeys(mc->mc_pg[mc->mc_top]) ||
17411                                    (mc->mc_flags & C_EOF));
17412               goto got_node;
17413             }
17414           }
17415           mc->mc_flags &= ~C_EOF;
17416           goto search_node;
17417         }
17418       }
17419       /* If any parents have right-sibs, search.
17420        * Otherwise, there's nothing further. */
17421       unsigned i;
17422       for (i = 0; i < mc->mc_top; i++)
17423         if (mc->mc_ki[i] < page_numkeys(mc->mc_pg[i]) - 1)
17424           break;
17425       if (i == mc->mc_top) {
17426         /* There are no other pages */
17427         mdbx_cassert(mc, nkeys <= UINT16_MAX);
17428         mc->mc_ki[mc->mc_top] = (uint16_t)nkeys;
17429         mc->mc_flags |= C_EOF;
17430         ret.err = MDBX_NOTFOUND;
17431         return ret;
17432       }
17433     }
17434     if (!mc->mc_top) {
17435       /* There are no other pages */
17436       mc->mc_ki[mc->mc_top] = 0;
17437       if (op == MDBX_SET_RANGE)
17438         goto got_node;
17439 
17440       mdbx_cassert(mc, mc->mc_ki[mc->mc_top] <
17441                                page_numkeys(mc->mc_pg[mc->mc_top]) ||
17442                            (mc->mc_flags & C_EOF));
17443       ret.err = MDBX_NOTFOUND;
17444       return ret;
17445     }
17446   } else {
17447     mc->mc_pg[0] = 0;
17448   }
17449 
17450   ret.err = mdbx_page_search(mc, &aligned_key, 0);
17451   if (unlikely(ret.err != MDBX_SUCCESS))
17452     return ret;
17453 
17454   mp = mc->mc_pg[mc->mc_top];
17455   mdbx_cassert(mc, IS_LEAF(mp));
17456 
17457 search_node:;
17458   struct node_result nsr = mdbx_node_search(mc, &aligned_key);
17459   node = nsr.node;
17460   ret.exact = nsr.exact;
17461   if (!ret.exact) {
17462     if (op != MDBX_SET_RANGE) {
17463       /* MDBX_SET specified and not an exact match. */
17464       if (unlikely(mc->mc_ki[mc->mc_top] >=
17465                    page_numkeys(mc->mc_pg[mc->mc_top])))
17466         mc->mc_flags |= C_EOF;
17467       ret.err = MDBX_NOTFOUND;
17468       return ret;
17469     }
17470 
17471     if (node == NULL) {
17472       mdbx_debug("%s", "===> inexact leaf not found, goto sibling");
17473       ret.err = mdbx_cursor_sibling(mc, SIBLING_RIGHT);
17474       if (unlikely(ret.err != MDBX_SUCCESS)) {
17475         mc->mc_flags |= C_EOF;
17476         return ret; /* no entries matched */
17477       }
17478       mp = mc->mc_pg[mc->mc_top];
17479       mdbx_cassert(mc, IS_LEAF(mp));
17480       if (!IS_LEAF2(mp))
17481         node = page_node(mp, 0);
17482     }
17483   }
17484   mdbx_cassert(mc,
17485                mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) ||
17486                    (mc->mc_flags & C_EOF));
17487 
17488 got_node:
17489   mc->mc_flags |= C_INITIALIZED;
17490   mc->mc_flags &= ~C_EOF;
17491 
17492   if (IS_LEAF2(mp)) {
17493     if (!MDBX_DISABLE_PAGECHECKS && unlikely((mc->mc_flags & C_SUB) == 0)) {
17494       mdbx_error("unexpected LEAF2-page %" PRIaPGNO "for non-dupsort cursor",
17495                  mp->mp_pgno);
17496       ret.err = MDBX_CORRUPTED;
17497     } else {
17498       if (op == MDBX_SET_RANGE || op == MDBX_SET_KEY) {
17499         key->iov_len = mc->mc_db->md_xsize;
17500         key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len);
17501       }
17502       ret.err = MDBX_SUCCESS;
17503     }
17504     return ret;
17505   }
17506 
17507   if (F_ISSET(node_flags(node), F_DUPDATA)) {
17508     ret.err = mdbx_xcursor_init1(mc, node, mp);
17509     if (unlikely(ret.err != MDBX_SUCCESS))
17510       return ret;
17511     if (op == MDBX_SET || op == MDBX_SET_KEY || op == MDBX_SET_RANGE) {
17512       ret.err = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
17513       if (unlikely(ret.err != MDBX_SUCCESS))
17514         return ret;
17515     } else {
17516       ret = mdbx_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL,
17517                             MDBX_SET_RANGE);
17518       if (unlikely(ret.err != MDBX_SUCCESS))
17519         return ret;
17520       if (op == MDBX_GET_BOTH && !ret.exact) {
17521         ret.err = MDBX_NOTFOUND;
17522         return ret;
17523       }
17524     }
17525   } else if (likely(data)) {
17526     if (op == MDBX_GET_BOTH || op == MDBX_GET_BOTH_RANGE) {
17527       if (unlikely(data->iov_len < mc->mc_dbx->md_vlen_min ||
17528                    data->iov_len > mc->mc_dbx->md_vlen_max)) {
17529         mdbx_cassert(mc, !"Invalid data-size");
17530         ret.err = MDBX_BAD_VALSIZE;
17531         return ret;
17532       }
17533       MDBX_val aligned_data = *data;
17534       uint64_t aligned_databytes;
17535       if (mc->mc_db->md_flags & MDBX_INTEGERDUP) {
17536         switch (aligned_data.iov_len) {
17537         default:
17538           mdbx_cassert(mc, !"data-size is invalid for MDBX_INTEGERDUP");
17539           ret.err = MDBX_BAD_VALSIZE;
17540           return ret;
17541         case 4:
17542           if (unlikely(3 & (uintptr_t)aligned_data.iov_base))
17543             /* copy instead of return error to avoid break compatibility */
17544             aligned_data.iov_base =
17545                 memcpy(&aligned_databytes, aligned_data.iov_base, 4);
17546           break;
17547         case 8:
17548           if (unlikely(7 & (uintptr_t)aligned_data.iov_base))
17549             /* copy instead of return error to avoid break compatibility */
17550             aligned_data.iov_base =
17551                 memcpy(&aligned_databytes, aligned_data.iov_base, 8);
17552           break;
17553         }
17554       }
17555       MDBX_val actual_data;
17556       ret.err = mdbx_node_read(mc, node, &actual_data,
17557                                pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn));
17558       if (unlikely(ret.err != MDBX_SUCCESS))
17559         return ret;
17560       const int cmp = mc->mc_dbx->md_dcmp(&aligned_data, &actual_data);
17561       if (cmp) {
17562         mdbx_cassert(mc, mc->mc_ki[mc->mc_top] <
17563                                  page_numkeys(mc->mc_pg[mc->mc_top]) ||
17564                              (mc->mc_flags & C_EOF));
17565         if (op != MDBX_GET_BOTH_RANGE || cmp > 0) {
17566           ret.err = MDBX_NOTFOUND;
17567           return ret;
17568         }
17569       }
17570       *data = actual_data;
17571     } else {
17572       ret.err = mdbx_node_read(mc, node, data,
17573                                pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn));
17574       if (unlikely(ret.err != MDBX_SUCCESS))
17575         return ret;
17576     }
17577   }
17578 
17579   /* The key already matches in all other cases */
17580   if (op == MDBX_SET_RANGE || op == MDBX_SET_KEY)
17581     get_key_optional(node, key);
17582 
17583   mdbx_debug("==> cursor placed on key [%s], data [%s]", DKEY_DEBUG(key),
17584              DVAL_DEBUG(data));
17585   ret.err = MDBX_SUCCESS;
17586   return ret;
17587 }
17588 
17589 /* Move the cursor to the first item in the database. */
mdbx_cursor_first(MDBX_cursor * mc,MDBX_val * key,MDBX_val * data)17590 static int mdbx_cursor_first(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) {
17591   int rc;
17592 
17593   if (mc->mc_xcursor)
17594     mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF);
17595 
17596   if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
17597     rc = mdbx_page_search(mc, NULL, MDBX_PS_FIRST);
17598     if (unlikely(rc != MDBX_SUCCESS))
17599       return rc;
17600   }
17601 
17602   if (!MDBX_DISABLE_PAGECHECKS && unlikely(!IS_LEAF(mc->mc_pg[mc->mc_top])))
17603     return MDBX_CORRUPTED;
17604 
17605   mc->mc_flags |= C_INITIALIZED;
17606   mc->mc_flags &= ~C_EOF;
17607   mc->mc_ki[mc->mc_top] = 0;
17608 
17609   if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
17610     if (!MDBX_DISABLE_PAGECHECKS && unlikely((mc->mc_flags & C_SUB) == 0)) {
17611       mdbx_error("unexpected LEAF2-page %" PRIaPGNO "for non-dupsort cursor",
17612                  mc->mc_pg[mc->mc_top]->mp_pgno);
17613       return MDBX_CORRUPTED;
17614     } else if (likely(key)) {
17615       key->iov_len = mc->mc_db->md_xsize;
17616       key->iov_base = page_leaf2key(mc->mc_pg[mc->mc_top], 0, key->iov_len);
17617     }
17618     return MDBX_SUCCESS;
17619   }
17620 
17621   MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], 0);
17622   if (F_ISSET(node_flags(node), F_DUPDATA)) {
17623     rc = mdbx_xcursor_init1(mc, node, mc->mc_pg[mc->mc_top]);
17624     if (unlikely(rc != MDBX_SUCCESS))
17625       return rc;
17626     rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
17627     if (unlikely(rc))
17628       return rc;
17629   } else if (likely(data)) {
17630     if (unlikely((rc = mdbx_node_read(
17631                       mc, node, data,
17632                       pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn))) !=
17633                  MDBX_SUCCESS))
17634       return rc;
17635   }
17636 
17637   get_key_optional(node, key);
17638   return MDBX_SUCCESS;
17639 }
17640 
17641 /* Move the cursor to the last item in the database. */
mdbx_cursor_last(MDBX_cursor * mc,MDBX_val * key,MDBX_val * data)17642 static int mdbx_cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) {
17643   int rc;
17644 
17645   if (mc->mc_xcursor)
17646     mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF);
17647 
17648   if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
17649     rc = mdbx_page_search(mc, NULL, MDBX_PS_LAST);
17650     if (unlikely(rc != MDBX_SUCCESS))
17651       return rc;
17652   }
17653 
17654   if (!MDBX_DISABLE_PAGECHECKS && unlikely(!IS_LEAF(mc->mc_pg[mc->mc_top])))
17655     return MDBX_CORRUPTED;
17656 
17657   mc->mc_ki[mc->mc_top] = (indx_t)page_numkeys(mc->mc_pg[mc->mc_top]) - 1;
17658   mc->mc_flags |= C_INITIALIZED | C_EOF;
17659 
17660   if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
17661     if (!MDBX_DISABLE_PAGECHECKS && unlikely((mc->mc_flags & C_SUB) == 0)) {
17662       mdbx_error("unexpected LEAF2-page %" PRIaPGNO "for non-dupsort cursor",
17663                  mc->mc_pg[mc->mc_top]->mp_pgno);
17664       return MDBX_CORRUPTED;
17665     } else if (likely(key)) {
17666       key->iov_len = mc->mc_db->md_xsize;
17667       key->iov_base = page_leaf2key(mc->mc_pg[mc->mc_top],
17668                                     mc->mc_ki[mc->mc_top], key->iov_len);
17669     }
17670     return MDBX_SUCCESS;
17671   }
17672 
17673   MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
17674   if (F_ISSET(node_flags(node), F_DUPDATA)) {
17675     rc = mdbx_xcursor_init1(mc, node, mc->mc_pg[mc->mc_top]);
17676     if (unlikely(rc != MDBX_SUCCESS))
17677       return rc;
17678     rc = mdbx_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL);
17679     if (unlikely(rc))
17680       return rc;
17681   } else if (likely(data)) {
17682     if (unlikely((rc = mdbx_node_read(
17683                       mc, node, data,
17684                       pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn))) !=
17685                  MDBX_SUCCESS))
17686       return rc;
17687   }
17688 
17689   get_key_optional(node, key);
17690   return MDBX_SUCCESS;
17691 }
17692 
mdbx_cursor_get(MDBX_cursor * mc,MDBX_val * key,MDBX_val * data,MDBX_cursor_op op)17693 int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data,
17694                     MDBX_cursor_op op) {
17695   if (unlikely(mc == NULL))
17696     return MDBX_EINVAL;
17697 
17698   if (unlikely(mc->mc_signature != MDBX_MC_LIVE))
17699     return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL
17700                                                      : MDBX_EBADSIGN;
17701 
17702   int rc = check_txn(mc->mc_txn, MDBX_TXN_BLOCKED);
17703   if (unlikely(rc != MDBX_SUCCESS))
17704     return rc;
17705 
17706   int (*mfunc)(MDBX_cursor * mc, MDBX_val * key, MDBX_val * data);
17707   switch (op) {
17708   case MDBX_GET_CURRENT: {
17709     if (unlikely(!(mc->mc_flags & C_INITIALIZED)))
17710       return MDBX_ENODATA;
17711     MDBX_page *mp = mc->mc_pg[mc->mc_top];
17712     const unsigned nkeys = page_numkeys(mp);
17713     if (mc->mc_ki[mc->mc_top] >= nkeys) {
17714       mdbx_cassert(mc, nkeys <= UINT16_MAX);
17715       if (mc->mc_flags & C_EOF)
17716         return MDBX_ENODATA;
17717       mc->mc_ki[mc->mc_top] = (uint16_t)nkeys;
17718       mc->mc_flags |= C_EOF;
17719       return MDBX_NOTFOUND;
17720     }
17721     mdbx_cassert(mc, nkeys > 0);
17722 
17723     rc = MDBX_SUCCESS;
17724     if (IS_LEAF2(mp)) {
17725       if (!MDBX_DISABLE_PAGECHECKS && unlikely((mc->mc_flags & C_SUB) == 0)) {
17726         mdbx_error("unexpected LEAF2-page %" PRIaPGNO "for non-dupsort cursor",
17727                    mp->mp_pgno);
17728         return MDBX_CORRUPTED;
17729       }
17730       key->iov_len = mc->mc_db->md_xsize;
17731       key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len);
17732     } else {
17733       MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]);
17734       get_key_optional(node, key);
17735       if (data) {
17736         if (F_ISSET(node_flags(node), F_DUPDATA)) {
17737           if (unlikely(!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))) {
17738             rc = mdbx_xcursor_init1(mc, node, mp);
17739             if (unlikely(rc != MDBX_SUCCESS))
17740               return rc;
17741             rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
17742             if (unlikely(rc))
17743               return rc;
17744           } else {
17745             rc = mdbx_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL,
17746                                  MDBX_GET_CURRENT);
17747             if (unlikely(rc))
17748               return rc;
17749           }
17750         } else {
17751           rc = mdbx_node_read(mc, node, data, pp_txnid4chk(mp, mc->mc_txn));
17752           if (unlikely(rc))
17753             return rc;
17754         }
17755       }
17756     }
17757     break;
17758   }
17759   case MDBX_GET_BOTH:
17760   case MDBX_GET_BOTH_RANGE:
17761     if (unlikely(data == NULL))
17762       return MDBX_EINVAL;
17763     if (unlikely(mc->mc_xcursor == NULL))
17764       return MDBX_INCOMPATIBLE;
17765     /* fall through */
17766     __fallthrough;
17767   case MDBX_SET:
17768   case MDBX_SET_KEY:
17769   case MDBX_SET_RANGE:
17770     if (unlikely(key == NULL))
17771       return MDBX_EINVAL;
17772     rc = mdbx_cursor_set(mc, key, data, op).err;
17773     if (mc->mc_flags & C_INITIALIZED) {
17774       mdbx_cassert(mc, mc->mc_snum > 0 && mc->mc_top < mc->mc_snum);
17775       mdbx_cassert(mc, mc->mc_ki[mc->mc_top] <
17776                                page_numkeys(mc->mc_pg[mc->mc_top]) ||
17777                            (mc->mc_flags & C_EOF));
17778     }
17779     break;
17780   case MDBX_GET_MULTIPLE:
17781     if (unlikely(data == NULL || !(mc->mc_flags & C_INITIALIZED)))
17782       return MDBX_EINVAL;
17783     if (unlikely(!(mc->mc_db->md_flags & MDBX_DUPFIXED)))
17784       return MDBX_INCOMPATIBLE;
17785     rc = MDBX_SUCCESS;
17786     if ((mc->mc_xcursor->mx_cursor.mc_flags & (C_INITIALIZED | C_EOF)) !=
17787         C_INITIALIZED)
17788       break;
17789     goto fetchm;
17790   case MDBX_NEXT_MULTIPLE:
17791     if (unlikely(data == NULL))
17792       return MDBX_EINVAL;
17793     if (unlikely(!(mc->mc_db->md_flags & MDBX_DUPFIXED)))
17794       return MDBX_INCOMPATIBLE;
17795     rc = mdbx_cursor_next(mc, key, data, MDBX_NEXT_DUP);
17796     if (rc == MDBX_SUCCESS) {
17797       if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) {
17798         MDBX_cursor *mx;
17799       fetchm:
17800         mx = &mc->mc_xcursor->mx_cursor;
17801         data->iov_len =
17802             page_numkeys(mx->mc_pg[mx->mc_top]) * mx->mc_db->md_xsize;
17803         data->iov_base = page_data(mx->mc_pg[mx->mc_top]);
17804         mx->mc_ki[mx->mc_top] = (indx_t)page_numkeys(mx->mc_pg[mx->mc_top]) - 1;
17805       } else {
17806         rc = MDBX_NOTFOUND;
17807       }
17808     }
17809     break;
17810   case MDBX_PREV_MULTIPLE:
17811     if (data == NULL)
17812       return MDBX_EINVAL;
17813     if (!(mc->mc_db->md_flags & MDBX_DUPFIXED))
17814       return MDBX_INCOMPATIBLE;
17815     rc = MDBX_SUCCESS;
17816     if (!(mc->mc_flags & C_INITIALIZED))
17817       rc = mdbx_cursor_last(mc, key, data);
17818     if (rc == MDBX_SUCCESS) {
17819       MDBX_cursor *mx = &mc->mc_xcursor->mx_cursor;
17820       if (mx->mc_flags & C_INITIALIZED) {
17821         rc = mdbx_cursor_sibling(mx, SIBLING_LEFT);
17822         if (rc == MDBX_SUCCESS)
17823           goto fetchm;
17824       } else {
17825         rc = MDBX_NOTFOUND;
17826       }
17827     }
17828     break;
17829   case MDBX_NEXT:
17830   case MDBX_NEXT_DUP:
17831   case MDBX_NEXT_NODUP:
17832     rc = mdbx_cursor_next(mc, key, data, op);
17833     break;
17834   case MDBX_PREV:
17835   case MDBX_PREV_DUP:
17836   case MDBX_PREV_NODUP:
17837     rc = mdbx_cursor_prev(mc, key, data, op);
17838     break;
17839   case MDBX_FIRST:
17840     rc = mdbx_cursor_first(mc, key, data);
17841     break;
17842   case MDBX_FIRST_DUP:
17843     mfunc = mdbx_cursor_first;
17844   mmove:
17845     if (unlikely(data == NULL || !(mc->mc_flags & C_INITIALIZED)))
17846       return MDBX_EINVAL;
17847     if (unlikely(mc->mc_xcursor == NULL))
17848       return MDBX_INCOMPATIBLE;
17849     if (mc->mc_ki[mc->mc_top] >= page_numkeys(mc->mc_pg[mc->mc_top])) {
17850       mc->mc_ki[mc->mc_top] = (indx_t)page_numkeys(mc->mc_pg[mc->mc_top]);
17851       mc->mc_flags |= C_EOF;
17852       return MDBX_NOTFOUND;
17853     }
17854     {
17855       MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
17856       if (!F_ISSET(node_flags(node), F_DUPDATA)) {
17857         get_key_optional(node, key);
17858         rc = mdbx_node_read(mc, node, data,
17859                             pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn));
17860         break;
17861       }
17862     }
17863     if (unlikely(!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)))
17864       return MDBX_EINVAL;
17865     rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL);
17866     break;
17867   case MDBX_LAST:
17868     rc = mdbx_cursor_last(mc, key, data);
17869     break;
17870   case MDBX_LAST_DUP:
17871     mfunc = mdbx_cursor_last;
17872     goto mmove;
17873   case MDBX_SET_LOWERBOUND: {
17874     if (unlikely(key == NULL || data == NULL))
17875       return MDBX_EINVAL;
17876     MDBX_val save_data = *data;
17877     struct cursor_set_result csr =
17878         mdbx_cursor_set(mc, key, data, MDBX_SET_RANGE);
17879     rc = csr.err;
17880     if (rc == MDBX_SUCCESS && csr.exact && mc->mc_xcursor) {
17881       mc->mc_flags &= ~C_DEL;
17882       if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) {
17883         *data = save_data;
17884         csr = mdbx_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL,
17885                               MDBX_SET_RANGE);
17886         rc = csr.err;
17887         if (rc == MDBX_NOTFOUND) {
17888           mdbx_cassert(mc, !csr.exact);
17889           rc = mdbx_cursor_next(mc, key, data, MDBX_NEXT_NODUP);
17890         }
17891       } else {
17892         int cmp = mc->mc_dbx->md_dcmp(&save_data, data);
17893         csr.exact = (cmp == 0);
17894         if (cmp > 0)
17895           rc = mdbx_cursor_next(mc, key, data, MDBX_NEXT_NODUP);
17896       }
17897     }
17898     if (rc == MDBX_SUCCESS && !csr.exact)
17899       rc = MDBX_RESULT_TRUE;
17900     break;
17901   }
17902   default:
17903     mdbx_debug("unhandled/unimplemented cursor operation %u", op);
17904     return MDBX_EINVAL;
17905   }
17906 
17907   mc->mc_flags &= ~C_DEL;
17908   return rc;
17909 }
17910 
mdbx_touch_dbi(MDBX_cursor * mc)17911 static int mdbx_touch_dbi(MDBX_cursor *mc) {
17912   mdbx_cassert(mc, (*mc->mc_dbistate & DBI_DIRTY) == 0);
17913   *mc->mc_dbistate |= DBI_DIRTY;
17914   mc->mc_txn->mt_flags |= MDBX_TXN_DIRTY;
17915   if (mc->mc_dbi >= CORE_DBS) {
17916     mdbx_cassert(mc, (mc->mc_flags & C_RECLAIMING) == 0);
17917     /* Touch DB record of named DB */
17918     MDBX_cursor_couple cx;
17919     int rc = mdbx_cursor_init(&cx.outer, mc->mc_txn, MAIN_DBI);
17920     if (unlikely(rc != MDBX_SUCCESS))
17921       return rc;
17922     mc->mc_txn->mt_dbistate[MAIN_DBI] |= DBI_DIRTY;
17923     rc = mdbx_page_search(&cx.outer, &mc->mc_dbx->md_name, MDBX_PS_MODIFY);
17924     if (unlikely(rc != MDBX_SUCCESS))
17925       return rc;
17926   }
17927   return MDBX_SUCCESS;
17928 }
17929 
17930 /* Touch all the pages in the cursor stack. Set mc_top.
17931  * Makes sure all the pages are writable, before attempting a write operation.
17932  * [in] mc The cursor to operate on. */
mdbx_cursor_touch(MDBX_cursor * mc)17933 static int mdbx_cursor_touch(MDBX_cursor *mc) {
17934   int rc = MDBX_SUCCESS;
17935   if (unlikely((*mc->mc_dbistate & DBI_DIRTY) == 0)) {
17936     rc = mdbx_touch_dbi(mc);
17937     if (unlikely(rc != MDBX_SUCCESS))
17938       return rc;
17939   }
17940   if (likely(mc->mc_snum)) {
17941     mc->mc_top = 0;
17942     do {
17943       rc = mdbx_page_touch(mc);
17944     } while (!rc && ++(mc->mc_top) < mc->mc_snum);
17945     mc->mc_top = mc->mc_snum - 1;
17946   }
17947   return rc;
17948 }
17949 
mdbx_cursor_put(MDBX_cursor * mc,const MDBX_val * key,MDBX_val * data,unsigned flags)17950 int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data,
17951                     unsigned flags) {
17952   MDBX_env *env;
17953   MDBX_page *sub_root = NULL;
17954   MDBX_val xdata, *rdata, dkey, olddata;
17955   MDBX_db nested_dupdb;
17956   int err;
17957   DKBUF_DEBUG;
17958 
17959   if (unlikely(mc == NULL || key == NULL || data == NULL))
17960     return MDBX_EINVAL;
17961 
17962   if (unlikely(mc->mc_signature != MDBX_MC_LIVE))
17963     return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL
17964                                                      : MDBX_EBADSIGN;
17965 
17966   int rc = check_txn_rw(mc->mc_txn, MDBX_TXN_BLOCKED);
17967   if (unlikely(rc != MDBX_SUCCESS))
17968     return rc;
17969 
17970   if (unlikely(TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)))
17971     return MDBX_BAD_DBI;
17972 
17973   mdbx_cassert(mc, cursor_is_tracked(mc));
17974   env = mc->mc_txn->mt_env;
17975 
17976   /* Check this first so counter will always be zero on any early failures. */
17977   size_t mcount = 0, dcount = 0;
17978   if (unlikely(flags & MDBX_MULTIPLE)) {
17979     if (unlikely(flags & MDBX_RESERVE))
17980       return MDBX_EINVAL;
17981     if (unlikely(!F_ISSET(mc->mc_db->md_flags, MDBX_DUPFIXED)))
17982       return MDBX_INCOMPATIBLE;
17983     dcount = data[1].iov_len;
17984     if (unlikely(dcount < 2 || data->iov_len == 0))
17985       return MDBX_BAD_VALSIZE;
17986     if (unlikely(mc->mc_db->md_xsize != data->iov_len) && mc->mc_db->md_xsize)
17987       return MDBX_BAD_VALSIZE;
17988     if (unlikely(dcount > MAX_MAPSIZE / 2 /
17989                               (BRANCH_NODE_MAX(MAX_PAGESIZE) - NODESIZE))) {
17990       /* checking for multiplication overflow */
17991       if (unlikely(dcount > MAX_MAPSIZE / 2 / data->iov_len))
17992         return MDBX_TOO_LARGE;
17993     }
17994     data[1].iov_len = 0 /* reset done item counter */;
17995   }
17996 
17997   if (flags & MDBX_RESERVE) {
17998     if (unlikely(mc->mc_db->md_flags & (MDBX_DUPSORT | MDBX_REVERSEDUP |
17999                                         MDBX_INTEGERDUP | MDBX_DUPFIXED)))
18000       return MDBX_INCOMPATIBLE;
18001     data->iov_base = nullptr;
18002   }
18003 
18004   const unsigned nospill = flags & MDBX_NOSPILL;
18005   flags -= nospill;
18006 
18007   if (unlikely(mc->mc_txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED)))
18008     return (mc->mc_txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS
18009                                                     : MDBX_BAD_TXN;
18010 
18011   uint64_t aligned_keybytes, aligned_databytes;
18012   MDBX_val aligned_key, aligned_data;
18013   if (likely((mc->mc_flags & C_SUB) == 0)) {
18014     if (unlikely(key->iov_len < mc->mc_dbx->md_klen_min ||
18015                  key->iov_len > mc->mc_dbx->md_klen_max)) {
18016       mdbx_cassert(mc, !"Invalid key-size");
18017       return MDBX_BAD_VALSIZE;
18018     }
18019     if (unlikely(data->iov_len < mc->mc_dbx->md_vlen_min ||
18020                  data->iov_len > mc->mc_dbx->md_vlen_max)) {
18021       mdbx_cassert(mc, !"Invalid data-size");
18022       return MDBX_BAD_VALSIZE;
18023     }
18024 
18025     if (mc->mc_db->md_flags & MDBX_INTEGERKEY) {
18026       switch (key->iov_len) {
18027       default:
18028         mdbx_cassert(mc, !"key-size is invalid for MDBX_INTEGERKEY");
18029         return MDBX_BAD_VALSIZE;
18030       case 4:
18031         if (unlikely(3 & (uintptr_t)key->iov_base)) {
18032           /* copy instead of return error to avoid break compatibility */
18033           aligned_key.iov_base =
18034               memcpy(&aligned_keybytes, key->iov_base, aligned_key.iov_len = 4);
18035           key = &aligned_key;
18036         }
18037         break;
18038       case 8:
18039         if (unlikely(7 & (uintptr_t)key->iov_base)) {
18040           /* copy instead of return error to avoid break compatibility */
18041           aligned_key.iov_base =
18042               memcpy(&aligned_keybytes, key->iov_base, aligned_key.iov_len = 8);
18043           key = &aligned_key;
18044         }
18045         break;
18046       }
18047     }
18048     if (mc->mc_db->md_flags & MDBX_INTEGERDUP) {
18049       switch (data->iov_len) {
18050       default:
18051         mdbx_cassert(mc, !"data-size is invalid for MDBX_INTEGERKEY");
18052         return MDBX_BAD_VALSIZE;
18053       case 4:
18054         if (unlikely(3 & (uintptr_t)data->iov_base)) {
18055           if (unlikely(flags & MDBX_MULTIPLE))
18056             return MDBX_BAD_VALSIZE;
18057           /* copy instead of return error to avoid break compatibility */
18058           aligned_data.iov_base = memcpy(&aligned_databytes, data->iov_base,
18059                                          aligned_data.iov_len = 4);
18060           data = &aligned_data;
18061         }
18062         break;
18063       case 8:
18064         if (unlikely(7 & (uintptr_t)data->iov_base)) {
18065           if (unlikely(flags & MDBX_MULTIPLE))
18066             return MDBX_BAD_VALSIZE;
18067           /* copy instead of return error to avoid break compatibility */
18068           aligned_data.iov_base = memcpy(&aligned_databytes, data->iov_base,
18069                                          aligned_data.iov_len = 8);
18070           data = &aligned_data;
18071         }
18072         break;
18073       }
18074     }
18075   }
18076 
18077   mdbx_debug(
18078       "==> put db %d key [%s], size %" PRIuPTR ", data [%s] size %" PRIuPTR,
18079       DDBI(mc), DKEY_DEBUG(key), key->iov_len,
18080       DVAL_DEBUG((flags & MDBX_RESERVE) ? nullptr : data), data->iov_len);
18081 
18082   int dupdata_flag = 0;
18083   if ((flags & MDBX_CURRENT) != 0 && (mc->mc_flags & C_SUB) == 0) {
18084     if (unlikely(flags & (MDBX_APPEND | MDBX_NOOVERWRITE)))
18085       return MDBX_EINVAL;
18086     /* Опция MDBX_CURRENT означает, что запрошено обновление текущей записи,
18087      * на которой сейчас стоит курсор. Проверяем что переданный ключ совпадает
18088      * со значением в текущей позиции курсора.
18089      * Здесь проще вызвать mdbx_cursor_get(), так как для обслуживания таблиц
18090      * с MDBX_DUPSORT также требуется текущий размер данных. */
18091     MDBX_val current_key, current_data;
18092     rc = mdbx_cursor_get(mc, &current_key, &current_data, MDBX_GET_CURRENT);
18093     if (unlikely(rc != MDBX_SUCCESS))
18094       return rc;
18095     if (mc->mc_dbx->md_cmp(key, &current_key) != 0)
18096       return MDBX_EKEYMISMATCH;
18097 
18098     if (unlikely((flags & MDBX_MULTIPLE)))
18099       goto drop_current;
18100 
18101     if (F_ISSET(mc->mc_db->md_flags, MDBX_DUPSORT)) {
18102       MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
18103       if (F_ISSET(node_flags(node), F_DUPDATA)) {
18104         mdbx_cassert(mc,
18105                      mc->mc_xcursor != NULL &&
18106                          (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED));
18107         /* Если за ключом более одного значения, либо если размер данных
18108          * отличается, то вместо обновления требуется удаление и
18109          * последующая вставка. */
18110         if (mc->mc_xcursor->mx_db.md_entries > 1 ||
18111             current_data.iov_len != data->iov_len) {
18112         drop_current:
18113           rc = mdbx_cursor_del(mc, flags & MDBX_ALLDUPS);
18114           if (unlikely(rc != MDBX_SUCCESS))
18115             return rc;
18116           flags -= MDBX_CURRENT;
18117           goto skip_check_samedata;
18118         }
18119       } else if (unlikely(node_size(key, data) > env->me_leaf_nodemax)) {
18120         rc = mdbx_cursor_del(mc, 0);
18121         if (unlikely(rc != MDBX_SUCCESS))
18122           return rc;
18123         flags -= MDBX_CURRENT;
18124         goto skip_check_samedata;
18125       }
18126     }
18127     if (!(flags & MDBX_RESERVE) &&
18128         unlikely(cmp_lenfast(&current_data, data) == 0))
18129       return MDBX_SUCCESS /* the same data, nothing to update */;
18130   skip_check_samedata:;
18131   }
18132 
18133   if (mc->mc_db->md_root == P_INVALID) {
18134     /* new database, cursor has nothing to point to */
18135     mc->mc_snum = 0;
18136     mc->mc_top = 0;
18137     mc->mc_flags &= ~C_INITIALIZED;
18138     rc = MDBX_NO_ROOT;
18139   } else if ((flags & MDBX_CURRENT) == 0) {
18140     bool exact = false;
18141     if ((flags & MDBX_APPEND) && mc->mc_db->md_entries > 0) {
18142       rc = mdbx_cursor_last(mc, &dkey, &olddata);
18143       if (likely(rc == MDBX_SUCCESS)) {
18144         rc = mc->mc_dbx->md_cmp(key, &dkey);
18145         if (likely(rc > 0)) {
18146           mc->mc_ki[mc->mc_top]++; /* step forward for appending */
18147           rc = MDBX_NOTFOUND;
18148         } else {
18149           if (unlikely(rc != MDBX_SUCCESS || !(flags & MDBX_APPENDDUP)))
18150             /* new-key < last-key
18151              * or new-key == last-key without MDBX_APPENDDUP */
18152             return MDBX_EKEYMISMATCH;
18153           exact = true;
18154         }
18155       }
18156     } else {
18157       struct cursor_set_result csr =
18158           mdbx_cursor_set(mc, (MDBX_val *)key, &olddata, MDBX_SET);
18159       rc = csr.err;
18160       exact = csr.exact;
18161     }
18162     if (likely(rc == MDBX_SUCCESS)) {
18163       if (exact) {
18164         if (unlikely(flags & MDBX_NOOVERWRITE)) {
18165           mdbx_debug("duplicate key [%s]", DKEY_DEBUG(key));
18166           *data = olddata;
18167           return MDBX_KEYEXIST;
18168         }
18169         if (unlikely(mc->mc_flags & C_SUB)) {
18170           /* nested subtree of DUPSORT-database with the same key,
18171            * nothing to update */
18172           mdbx_assert(env, data->iov_len == 0 && olddata.iov_len == 0);
18173           return MDBX_SUCCESS;
18174         }
18175         if (unlikely(flags & MDBX_ALLDUPS) && mc->mc_xcursor &&
18176             (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) {
18177           rc = mdbx_cursor_del(mc, MDBX_ALLDUPS);
18178           if (unlikely(rc != MDBX_SUCCESS))
18179             return rc;
18180           flags -= MDBX_ALLDUPS;
18181           rc = MDBX_NOTFOUND;
18182           exact = false;
18183         } else /* checking for early exit without dirtying pages */
18184           if (!(flags & (MDBX_RESERVE | MDBX_MULTIPLE)) &&
18185               unlikely(mc->mc_dbx->md_dcmp(data, &olddata) == 0)) {
18186             if (!mc->mc_xcursor)
18187               /* the same data, nothing to update */
18188               return MDBX_SUCCESS;
18189             if (flags & MDBX_NODUPDATA)
18190               return MDBX_KEYEXIST;
18191             if (flags & MDBX_APPENDDUP)
18192               return MDBX_EKEYMISMATCH;
18193             if (likely(unsure_equal(mc->mc_dbx->md_dcmp, data, &olddata)))
18194               /* data is match exactly byte-to-byte, nothing to update */
18195               return MDBX_SUCCESS;
18196             else {
18197               /* The data has differences, but the user-provided comparator
18198                * considers them equal. So continue update since called without.
18199                * Continue to update since was called without MDBX_NODUPDATA. */
18200             }
18201           }
18202       }
18203     } else if (unlikely(rc != MDBX_NOTFOUND))
18204       return rc;
18205   }
18206 
18207   mc->mc_flags &= ~C_DEL;
18208 
18209   /* Cursor is positioned, check for room in the dirty list */
18210   if (!nospill) {
18211     rdata = data;
18212     if (unlikely(flags & MDBX_MULTIPLE)) {
18213       rdata = &xdata;
18214       xdata.iov_len = data->iov_len * dcount;
18215     }
18216     if (unlikely(err = mdbx_cursor_spill(mc, key, rdata)))
18217       return err;
18218   }
18219 
18220   if (unlikely(rc == MDBX_NO_ROOT)) {
18221     /* new database, write a root leaf page */
18222     mdbx_debug("%s", "allocating new root leaf page");
18223     if (unlikely((*mc->mc_dbistate & DBI_DIRTY) == 0)) {
18224       err = mdbx_touch_dbi(mc);
18225       if (unlikely(err != MDBX_SUCCESS))
18226         return err;
18227     }
18228     struct page_result npr = mdbx_page_new(mc, P_LEAF, 1);
18229     if (unlikely(npr.err != MDBX_SUCCESS))
18230       return npr.err;
18231     npr.err = mdbx_cursor_push(mc, npr.page);
18232     if (unlikely(npr.err != MDBX_SUCCESS))
18233       return npr.err;
18234     mc->mc_db->md_root = npr.page->mp_pgno;
18235     mc->mc_db->md_depth++;
18236     if (mc->mc_db->md_flags & MDBX_INTEGERKEY) {
18237       assert(key->iov_len >= mc->mc_dbx->md_klen_min &&
18238              key->iov_len <= mc->mc_dbx->md_klen_max);
18239       mc->mc_dbx->md_klen_min = mc->mc_dbx->md_klen_max = key->iov_len;
18240     }
18241     if (mc->mc_db->md_flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED)) {
18242       assert(data->iov_len >= mc->mc_dbx->md_vlen_min &&
18243              data->iov_len <= mc->mc_dbx->md_vlen_max);
18244       assert(mc->mc_xcursor != NULL);
18245       mc->mc_db->md_xsize = mc->mc_xcursor->mx_db.md_xsize =
18246           (unsigned)(mc->mc_dbx->md_vlen_min = mc->mc_dbx->md_vlen_max =
18247                          mc->mc_xcursor->mx_dbx.md_klen_min =
18248                              mc->mc_xcursor->mx_dbx.md_klen_max =
18249                                  data->iov_len);
18250     }
18251     if ((mc->mc_db->md_flags & (MDBX_DUPSORT | MDBX_DUPFIXED)) == MDBX_DUPFIXED)
18252       npr.page->mp_flags |= P_LEAF2;
18253     mc->mc_flags |= C_INITIALIZED;
18254   } else {
18255     /* make sure all cursor pages are writable */
18256     err = mdbx_cursor_touch(mc);
18257     if (unlikely(err))
18258       return err;
18259   }
18260 
18261   bool insert_key, insert_data, do_sub = false;
18262   insert_key = insert_data = (rc != MDBX_SUCCESS);
18263   uint16_t fp_flags = P_LEAF;
18264   MDBX_page *fp = env->me_pbuf;
18265   fp->mp_txnid = mc->mc_txn->mt_front;
18266   if (insert_key) {
18267     /* The key does not exist */
18268     mdbx_debug("inserting key at index %i", mc->mc_ki[mc->mc_top]);
18269     if ((mc->mc_db->md_flags & MDBX_DUPSORT) &&
18270         node_size(key, data) > env->me_leaf_nodemax) {
18271       /* Too big for a node, insert in sub-DB.  Set up an empty
18272        * "old sub-page" for prep_subDB to expand to a full page. */
18273       fp->mp_leaf2_ksize =
18274           (mc->mc_db->md_flags & MDBX_DUPFIXED) ? (uint16_t)data->iov_len : 0;
18275       fp->mp_lower = fp->mp_upper = 0;
18276       olddata.iov_len = PAGEHDRSZ;
18277       goto prep_subDB;
18278     }
18279   } else {
18280     /* there's only a key anyway, so this is a no-op */
18281     if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
18282       char *ptr;
18283       unsigned ksize = mc->mc_db->md_xsize;
18284       if (unlikely(key->iov_len != ksize))
18285         return MDBX_BAD_VALSIZE;
18286       ptr = page_leaf2key(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize);
18287       memcpy(ptr, key->iov_base, ksize);
18288     fix_parent:
18289       /* if overwriting slot 0 of leaf, need to
18290        * update branch key if there is a parent page */
18291       if (mc->mc_top && !mc->mc_ki[mc->mc_top]) {
18292         unsigned dtop = 1;
18293         mc->mc_top--;
18294         /* slot 0 is always an empty key, find real slot */
18295         while (mc->mc_top && !mc->mc_ki[mc->mc_top]) {
18296           mc->mc_top--;
18297           dtop++;
18298         }
18299         err = MDBX_SUCCESS;
18300         if (mc->mc_ki[mc->mc_top])
18301           err = mdbx_update_key(mc, key);
18302         mdbx_cassert(mc, mc->mc_top + dtop < UINT16_MAX);
18303         mc->mc_top += (uint16_t)dtop;
18304         if (unlikely(err != MDBX_SUCCESS))
18305           return err;
18306       }
18307 
18308       if (mdbx_audit_enabled()) {
18309         err = mdbx_cursor_check(mc, 0);
18310         if (unlikely(err != MDBX_SUCCESS))
18311           return err;
18312       }
18313       return MDBX_SUCCESS;
18314     }
18315 
18316   more:;
18317     if (mdbx_audit_enabled()) {
18318       err = mdbx_cursor_check(mc, 0);
18319       if (unlikely(err != MDBX_SUCCESS))
18320         return err;
18321     }
18322     MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
18323 
18324     /* Large/Overflow page overwrites need special handling */
18325     if (unlikely(F_ISSET(node_flags(node), F_BIGDATA))) {
18326       int dpages = (node_size(key, data) > env->me_leaf_nodemax)
18327                        ? number_of_ovpages(env, data->iov_len)
18328                        : 0;
18329 
18330       const pgno_t pgno = node_largedata_pgno(node);
18331       struct page_result pgr = mdbx_page_get_ex(
18332           mc, pgno, pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn));
18333       if (unlikely(pgr.err != MDBX_SUCCESS))
18334         return pgr.err;
18335       if (unlikely(!IS_OVERFLOW(pgr.page)))
18336         return MDBX_CORRUPTED;
18337 
18338       /* Is the ov page from this txn (or a parent) and big enough? */
18339       int ovpages = pgr.page->mp_pages;
18340       if (!IS_FROZEN(mc->mc_txn, pgr.page) &&
18341           (unlikely(mc->mc_flags & C_GCFREEZE)
18342                ? (ovpages >= dpages)
18343                : (ovpages ==
18344                   /* LY: add configurable threshold to keep reserve space */
18345                   dpages))) {
18346         /* yes, overwrite it. */
18347         if (!IS_MODIFIABLE(mc->mc_txn, pgr.page)) {
18348           if (IS_SPILLED(mc->mc_txn, pgr.page)) {
18349             pgr = /* TODO: avoid search and get txn & spill-index from
18350                      page_result */
18351                 mdbx_page_unspill(mc->mc_txn, pgr.page);
18352             if (unlikely(pgr.err))
18353               return pgr.err;
18354           } else {
18355             if (unlikely(!mc->mc_txn->mt_parent)) {
18356               mdbx_error(
18357                   "Unexpected not frozen/modifiable/spilled but shadowed %s "
18358                   "page %" PRIaPGNO " mod-txnid %" PRIaTXN ","
18359                   " without parent transaction, current txn %" PRIaTXN
18360                   " front %" PRIaTXN,
18361                   "overflow/large", pgno, pgr.page->mp_txnid,
18362                   mc->mc_txn->mt_txnid, mc->mc_txn->mt_front);
18363               return MDBX_PROBLEM;
18364             }
18365 
18366             /* It is writable only in a parent txn */
18367             MDBX_page *np = mdbx_page_malloc(mc->mc_txn, ovpages);
18368             if (unlikely(!np))
18369               return MDBX_ENOMEM;
18370 
18371             memcpy(np, pgr.page, PAGEHDRSZ); /* Copy header of page */
18372             err = mdbx_page_dirty(mc->mc_txn, pgr.page = np, ovpages);
18373             if (unlikely(err != MDBX_SUCCESS))
18374               return err;
18375 
18376 #if MDBX_ENABLE_PGOP_STAT
18377             mc->mc_txn->mt_env->me_lck->mti_pgop_stat.clone.weak += ovpages;
18378 #endif /* MDBX_ENABLE_PGOP_STAT */
18379             mdbx_cassert(mc, mdbx_dirtylist_check(mc->mc_txn));
18380           }
18381         }
18382         node_set_ds(node, data->iov_len);
18383         if (F_ISSET(flags, MDBX_RESERVE))
18384           data->iov_base = page_data(pgr.page);
18385         else
18386           memcpy(page_data(pgr.page), data->iov_base, data->iov_len);
18387 
18388         if (mdbx_audit_enabled()) {
18389           err = mdbx_cursor_check(mc, 0);
18390           if (unlikely(err != MDBX_SUCCESS))
18391             return err;
18392         }
18393         return MDBX_SUCCESS;
18394       }
18395 
18396       if ((err = mdbx_page_retire(mc, pgr.page)) != MDBX_SUCCESS)
18397         return err;
18398     } else {
18399       olddata.iov_len = node_ds(node);
18400       olddata.iov_base = node_data(node);
18401       mdbx_cassert(mc, (char *)olddata.iov_base + olddata.iov_len <=
18402                            (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize);
18403 
18404       /* DB has dups? */
18405       if (F_ISSET(mc->mc_db->md_flags, MDBX_DUPSORT)) {
18406         /* Prepare (sub-)page/sub-DB to accept the new item, if needed.
18407          * fp: old sub-page or a header faking it.
18408          * mp: new (sub-)page.  offset: growth in page size.
18409          * xdata: node data with new page or DB. */
18410         unsigned i;
18411         size_t offset = 0;
18412         MDBX_page *mp = fp = xdata.iov_base = env->me_pbuf;
18413         mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
18414 
18415         /* Was a single item before, must convert now */
18416         if (!F_ISSET(node_flags(node), F_DUPDATA)) {
18417 
18418           /* does data match? */
18419           const int cmp = mc->mc_dbx->md_dcmp(data, &olddata);
18420           if ((flags & MDBX_APPENDDUP) && unlikely(cmp <= 0))
18421             return MDBX_EKEYMISMATCH;
18422           if (cmp == 0) {
18423             if (flags & MDBX_NODUPDATA)
18424               return MDBX_KEYEXIST;
18425             if (likely(unsure_equal(mc->mc_dbx->md_dcmp, data, &olddata))) {
18426               /* data is match exactly byte-to-byte, nothing to update */
18427               if (unlikely(flags & MDBX_MULTIPLE)) {
18428                 rc = MDBX_SUCCESS;
18429                 goto continue_multiple;
18430               }
18431               return MDBX_SUCCESS;
18432             } else {
18433               /* The data has differences, but the user-provided comparator
18434                * considers them equal. So continue update since called without.
18435                * Continue to update since was called without MDBX_NODUPDATA. */
18436             }
18437             mdbx_cassert(mc, node_size(key, data) <= env->me_leaf_nodemax);
18438             goto current;
18439           }
18440 
18441           /* Just overwrite the current item */
18442           if (flags & MDBX_CURRENT) {
18443             mdbx_cassert(mc, node_size(key, data) <= env->me_leaf_nodemax);
18444             goto current;
18445           }
18446 
18447           /* Back up original data item */
18448           memcpy(dkey.iov_base = fp + 1, olddata.iov_base,
18449                  dkey.iov_len = olddata.iov_len);
18450           dupdata_flag = 1;
18451 
18452           /* Make sub-page header for the dup items, with dummy body */
18453           fp->mp_flags = P_LEAF | P_SUBP;
18454           fp->mp_lower = 0;
18455           xdata.iov_len = PAGEHDRSZ + dkey.iov_len + data->iov_len;
18456           if (mc->mc_db->md_flags & MDBX_DUPFIXED) {
18457             fp->mp_flags |= P_LEAF2;
18458             fp->mp_leaf2_ksize = (uint16_t)data->iov_len;
18459             xdata.iov_len += 2 * data->iov_len; /* leave space for 2 more */
18460             mdbx_cassert(mc, xdata.iov_len <= env->me_psize);
18461           } else {
18462             xdata.iov_len += 2 * (sizeof(indx_t) + NODESIZE) +
18463                              (dkey.iov_len & 1) + (data->iov_len & 1);
18464             mdbx_cassert(mc, xdata.iov_len <= env->me_psize);
18465           }
18466           fp->mp_upper = (uint16_t)(xdata.iov_len - PAGEHDRSZ);
18467           olddata.iov_len = xdata.iov_len; /* pretend olddata is fp */
18468         } else if (node_flags(node) & F_SUBDATA) {
18469           /* Data is on sub-DB, just store it */
18470           flags |= F_DUPDATA | F_SUBDATA;
18471           goto put_sub;
18472         } else {
18473           /* Data is on sub-page */
18474           fp = olddata.iov_base;
18475           switch (flags) {
18476           default:
18477             if (!(mc->mc_db->md_flags & MDBX_DUPFIXED)) {
18478               offset = node_size(data, nullptr) + sizeof(indx_t);
18479               break;
18480             }
18481             offset = fp->mp_leaf2_ksize;
18482             if (page_room(fp) < offset) {
18483               offset *= 4; /* space for 4 more */
18484               break;
18485             }
18486             /* FALLTHRU: Big enough MDBX_DUPFIXED sub-page */
18487             __fallthrough;
18488           case MDBX_CURRENT | MDBX_NODUPDATA:
18489           case MDBX_CURRENT:
18490             fp->mp_txnid = mc->mc_txn->mt_front;
18491             fp->mp_pgno = mp->mp_pgno;
18492             mc->mc_xcursor->mx_cursor.mc_pg[0] = fp;
18493             flags |= F_DUPDATA;
18494             goto put_sub;
18495           }
18496           xdata.iov_len = olddata.iov_len + offset;
18497         }
18498 
18499         fp_flags = fp->mp_flags;
18500         if (node_size_len(node_ks(node), xdata.iov_len) >
18501             env->me_leaf_nodemax) {
18502           /* Too big for a sub-page, convert to sub-DB */
18503           fp_flags &= ~P_SUBP;
18504         prep_subDB:
18505           nested_dupdb.md_xsize = 0;
18506           nested_dupdb.md_flags = flags_db2sub(mc->mc_db->md_flags);
18507           if (mc->mc_db->md_flags & MDBX_DUPFIXED) {
18508             fp_flags |= P_LEAF2;
18509             nested_dupdb.md_xsize = fp->mp_leaf2_ksize;
18510           }
18511           nested_dupdb.md_depth = 1;
18512           nested_dupdb.md_branch_pages = 0;
18513           nested_dupdb.md_leaf_pages = 1;
18514           nested_dupdb.md_overflow_pages = 0;
18515           nested_dupdb.md_entries = page_numkeys(fp);
18516           xdata.iov_len = sizeof(nested_dupdb);
18517           xdata.iov_base = &nested_dupdb;
18518           const struct page_result par = mdbx_page_alloc(mc, 1, MDBX_ALLOC_ALL);
18519           mp = par.page;
18520           if (unlikely(par.err != MDBX_SUCCESS))
18521             return par.err;
18522           mc->mc_db->md_leaf_pages += 1;
18523           mdbx_cassert(mc, env->me_psize > olddata.iov_len);
18524           offset = env->me_psize - (unsigned)olddata.iov_len;
18525           flags |= F_DUPDATA | F_SUBDATA;
18526           nested_dupdb.md_root = mp->mp_pgno;
18527           nested_dupdb.md_seq = 0;
18528           nested_dupdb.md_mod_txnid = mc->mc_txn->mt_txnid;
18529           sub_root = mp;
18530         }
18531         if (mp != fp) {
18532           mp->mp_flags = fp_flags;
18533           mp->mp_txnid = mc->mc_txn->mt_front;
18534           mp->mp_leaf2_ksize = fp->mp_leaf2_ksize;
18535           mp->mp_lower = fp->mp_lower;
18536           mdbx_cassert(mc, fp->mp_upper + offset <= UINT16_MAX);
18537           mp->mp_upper = (indx_t)(fp->mp_upper + offset);
18538           if (unlikely(fp_flags & P_LEAF2)) {
18539             memcpy(page_data(mp), page_data(fp),
18540                    page_numkeys(fp) * fp->mp_leaf2_ksize);
18541           } else {
18542             memcpy((char *)mp + mp->mp_upper + PAGEHDRSZ,
18543                    (char *)fp + fp->mp_upper + PAGEHDRSZ,
18544                    olddata.iov_len - fp->mp_upper - PAGEHDRSZ);
18545             memcpy((char *)(&mp->mp_ptrs), (char *)(&fp->mp_ptrs),
18546                    page_numkeys(fp) * sizeof(mp->mp_ptrs[0]));
18547             for (i = 0; i < page_numkeys(fp); i++) {
18548               mdbx_cassert(mc, mp->mp_ptrs[i] + offset <= UINT16_MAX);
18549               mp->mp_ptrs[i] += (indx_t)offset;
18550             }
18551           }
18552         }
18553 
18554         rdata = &xdata;
18555         flags |= F_DUPDATA;
18556         do_sub = true;
18557         if (!insert_key)
18558           mdbx_node_del(mc, 0);
18559         goto new_sub;
18560       }
18561 
18562       /* MDBX passes F_SUBDATA in 'flags' to write a DB record */
18563       if (unlikely((node_flags(node) ^ flags) & F_SUBDATA))
18564         return MDBX_INCOMPATIBLE;
18565 
18566     current:
18567       if (data->iov_len == olddata.iov_len) {
18568         mdbx_cassert(mc, EVEN(key->iov_len) == EVEN(node_ks(node)));
18569         /* same size, just replace it. Note that we could
18570          * also reuse this node if the new data is smaller,
18571          * but instead we opt to shrink the node in that case. */
18572         if (F_ISSET(flags, MDBX_RESERVE))
18573           data->iov_base = olddata.iov_base;
18574         else if (!(mc->mc_flags & C_SUB))
18575           memcpy(olddata.iov_base, data->iov_base, data->iov_len);
18576         else {
18577           mdbx_cassert(mc, page_numkeys(mc->mc_pg[mc->mc_top]) == 1);
18578           mdbx_cassert(mc, PAGETYPE(mc->mc_pg[mc->mc_top]) == P_LEAF);
18579           mdbx_cassert(mc, node_ds(node) == 0);
18580           mdbx_cassert(mc, node_flags(node) == 0);
18581           mdbx_cassert(mc, key->iov_len < UINT16_MAX);
18582           node_set_ks(node, key->iov_len);
18583           memcpy(node_key(node), key->iov_base, key->iov_len);
18584           mdbx_cassert(mc, (char *)node_key(node) + node_ds(node) <
18585                                (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize);
18586           goto fix_parent;
18587         }
18588 
18589         if (mdbx_audit_enabled()) {
18590           err = mdbx_cursor_check(mc, 0);
18591           if (unlikely(err != MDBX_SUCCESS))
18592             return err;
18593         }
18594         return MDBX_SUCCESS;
18595       }
18596     }
18597     mdbx_node_del(mc, 0);
18598   }
18599 
18600   rdata = data;
18601 
18602 new_sub:;
18603   unsigned nflags = flags & NODE_ADD_FLAGS;
18604   size_t nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->iov_len
18605                                                  : leaf_size(env, key, rdata);
18606   if (page_room(mc->mc_pg[mc->mc_top]) < nsize) {
18607     if (!insert_key)
18608       nflags |= MDBX_SPLIT_REPLACE;
18609     rc = mdbx_page_split(mc, key, rdata, P_INVALID, nflags);
18610     if (rc == MDBX_SUCCESS && mdbx_audit_enabled())
18611       rc = mdbx_cursor_check(mc, 0);
18612   } else {
18613     /* There is room already in this leaf page. */
18614     if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
18615       mdbx_cassert(mc, (nflags & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0 &&
18616                            rdata->iov_len == 0);
18617       rc = mdbx_node_add_leaf2(mc, mc->mc_ki[mc->mc_top], key);
18618     } else
18619       rc = mdbx_node_add_leaf(mc, mc->mc_ki[mc->mc_top], key, rdata, nflags);
18620     if (likely(rc == 0)) {
18621       /* Adjust other cursors pointing to mp */
18622       const MDBX_dbi dbi = mc->mc_dbi;
18623       const unsigned i = mc->mc_top;
18624       MDBX_page *const mp = mc->mc_pg[i];
18625       for (MDBX_cursor *m2 = mc->mc_txn->tw.cursors[dbi]; m2;
18626            m2 = m2->mc_next) {
18627         MDBX_cursor *m3 =
18628             (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2;
18629         if (m3 == mc || m3->mc_snum < mc->mc_snum || m3->mc_pg[i] != mp)
18630           continue;
18631         if (m3->mc_ki[i] >= mc->mc_ki[i])
18632           m3->mc_ki[i] += insert_key;
18633         if (XCURSOR_INITED(m3))
18634           XCURSOR_REFRESH(m3, mp, m3->mc_ki[i]);
18635       }
18636     }
18637   }
18638 
18639   if (likely(rc == MDBX_SUCCESS)) {
18640     /* Now store the actual data in the child DB. Note that we're
18641      * storing the user data in the keys field, so there are strict
18642      * size limits on dupdata. The actual data fields of the child
18643      * DB are all zero size. */
18644     if (do_sub) {
18645       int xflags;
18646       size_t ecount;
18647     put_sub:
18648       xdata.iov_len = 0;
18649       xdata.iov_base = nullptr;
18650       MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
18651 #define SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE 1
18652       STATIC_ASSERT(
18653           (MDBX_NODUPDATA >> SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE) ==
18654           MDBX_NOOVERWRITE);
18655       xflags = MDBX_CURRENT | MDBX_NOSPILL |
18656                ((flags & MDBX_NODUPDATA) >>
18657                 SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE);
18658       if ((flags & MDBX_CURRENT) == 0) {
18659         xflags -= MDBX_CURRENT;
18660         err = mdbx_xcursor_init1(mc, node, mc->mc_pg[mc->mc_top]);
18661         if (unlikely(err != MDBX_SUCCESS))
18662           return err;
18663       }
18664       if (sub_root)
18665         mc->mc_xcursor->mx_cursor.mc_pg[0] = sub_root;
18666       /* converted, write the original data first */
18667       if (dupdata_flag) {
18668         rc = mdbx_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags);
18669         if (unlikely(rc))
18670           goto bad_sub;
18671         /* we've done our job */
18672         dkey.iov_len = 0;
18673       }
18674       if (!(node_flags(node) & F_SUBDATA) || sub_root) {
18675         /* Adjust other cursors pointing to mp */
18676         MDBX_cursor *m2;
18677         MDBX_xcursor *mx = mc->mc_xcursor;
18678         unsigned i = mc->mc_top;
18679         MDBX_page *mp = mc->mc_pg[i];
18680         const int nkeys = page_numkeys(mp);
18681 
18682         for (m2 = mc->mc_txn->tw.cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) {
18683           if (m2 == mc || m2->mc_snum < mc->mc_snum)
18684             continue;
18685           if (!(m2->mc_flags & C_INITIALIZED))
18686             continue;
18687           if (m2->mc_pg[i] == mp) {
18688             if (m2->mc_ki[i] == mc->mc_ki[i]) {
18689               err = mdbx_xcursor_init2(m2, mx, dupdata_flag);
18690               if (unlikely(err != MDBX_SUCCESS))
18691                 return err;
18692             } else if (!insert_key && m2->mc_ki[i] < nkeys) {
18693               XCURSOR_REFRESH(m2, mp, m2->mc_ki[i]);
18694             }
18695           }
18696         }
18697       }
18698       mdbx_cassert(mc, mc->mc_xcursor->mx_db.md_entries < PTRDIFF_MAX);
18699       ecount = (size_t)mc->mc_xcursor->mx_db.md_entries;
18700 #define SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND 1
18701       STATIC_ASSERT((MDBX_APPENDDUP >> SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND) ==
18702                     MDBX_APPEND);
18703       xflags |= (flags & MDBX_APPENDDUP) >> SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND;
18704       rc = mdbx_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags);
18705       if (flags & F_SUBDATA) {
18706         void *db = node_data(node);
18707         mc->mc_xcursor->mx_db.md_mod_txnid = mc->mc_txn->mt_txnid;
18708         memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDBX_db));
18709       }
18710       insert_data = (ecount != (size_t)mc->mc_xcursor->mx_db.md_entries);
18711     }
18712     /* Increment count unless we just replaced an existing item. */
18713     if (insert_data)
18714       mc->mc_db->md_entries++;
18715     if (insert_key) {
18716       /* Invalidate txn if we created an empty sub-DB */
18717       if (unlikely(rc))
18718         goto bad_sub;
18719       /* If we succeeded and the key didn't exist before,
18720        * make sure the cursor is marked valid. */
18721       mc->mc_flags |= C_INITIALIZED;
18722     }
18723     if (unlikely(flags & MDBX_MULTIPLE)) {
18724       if (likely(rc == MDBX_SUCCESS)) {
18725       continue_multiple:
18726         mcount++;
18727         /* let caller know how many succeeded, if any */
18728         data[1].iov_len = mcount;
18729         if (mcount < dcount) {
18730           data[0].iov_base = (char *)data[0].iov_base + data[0].iov_len;
18731           insert_key = insert_data = false;
18732           goto more;
18733         }
18734       }
18735     }
18736     if (rc == MDBX_SUCCESS && mdbx_audit_enabled())
18737       rc = mdbx_cursor_check(mc, 0);
18738     return rc;
18739   bad_sub:
18740     if (unlikely(rc == MDBX_KEYEXIST)) {
18741       /* should not happen, we deleted that item */
18742       mdbx_error("Unexpected %i error while put to nested dupsort's hive", rc);
18743       rc = MDBX_PROBLEM;
18744     }
18745   }
18746   mc->mc_txn->mt_flags |= MDBX_TXN_ERROR;
18747   return rc;
18748 }
18749 
mdbx_cursor_del(MDBX_cursor * mc,MDBX_put_flags_t flags)18750 int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) {
18751   if (unlikely(!mc))
18752     return MDBX_EINVAL;
18753 
18754   if (unlikely(mc->mc_signature != MDBX_MC_LIVE))
18755     return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL
18756                                                      : MDBX_EBADSIGN;
18757 
18758   int rc = check_txn_rw(mc->mc_txn, MDBX_TXN_BLOCKED);
18759   if (unlikely(rc != MDBX_SUCCESS))
18760     return rc;
18761 
18762   if (unlikely(TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)))
18763     return MDBX_BAD_DBI;
18764 
18765   if (unlikely(!(mc->mc_flags & C_INITIALIZED)))
18766     return MDBX_ENODATA;
18767 
18768   if (unlikely(mc->mc_ki[mc->mc_top] >= page_numkeys(mc->mc_pg[mc->mc_top])))
18769     return MDBX_NOTFOUND;
18770 
18771   if (likely((flags & MDBX_NOSPILL) == 0) &&
18772       unlikely(rc = mdbx_cursor_spill(mc, NULL, NULL)))
18773     return rc;
18774 
18775   rc = mdbx_cursor_touch(mc);
18776   if (unlikely(rc != MDBX_SUCCESS))
18777     return rc;
18778 
18779   MDBX_page *mp = mc->mc_pg[mc->mc_top];
18780   if (!MDBX_DISABLE_PAGECHECKS && unlikely(!IS_LEAF(mp)))
18781     return MDBX_CORRUPTED;
18782   if (IS_LEAF2(mp)) {
18783     if (!MDBX_DISABLE_PAGECHECKS && unlikely((mc->mc_flags & C_SUB) == 0)) {
18784       mdbx_error("unexpected LEAF2-page %" PRIaPGNO "for non-dupsort cursor",
18785                  mp->mp_pgno);
18786       return MDBX_CORRUPTED;
18787     }
18788     goto del_key;
18789   }
18790 
18791   MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]);
18792   if (F_ISSET(node_flags(node), F_DUPDATA)) {
18793     if (flags & (MDBX_ALLDUPS | /* for compatibility */ MDBX_NODUPDATA)) {
18794       /* mdbx_cursor_del0() will subtract the final entry */
18795       mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1;
18796       mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
18797     } else {
18798       if (!F_ISSET(node_flags(node), F_SUBDATA))
18799         mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node);
18800       rc = mdbx_cursor_del(&mc->mc_xcursor->mx_cursor, MDBX_NOSPILL);
18801       if (unlikely(rc))
18802         return rc;
18803       /* If sub-DB still has entries, we're done */
18804       if (mc->mc_xcursor->mx_db.md_entries) {
18805         if (node_flags(node) & F_SUBDATA) {
18806           /* update subDB info */
18807           void *db = node_data(node);
18808           mc->mc_xcursor->mx_db.md_mod_txnid = mc->mc_txn->mt_txnid;
18809           memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDBX_db));
18810         } else {
18811           MDBX_cursor *m2;
18812           /* shrink fake page */
18813           mdbx_node_shrink(mp, mc->mc_ki[mc->mc_top]);
18814           node = page_node(mp, mc->mc_ki[mc->mc_top]);
18815           mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node);
18816           /* fix other sub-DB cursors pointed at fake pages on this page */
18817           for (m2 = mc->mc_txn->tw.cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) {
18818             if (m2 == mc || m2->mc_snum < mc->mc_snum)
18819               continue;
18820             if (!(m2->mc_flags & C_INITIALIZED))
18821               continue;
18822             if (m2->mc_pg[mc->mc_top] == mp) {
18823               MDBX_node *inner = node;
18824               if (m2->mc_ki[mc->mc_top] >= page_numkeys(mp))
18825                 continue;
18826               if (m2->mc_ki[mc->mc_top] != mc->mc_ki[mc->mc_top]) {
18827                 inner = page_node(mp, m2->mc_ki[mc->mc_top]);
18828                 if (node_flags(inner) & F_SUBDATA)
18829                   continue;
18830               }
18831               m2->mc_xcursor->mx_cursor.mc_pg[0] = node_data(inner);
18832             }
18833           }
18834         }
18835         mc->mc_db->md_entries--;
18836         mdbx_cassert(mc, mc->mc_db->md_entries > 0 && mc->mc_db->md_depth > 0 &&
18837                              mc->mc_db->md_root != P_INVALID);
18838         return rc;
18839       } else {
18840         mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
18841       }
18842       /* otherwise fall thru and delete the sub-DB */
18843     }
18844 
18845     if (node_flags(node) & F_SUBDATA) {
18846       /* add all the child DB's pages to the free list */
18847       rc = mdbx_drop_tree(&mc->mc_xcursor->mx_cursor, false);
18848       if (unlikely(rc))
18849         goto fail;
18850     }
18851   }
18852   /* MDBX passes F_SUBDATA in 'flags' to delete a DB record */
18853   else if (unlikely((node_flags(node) ^ flags) & F_SUBDATA))
18854     return MDBX_INCOMPATIBLE;
18855 
18856   /* add overflow pages to free list */
18857   if (F_ISSET(node_flags(node), F_BIGDATA)) {
18858     MDBX_page *omp;
18859     if (unlikely((rc = mdbx_page_get(mc, node_largedata_pgno(node), &omp,
18860                                      pp_txnid4chk(mp, mc->mc_txn))) ||
18861                  (rc = mdbx_page_retire(mc, omp))))
18862       goto fail;
18863   }
18864 
18865 del_key:
18866   return mdbx_cursor_del0(mc);
18867 
18868 fail:
18869   mc->mc_txn->mt_flags |= MDBX_TXN_ERROR;
18870   return rc;
18871 }
18872 
18873 /* Allocate and initialize new pages for a database.
18874  * Set MDBX_TXN_ERROR on failure.
18875  *
18876  * [in] mc a  cursor on the database being added to.
18877  * [in] flags flags defining what type of page is being allocated.
18878  * [in] num   the number of pages to allocate. This is usually 1,
18879  *            unless allocating overflow pages for a large record.
18880  * [out] mp   Address of a page, or NULL on failure.
18881  *
18882  * Returns 0 on success, non-zero on failure. */
mdbx_page_new(MDBX_cursor * mc,const unsigned flags,const unsigned npages)18883 static struct page_result mdbx_page_new(MDBX_cursor *mc, const unsigned flags,
18884                                         const unsigned npages) {
18885   struct page_result ret = mdbx_page_alloc(mc, npages, MDBX_ALLOC_ALL);
18886   if (unlikely(ret.err != MDBX_SUCCESS))
18887     return ret;
18888 
18889   mdbx_debug("db %u allocated new page %" PRIaPGNO ", num %u", mc->mc_dbi,
18890              ret.page->mp_pgno, npages);
18891   ret.page->mp_flags = (uint16_t)flags;
18892   ret.page->mp_txnid = mc->mc_txn->mt_front;
18893   mdbx_cassert(mc, *mc->mc_dbistate & DBI_DIRTY);
18894   mdbx_cassert(mc, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY);
18895 #if MDBX_ENABLE_PGOP_STAT
18896   mc->mc_txn->mt_env->me_lck->mti_pgop_stat.newly.weak += npages;
18897 #endif /* MDBX_ENABLE_PGOP_STAT */
18898 
18899   if (likely((flags & P_OVERFLOW) == 0)) {
18900     STATIC_ASSERT(P_BRANCH == 1);
18901     const bool is_branch = flags & P_BRANCH;
18902     ret.page->mp_lower = 0;
18903     ret.page->mp_upper = (indx_t)(mc->mc_txn->mt_env->me_psize - PAGEHDRSZ);
18904     mc->mc_db->md_branch_pages += is_branch;
18905     mc->mc_db->md_leaf_pages += 1 - is_branch;
18906     if (unlikely(mc->mc_flags & C_SUB)) {
18907       MDBX_db *outer = mdbx_outer_db(mc);
18908       outer->md_branch_pages += is_branch;
18909       outer->md_leaf_pages += 1 - is_branch;
18910     }
18911   } else {
18912     mc->mc_db->md_overflow_pages += npages;
18913     ret.page->mp_pages = npages;
18914     mdbx_cassert(mc, !(mc->mc_flags & C_SUB));
18915   }
18916 
18917   return ret;
18918 }
18919 
mdbx_node_add_leaf2(MDBX_cursor * mc,unsigned indx,const MDBX_val * key)18920 static int __must_check_result mdbx_node_add_leaf2(MDBX_cursor *mc,
18921                                                    unsigned indx,
18922                                                    const MDBX_val *key) {
18923   MDBX_page *mp = mc->mc_pg[mc->mc_top];
18924   DKBUF_DEBUG;
18925   mdbx_debug("add to leaf2-%spage %" PRIaPGNO " index %i, "
18926              " key size %" PRIuPTR " [%s]",
18927              IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx,
18928              key ? key->iov_len : 0, DKEY_DEBUG(key));
18929 
18930   mdbx_cassert(mc, key);
18931   mdbx_cassert(mc, PAGETYPE(mp) == (P_LEAF | P_LEAF2));
18932   const unsigned ksize = mc->mc_db->md_xsize;
18933   mdbx_cassert(mc, ksize == key->iov_len);
18934   const unsigned nkeys = page_numkeys(mp);
18935 
18936   /* Just using these for counting */
18937   const intptr_t lower = mp->mp_lower + sizeof(indx_t);
18938   const intptr_t upper = mp->mp_upper - (ksize - sizeof(indx_t));
18939   if (unlikely(lower > upper)) {
18940     mc->mc_txn->mt_flags |= MDBX_TXN_ERROR;
18941     return MDBX_PAGE_FULL;
18942   }
18943   mp->mp_lower = (indx_t)lower;
18944   mp->mp_upper = (indx_t)upper;
18945 
18946   char *const ptr = page_leaf2key(mp, indx, ksize);
18947   mdbx_cassert(mc, nkeys >= indx);
18948   const unsigned diff = nkeys - indx;
18949   if (likely(diff > 0))
18950     /* Move higher keys up one slot. */
18951     memmove(ptr + ksize, ptr, diff * ksize);
18952   /* insert new key */
18953   memcpy(ptr, key->iov_base, ksize);
18954   return MDBX_SUCCESS;
18955 }
18956 
mdbx_node_add_branch(MDBX_cursor * mc,unsigned indx,const MDBX_val * key,pgno_t pgno)18957 static int __must_check_result mdbx_node_add_branch(MDBX_cursor *mc,
18958                                                     unsigned indx,
18959                                                     const MDBX_val *key,
18960                                                     pgno_t pgno) {
18961   MDBX_page *mp = mc->mc_pg[mc->mc_top];
18962   DKBUF_DEBUG;
18963   mdbx_debug("add to branch-%spage %" PRIaPGNO " index %i, node-pgno %" PRIaPGNO
18964              " key size %" PRIuPTR " [%s]",
18965              IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, pgno,
18966              key ? key->iov_len : 0, DKEY_DEBUG(key));
18967 
18968   mdbx_cassert(mc, PAGETYPE(mp) == P_BRANCH);
18969   STATIC_ASSERT(NODESIZE % 2 == 0);
18970 
18971   /* Move higher pointers up one slot. */
18972   const unsigned nkeys = page_numkeys(mp);
18973   mdbx_cassert(mc, nkeys >= indx);
18974   for (unsigned i = nkeys; i > indx; --i)
18975     mp->mp_ptrs[i] = mp->mp_ptrs[i - 1];
18976 
18977   /* Adjust free space offsets. */
18978   const size_t branch_bytes = branch_size(mc->mc_txn->mt_env, key);
18979   const intptr_t lower = mp->mp_lower + sizeof(indx_t);
18980   const intptr_t upper = mp->mp_upper - (branch_bytes - sizeof(indx_t));
18981   if (unlikely(lower > upper)) {
18982     mc->mc_txn->mt_flags |= MDBX_TXN_ERROR;
18983     return MDBX_PAGE_FULL;
18984   }
18985   mp->mp_lower = (indx_t)lower;
18986   mp->mp_ptrs[indx] = mp->mp_upper = (indx_t)upper;
18987 
18988   /* Write the node data. */
18989   MDBX_node *node = page_node(mp, indx);
18990   node_set_pgno(node, pgno);
18991   node_set_flags(node, 0);
18992   UNALIGNED_POKE_8(node, MDBX_node, mn_extra, 0);
18993   node_set_ks(node, 0);
18994   if (likely(key != NULL)) {
18995     node_set_ks(node, key->iov_len);
18996     memcpy(node_key(node), key->iov_base, key->iov_len);
18997   }
18998   return MDBX_SUCCESS;
18999 }
19000 
mdbx_node_add_leaf(MDBX_cursor * mc,unsigned indx,const MDBX_val * key,MDBX_val * data,unsigned flags)19001 static int __must_check_result mdbx_node_add_leaf(MDBX_cursor *mc,
19002                                                   unsigned indx,
19003                                                   const MDBX_val *key,
19004                                                   MDBX_val *data,
19005                                                   unsigned flags) {
19006   MDBX_page *mp = mc->mc_pg[mc->mc_top];
19007   DKBUF_DEBUG;
19008   mdbx_debug("add to leaf-%spage %" PRIaPGNO " index %i, data size %" PRIuPTR
19009              " key size %" PRIuPTR " [%s]",
19010              IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx,
19011              data ? data->iov_len : 0, key ? key->iov_len : 0, DKEY_DEBUG(key));
19012   mdbx_cassert(mc, key != NULL && data != NULL);
19013   mdbx_cassert(mc, PAGETYPE(mp) == P_LEAF);
19014   mdbx_cassert(mc, page_room(mp) >= leaf_size(mc->mc_txn->mt_env, key, data));
19015   MDBX_page *largepage = NULL;
19016 
19017   size_t node_bytes;
19018   if (unlikely(flags & F_BIGDATA)) {
19019     /* Data already on overflow page. */
19020     STATIC_ASSERT(sizeof(pgno_t) % 2 == 0);
19021     node_bytes =
19022         node_size_len(key->iov_len, 0) + sizeof(pgno_t) + sizeof(indx_t);
19023   } else if (unlikely(node_size(key, data) >
19024                       mc->mc_txn->mt_env->me_leaf_nodemax)) {
19025     /* Put data on overflow page. */
19026     if (unlikely(mc->mc_db->md_flags & MDBX_DUPSORT)) {
19027       mdbx_error("Unexpected target %s flags 0x%x for large data-item",
19028                  "dupsort-db", mc->mc_db->md_flags);
19029       return MDBX_PROBLEM;
19030     }
19031     if (unlikely(flags & (F_DUPDATA | F_SUBDATA))) {
19032       mdbx_error("Unexpected target %s flags 0x%x for large data-item", "node",
19033                  flags);
19034       return MDBX_PROBLEM;
19035     }
19036     const pgno_t ovpages = number_of_ovpages(mc->mc_txn->mt_env, data->iov_len);
19037     const struct page_result npr = mdbx_page_new(mc, P_OVERFLOW, ovpages);
19038     if (unlikely(npr.err != MDBX_SUCCESS))
19039       return npr.err;
19040     largepage = npr.page;
19041     mdbx_debug("allocated %u overflow page(s) %" PRIaPGNO "for %" PRIuPTR
19042                " data bytes",
19043                largepage->mp_pages, largepage->mp_pgno, data->iov_len);
19044     flags |= F_BIGDATA;
19045     node_bytes =
19046         node_size_len(key->iov_len, 0) + sizeof(pgno_t) + sizeof(indx_t);
19047   } else {
19048     node_bytes = node_size(key, data) + sizeof(indx_t);
19049   }
19050   mdbx_cassert(mc, node_bytes == leaf_size(mc->mc_txn->mt_env, key, data));
19051 
19052   /* Move higher pointers up one slot. */
19053   const unsigned nkeys = page_numkeys(mp);
19054   mdbx_cassert(mc, nkeys >= indx);
19055   for (unsigned i = nkeys; i > indx; --i)
19056     mp->mp_ptrs[i] = mp->mp_ptrs[i - 1];
19057 
19058   /* Adjust free space offsets. */
19059   const intptr_t lower = mp->mp_lower + sizeof(indx_t);
19060   const intptr_t upper = mp->mp_upper - (node_bytes - sizeof(indx_t));
19061   if (unlikely(lower > upper)) {
19062     mc->mc_txn->mt_flags |= MDBX_TXN_ERROR;
19063     return MDBX_PAGE_FULL;
19064   }
19065   mp->mp_lower = (indx_t)lower;
19066   mp->mp_ptrs[indx] = mp->mp_upper = (indx_t)upper;
19067 
19068   /* Write the node data. */
19069   MDBX_node *node = page_node(mp, indx);
19070   node_set_ks(node, key->iov_len);
19071   node_set_flags(node, (uint8_t)flags);
19072   UNALIGNED_POKE_8(node, MDBX_node, mn_extra, 0);
19073   node_set_ds(node, data->iov_len);
19074   memcpy(node_key(node), key->iov_base, key->iov_len);
19075 
19076   void *nodedata = node_data(node);
19077   if (likely(largepage == NULL)) {
19078     if (unlikely(flags & F_BIGDATA))
19079       memcpy(nodedata, data->iov_base, sizeof(pgno_t));
19080     else if (unlikely(flags & MDBX_RESERVE))
19081       data->iov_base = nodedata;
19082     else if (likely(nodedata != data->iov_base &&
19083                     data->iov_len /* to avoid UBSAN traps*/ != 0))
19084       memcpy(nodedata, data->iov_base, data->iov_len);
19085   } else {
19086     poke_pgno(nodedata, largepage->mp_pgno);
19087     nodedata = page_data(largepage);
19088     if (unlikely(flags & MDBX_RESERVE))
19089       data->iov_base = nodedata;
19090     else if (likely(nodedata != data->iov_base &&
19091                     data->iov_len /* to avoid UBSAN traps*/ != 0))
19092       memcpy(nodedata, data->iov_base, data->iov_len);
19093   }
19094   return MDBX_SUCCESS;
19095 }
19096 
19097 /* Delete the specified node from a page.
19098  * [in] mc Cursor pointing to the node to delete.
19099  * [in] ksize The size of a node. Only used if the page is
19100  * part of a MDBX_DUPFIXED database. */
mdbx_node_del(MDBX_cursor * mc,size_t ksize)19101 static void mdbx_node_del(MDBX_cursor *mc, size_t ksize) {
19102   MDBX_page *mp = mc->mc_pg[mc->mc_top];
19103   int indx = mc->mc_ki[mc->mc_top];
19104   int i, j, nkeys, ptr;
19105   MDBX_node *node;
19106   char *base;
19107 
19108   mdbx_debug("delete node %u on %s page %" PRIaPGNO, indx,
19109              IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno);
19110   nkeys = page_numkeys(mp);
19111   mdbx_cassert(mc, indx < nkeys);
19112 
19113   if (IS_LEAF2(mp)) {
19114     mdbx_cassert(mc, ksize >= sizeof(indx_t));
19115     unsigned diff = nkeys - 1 - indx;
19116     base = page_leaf2key(mp, indx, ksize);
19117     if (diff)
19118       memmove(base, base + ksize, diff * ksize);
19119     mdbx_cassert(mc, mp->mp_lower >= sizeof(indx_t));
19120     mp->mp_lower -= sizeof(indx_t);
19121     mdbx_cassert(mc,
19122                  (size_t)UINT16_MAX - mp->mp_upper >= ksize - sizeof(indx_t));
19123     mp->mp_upper += (indx_t)(ksize - sizeof(indx_t));
19124     return;
19125   }
19126 
19127   node = page_node(mp, indx);
19128   mdbx_cassert(mc, !IS_BRANCH(mp) || indx || node_ks(node) == 0);
19129   size_t sz = NODESIZE + node_ks(node);
19130   if (IS_LEAF(mp)) {
19131     if (F_ISSET(node_flags(node), F_BIGDATA))
19132       sz += sizeof(pgno_t);
19133     else
19134       sz += node_ds(node);
19135   }
19136   sz = EVEN(sz);
19137 
19138   ptr = mp->mp_ptrs[indx];
19139   for (i = j = 0; i < nkeys; i++) {
19140     if (i != indx) {
19141       mp->mp_ptrs[j] = mp->mp_ptrs[i];
19142       if (mp->mp_ptrs[i] < ptr) {
19143         mdbx_cassert(mc, (size_t)UINT16_MAX - mp->mp_ptrs[j] >= sz);
19144         mp->mp_ptrs[j] += (indx_t)sz;
19145       }
19146       j++;
19147     }
19148   }
19149 
19150   base = (char *)mp + mp->mp_upper + PAGEHDRSZ;
19151   memmove(base + sz, base, ptr - mp->mp_upper);
19152 
19153   mdbx_cassert(mc, mp->mp_lower >= sizeof(indx_t));
19154   mp->mp_lower -= sizeof(indx_t);
19155   mdbx_cassert(mc, (size_t)UINT16_MAX - mp->mp_upper >= sz);
19156   mp->mp_upper += (indx_t)sz;
19157 
19158 #if MDBX_DEBUG > 0
19159   if (mdbx_audit_enabled()) {
19160     int page_check_err = mdbx_page_check(mc, mp, C_UPDATING);
19161     mdbx_cassert(mc, page_check_err == MDBX_SUCCESS);
19162   }
19163 #endif
19164 }
19165 
19166 /* Compact the main page after deleting a node on a subpage.
19167  * [in] mp The main page to operate on.
19168  * [in] indx The index of the subpage on the main page. */
mdbx_node_shrink(MDBX_page * mp,unsigned indx)19169 static void mdbx_node_shrink(MDBX_page *mp, unsigned indx) {
19170   MDBX_node *node;
19171   MDBX_page *sp, *xp;
19172   char *base;
19173   size_t nsize, delta, len, ptr;
19174   int i;
19175 
19176   node = page_node(mp, indx);
19177   sp = (MDBX_page *)node_data(node);
19178   delta = page_room(sp);
19179   assert(delta > 0);
19180 
19181   /* Prepare to shift upward, set len = length(subpage part to shift) */
19182   if (IS_LEAF2(sp)) {
19183     delta &= /* do not make the node uneven-sized */ ~(size_t)1;
19184     if (unlikely(delta) == 0)
19185       return;
19186     nsize = node_ds(node) - delta;
19187     assert(nsize % 1 == 0);
19188     len = nsize;
19189   } else {
19190     xp = (MDBX_page *)((char *)sp + delta); /* destination subpage */
19191     for (i = page_numkeys(sp); --i >= 0;) {
19192       assert(sp->mp_ptrs[i] >= delta);
19193       xp->mp_ptrs[i] = (indx_t)(sp->mp_ptrs[i] - delta);
19194     }
19195     nsize = node_ds(node) - delta;
19196     len = PAGEHDRSZ;
19197   }
19198   sp->mp_upper = sp->mp_lower;
19199   sp->mp_pgno = mp->mp_pgno;
19200   node_set_ds(node, nsize);
19201 
19202   /* Shift <lower nodes...initial part of subpage> upward */
19203   base = (char *)mp + mp->mp_upper + PAGEHDRSZ;
19204   memmove(base + delta, base, (char *)sp + len - base);
19205 
19206   ptr = mp->mp_ptrs[indx];
19207   for (i = page_numkeys(mp); --i >= 0;) {
19208     if (mp->mp_ptrs[i] <= ptr) {
19209       assert((size_t)UINT16_MAX - mp->mp_ptrs[i] >= delta);
19210       mp->mp_ptrs[i] += (indx_t)delta;
19211     }
19212   }
19213   assert((size_t)UINT16_MAX - mp->mp_upper >= delta);
19214   mp->mp_upper += (indx_t)delta;
19215 }
19216 
19217 /* Initial setup of a sorted-dups cursor.
19218  *
19219  * Sorted duplicates are implemented as a sub-database for the given key.
19220  * The duplicate data items are actually keys of the sub-database.
19221  * Operations on the duplicate data items are performed using a sub-cursor
19222  * initialized when the sub-database is first accessed. This function does
19223  * the preliminary setup of the sub-cursor, filling in the fields that
19224  * depend only on the parent DB.
19225  *
19226  * [in] mc The main cursor whose sorted-dups cursor is to be initialized. */
mdbx_xcursor_init0(MDBX_cursor * mc)19227 static int mdbx_xcursor_init0(MDBX_cursor *mc) {
19228   MDBX_xcursor *mx = mc->mc_xcursor;
19229   if (!MDBX_DISABLE_PAGECHECKS && unlikely(mx == nullptr)) {
19230     mdbx_error("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)",
19231                mc->mc_dbi);
19232     return MDBX_CORRUPTED;
19233   }
19234 
19235   mx->mx_cursor.mc_xcursor = NULL;
19236   mx->mx_cursor.mc_next = NULL;
19237   mx->mx_cursor.mc_txn = mc->mc_txn;
19238   mx->mx_cursor.mc_db = &mx->mx_db;
19239   mx->mx_cursor.mc_dbx = &mx->mx_dbx;
19240   mx->mx_cursor.mc_dbi = mc->mc_dbi;
19241   mx->mx_cursor.mc_dbistate = mc->mc_dbistate;
19242   mx->mx_cursor.mc_snum = 0;
19243   mx->mx_cursor.mc_top = 0;
19244   mx->mx_cursor.mc_flags = C_SUB | (mc->mc_flags & (C_COPYING | C_SKIPORD));
19245   mx->mx_dbx.md_name.iov_len = 0;
19246   mx->mx_dbx.md_name.iov_base = NULL;
19247   mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp;
19248   mx->mx_dbx.md_dcmp = NULL;
19249   mx->mx_dbx.md_klen_min = INT_MAX;
19250   mx->mx_dbx.md_vlen_min = mx->mx_dbx.md_klen_max = mx->mx_dbx.md_vlen_max = 0;
19251   return MDBX_SUCCESS;
19252 }
19253 
19254 /* Final setup of a sorted-dups cursor.
19255  * Sets up the fields that depend on the data from the main cursor.
19256  * [in] mc The main cursor whose sorted-dups cursor is to be initialized.
19257  * [in] node The data containing the MDBX_db record for the sorted-dup database.
19258  */
mdbx_xcursor_init1(MDBX_cursor * mc,MDBX_node * node,const MDBX_page * mp)19259 static int mdbx_xcursor_init1(MDBX_cursor *mc, MDBX_node *node,
19260                               const MDBX_page *mp) {
19261   MDBX_xcursor *mx = mc->mc_xcursor;
19262   if (!MDBX_DISABLE_PAGECHECKS && unlikely(mx == nullptr)) {
19263     mdbx_error("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)",
19264                mc->mc_dbi);
19265     return MDBX_CORRUPTED;
19266   }
19267 
19268   const uint8_t flags = node_flags(node);
19269   switch (flags) {
19270   default:
19271     mdbx_error("invalid node flags %u", flags);
19272     return MDBX_CORRUPTED;
19273   case F_DUPDATA | F_SUBDATA:
19274     if (!MDBX_DISABLE_PAGECHECKS &&
19275         unlikely(node_ds(node) != sizeof(MDBX_db))) {
19276       mdbx_error("invalid nested-db record size %zu", node_ds(node));
19277       return MDBX_CORRUPTED;
19278     }
19279     memcpy(&mx->mx_db, node_data(node), sizeof(MDBX_db));
19280     const txnid_t pp_txnid = mp->mp_txnid;
19281     if (!MDBX_DISABLE_PAGECHECKS &&
19282         unlikely(mx->mx_db.md_mod_txnid > pp_txnid)) {
19283       mdbx_error("nested-db.md_mod_txnid (%" PRIaTXN ") > page-txnid (%" PRIaTXN
19284                  ")",
19285                  mx->mx_db.md_mod_txnid, pp_txnid);
19286       return MDBX_CORRUPTED;
19287     }
19288     mx->mx_cursor.mc_pg[0] = 0;
19289     mx->mx_cursor.mc_snum = 0;
19290     mx->mx_cursor.mc_top = 0;
19291     mx->mx_cursor.mc_flags = C_SUB | (mc->mc_flags & (C_COPYING | C_SKIPORD));
19292     break;
19293   case F_DUPDATA:
19294     if (!MDBX_DISABLE_PAGECHECKS && unlikely(node_ds(node) <= PAGEHDRSZ)) {
19295       mdbx_error("invalid nested-page size %zu", node_ds(node));
19296       return MDBX_CORRUPTED;
19297     }
19298     MDBX_page *fp = node_data(node);
19299     mx->mx_db.md_depth = 1;
19300     mx->mx_db.md_branch_pages = 0;
19301     mx->mx_db.md_leaf_pages = 1;
19302     mx->mx_db.md_overflow_pages = 0;
19303     mx->mx_db.md_entries = page_numkeys(fp);
19304     mx->mx_db.md_root = fp->mp_pgno;
19305     mx->mx_db.md_mod_txnid = mp->mp_txnid;
19306     mx->mx_cursor.mc_snum = 1;
19307     mx->mx_cursor.mc_top = 0;
19308     mx->mx_cursor.mc_flags =
19309         C_INITIALIZED | C_SUB | (mc->mc_flags & (C_COPYING | C_SKIPORD));
19310     mx->mx_cursor.mc_pg[0] = fp;
19311     mx->mx_cursor.mc_ki[0] = 0;
19312     mx->mx_db.md_flags = flags_db2sub(mc->mc_db->md_flags);
19313     mx->mx_db.md_xsize =
19314         (mc->mc_db->md_flags & MDBX_DUPFIXED) ? fp->mp_leaf2_ksize : 0;
19315     break;
19316   }
19317 
19318   if (unlikely(mx->mx_db.md_xsize != mc->mc_db->md_xsize)) {
19319     if (!MDBX_DISABLE_PAGECHECKS && unlikely(mc->mc_db->md_xsize != 0)) {
19320       mdbx_error("cursor mismatched nested-db md_xsize %u",
19321                  mc->mc_db->md_xsize);
19322       return MDBX_CORRUPTED;
19323     }
19324     if (!MDBX_DISABLE_PAGECHECKS &&
19325         unlikely((mc->mc_db->md_flags & MDBX_DUPFIXED) == 0)) {
19326       mdbx_error("mismatched nested-db md_flags %u", mc->mc_db->md_flags);
19327       return MDBX_CORRUPTED;
19328     }
19329     if (!MDBX_DISABLE_PAGECHECKS &&
19330         unlikely(mx->mx_db.md_xsize < mc->mc_dbx->md_vlen_min ||
19331                  mx->mx_db.md_xsize > mc->mc_dbx->md_vlen_max)) {
19332       mdbx_error("mismatched nested-db.md_xsize (%u) <> min/max value-length "
19333                  "(%zu/%zu)",
19334                  mx->mx_db.md_xsize, mc->mc_dbx->md_vlen_min,
19335                  mc->mc_dbx->md_vlen_max);
19336       return MDBX_CORRUPTED;
19337     }
19338     mc->mc_db->md_xsize = mx->mx_db.md_xsize;
19339     mc->mc_dbx->md_vlen_min = mc->mc_dbx->md_vlen_max = mx->mx_db.md_xsize;
19340   }
19341   mx->mx_dbx.md_klen_min = mc->mc_dbx->md_vlen_min;
19342   mx->mx_dbx.md_klen_max = mc->mc_dbx->md_vlen_max;
19343 
19344   mdbx_debug("Sub-db -%u root page %" PRIaPGNO, mx->mx_cursor.mc_dbi,
19345              mx->mx_db.md_root);
19346   return MDBX_SUCCESS;
19347 }
19348 
19349 /* Fixup a sorted-dups cursor due to underlying update.
19350  * Sets up some fields that depend on the data from the main cursor.
19351  * Almost the same as init1, but skips initialization steps if the
19352  * xcursor had already been used.
19353  * [in] mc The main cursor whose sorted-dups cursor is to be fixed up.
19354  * [in] src_mx The xcursor of an up-to-date cursor.
19355  * [in] new_dupdata True if converting from a non-F_DUPDATA item. */
mdbx_xcursor_init2(MDBX_cursor * mc,MDBX_xcursor * src_mx,bool new_dupdata)19356 static int mdbx_xcursor_init2(MDBX_cursor *mc, MDBX_xcursor *src_mx,
19357                               bool new_dupdata) {
19358   MDBX_xcursor *mx = mc->mc_xcursor;
19359   if (!MDBX_DISABLE_PAGECHECKS && unlikely(mx == nullptr)) {
19360     mdbx_error("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)",
19361                mc->mc_dbi);
19362     return MDBX_CORRUPTED;
19363   }
19364 
19365   if (new_dupdata) {
19366     mx->mx_cursor.mc_snum = 1;
19367     mx->mx_cursor.mc_top = 0;
19368     mx->mx_cursor.mc_flags |= C_INITIALIZED;
19369     mx->mx_cursor.mc_ki[0] = 0;
19370   }
19371 
19372   mx->mx_dbx.md_klen_min = src_mx->mx_dbx.md_klen_min;
19373   mx->mx_dbx.md_klen_max = src_mx->mx_dbx.md_klen_max;
19374   mx->mx_dbx.md_cmp = src_mx->mx_dbx.md_cmp;
19375   mx->mx_db = src_mx->mx_db;
19376   mx->mx_cursor.mc_pg[0] = src_mx->mx_cursor.mc_pg[0];
19377   if (mx->mx_cursor.mc_flags & C_INITIALIZED) {
19378     mdbx_debug("Sub-db -%u root page %" PRIaPGNO, mx->mx_cursor.mc_dbi,
19379                mx->mx_db.md_root);
19380   }
19381   return MDBX_SUCCESS;
19382 }
19383 
mdbx_couple_init(MDBX_cursor_couple * couple,const MDBX_dbi dbi,MDBX_txn * const txn,MDBX_db * const db,MDBX_dbx * const dbx,uint8_t * const dbstate)19384 static __inline int mdbx_couple_init(MDBX_cursor_couple *couple,
19385                                      const MDBX_dbi dbi, MDBX_txn *const txn,
19386                                      MDBX_db *const db, MDBX_dbx *const dbx,
19387                                      uint8_t *const dbstate) {
19388   couple->outer.mc_signature = MDBX_MC_LIVE;
19389   couple->outer.mc_next = NULL;
19390   couple->outer.mc_backup = NULL;
19391   couple->outer.mc_dbi = dbi;
19392   couple->outer.mc_txn = txn;
19393   couple->outer.mc_db = db;
19394   couple->outer.mc_dbx = dbx;
19395   couple->outer.mc_dbistate = dbstate;
19396   couple->outer.mc_snum = 0;
19397   couple->outer.mc_top = 0;
19398   couple->outer.mc_pg[0] = 0;
19399   couple->outer.mc_flags = 0;
19400   couple->outer.mc_ki[0] = 0;
19401   couple->outer.mc_xcursor = NULL;
19402 
19403   int rc = MDBX_SUCCESS;
19404   if (unlikely(*couple->outer.mc_dbistate & DBI_STALE)) {
19405     rc = mdbx_page_search(&couple->outer, NULL, MDBX_PS_ROOTONLY);
19406     rc = (rc != MDBX_NOTFOUND) ? rc : MDBX_SUCCESS;
19407   } else if (unlikely(couple->outer.mc_dbx->md_klen_max == 0)) {
19408     rc = mdbx_setup_dbx(couple->outer.mc_dbx, couple->outer.mc_db,
19409                         txn->mt_env->me_psize);
19410   }
19411 
19412   if (couple->outer.mc_db->md_flags & MDBX_DUPSORT) {
19413     couple->inner.mx_cursor.mc_signature = MDBX_MC_LIVE;
19414     couple->outer.mc_xcursor = &couple->inner;
19415     rc = mdbx_xcursor_init0(&couple->outer);
19416     if (unlikely(rc != MDBX_SUCCESS))
19417       return rc;
19418     couple->inner.mx_dbx.md_klen_min = couple->outer.mc_dbx->md_vlen_min;
19419     couple->inner.mx_dbx.md_klen_max = couple->outer.mc_dbx->md_vlen_max;
19420   }
19421   return rc;
19422 }
19423 
19424 /* Initialize a cursor for a given transaction and database. */
mdbx_cursor_init(MDBX_cursor * mc,MDBX_txn * txn,MDBX_dbi dbi)19425 static int mdbx_cursor_init(MDBX_cursor *mc, MDBX_txn *txn, MDBX_dbi dbi) {
19426   STATIC_ASSERT(offsetof(MDBX_cursor_couple, outer) == 0);
19427   if (unlikely(TXN_DBI_CHANGED(txn, dbi)))
19428     return MDBX_BAD_DBI;
19429 
19430   return mdbx_couple_init(container_of(mc, MDBX_cursor_couple, outer), dbi, txn,
19431                           &txn->mt_dbs[dbi], &txn->mt_dbxs[dbi],
19432                           &txn->mt_dbistate[dbi]);
19433 }
19434 
mdbx_cursor_create(void * context)19435 MDBX_cursor *mdbx_cursor_create(void *context) {
19436   MDBX_cursor_couple *couple = mdbx_calloc(1, sizeof(MDBX_cursor_couple));
19437   if (unlikely(!couple))
19438     return nullptr;
19439 
19440   couple->outer.mc_signature = MDBX_MC_READY4CLOSE;
19441   couple->outer.mc_dbi = UINT_MAX;
19442   couple->mc_userctx = context;
19443   return &couple->outer;
19444 }
19445 
mdbx_cursor_set_userctx(MDBX_cursor * mc,void * ctx)19446 int mdbx_cursor_set_userctx(MDBX_cursor *mc, void *ctx) {
19447   if (unlikely(!mc))
19448     return MDBX_EINVAL;
19449 
19450   if (unlikely(mc->mc_signature != MDBX_MC_READY4CLOSE &&
19451                mc->mc_signature != MDBX_MC_LIVE))
19452     return MDBX_EBADSIGN;
19453 
19454   MDBX_cursor_couple *couple = container_of(mc, MDBX_cursor_couple, outer);
19455   couple->mc_userctx = ctx;
19456   return MDBX_SUCCESS;
19457 }
19458 
mdbx_cursor_get_userctx(const MDBX_cursor * mc)19459 void *mdbx_cursor_get_userctx(const MDBX_cursor *mc) {
19460   if (unlikely(!mc))
19461     return nullptr;
19462 
19463   if (unlikely(mc->mc_signature != MDBX_MC_READY4CLOSE &&
19464                mc->mc_signature != MDBX_MC_LIVE))
19465     return nullptr;
19466 
19467   MDBX_cursor_couple *couple = container_of(mc, MDBX_cursor_couple, outer);
19468   return couple->mc_userctx;
19469 }
19470 
mdbx_cursor_bind(MDBX_txn * txn,MDBX_cursor * mc,MDBX_dbi dbi)19471 int mdbx_cursor_bind(MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) {
19472   if (unlikely(!mc))
19473     return MDBX_EINVAL;
19474 
19475   if (unlikely(mc->mc_signature != MDBX_MC_READY4CLOSE &&
19476                mc->mc_signature != MDBX_MC_LIVE))
19477     return MDBX_EBADSIGN;
19478 
19479   int rc = check_txn(txn, MDBX_TXN_BLOCKED);
19480   if (unlikely(rc != MDBX_SUCCESS))
19481     return rc;
19482 
19483   if (unlikely(!check_dbi(txn, dbi, DBI_VALID)))
19484     return MDBX_BAD_DBI;
19485 
19486   if (unlikely(dbi == FREE_DBI && !F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY)))
19487     return MDBX_EACCESS;
19488 
19489   if (unlikely(mc->mc_backup)) /* Cursor from parent transaction */ {
19490     mdbx_cassert(mc, mc->mc_signature == MDBX_MC_LIVE);
19491     if (unlikely(mc->mc_dbi != dbi ||
19492                  /* paranoia */ mc->mc_signature != MDBX_MC_LIVE ||
19493                  mc->mc_txn != txn))
19494       return MDBX_EINVAL;
19495 
19496     assert(mc->mc_db == &txn->mt_dbs[dbi]);
19497     assert(mc->mc_dbx == &txn->mt_dbxs[dbi]);
19498     assert(mc->mc_dbi == dbi);
19499     assert(mc->mc_dbistate == &txn->mt_dbistate[dbi]);
19500     return likely(mc->mc_dbi == dbi &&
19501                   /* paranoia */ mc->mc_signature == MDBX_MC_LIVE &&
19502                   mc->mc_txn == txn)
19503                ? MDBX_SUCCESS
19504                : MDBX_EINVAL /* Disallow change DBI in nested transactions */;
19505   }
19506 
19507   if (mc->mc_signature == MDBX_MC_LIVE) {
19508     if (unlikely(!mc->mc_txn ||
19509                  mc->mc_txn->mt_signature != MDBX_MT_SIGNATURE)) {
19510       mdbx_error("Wrong cursor's transaction %p 0x%x",
19511                  __Wpedantic_format_voidptr(mc->mc_txn),
19512                  mc->mc_txn ? mc->mc_txn->mt_signature : 0);
19513       return MDBX_PROBLEM;
19514     }
19515     if (mc->mc_flags & C_UNTRACK) {
19516       mdbx_cassert(mc, !(mc->mc_txn->mt_flags & MDBX_TXN_RDONLY));
19517       MDBX_cursor **prev = &mc->mc_txn->tw.cursors[mc->mc_dbi];
19518       while (*prev && *prev != mc)
19519         prev = &(*prev)->mc_next;
19520       mdbx_cassert(mc, *prev == mc);
19521       *prev = mc->mc_next;
19522     }
19523     mc->mc_signature = MDBX_MC_READY4CLOSE;
19524     mc->mc_flags = 0;
19525     mc->mc_dbi = UINT_MAX;
19526     mc->mc_next = NULL;
19527     mc->mc_db = NULL;
19528     mc->mc_dbx = NULL;
19529     mc->mc_dbistate = NULL;
19530   }
19531   mdbx_cassert(mc, !(mc->mc_flags & C_UNTRACK));
19532 
19533   rc = mdbx_cursor_init(mc, txn, dbi);
19534   if (unlikely(rc != MDBX_SUCCESS))
19535     return rc;
19536 
19537   if (!(txn->mt_flags & MDBX_TXN_RDONLY)) {
19538     mc->mc_next = txn->tw.cursors[dbi];
19539     txn->tw.cursors[dbi] = mc;
19540     mc->mc_flags |= C_UNTRACK;
19541   }
19542 
19543   return MDBX_SUCCESS;
19544 }
19545 
mdbx_cursor_open(MDBX_txn * txn,MDBX_dbi dbi,MDBX_cursor ** ret)19546 int mdbx_cursor_open(MDBX_txn *txn, MDBX_dbi dbi, MDBX_cursor **ret) {
19547   if (unlikely(!ret))
19548     return MDBX_EINVAL;
19549   *ret = NULL;
19550 
19551   MDBX_cursor *const mc = mdbx_cursor_create(nullptr);
19552   if (unlikely(!mc))
19553     return MDBX_ENOMEM;
19554 
19555   int rc = mdbx_cursor_bind(txn, mc, dbi);
19556   if (unlikely(rc != MDBX_SUCCESS)) {
19557     mdbx_cursor_close(mc);
19558     return rc;
19559   }
19560 
19561   *ret = mc;
19562   return MDBX_SUCCESS;
19563 }
19564 
mdbx_cursor_renew(MDBX_txn * txn,MDBX_cursor * mc)19565 int mdbx_cursor_renew(MDBX_txn *txn, MDBX_cursor *mc) {
19566   return likely(mc) ? mdbx_cursor_bind(txn, mc, mc->mc_dbi) : MDBX_EINVAL;
19567 }
19568 
mdbx_cursor_copy(const MDBX_cursor * src,MDBX_cursor * dest)19569 int mdbx_cursor_copy(const MDBX_cursor *src, MDBX_cursor *dest) {
19570   if (unlikely(!src))
19571     return MDBX_EINVAL;
19572   if (unlikely(src->mc_signature != MDBX_MC_LIVE))
19573     return (src->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL
19574                                                       : MDBX_EBADSIGN;
19575 
19576   int rc = mdbx_cursor_bind(src->mc_txn, dest, src->mc_dbi);
19577   if (unlikely(rc != MDBX_SUCCESS))
19578     return rc;
19579 
19580   assert(dest->mc_db == src->mc_db);
19581   assert(dest->mc_dbi == src->mc_dbi);
19582   assert(dest->mc_dbx == src->mc_dbx);
19583   assert(dest->mc_dbistate == src->mc_dbistate);
19584 again:
19585   assert(dest->mc_txn == src->mc_txn);
19586   dest->mc_flags ^= (dest->mc_flags ^ src->mc_flags) & ~C_UNTRACK;
19587   dest->mc_top = src->mc_top;
19588   dest->mc_snum = src->mc_snum;
19589   for (unsigned i = 0; i < src->mc_snum; ++i) {
19590     dest->mc_ki[i] = src->mc_ki[i];
19591     dest->mc_pg[i] = src->mc_pg[i];
19592   }
19593 
19594   if (src->mc_xcursor) {
19595     dest->mc_xcursor->mx_db = src->mc_xcursor->mx_db;
19596     dest->mc_xcursor->mx_dbx = src->mc_xcursor->mx_dbx;
19597     src = &src->mc_xcursor->mx_cursor;
19598     dest = &dest->mc_xcursor->mx_cursor;
19599     goto again;
19600   }
19601 
19602   return MDBX_SUCCESS;
19603 }
19604 
mdbx_cursor_close(MDBX_cursor * mc)19605 void mdbx_cursor_close(MDBX_cursor *mc) {
19606   if (likely(mc)) {
19607     mdbx_ensure(NULL, mc->mc_signature == MDBX_MC_LIVE ||
19608                           mc->mc_signature == MDBX_MC_READY4CLOSE);
19609     MDBX_txn *const txn = mc->mc_txn;
19610     if (!mc->mc_backup) {
19611       mc->mc_txn = NULL;
19612       /* Remove from txn, if tracked.
19613        * A read-only txn (!C_UNTRACK) may have been freed already,
19614        * so do not peek inside it.  Only write txns track cursors. */
19615       if (mc->mc_flags & C_UNTRACK) {
19616         mdbx_ensure(txn->mt_env, check_txn_rw(txn, 0) == MDBX_SUCCESS);
19617         MDBX_cursor **prev = &txn->tw.cursors[mc->mc_dbi];
19618         while (*prev && *prev != mc)
19619           prev = &(*prev)->mc_next;
19620         mdbx_tassert(txn, *prev == mc);
19621         *prev = mc->mc_next;
19622       }
19623       mc->mc_signature = 0;
19624       mc->mc_next = mc;
19625       mdbx_free(mc);
19626     } else {
19627       /* Cursor closed before nested txn ends */
19628       mdbx_tassert(txn, mc->mc_signature == MDBX_MC_LIVE);
19629       mdbx_ensure(txn->mt_env, check_txn_rw(txn, 0) == MDBX_SUCCESS);
19630       mc->mc_signature = MDBX_MC_WAIT4EOT;
19631     }
19632   }
19633 }
19634 
mdbx_cursor_txn(const MDBX_cursor * mc)19635 MDBX_txn *mdbx_cursor_txn(const MDBX_cursor *mc) {
19636   if (unlikely(!mc || mc->mc_signature != MDBX_MC_LIVE))
19637     return NULL;
19638   MDBX_txn *txn = mc->mc_txn;
19639   if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE))
19640     return NULL;
19641   if (unlikely(txn->mt_flags & MDBX_TXN_FINISHED))
19642     return NULL;
19643   return txn;
19644 }
19645 
mdbx_cursor_dbi(const MDBX_cursor * mc)19646 MDBX_dbi mdbx_cursor_dbi(const MDBX_cursor *mc) {
19647   if (unlikely(!mc || mc->mc_signature != MDBX_MC_LIVE))
19648     return UINT_MAX;
19649   return mc->mc_dbi;
19650 }
19651 
19652 /* Return the count of duplicate data items for the current key */
mdbx_cursor_count(const MDBX_cursor * mc,size_t * countp)19653 int mdbx_cursor_count(const MDBX_cursor *mc, size_t *countp) {
19654   if (unlikely(mc == NULL))
19655     return MDBX_EINVAL;
19656 
19657   if (unlikely(mc->mc_signature != MDBX_MC_LIVE))
19658     return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL
19659                                                      : MDBX_EBADSIGN;
19660 
19661   int rc = check_txn(mc->mc_txn, MDBX_TXN_BLOCKED);
19662   if (unlikely(rc != MDBX_SUCCESS))
19663     return rc;
19664 
19665   if (unlikely(countp == NULL || !(mc->mc_flags & C_INITIALIZED)))
19666     return MDBX_EINVAL;
19667 
19668   if (!mc->mc_snum) {
19669     *countp = 0;
19670     return MDBX_NOTFOUND;
19671   }
19672 
19673   MDBX_page *mp = mc->mc_pg[mc->mc_top];
19674   if ((mc->mc_flags & C_EOF) && mc->mc_ki[mc->mc_top] >= page_numkeys(mp)) {
19675     *countp = 0;
19676     return MDBX_NOTFOUND;
19677   }
19678 
19679   *countp = 1;
19680   if (mc->mc_xcursor != NULL) {
19681     MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]);
19682     if (F_ISSET(node_flags(node), F_DUPDATA)) {
19683       mdbx_cassert(mc, mc->mc_xcursor && (mc->mc_xcursor->mx_cursor.mc_flags &
19684                                           C_INITIALIZED));
19685       *countp = unlikely(mc->mc_xcursor->mx_db.md_entries > PTRDIFF_MAX)
19686                     ? PTRDIFF_MAX
19687                     : (size_t)mc->mc_xcursor->mx_db.md_entries;
19688     }
19689   }
19690   return MDBX_SUCCESS;
19691 }
19692 
19693 /* Replace the key for a branch node with a new key.
19694  * Set MDBX_TXN_ERROR on failure.
19695  * [in] mc Cursor pointing to the node to operate on.
19696  * [in] key The new key to use.
19697  * Returns 0 on success, non-zero on failure. */
mdbx_update_key(MDBX_cursor * mc,const MDBX_val * key)19698 static int mdbx_update_key(MDBX_cursor *mc, const MDBX_val *key) {
19699   MDBX_page *mp;
19700   MDBX_node *node;
19701   char *base;
19702   size_t len;
19703   int delta, ksize, oksize;
19704   int ptr, i, nkeys, indx;
19705   DKBUF_DEBUG;
19706 
19707   mdbx_cassert(mc, cursor_is_tracked(mc));
19708   indx = mc->mc_ki[mc->mc_top];
19709   mp = mc->mc_pg[mc->mc_top];
19710   node = page_node(mp, indx);
19711   ptr = mp->mp_ptrs[indx];
19712 #if MDBX_DEBUG
19713   MDBX_val k2;
19714   k2.iov_base = node_key(node);
19715   k2.iov_len = node_ks(node);
19716   mdbx_debug("update key %u (offset %u) [%s] to [%s] on page %" PRIaPGNO, indx,
19717              ptr, DVAL_DEBUG(&k2), DKEY_DEBUG(key), mp->mp_pgno);
19718 #endif /* MDBX_DEBUG */
19719 
19720   /* Sizes must be 2-byte aligned. */
19721   ksize = EVEN(key->iov_len);
19722   oksize = EVEN(node_ks(node));
19723   delta = ksize - oksize;
19724 
19725   /* Shift node contents if EVEN(key length) changed. */
19726   if (delta) {
19727     if (delta > (int)page_room(mp)) {
19728       /* not enough space left, do a delete and split */
19729       mdbx_debug("Not enough room, delta = %d, splitting...", delta);
19730       pgno_t pgno = node_pgno(node);
19731       mdbx_node_del(mc, 0);
19732       int rc = mdbx_page_split(mc, key, NULL, pgno, MDBX_SPLIT_REPLACE);
19733       if (rc == MDBX_SUCCESS && mdbx_audit_enabled())
19734         rc = mdbx_cursor_check(mc, C_UPDATING);
19735       return rc;
19736     }
19737 
19738     nkeys = page_numkeys(mp);
19739     for (i = 0; i < nkeys; i++) {
19740       if (mp->mp_ptrs[i] <= ptr) {
19741         mdbx_cassert(mc, mp->mp_ptrs[i] >= delta);
19742         mp->mp_ptrs[i] -= (indx_t)delta;
19743       }
19744     }
19745 
19746     base = (char *)mp + mp->mp_upper + PAGEHDRSZ;
19747     len = ptr - mp->mp_upper + NODESIZE;
19748     memmove(base - delta, base, len);
19749     mdbx_cassert(mc, mp->mp_upper >= delta);
19750     mp->mp_upper -= (indx_t)delta;
19751 
19752     node = page_node(mp, indx);
19753   }
19754 
19755   /* But even if no shift was needed, update ksize */
19756   node_set_ks(node, key->iov_len);
19757 
19758   if (likely(key->iov_len /* to avoid UBSAN traps*/ != 0))
19759     memcpy(node_key(node), key->iov_base, key->iov_len);
19760   return MDBX_SUCCESS;
19761 }
19762 
19763 /* Move a node from csrc to cdst. */
mdbx_node_move(MDBX_cursor * csrc,MDBX_cursor * cdst,bool fromleft)19764 static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) {
19765   int rc;
19766   DKBUF_DEBUG;
19767 
19768   MDBX_page *psrc = csrc->mc_pg[csrc->mc_top];
19769   MDBX_page *pdst = cdst->mc_pg[cdst->mc_top];
19770   mdbx_cassert(csrc, PAGETYPE(psrc) == PAGETYPE(pdst));
19771   mdbx_cassert(csrc, csrc->mc_dbi == cdst->mc_dbi);
19772   mdbx_cassert(csrc, csrc->mc_top == cdst->mc_top);
19773   if (unlikely(PAGETYPE(psrc) != PAGETYPE(pdst))) {
19774   bailout:
19775     mdbx_error("Wrong or mismatch pages's types (src %d, dst %d) to move node",
19776                PAGETYPE(psrc), PAGETYPE(pdst));
19777     csrc->mc_txn->mt_flags |= MDBX_TXN_ERROR;
19778     return MDBX_PROBLEM;
19779   }
19780 
19781   MDBX_val key4move;
19782   switch (PAGETYPE(psrc)) {
19783   case P_BRANCH: {
19784     const MDBX_node *srcnode = page_node(psrc, csrc->mc_ki[csrc->mc_top]);
19785     mdbx_cassert(csrc, node_flags(srcnode) == 0);
19786     const pgno_t srcpg = node_pgno(srcnode);
19787     key4move.iov_len = node_ks(srcnode);
19788     key4move.iov_base = node_key(srcnode);
19789 
19790     if (csrc->mc_ki[csrc->mc_top] == 0) {
19791       const unsigned snum = csrc->mc_snum;
19792       mdbx_cassert(csrc, snum > 0);
19793       /* must find the lowest key below src */
19794       rc = mdbx_page_search_lowest(csrc);
19795       MDBX_page *lowest_page = csrc->mc_pg[csrc->mc_top];
19796       if (unlikely(rc))
19797         return rc;
19798       mdbx_cassert(csrc, IS_LEAF(lowest_page));
19799       if (unlikely(!IS_LEAF(lowest_page)))
19800         goto bailout;
19801       if (IS_LEAF2(lowest_page)) {
19802         key4move.iov_len = csrc->mc_db->md_xsize;
19803         key4move.iov_base = page_leaf2key(lowest_page, 0, key4move.iov_len);
19804       } else {
19805         const MDBX_node *lowest_node = page_node(lowest_page, 0);
19806         key4move.iov_len = node_ks(lowest_node);
19807         key4move.iov_base = node_key(lowest_node);
19808       }
19809 
19810       /* restore cursor after mdbx_page_search_lowest() */
19811       csrc->mc_snum = snum;
19812       csrc->mc_top = snum - 1;
19813       csrc->mc_ki[csrc->mc_top] = 0;
19814 
19815       /* paranoia */
19816       mdbx_cassert(csrc, psrc == csrc->mc_pg[csrc->mc_top]);
19817       mdbx_cassert(csrc, IS_BRANCH(psrc));
19818       if (unlikely(!IS_BRANCH(psrc)))
19819         goto bailout;
19820     }
19821 
19822     if (cdst->mc_ki[cdst->mc_top] == 0) {
19823       const unsigned snum = cdst->mc_snum;
19824       mdbx_cassert(csrc, snum > 0);
19825       MDBX_cursor mn;
19826       cursor_copy(cdst, &mn);
19827       /* must find the lowest key below dst */
19828       rc = mdbx_page_search_lowest(&mn);
19829       if (unlikely(rc))
19830         return rc;
19831       MDBX_page *const lowest_page = mn.mc_pg[mn.mc_top];
19832       mdbx_cassert(cdst, IS_LEAF(lowest_page));
19833       if (unlikely(!IS_LEAF(lowest_page)))
19834         goto bailout;
19835       MDBX_val key;
19836       if (IS_LEAF2(lowest_page)) {
19837         key.iov_len = mn.mc_db->md_xsize;
19838         key.iov_base = page_leaf2key(lowest_page, 0, key.iov_len);
19839       } else {
19840         MDBX_node *lowest_node = page_node(lowest_page, 0);
19841         key.iov_len = node_ks(lowest_node);
19842         key.iov_base = node_key(lowest_node);
19843       }
19844 
19845       /* restore cursor after mdbx_page_search_lowest() */
19846       mn.mc_snum = snum;
19847       mn.mc_top = snum - 1;
19848       mn.mc_ki[mn.mc_top] = 0;
19849 
19850       const intptr_t delta =
19851           EVEN(key.iov_len) - EVEN(node_ks(page_node(mn.mc_pg[mn.mc_top], 0)));
19852       const intptr_t needed =
19853           branch_size(cdst->mc_txn->mt_env, &key4move) + delta;
19854       const intptr_t have = page_room(pdst);
19855       if (unlikely(needed > have))
19856         return MDBX_RESULT_TRUE;
19857 
19858       if (unlikely((rc = mdbx_page_touch(csrc)) ||
19859                    (rc = mdbx_page_touch(cdst))))
19860         return rc;
19861       psrc = csrc->mc_pg[csrc->mc_top];
19862       pdst = cdst->mc_pg[cdst->mc_top];
19863 
19864       WITH_CURSOR_TRACKING(mn, rc = mdbx_update_key(&mn, &key));
19865       if (unlikely(rc))
19866         return rc;
19867     } else {
19868       const size_t needed = branch_size(cdst->mc_txn->mt_env, &key4move);
19869       const size_t have = page_room(pdst);
19870       if (unlikely(needed > have))
19871         return MDBX_RESULT_TRUE;
19872 
19873       if (unlikely((rc = mdbx_page_touch(csrc)) ||
19874                    (rc = mdbx_page_touch(cdst))))
19875         return rc;
19876       psrc = csrc->mc_pg[csrc->mc_top];
19877       pdst = cdst->mc_pg[cdst->mc_top];
19878     }
19879 
19880     mdbx_debug("moving %s-node %u [%s] on page %" PRIaPGNO
19881                " to node %u on page %" PRIaPGNO,
19882                "branch", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move),
19883                psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno);
19884     /* Add the node to the destination page. */
19885     rc =
19886         mdbx_node_add_branch(cdst, cdst->mc_ki[cdst->mc_top], &key4move, srcpg);
19887   } break;
19888 
19889   case P_LEAF: {
19890     /* Mark src and dst as dirty. */
19891     if (unlikely((rc = mdbx_page_touch(csrc)) || (rc = mdbx_page_touch(cdst))))
19892       return rc;
19893     psrc = csrc->mc_pg[csrc->mc_top];
19894     pdst = cdst->mc_pg[cdst->mc_top];
19895     const MDBX_node *srcnode = page_node(psrc, csrc->mc_ki[csrc->mc_top]);
19896     MDBX_val data;
19897     data.iov_len = node_ds(srcnode);
19898     data.iov_base = node_data(srcnode);
19899     key4move.iov_len = node_ks(srcnode);
19900     key4move.iov_base = node_key(srcnode);
19901     mdbx_debug("moving %s-node %u [%s] on page %" PRIaPGNO
19902                " to node %u on page %" PRIaPGNO,
19903                "leaf", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move),
19904                psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno);
19905     /* Add the node to the destination page. */
19906     rc = mdbx_node_add_leaf(cdst, cdst->mc_ki[cdst->mc_top], &key4move, &data,
19907                             node_flags(srcnode));
19908   } break;
19909 
19910   case P_LEAF | P_LEAF2: {
19911     /* Mark src and dst as dirty. */
19912     if (unlikely((rc = mdbx_page_touch(csrc)) || (rc = mdbx_page_touch(cdst))))
19913       return rc;
19914     psrc = csrc->mc_pg[csrc->mc_top];
19915     pdst = cdst->mc_pg[cdst->mc_top];
19916     key4move.iov_len = csrc->mc_db->md_xsize;
19917     key4move.iov_base =
19918         page_leaf2key(psrc, csrc->mc_ki[csrc->mc_top], key4move.iov_len);
19919     mdbx_debug("moving %s-node %u [%s] on page %" PRIaPGNO
19920                " to node %u on page %" PRIaPGNO,
19921                "leaf2", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move),
19922                psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno);
19923     /* Add the node to the destination page. */
19924     rc = mdbx_node_add_leaf2(cdst, cdst->mc_ki[cdst->mc_top], &key4move);
19925   } break;
19926 
19927   default:
19928     goto bailout;
19929   }
19930 
19931   if (unlikely(rc != MDBX_SUCCESS))
19932     return rc;
19933 
19934   /* Delete the node from the source page. */
19935   mdbx_node_del(csrc, key4move.iov_len);
19936 
19937   mdbx_cassert(csrc, psrc == csrc->mc_pg[csrc->mc_top]);
19938   mdbx_cassert(cdst, pdst == cdst->mc_pg[cdst->mc_top]);
19939   mdbx_cassert(csrc, PAGETYPE(psrc) == PAGETYPE(pdst));
19940 
19941   {
19942     /* Adjust other cursors pointing to mp */
19943     MDBX_cursor *m2, *m3;
19944     const MDBX_dbi dbi = csrc->mc_dbi;
19945     mdbx_cassert(csrc, csrc->mc_top == cdst->mc_top);
19946     if (fromleft) {
19947       /* If we're adding on the left, bump others up */
19948       for (m2 = csrc->mc_txn->tw.cursors[dbi]; m2; m2 = m2->mc_next) {
19949         m3 = (csrc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2;
19950         if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top)
19951           continue;
19952         if (m3 != cdst && m3->mc_pg[csrc->mc_top] == pdst &&
19953             m3->mc_ki[csrc->mc_top] >= cdst->mc_ki[csrc->mc_top]) {
19954           m3->mc_ki[csrc->mc_top]++;
19955         }
19956         if (m3 != csrc && m3->mc_pg[csrc->mc_top] == psrc &&
19957             m3->mc_ki[csrc->mc_top] == csrc->mc_ki[csrc->mc_top]) {
19958           m3->mc_pg[csrc->mc_top] = pdst;
19959           m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top];
19960           mdbx_cassert(csrc, csrc->mc_top > 0);
19961           m3->mc_ki[csrc->mc_top - 1]++;
19962         }
19963         if (XCURSOR_INITED(m3) && IS_LEAF(psrc))
19964           XCURSOR_REFRESH(m3, m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]);
19965       }
19966     } else {
19967       /* Adding on the right, bump others down */
19968       for (m2 = csrc->mc_txn->tw.cursors[dbi]; m2; m2 = m2->mc_next) {
19969         m3 = (csrc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2;
19970         if (m3 == csrc)
19971           continue;
19972         if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top)
19973           continue;
19974         if (m3->mc_pg[csrc->mc_top] == psrc) {
19975           if (!m3->mc_ki[csrc->mc_top]) {
19976             m3->mc_pg[csrc->mc_top] = pdst;
19977             m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top];
19978             mdbx_cassert(csrc, csrc->mc_top > 0);
19979             m3->mc_ki[csrc->mc_top - 1]--;
19980           } else {
19981             m3->mc_ki[csrc->mc_top]--;
19982           }
19983           if (XCURSOR_INITED(m3) && IS_LEAF(psrc))
19984             XCURSOR_REFRESH(m3, m3->mc_pg[csrc->mc_top],
19985                             m3->mc_ki[csrc->mc_top]);
19986         }
19987       }
19988     }
19989   }
19990 
19991   /* Update the parent separators. */
19992   if (csrc->mc_ki[csrc->mc_top] == 0) {
19993     mdbx_cassert(csrc, csrc->mc_top > 0);
19994     if (csrc->mc_ki[csrc->mc_top - 1] != 0) {
19995       MDBX_val key;
19996       if (IS_LEAF2(psrc)) {
19997         key.iov_len = psrc->mp_leaf2_ksize;
19998         key.iov_base = page_leaf2key(psrc, 0, key.iov_len);
19999       } else {
20000         MDBX_node *srcnode = page_node(psrc, 0);
20001         key.iov_len = node_ks(srcnode);
20002         key.iov_base = node_key(srcnode);
20003       }
20004       mdbx_debug("update separator for source page %" PRIaPGNO " to [%s]",
20005                  psrc->mp_pgno, DKEY_DEBUG(&key));
20006       MDBX_cursor mn;
20007       cursor_copy(csrc, &mn);
20008       mdbx_cassert(csrc, mn.mc_snum > 0);
20009       mn.mc_snum--;
20010       mn.mc_top--;
20011       /* We want mdbx_rebalance to find mn when doing fixups */
20012       WITH_CURSOR_TRACKING(mn, rc = mdbx_update_key(&mn, &key));
20013       if (unlikely(rc != MDBX_SUCCESS))
20014         return rc;
20015     }
20016     if (IS_BRANCH(psrc)) {
20017       const MDBX_val nullkey = {0, 0};
20018       const indx_t ix = csrc->mc_ki[csrc->mc_top];
20019       csrc->mc_ki[csrc->mc_top] = 0;
20020       rc = mdbx_update_key(csrc, &nullkey);
20021       csrc->mc_ki[csrc->mc_top] = ix;
20022       mdbx_cassert(csrc, rc == MDBX_SUCCESS);
20023     }
20024   }
20025 
20026   if (cdst->mc_ki[cdst->mc_top] == 0) {
20027     mdbx_cassert(cdst, cdst->mc_top > 0);
20028     if (cdst->mc_ki[cdst->mc_top - 1] != 0) {
20029       MDBX_val key;
20030       if (IS_LEAF2(pdst)) {
20031         key.iov_len = pdst->mp_leaf2_ksize;
20032         key.iov_base = page_leaf2key(pdst, 0, key.iov_len);
20033       } else {
20034         MDBX_node *srcnode = page_node(pdst, 0);
20035         key.iov_len = node_ks(srcnode);
20036         key.iov_base = node_key(srcnode);
20037       }
20038       mdbx_debug("update separator for destination page %" PRIaPGNO " to [%s]",
20039                  pdst->mp_pgno, DKEY_DEBUG(&key));
20040       MDBX_cursor mn;
20041       cursor_copy(cdst, &mn);
20042       mdbx_cassert(cdst, mn.mc_snum > 0);
20043       mn.mc_snum--;
20044       mn.mc_top--;
20045       /* We want mdbx_rebalance to find mn when doing fixups */
20046       WITH_CURSOR_TRACKING(mn, rc = mdbx_update_key(&mn, &key));
20047       if (unlikely(rc != MDBX_SUCCESS))
20048         return rc;
20049     }
20050     if (IS_BRANCH(pdst)) {
20051       const MDBX_val nullkey = {0, 0};
20052       const indx_t ix = cdst->mc_ki[cdst->mc_top];
20053       cdst->mc_ki[cdst->mc_top] = 0;
20054       rc = mdbx_update_key(cdst, &nullkey);
20055       cdst->mc_ki[cdst->mc_top] = ix;
20056       mdbx_cassert(cdst, rc == MDBX_SUCCESS);
20057     }
20058   }
20059 
20060   return MDBX_SUCCESS;
20061 }
20062 
20063 /* Merge one page into another.
20064  *
20065  * The nodes from the page pointed to by csrc will be copied to the page
20066  * pointed to by cdst and then the csrc page will be freed.
20067  *
20068  * [in] csrc Cursor pointing to the source page.
20069  * [in] cdst Cursor pointing to the destination page.
20070  *
20071  * Returns 0 on success, non-zero on failure. */
mdbx_page_merge(MDBX_cursor * csrc,MDBX_cursor * cdst)20072 static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) {
20073   MDBX_val key;
20074   int rc;
20075 
20076   mdbx_cassert(csrc, csrc != cdst);
20077   mdbx_cassert(csrc, cursor_is_tracked(csrc));
20078   mdbx_cassert(cdst, cursor_is_tracked(cdst));
20079   const MDBX_page *const psrc = csrc->mc_pg[csrc->mc_top];
20080   MDBX_page *pdst = cdst->mc_pg[cdst->mc_top];
20081   mdbx_debug("merging page %" PRIaPGNO " into %" PRIaPGNO, psrc->mp_pgno,
20082              pdst->mp_pgno);
20083 
20084   mdbx_cassert(csrc, PAGETYPE(psrc) == PAGETYPE(pdst));
20085   mdbx_cassert(csrc,
20086                csrc->mc_dbi == cdst->mc_dbi && csrc->mc_db == cdst->mc_db);
20087   mdbx_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */
20088   mdbx_cassert(cdst, cdst->mc_snum > 1);
20089   mdbx_cassert(cdst, cdst->mc_snum < cdst->mc_db->md_depth ||
20090                          IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1]));
20091   mdbx_cassert(csrc, csrc->mc_snum < csrc->mc_db->md_depth ||
20092                          IS_LEAF(csrc->mc_pg[csrc->mc_db->md_depth - 1]));
20093   mdbx_cassert(cdst, page_room(pdst) >= page_used(cdst->mc_txn->mt_env, psrc));
20094   const int pagetype = PAGETYPE(psrc);
20095 
20096   /* Move all nodes from src to dst */
20097   const unsigned dst_nkeys = page_numkeys(pdst);
20098   const unsigned src_nkeys = page_numkeys(psrc);
20099   mdbx_cassert(cdst, dst_nkeys + src_nkeys >= (IS_LEAF(psrc) ? 1u : 2u));
20100   if (likely(src_nkeys)) {
20101     unsigned j = dst_nkeys;
20102     if (unlikely(pagetype & P_LEAF2)) {
20103       /* Mark dst as dirty. */
20104       if (unlikely(rc = mdbx_page_touch(cdst)))
20105         return rc;
20106 
20107       key.iov_len = csrc->mc_db->md_xsize;
20108       key.iov_base = page_data(psrc);
20109       unsigned i = 0;
20110       do {
20111         rc = mdbx_node_add_leaf2(cdst, j++, &key);
20112         if (unlikely(rc != MDBX_SUCCESS))
20113           return rc;
20114         key.iov_base = (char *)key.iov_base + key.iov_len;
20115       } while (++i != src_nkeys);
20116     } else {
20117       MDBX_node *srcnode = page_node(psrc, 0);
20118       key.iov_len = node_ks(srcnode);
20119       key.iov_base = node_key(srcnode);
20120       if (pagetype & P_BRANCH) {
20121         MDBX_cursor mn;
20122         cursor_copy(csrc, &mn);
20123         /* must find the lowest key below src */
20124         rc = mdbx_page_search_lowest(&mn);
20125         if (unlikely(rc))
20126           return rc;
20127 
20128         const MDBX_page *mp = mn.mc_pg[mn.mc_top];
20129         if (likely(!IS_LEAF2(mp))) {
20130           mdbx_cassert(&mn, IS_LEAF(mp));
20131           const MDBX_node *lowest = page_node(mp, 0);
20132           key.iov_len = node_ks(lowest);
20133           key.iov_base = node_key(lowest);
20134         } else {
20135           mdbx_cassert(&mn, mn.mc_top > csrc->mc_top);
20136           key.iov_len = mp->mp_leaf2_ksize;
20137           key.iov_base = page_leaf2key(mp, mn.mc_ki[mn.mc_top], key.iov_len);
20138         }
20139         mdbx_cassert(&mn, key.iov_len >= csrc->mc_dbx->md_klen_min);
20140         mdbx_cassert(&mn, key.iov_len <= csrc->mc_dbx->md_klen_max);
20141 
20142         const size_t dst_room = page_room(pdst);
20143         const size_t src_used = page_used(cdst->mc_txn->mt_env, psrc);
20144         const size_t space_needed = src_used - node_ks(srcnode) + key.iov_len;
20145         if (unlikely(space_needed > dst_room))
20146           return MDBX_RESULT_TRUE;
20147       }
20148 
20149       /* Mark dst as dirty. */
20150       if (unlikely(rc = mdbx_page_touch(cdst)))
20151         return rc;
20152 
20153       unsigned i = 0;
20154       while (true) {
20155         if (pagetype & P_LEAF) {
20156           MDBX_val data;
20157           data.iov_len = node_ds(srcnode);
20158           data.iov_base = node_data(srcnode);
20159           rc = mdbx_node_add_leaf(cdst, j++, &key, &data, node_flags(srcnode));
20160         } else {
20161           mdbx_cassert(csrc, node_flags(srcnode) == 0);
20162           rc = mdbx_node_add_branch(cdst, j++, &key, node_pgno(srcnode));
20163         }
20164         if (unlikely(rc != MDBX_SUCCESS))
20165           return rc;
20166 
20167         if (++i == src_nkeys)
20168           break;
20169         srcnode = page_node(psrc, i);
20170         key.iov_len = node_ks(srcnode);
20171         key.iov_base = node_key(srcnode);
20172       }
20173     }
20174 
20175     pdst = cdst->mc_pg[cdst->mc_top];
20176     mdbx_debug("dst page %" PRIaPGNO " now has %u keys (%.1f%% filled)",
20177                pdst->mp_pgno, page_numkeys(pdst),
20178                page_fill(cdst->mc_txn->mt_env, pdst));
20179 
20180     mdbx_cassert(csrc, psrc == csrc->mc_pg[csrc->mc_top]);
20181     mdbx_cassert(cdst, pdst == cdst->mc_pg[cdst->mc_top]);
20182   }
20183 
20184   /* Unlink the src page from parent and add to free list. */
20185   csrc->mc_top--;
20186   mdbx_node_del(csrc, 0);
20187   if (csrc->mc_ki[csrc->mc_top] == 0) {
20188     const MDBX_val nullkey = {0, 0};
20189     rc = mdbx_update_key(csrc, &nullkey);
20190     if (unlikely(rc)) {
20191       csrc->mc_top++;
20192       return rc;
20193     }
20194   }
20195   csrc->mc_top++;
20196 
20197   mdbx_cassert(csrc, psrc == csrc->mc_pg[csrc->mc_top]);
20198   mdbx_cassert(cdst, pdst == cdst->mc_pg[cdst->mc_top]);
20199 
20200   {
20201     /* Adjust other cursors pointing to mp */
20202     MDBX_cursor *m2, *m3;
20203     const MDBX_dbi dbi = csrc->mc_dbi;
20204     const unsigned top = csrc->mc_top;
20205 
20206     for (m2 = csrc->mc_txn->tw.cursors[dbi]; m2; m2 = m2->mc_next) {
20207       m3 = (csrc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2;
20208       if (m3 == csrc || top >= m3->mc_snum)
20209         continue;
20210       if (m3->mc_pg[top] == psrc) {
20211         m3->mc_pg[top] = pdst;
20212         mdbx_cassert(m3, dst_nkeys + m3->mc_ki[top] <= UINT16_MAX);
20213         m3->mc_ki[top] += (indx_t)dst_nkeys;
20214         m3->mc_ki[top - 1] = cdst->mc_ki[top - 1];
20215       } else if (m3->mc_pg[top - 1] == csrc->mc_pg[top - 1] &&
20216                  m3->mc_ki[top - 1] > csrc->mc_ki[top - 1]) {
20217         m3->mc_ki[top - 1]--;
20218       }
20219       if (XCURSOR_INITED(m3) && IS_LEAF(psrc))
20220         XCURSOR_REFRESH(m3, m3->mc_pg[top], m3->mc_ki[top]);
20221     }
20222   }
20223 
20224   /* If not operating on GC, allow this page to be reused
20225    * in this txn. Otherwise just add to free list. */
20226   rc = mdbx_page_retire(csrc, (MDBX_page *)psrc);
20227   if (unlikely(rc))
20228     return rc;
20229 
20230   mdbx_cassert(cdst, cdst->mc_db->md_entries > 0);
20231   mdbx_cassert(cdst, cdst->mc_snum <= cdst->mc_db->md_depth);
20232   mdbx_cassert(cdst, cdst->mc_top > 0);
20233   mdbx_cassert(cdst, cdst->mc_snum == cdst->mc_top + 1);
20234   MDBX_page *const top_page = cdst->mc_pg[cdst->mc_top];
20235   const indx_t top_indx = cdst->mc_ki[cdst->mc_top];
20236   const unsigned save_snum = cdst->mc_snum;
20237   const uint16_t save_depth = cdst->mc_db->md_depth;
20238   mdbx_cursor_pop(cdst);
20239   rc = mdbx_rebalance(cdst);
20240   if (unlikely(rc))
20241     return rc;
20242 
20243   mdbx_cassert(cdst, cdst->mc_db->md_entries > 0);
20244   mdbx_cassert(cdst, cdst->mc_snum <= cdst->mc_db->md_depth);
20245   mdbx_cassert(cdst, cdst->mc_snum == cdst->mc_top + 1);
20246 
20247 #if MDBX_ENABLE_PGOP_STAT
20248   cdst->mc_txn->mt_env->me_lck->mti_pgop_stat.merge.weak += 1;
20249 #endif /* MDBX_ENABLE_PGOP_STAT */
20250 
20251   if (IS_LEAF(cdst->mc_pg[cdst->mc_top])) {
20252     /* LY: don't touch cursor if top-page is a LEAF */
20253     mdbx_cassert(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) ||
20254                            PAGETYPE(cdst->mc_pg[cdst->mc_top]) == pagetype);
20255     return MDBX_SUCCESS;
20256   }
20257 
20258   mdbx_cassert(cdst, page_numkeys(top_page) == dst_nkeys + src_nkeys);
20259 
20260   if (unlikely(pagetype != PAGETYPE(top_page))) {
20261     /* LY: LEAF-page becomes BRANCH, unable restore cursor's stack */
20262     goto bailout;
20263   }
20264 
20265   if (top_page == cdst->mc_pg[cdst->mc_top]) {
20266     /* LY: don't touch cursor if prev top-page already on the top */
20267     mdbx_cassert(cdst, cdst->mc_ki[cdst->mc_top] == top_indx);
20268     mdbx_cassert(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) ||
20269                            PAGETYPE(cdst->mc_pg[cdst->mc_top]) == pagetype);
20270     return MDBX_SUCCESS;
20271   }
20272 
20273   const int new_snum = save_snum - save_depth + cdst->mc_db->md_depth;
20274   if (unlikely(new_snum < 1 || new_snum > cdst->mc_db->md_depth)) {
20275     /* LY: out of range, unable restore cursor's stack */
20276     goto bailout;
20277   }
20278 
20279   if (top_page == cdst->mc_pg[new_snum - 1]) {
20280     mdbx_cassert(cdst, cdst->mc_ki[new_snum - 1] == top_indx);
20281     /* LY: restore cursor stack */
20282     cdst->mc_snum = (uint16_t)new_snum;
20283     cdst->mc_top = (uint16_t)new_snum - 1;
20284     mdbx_cassert(cdst, cdst->mc_snum < cdst->mc_db->md_depth ||
20285                            IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1]));
20286     mdbx_cassert(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) ||
20287                            PAGETYPE(cdst->mc_pg[cdst->mc_top]) == pagetype);
20288     return MDBX_SUCCESS;
20289   }
20290 
20291   MDBX_page *const stub_page = (MDBX_page *)(~(uintptr_t)top_page);
20292   const indx_t stub_indx = top_indx;
20293   if (save_depth > cdst->mc_db->md_depth &&
20294       ((cdst->mc_pg[save_snum - 1] == top_page &&
20295         cdst->mc_ki[save_snum - 1] == top_indx) ||
20296        (cdst->mc_pg[save_snum - 1] == stub_page &&
20297         cdst->mc_ki[save_snum - 1] == stub_indx))) {
20298     /* LY: restore cursor stack */
20299     cdst->mc_pg[new_snum - 1] = top_page;
20300     cdst->mc_ki[new_snum - 1] = top_indx;
20301     cdst->mc_pg[new_snum] = (MDBX_page *)(~(uintptr_t)cdst->mc_pg[new_snum]);
20302     cdst->mc_ki[new_snum] = ~cdst->mc_ki[new_snum];
20303     cdst->mc_snum = (uint16_t)new_snum;
20304     cdst->mc_top = (uint16_t)new_snum - 1;
20305     mdbx_cassert(cdst, cdst->mc_snum < cdst->mc_db->md_depth ||
20306                            IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1]));
20307     mdbx_cassert(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) ||
20308                            PAGETYPE(cdst->mc_pg[cdst->mc_top]) == pagetype);
20309     return MDBX_SUCCESS;
20310   }
20311 
20312 bailout:
20313   /* LY: unable restore cursor's stack */
20314   cdst->mc_flags &= ~C_INITIALIZED;
20315   return MDBX_CURSOR_FULL;
20316 }
20317 
cursor_restore(const MDBX_cursor * csrc,MDBX_cursor * cdst)20318 static void cursor_restore(const MDBX_cursor *csrc, MDBX_cursor *cdst) {
20319   mdbx_cassert(cdst, cdst->mc_dbi == csrc->mc_dbi);
20320   mdbx_cassert(cdst, cdst->mc_txn == csrc->mc_txn);
20321   mdbx_cassert(cdst, cdst->mc_db == csrc->mc_db);
20322   mdbx_cassert(cdst, cdst->mc_dbx == csrc->mc_dbx);
20323   mdbx_cassert(cdst, cdst->mc_dbistate == csrc->mc_dbistate);
20324   cdst->mc_snum = csrc->mc_snum;
20325   cdst->mc_top = csrc->mc_top;
20326   cdst->mc_flags = csrc->mc_flags;
20327 
20328   for (unsigned i = 0; i < csrc->mc_snum; i++) {
20329     cdst->mc_pg[i] = csrc->mc_pg[i];
20330     cdst->mc_ki[i] = csrc->mc_ki[i];
20331   }
20332 }
20333 
20334 /* Copy the contents of a cursor.
20335  * [in] csrc The cursor to copy from.
20336  * [out] cdst The cursor to copy to. */
cursor_copy(const MDBX_cursor * csrc,MDBX_cursor * cdst)20337 static void cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst) {
20338   mdbx_cassert(csrc, csrc->mc_txn->mt_txnid >=
20339                          csrc->mc_txn->mt_env->me_lck->mti_oldest_reader.weak);
20340   cdst->mc_dbi = csrc->mc_dbi;
20341   cdst->mc_next = NULL;
20342   cdst->mc_backup = NULL;
20343   cdst->mc_xcursor = NULL;
20344   cdst->mc_txn = csrc->mc_txn;
20345   cdst->mc_db = csrc->mc_db;
20346   cdst->mc_dbx = csrc->mc_dbx;
20347   cdst->mc_dbistate = csrc->mc_dbistate;
20348   cursor_restore(csrc, cdst);
20349 }
20350 
20351 /* Rebalance the tree after a delete operation.
20352  * [in] mc Cursor pointing to the page where rebalancing should begin.
20353  * Returns 0 on success, non-zero on failure. */
mdbx_rebalance(MDBX_cursor * mc)20354 static int mdbx_rebalance(MDBX_cursor *mc) {
20355   mdbx_cassert(mc, cursor_is_tracked(mc));
20356   mdbx_cassert(mc, mc->mc_snum > 0);
20357   mdbx_cassert(mc, mc->mc_snum < mc->mc_db->md_depth ||
20358                        IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1]));
20359   const int pagetype = PAGETYPE(mc->mc_pg[mc->mc_top]);
20360 
20361   STATIC_ASSERT(P_BRANCH == 1);
20362   const unsigned minkeys = (pagetype & P_BRANCH) + 1;
20363 
20364   /* Pages emptier than this are candidates for merging. */
20365   unsigned room_threshold = likely(mc->mc_dbi != FREE_DBI)
20366                                 ? mc->mc_txn->mt_env->me_merge_threshold
20367                                 : mc->mc_txn->mt_env->me_merge_threshold_gc;
20368 
20369   const MDBX_page *const tp = mc->mc_pg[mc->mc_top];
20370   const unsigned numkeys = page_numkeys(tp);
20371   const unsigned room = page_room(tp);
20372   mdbx_debug("rebalancing %s page %" PRIaPGNO
20373              " (has %u keys, full %.1f%%, used %u, room %u bytes )",
20374              (pagetype & P_LEAF) ? "leaf" : "branch", tp->mp_pgno, numkeys,
20375              page_fill(mc->mc_txn->mt_env, tp),
20376              page_used(mc->mc_txn->mt_env, tp), room);
20377 
20378   if (unlikely(numkeys < minkeys)) {
20379     mdbx_debug("page %" PRIaPGNO " must be merged due keys < %u threshold",
20380                tp->mp_pgno, minkeys);
20381   } else if (unlikely(room > room_threshold)) {
20382     mdbx_debug("page %" PRIaPGNO " should be merged due room %u > %u threshold",
20383                tp->mp_pgno, room, room_threshold);
20384   } else {
20385     mdbx_debug("no need to rebalance page %" PRIaPGNO
20386                ", room %u < %u threshold",
20387                tp->mp_pgno, room, room_threshold);
20388     mdbx_cassert(mc, mc->mc_db->md_entries > 0);
20389     return MDBX_SUCCESS;
20390   }
20391 
20392   int rc;
20393   if (mc->mc_snum < 2) {
20394     MDBX_page *const mp = mc->mc_pg[0];
20395     const unsigned nkeys = page_numkeys(mp);
20396     mdbx_cassert(mc, (mc->mc_db->md_entries == 0) == (nkeys == 0));
20397     if (IS_SUBP(mp)) {
20398       mdbx_debug("%s", "Can't rebalance a subpage, ignoring");
20399       mdbx_cassert(mc, pagetype & P_LEAF);
20400       return MDBX_SUCCESS;
20401     }
20402     if (nkeys == 0) {
20403       mdbx_cassert(mc, IS_LEAF(mp));
20404       mdbx_debug("%s", "tree is completely empty");
20405       mc->mc_db->md_root = P_INVALID;
20406       mc->mc_db->md_depth = 0;
20407       mdbx_cassert(mc, mc->mc_db->md_branch_pages == 0 &&
20408                            mc->mc_db->md_overflow_pages == 0 &&
20409                            mc->mc_db->md_leaf_pages == 1);
20410       /* Adjust cursors pointing to mp */
20411       for (MDBX_cursor *m2 = mc->mc_txn->tw.cursors[mc->mc_dbi]; m2;
20412            m2 = m2->mc_next) {
20413         MDBX_cursor *m3 =
20414             (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2;
20415         if (m3 == mc || !(m3->mc_flags & C_INITIALIZED))
20416           continue;
20417         if (m3->mc_pg[0] == mp) {
20418           m3->mc_snum = 0;
20419           m3->mc_top = 0;
20420           m3->mc_flags &= ~C_INITIALIZED;
20421         }
20422       }
20423       mc->mc_snum = 0;
20424       mc->mc_top = 0;
20425       mc->mc_flags &= ~C_INITIALIZED;
20426 
20427       rc = mdbx_page_retire(mc, mp);
20428       if (unlikely(rc != MDBX_SUCCESS))
20429         return rc;
20430     } else if (IS_BRANCH(mp) && nkeys == 1) {
20431       mdbx_debug("%s", "collapsing root page!");
20432       mc->mc_db->md_root = node_pgno(page_node(mp, 0));
20433       rc = mdbx_page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0],
20434                          pp_txnid4chk(mp, mc->mc_txn));
20435       if (unlikely(rc != MDBX_SUCCESS))
20436         return rc;
20437       mc->mc_db->md_depth--;
20438       mc->mc_ki[0] = mc->mc_ki[1];
20439       for (int i = 1; i < mc->mc_db->md_depth; i++) {
20440         mc->mc_pg[i] = mc->mc_pg[i + 1];
20441         mc->mc_ki[i] = mc->mc_ki[i + 1];
20442       }
20443 
20444       /* Adjust other cursors pointing to mp */
20445       for (MDBX_cursor *m2 = mc->mc_txn->tw.cursors[mc->mc_dbi]; m2;
20446            m2 = m2->mc_next) {
20447         MDBX_cursor *m3 =
20448             (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2;
20449         if (m3 == mc || !(m3->mc_flags & C_INITIALIZED))
20450           continue;
20451         if (m3->mc_pg[0] == mp) {
20452           for (int i = 0; i < mc->mc_db->md_depth; i++) {
20453             m3->mc_pg[i] = m3->mc_pg[i + 1];
20454             m3->mc_ki[i] = m3->mc_ki[i + 1];
20455           }
20456           m3->mc_snum--;
20457           m3->mc_top--;
20458         }
20459       }
20460       mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]) ||
20461                            PAGETYPE(mc->mc_pg[mc->mc_top]) == pagetype);
20462       mdbx_cassert(mc, mc->mc_snum < mc->mc_db->md_depth ||
20463                            IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1]));
20464 
20465       rc = mdbx_page_retire(mc, mp);
20466       if (unlikely(rc != MDBX_SUCCESS))
20467         return rc;
20468     } else {
20469       mdbx_debug("root page %" PRIaPGNO
20470                  " doesn't need rebalancing (flags 0x%x)",
20471                  mp->mp_pgno, mp->mp_flags);
20472     }
20473     return MDBX_SUCCESS;
20474   }
20475 
20476   /* The parent (branch page) must have at least 2 pointers,
20477    * otherwise the tree is invalid. */
20478   const unsigned pre_top = mc->mc_top - 1;
20479   mdbx_cassert(mc, IS_BRANCH(mc->mc_pg[pre_top]));
20480   mdbx_cassert(mc, !IS_SUBP(mc->mc_pg[0]));
20481   mdbx_cassert(mc, page_numkeys(mc->mc_pg[pre_top]) > 1);
20482 
20483   /* Leaf page fill factor is below the threshold.
20484    * Try to move keys from left or right neighbor, or
20485    * merge with a neighbor page. */
20486 
20487   /* Find neighbors. */
20488   MDBX_cursor mn;
20489   cursor_copy(mc, &mn);
20490 
20491   MDBX_page *left = nullptr, *right = nullptr;
20492   if (mn.mc_ki[pre_top] > 0) {
20493     rc = mdbx_page_get(
20494         &mn, node_pgno(page_node(mn.mc_pg[pre_top], mn.mc_ki[pre_top] - 1)),
20495         &left, pp_txnid4chk(mn.mc_pg[pre_top], mc->mc_txn));
20496     if (unlikely(rc != MDBX_SUCCESS))
20497       return rc;
20498     mdbx_cassert(mc, PAGETYPE(left) == PAGETYPE(mc->mc_pg[mc->mc_top]));
20499   }
20500   if (mn.mc_ki[pre_top] + 1u < page_numkeys(mn.mc_pg[pre_top])) {
20501     rc = mdbx_page_get(
20502         &mn, node_pgno(page_node(mn.mc_pg[pre_top], mn.mc_ki[pre_top] + 1)),
20503         &right, pp_txnid4chk(mn.mc_pg[pre_top], mc->mc_txn));
20504     if (unlikely(rc != MDBX_SUCCESS))
20505       return rc;
20506     mdbx_cassert(mc, PAGETYPE(right) == PAGETYPE(mc->mc_pg[mc->mc_top]));
20507   }
20508   mdbx_cassert(mc, left || right);
20509 
20510   const unsigned ki_top = mc->mc_ki[mc->mc_top];
20511   const unsigned ki_pre_top = mn.mc_ki[pre_top];
20512   const unsigned nkeys = page_numkeys(mn.mc_pg[mn.mc_top]);
20513 
20514   const unsigned left_room = left ? page_room(left) : 0;
20515   const unsigned right_room = right ? page_room(right) : 0;
20516   const unsigned left_nkeys = left ? page_numkeys(left) : 0;
20517   const unsigned right_nkeys = right ? page_numkeys(right) : 0;
20518 retry:
20519   if (left_room > room_threshold && left_room >= right_room) {
20520     /* try merge with left */
20521     mdbx_cassert(mc, left_nkeys >= minkeys);
20522     mn.mc_pg[mn.mc_top] = left;
20523     mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top - 1);
20524     mn.mc_ki[mn.mc_top] = (indx_t)(left_nkeys - 1);
20525     mc->mc_ki[mc->mc_top] = 0;
20526     const unsigned new_ki = ki_top + left_nkeys;
20527     mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1;
20528     /* We want mdbx_rebalance to find mn when doing fixups */
20529     WITH_CURSOR_TRACKING(mn, rc = mdbx_page_merge(mc, &mn));
20530     if (likely(rc != MDBX_RESULT_TRUE)) {
20531       cursor_restore(&mn, mc);
20532       mc->mc_ki[mc->mc_top] = (indx_t)new_ki;
20533       mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys);
20534       return rc;
20535     }
20536   }
20537   if (right_room > room_threshold) {
20538     /* try merge with right */
20539     mdbx_cassert(mc, right_nkeys >= minkeys);
20540     mn.mc_pg[mn.mc_top] = right;
20541     mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top + 1);
20542     mn.mc_ki[mn.mc_top] = 0;
20543     mc->mc_ki[mc->mc_top] = (indx_t)nkeys;
20544     WITH_CURSOR_TRACKING(mn, rc = mdbx_page_merge(&mn, mc));
20545     if (likely(rc != MDBX_RESULT_TRUE)) {
20546       mc->mc_ki[mc->mc_top] = (indx_t)ki_top;
20547       mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys);
20548       return rc;
20549     }
20550   }
20551 
20552   if (left_nkeys > minkeys &&
20553       (right_nkeys <= left_nkeys || right_room >= left_room)) {
20554     /* try move from left */
20555     mn.mc_pg[mn.mc_top] = left;
20556     mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top - 1);
20557     mn.mc_ki[mn.mc_top] = (indx_t)(left_nkeys - 1);
20558     mc->mc_ki[mc->mc_top] = 0;
20559     WITH_CURSOR_TRACKING(mn, rc = mdbx_node_move(&mn, mc, true));
20560     if (likely(rc != MDBX_RESULT_TRUE)) {
20561       mc->mc_ki[mc->mc_top] = (indx_t)(ki_top + 1);
20562       mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys);
20563       return rc;
20564     }
20565   }
20566   if (right_nkeys > minkeys) {
20567     /* try move from right */
20568     mn.mc_pg[mn.mc_top] = right;
20569     mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top + 1);
20570     mn.mc_ki[mn.mc_top] = 0;
20571     mc->mc_ki[mc->mc_top] = (indx_t)nkeys;
20572     WITH_CURSOR_TRACKING(mn, rc = mdbx_node_move(&mn, mc, false));
20573     if (likely(rc != MDBX_RESULT_TRUE)) {
20574       mc->mc_ki[mc->mc_top] = (indx_t)ki_top;
20575       mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys);
20576       return rc;
20577     }
20578   }
20579 
20580   if (nkeys >= minkeys) {
20581     mc->mc_ki[mc->mc_top] = (indx_t)ki_top;
20582     if (!mdbx_audit_enabled())
20583       return MDBX_SUCCESS;
20584     return mdbx_cursor_check(mc, C_UPDATING);
20585   }
20586 
20587   if (likely(room_threshold > 0)) {
20588     room_threshold = 0;
20589     goto retry;
20590   }
20591   mdbx_error("Unable to merge/rebalance %s page %" PRIaPGNO
20592              " (has %u keys, full %.1f%%, used %u, room %u bytes )",
20593              (pagetype & P_LEAF) ? "leaf" : "branch", tp->mp_pgno, numkeys,
20594              page_fill(mc->mc_txn->mt_env, tp),
20595              page_used(mc->mc_txn->mt_env, tp), room);
20596   return MDBX_PROBLEM;
20597 }
20598 
mdbx_page_check(MDBX_cursor * const mc,const MDBX_page * const mp,unsigned options)20599 __cold static int mdbx_page_check(MDBX_cursor *const mc,
20600                                   const MDBX_page *const mp, unsigned options) {
20601   DKBUF;
20602   options |= mc->mc_flags;
20603   MDBX_env *const env = mc->mc_txn->mt_env;
20604   const unsigned nkeys = page_numkeys(mp);
20605   char *const end_of_page = (char *)mp + env->me_psize;
20606   if (unlikely(mp->mp_pgno < MIN_PAGENO || mp->mp_pgno > MAX_PAGENO))
20607     return bad_page(mp, "invalid pgno (%u)\n", mp->mp_pgno);
20608   if (IS_OVERFLOW(mp)) {
20609     if (unlikely(mp->mp_pages < 1 && mp->mp_pages >= MAX_PAGENO / 2))
20610       return bad_page(mp, "invalid overflow n-pages (%u)\n", mp->mp_pages);
20611     if (unlikely(mp->mp_pgno + mp->mp_pages > mc->mc_txn->mt_next_pgno))
20612       return bad_page(mp, "overflow page beyond (%u) next-pgno\n",
20613                       mp->mp_pgno + mp->mp_pages);
20614     if (unlikely((options & (C_SUB | C_COPYING)) == C_SUB))
20615       return bad_page(mp,
20616                       "unexpected overflow-page for dupsort db (flags 0x%x)\n",
20617                       mc->mc_db->md_flags);
20618     return MDBX_SUCCESS;
20619   }
20620 
20621   int rc = MDBX_SUCCESS;
20622   if ((options & C_UPDATING) == 0 || !IS_MODIFIABLE(mc->mc_txn, mp)) {
20623     if (unlikely(nkeys < 2 && IS_BRANCH(mp)))
20624       rc = bad_page(mp, "branch-page nkey (%u) < 2\n", nkeys);
20625   }
20626   if (IS_LEAF2(mp) && unlikely((options & (C_SUB | C_COPYING)) == 0))
20627     rc = bad_page(mp, "unexpected leaf2-page (db flags 0x%x)\n",
20628                   mc->mc_db->md_flags);
20629 
20630   MDBX_val here, prev = {0, 0};
20631   for (unsigned i = 0; i < nkeys; ++i) {
20632     if (IS_LEAF2(mp)) {
20633       const size_t ksize = mp->mp_leaf2_ksize;
20634       char *const key = page_leaf2key(mp, i, ksize);
20635       if (unlikely(end_of_page < key + ksize)) {
20636         rc = bad_page(mp, "leaf2-key beyond (%zu) page-end\n",
20637                       key + ksize - end_of_page);
20638         continue;
20639       }
20640 
20641       if ((options & C_COPYING) == 0) {
20642         if (unlikely(ksize != mc->mc_dbx->md_klen_min)) {
20643           if (unlikely(ksize < mc->mc_dbx->md_klen_min ||
20644                        ksize > mc->mc_dbx->md_klen_max))
20645             rc = bad_page(
20646                 mp, "leaf2-key size (%zu) <> min/max key-length (%zu/%zu)\n",
20647                 ksize, mc->mc_dbx->md_klen_min, mc->mc_dbx->md_klen_max);
20648           else
20649             mc->mc_dbx->md_klen_min = mc->mc_dbx->md_klen_max = ksize;
20650         }
20651         if ((options & C_SKIPORD) == 0) {
20652           here.iov_len = ksize;
20653           here.iov_base = key;
20654           if (prev.iov_base && unlikely(mc->mc_dbx->md_cmp(&prev, &here) >= 0))
20655             rc = bad_page(mp, "leaf2-key #%u wrong order (%s >= %s)\n", i,
20656                           DKEY(&prev), DVAL(&here));
20657           prev = here;
20658         }
20659       }
20660     } else {
20661       const MDBX_node *const node = page_node(mp, i);
20662       const char *node_end = (char *)node + NODESIZE;
20663       if (unlikely(node_end > end_of_page)) {
20664         rc = bad_page(mp, "node[%u] (%zu) beyond page-end\n", i,
20665                       node_end - end_of_page);
20666         continue;
20667       }
20668       size_t ksize = node_ks(node);
20669       char *key = node_key(node);
20670       if (unlikely(end_of_page < key + ksize)) {
20671         rc = bad_page(mp, "node[%u] key (%zu) beyond page-end\n", i,
20672                       key + ksize - end_of_page);
20673         continue;
20674       }
20675       if ((IS_LEAF(mp) || i > 0) && (options & C_COPYING) == 0) {
20676         if (unlikely(ksize < mc->mc_dbx->md_klen_min ||
20677                      ksize > mc->mc_dbx->md_klen_max))
20678           rc = bad_page(
20679               mp, "node[%u] key size (%zu) <> min/max key-length (%zu/%zu)\n",
20680               i, ksize, mc->mc_dbx->md_klen_min, mc->mc_dbx->md_klen_max);
20681         if ((options & C_SKIPORD) == 0) {
20682           here.iov_base = key;
20683           here.iov_len = ksize;
20684           if (prev.iov_base && unlikely(mc->mc_dbx->md_cmp(&prev, &here) >= 0))
20685             rc = bad_page(mp, "node[%u] key wrong order (%s >= %s)\n", i,
20686                           DKEY(&prev), DVAL(&here));
20687           prev = here;
20688         }
20689       }
20690       if (IS_BRANCH(mp)) {
20691         if ((options & C_UPDATING) == 0 && i == 0 && unlikely(ksize != 0))
20692           rc = bad_page(mp, "branch-node[%u] wrong 0-node key-length (%zu)\n",
20693                         i, ksize);
20694         if ((options & C_RETIRING) == 0) {
20695           const pgno_t ref = node_pgno(node);
20696           if (unlikely(ref < MIN_PAGENO || ref >= mc->mc_txn->mt_next_pgno))
20697             rc = bad_page(mp, "branch-node[%u] wrong pgno (%u)\n", i, ref);
20698         }
20699         if (unlikely(node_flags(node)))
20700           rc = bad_page(mp, "branch-node[%u] wrong flags (%u)\n", i,
20701                         node_flags(node));
20702         continue;
20703       }
20704 
20705       switch (node_flags(node)) {
20706       default:
20707         rc = bad_page(mp, "invalid node[%u] flags (%u)\n", i, node_flags(node));
20708         break;
20709       case F_BIGDATA /* data on large-page */:
20710       case 0 /* usual */:
20711       case F_SUBDATA /* sub-db */:
20712       case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */:
20713       case F_DUPDATA /* short sub-page */:
20714         break;
20715       }
20716 
20717       const size_t dsize = node_ds(node);
20718       const char *const data = node_data(node);
20719       if (node_flags(node) & F_BIGDATA) {
20720         if (unlikely(end_of_page < data + sizeof(pgno_t))) {
20721           rc = bad_page(
20722               mp, "node-%s(%u of %u, %zu bytes) beyond (%zu) page-end\n",
20723               "bigdata-pgno", i, nkeys, dsize, data + dsize - end_of_page);
20724           continue;
20725         }
20726         if ((options & C_COPYING) == 0) {
20727           if (unlikely(dsize <= mc->mc_dbx->md_vlen_min ||
20728                        dsize > mc->mc_dbx->md_vlen_max))
20729             rc = bad_page(
20730                 mp,
20731                 "big-node data size (%zu) <> min/max value-length (%zu/%zu)\n",
20732                 dsize, mc->mc_dbx->md_vlen_min, mc->mc_dbx->md_vlen_max);
20733         }
20734         if ((options & C_RETIRING) == 0) {
20735           MDBX_page *lp;
20736           int err = mdbx_page_get(mc, node_largedata_pgno(node), &lp,
20737                                   pp_txnid4chk(mp, mc->mc_txn));
20738           if (unlikely(err != MDBX_SUCCESS))
20739             return err;
20740           if (unlikely(!IS_OVERFLOW(lp))) {
20741             rc = bad_page(mp, "big-node refs to non-overflow page (%u)\n",
20742                           lp->mp_pgno);
20743             continue;
20744           }
20745           if (unlikely(number_of_ovpages(env, dsize) > lp->mp_pages))
20746             rc =
20747                 bad_page(mp, "big-node size (%zu) mismatch n-pages size (%u)\n",
20748                          dsize, lp->mp_pages);
20749         }
20750         continue;
20751       }
20752 
20753       if (unlikely(end_of_page < data + dsize)) {
20754         rc =
20755             bad_page(mp, "node-%s(%u of %u, %zu bytes) beyond (%zu) page-end\n",
20756                      "data", i, nkeys, dsize, data + dsize - end_of_page);
20757         continue;
20758       }
20759 
20760       switch (node_flags(node)) {
20761       default:
20762         /* wrong, but already handled */
20763         continue;
20764       case 0 /* usual */:
20765         if ((options & C_COPYING) == 0) {
20766           if (unlikely(dsize < mc->mc_dbx->md_vlen_min ||
20767                        dsize > mc->mc_dbx->md_vlen_max)) {
20768             rc = bad_page(
20769                 mp, "node-data size (%zu) <> min/max value-length (%zu/%zu)\n",
20770                 dsize, mc->mc_dbx->md_vlen_min, mc->mc_dbx->md_vlen_max);
20771             continue;
20772           }
20773         }
20774         break;
20775       case F_SUBDATA /* sub-db */:
20776         if (unlikely(dsize != sizeof(MDBX_db))) {
20777           rc = bad_page(mp, "invalid sub-db record size (%zu)\n", dsize);
20778           continue;
20779         }
20780         break;
20781       case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */:
20782         if (unlikely(dsize != sizeof(MDBX_db))) {
20783           rc = bad_page(mp, "invalid nested-db record size (%zu)\n", dsize);
20784           continue;
20785         }
20786         break;
20787       case F_DUPDATA /* short sub-page */:
20788         if (unlikely(dsize <= PAGEHDRSZ)) {
20789           rc = bad_page(mp, "invalid nested/sub-page record size (%zu)\n",
20790                         dsize);
20791           continue;
20792         } else {
20793           const MDBX_page *const sp = (MDBX_page *)data;
20794           const char *const end_of_subpage = data + dsize;
20795           const int nsubkeys = page_numkeys(sp);
20796           switch (sp->mp_flags & /* ignore legacy P_DIRTY flag */ ~0x10) {
20797           case P_LEAF | P_SUBP:
20798           case P_LEAF | P_LEAF2 | P_SUBP:
20799             break;
20800           default:
20801             rc = bad_page(mp, "invalid nested/sub-page flags (0x%02x)\n",
20802                           sp->mp_flags);
20803             continue;
20804           }
20805 
20806           MDBX_val sub_here, sub_prev = {0, 0};
20807           for (int j = 0; j < nsubkeys; j++) {
20808             if (IS_LEAF2(sp)) {
20809               /* LEAF2 pages have no mp_ptrs[] or node headers */
20810               size_t sub_ksize = sp->mp_leaf2_ksize;
20811               char *sub_key = page_leaf2key(sp, j, sub_ksize);
20812               if (unlikely(end_of_subpage < sub_key + sub_ksize)) {
20813                 rc = bad_page(mp, "nested-leaf2-key beyond (%zu) nested-page\n",
20814                               sub_key + sub_ksize - end_of_subpage);
20815                 continue;
20816               }
20817 
20818               if ((options & C_COPYING) == 0) {
20819                 if (unlikely(sub_ksize != mc->mc_dbx->md_vlen_min)) {
20820                   if (unlikely(sub_ksize < mc->mc_dbx->md_vlen_min ||
20821                                sub_ksize > mc->mc_dbx->md_vlen_max)) {
20822                     rc = bad_page(mp,
20823                                   "nested-leaf2-key size (%zu) <> min/max "
20824                                   "value-length (%zu/%zu)\n",
20825                                   sub_ksize, mc->mc_dbx->md_vlen_min,
20826                                   mc->mc_dbx->md_vlen_max);
20827                     continue;
20828                   }
20829                   mc->mc_dbx->md_vlen_min = mc->mc_dbx->md_vlen_max = sub_ksize;
20830                 }
20831                 if ((options & C_SKIPORD) == 0) {
20832                   sub_here.iov_len = sub_ksize;
20833                   sub_here.iov_base = sub_key;
20834                   if (sub_prev.iov_base &&
20835                       unlikely(mc->mc_dbx->md_dcmp(&sub_prev, &sub_here) >= 0))
20836                     rc = bad_page(
20837                         mp, "nested-leaf2-key #%u wrong order (%s >= %s)\n", j,
20838                         DKEY(&sub_prev), DVAL(&sub_here));
20839                   sub_prev = sub_here;
20840                 }
20841               }
20842             } else {
20843               const MDBX_node *const sub_node = page_node(sp, j);
20844               const char *sub_node_end = (char *)sub_node + NODESIZE;
20845               if (unlikely(sub_node_end > end_of_subpage)) {
20846                 rc = bad_page(mp, "nested-node beyond (%zu) nested-page\n",
20847                               end_of_subpage - sub_node_end);
20848                 continue;
20849               }
20850               if (unlikely(node_flags(sub_node) != 0))
20851                 rc = bad_page(mp, "nested-node invalid flags (%u)\n",
20852                               node_flags(sub_node));
20853 
20854               size_t sub_ksize = node_ks(sub_node);
20855               char *sub_key = node_key(sub_node);
20856               size_t sub_dsize = node_ds(sub_node);
20857               /* char *sub_data = node_data(sub_node); */
20858 
20859               if ((options & C_COPYING) == 0) {
20860                 if (unlikely(sub_ksize < mc->mc_dbx->md_vlen_min ||
20861                              sub_ksize > mc->mc_dbx->md_vlen_max))
20862                   rc = bad_page(mp,
20863                                 "nested-node-key size (%zu) <> min/max "
20864                                 "value-length (%zu/%zu)\n",
20865                                 sub_ksize, mc->mc_dbx->md_vlen_min,
20866                                 mc->mc_dbx->md_vlen_max);
20867 
20868                 if ((options & C_SKIPORD) == 0) {
20869                   sub_here.iov_len = sub_ksize;
20870                   sub_here.iov_base = sub_key;
20871                   if (sub_prev.iov_base &&
20872                       unlikely(mc->mc_dbx->md_dcmp(&sub_prev, &sub_here) >= 0))
20873                     rc = bad_page(
20874                         mp, "nested-node-key #%u wrong order (%s >= %s)\n", j,
20875                         DKEY(&sub_prev), DVAL(&sub_here));
20876                   sub_prev = sub_here;
20877                 }
20878               }
20879               if (unlikely(sub_dsize != 0))
20880                 rc = bad_page(mp, "nested-node non-empty data size (%zu)\n",
20881                               sub_dsize);
20882               if (unlikely(end_of_subpage < sub_key + sub_ksize))
20883                 rc = bad_page(mp, "nested-node-key beyond (%zu) nested-page\n",
20884                               sub_key + sub_ksize - end_of_subpage);
20885             }
20886           }
20887         }
20888         break;
20889       }
20890     }
20891   }
20892   return rc;
20893 }
20894 
mdbx_cursor_check(MDBX_cursor * mc,unsigned options)20895 __cold static int mdbx_cursor_check(MDBX_cursor *mc, unsigned options) {
20896   mdbx_cassert(mc,
20897                mc->mc_txn->tw.dirtyroom + mc->mc_txn->tw.dirtylist->length ==
20898                    (mc->mc_txn->mt_parent
20899                         ? mc->mc_txn->mt_parent->tw.dirtyroom
20900                         : mc->mc_txn->mt_env->me_options.dp_limit));
20901   mdbx_cassert(mc, mc->mc_top == mc->mc_snum - 1 || (options & C_UPDATING));
20902   if (unlikely(mc->mc_top != mc->mc_snum - 1) && (options & C_UPDATING) == 0)
20903     return MDBX_CURSOR_FULL;
20904   mdbx_cassert(mc, (options & C_UPDATING) ? mc->mc_snum <= mc->mc_db->md_depth
20905                                           : mc->mc_snum == mc->mc_db->md_depth);
20906   if (unlikely((options & C_UPDATING) ? mc->mc_snum > mc->mc_db->md_depth
20907                                       : mc->mc_snum != mc->mc_db->md_depth))
20908     return MDBX_CURSOR_FULL;
20909 
20910   for (int n = 0; n < (int)mc->mc_snum; ++n) {
20911     MDBX_page *mp = mc->mc_pg[n];
20912     const unsigned nkeys = page_numkeys(mp);
20913     const bool expect_branch = (n < mc->mc_db->md_depth - 1) ? true : false;
20914     const bool expect_nested_leaf =
20915         (n + 1 == mc->mc_db->md_depth - 1) ? true : false;
20916     const bool branch = IS_BRANCH(mp) ? true : false;
20917     mdbx_cassert(mc, branch == expect_branch);
20918     if (unlikely(branch != expect_branch))
20919       return MDBX_CURSOR_FULL;
20920     if ((options & C_UPDATING) == 0) {
20921       mdbx_cassert(mc,
20922                    nkeys > mc->mc_ki[n] || (!branch && nkeys == mc->mc_ki[n] &&
20923                                             (mc->mc_flags & C_EOF) != 0));
20924       if (unlikely(nkeys <= mc->mc_ki[n] &&
20925                    !(!branch && nkeys == mc->mc_ki[n] &&
20926                      (mc->mc_flags & C_EOF) != 0)))
20927         return MDBX_CURSOR_FULL;
20928     } else {
20929       mdbx_cassert(mc, nkeys + 1 >= mc->mc_ki[n]);
20930       if (unlikely(nkeys + 1 < mc->mc_ki[n]))
20931         return MDBX_CURSOR_FULL;
20932     }
20933 
20934     int err = mdbx_page_check(mc, mp, options);
20935     if (unlikely(err != MDBX_SUCCESS))
20936       return err;
20937 
20938     for (unsigned i = 0; i < nkeys; ++i) {
20939       if (branch) {
20940         MDBX_node *node = page_node(mp, i);
20941         mdbx_cassert(mc, node_flags(node) == 0);
20942         if (unlikely(node_flags(node) != 0))
20943           return MDBX_CURSOR_FULL;
20944         pgno_t pgno = node_pgno(node);
20945         MDBX_page *np;
20946         int rc = mdbx_page_get(mc, pgno, &np, pp_txnid4chk(mp, mc->mc_txn));
20947         mdbx_cassert(mc, rc == MDBX_SUCCESS);
20948         if (unlikely(rc != MDBX_SUCCESS))
20949           return rc;
20950         const bool nested_leaf = IS_LEAF(np) ? true : false;
20951         mdbx_cassert(mc, nested_leaf == expect_nested_leaf);
20952         if (unlikely(nested_leaf != expect_nested_leaf))
20953           return MDBX_CURSOR_FULL;
20954         err = mdbx_page_check(mc, np, options);
20955         if (unlikely(err != MDBX_SUCCESS))
20956           return err;
20957       }
20958     }
20959   }
20960   return MDBX_SUCCESS;
20961 }
20962 
20963 /* Complete a delete operation started by mdbx_cursor_del(). */
mdbx_cursor_del0(MDBX_cursor * mc)20964 static int mdbx_cursor_del0(MDBX_cursor *mc) {
20965   int rc;
20966   MDBX_page *mp;
20967   indx_t ki;
20968   unsigned nkeys;
20969   MDBX_dbi dbi = mc->mc_dbi;
20970 
20971   mdbx_cassert(mc, cursor_is_tracked(mc));
20972   mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]));
20973   ki = mc->mc_ki[mc->mc_top];
20974   mp = mc->mc_pg[mc->mc_top];
20975   mdbx_node_del(mc, mc->mc_db->md_xsize);
20976   mc->mc_db->md_entries--;
20977 
20978   /* Adjust other cursors pointing to mp */
20979   for (MDBX_cursor *m2 = mc->mc_txn->tw.cursors[dbi]; m2; m2 = m2->mc_next) {
20980     MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2;
20981     if (m3 == mc || !(m2->mc_flags & m3->mc_flags & C_INITIALIZED))
20982       continue;
20983     if (m3->mc_snum < mc->mc_snum)
20984       continue;
20985     if (m3->mc_pg[mc->mc_top] == mp) {
20986       if (m3->mc_ki[mc->mc_top] == ki) {
20987         m3->mc_flags |= C_DEL;
20988         if (mc->mc_db->md_flags & MDBX_DUPSORT) {
20989           /* Sub-cursor referred into dataset which is gone */
20990           m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF);
20991         }
20992         continue;
20993       } else if (m3->mc_ki[mc->mc_top] > ki) {
20994         m3->mc_ki[mc->mc_top]--;
20995       }
20996       if (XCURSOR_INITED(m3))
20997         XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]);
20998     }
20999   }
21000 
21001   rc = mdbx_rebalance(mc);
21002   if (unlikely(rc != MDBX_SUCCESS))
21003     goto bailout;
21004 
21005   if (unlikely(!mc->mc_snum)) {
21006     /* DB is totally empty now, just bail out.
21007      * Other cursors adjustments were already done
21008      * by mdbx_rebalance and aren't needed here. */
21009     mdbx_cassert(mc, mc->mc_db->md_entries == 0 && mc->mc_db->md_depth == 0 &&
21010                          mc->mc_db->md_root == P_INVALID);
21011     mc->mc_flags |= C_EOF;
21012     return MDBX_SUCCESS;
21013   }
21014 
21015   ki = mc->mc_ki[mc->mc_top];
21016   mp = mc->mc_pg[mc->mc_top];
21017   mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]));
21018   nkeys = page_numkeys(mp);
21019   mdbx_cassert(mc, (mc->mc_db->md_entries > 0 && nkeys > 0) ||
21020                        ((mc->mc_flags & C_SUB) && mc->mc_db->md_entries == 0 &&
21021                         nkeys == 0));
21022 
21023   /* Adjust this and other cursors pointing to mp */
21024   for (MDBX_cursor *m2 = mc->mc_txn->tw.cursors[dbi]; m2; m2 = m2->mc_next) {
21025     MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2;
21026     if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED))
21027       continue;
21028     if (m3->mc_snum < mc->mc_snum)
21029       continue;
21030     if (m3->mc_pg[mc->mc_top] == mp) {
21031       /* if m3 points past last node in page, find next sibling */
21032       if (m3->mc_ki[mc->mc_top] >= nkeys) {
21033         rc = mdbx_cursor_sibling(m3, SIBLING_RIGHT);
21034         if (rc == MDBX_NOTFOUND) {
21035           m3->mc_flags |= C_EOF;
21036           rc = MDBX_SUCCESS;
21037           continue;
21038         }
21039         if (unlikely(rc != MDBX_SUCCESS))
21040           goto bailout;
21041       }
21042       if (m3->mc_ki[mc->mc_top] >= ki ||
21043           /* moved to right sibling */ m3->mc_pg[mc->mc_top] != mp) {
21044         if (m3->mc_xcursor && !(m3->mc_flags & C_EOF)) {
21045           MDBX_node *node =
21046               page_node(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]);
21047           /* If this node has dupdata, it may need to be reinited
21048            * because its data has moved.
21049            * If the xcursor was not inited it must be reinited.
21050            * Else if node points to a subDB, nothing is needed. */
21051           if (node_flags(node) & F_DUPDATA) {
21052             if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) {
21053               if (!(node_flags(node) & F_SUBDATA))
21054                 m3->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node);
21055             } else {
21056               rc = mdbx_xcursor_init1(m3, node, m3->mc_pg[m3->mc_top]);
21057               if (unlikely(rc != MDBX_SUCCESS))
21058                 goto bailout;
21059               rc = mdbx_cursor_first(&m3->mc_xcursor->mx_cursor, NULL, NULL);
21060               if (unlikely(rc != MDBX_SUCCESS))
21061                 goto bailout;
21062             }
21063           }
21064           m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL;
21065         }
21066         m3->mc_flags |= C_DEL;
21067       }
21068     }
21069   }
21070 
21071   mdbx_cassert(mc, rc == MDBX_SUCCESS);
21072   if (mdbx_audit_enabled())
21073     rc = mdbx_cursor_check(mc, 0);
21074   return rc;
21075 
21076 bailout:
21077   mc->mc_txn->mt_flags |= MDBX_TXN_ERROR;
21078   return rc;
21079 }
21080 
mdbx_del(MDBX_txn * txn,MDBX_dbi dbi,const MDBX_val * key,const MDBX_val * data)21081 int mdbx_del(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key,
21082              const MDBX_val *data) {
21083   int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED);
21084   if (unlikely(rc != MDBX_SUCCESS))
21085     return rc;
21086 
21087   if (unlikely(!key))
21088     return MDBX_EINVAL;
21089 
21090   if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID)))
21091     return MDBX_BAD_DBI;
21092 
21093   if (unlikely(txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED)))
21094     return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN;
21095 
21096   return mdbx_del0(txn, dbi, key, data, 0);
21097 }
21098 
mdbx_del0(MDBX_txn * txn,MDBX_dbi dbi,const MDBX_val * key,const MDBX_val * data,unsigned flags)21099 static int mdbx_del0(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key,
21100                      const MDBX_val *data, unsigned flags) {
21101   MDBX_cursor_couple cx;
21102   MDBX_cursor_op op;
21103   MDBX_val rdata;
21104   int rc;
21105   DKBUF_DEBUG;
21106 
21107   mdbx_debug("====> delete db %u key [%s], data [%s]", dbi, DKEY_DEBUG(key),
21108              DVAL_DEBUG(data));
21109 
21110   rc = mdbx_cursor_init(&cx.outer, txn, dbi);
21111   if (unlikely(rc != MDBX_SUCCESS))
21112     return rc;
21113 
21114   if (data) {
21115     op = MDBX_GET_BOTH;
21116     rdata = *data;
21117     data = &rdata;
21118   } else {
21119     op = MDBX_SET;
21120     flags |= MDBX_ALLDUPS;
21121   }
21122   rc = mdbx_cursor_set(&cx.outer, (MDBX_val *)key, (MDBX_val *)data, op).err;
21123   if (likely(rc == MDBX_SUCCESS)) {
21124     /* let mdbx_page_split know about this cursor if needed:
21125      * delete will trigger a rebalance; if it needs to move
21126      * a node from one page to another, it will have to
21127      * update the parent's separator key(s). If the new sepkey
21128      * is larger than the current one, the parent page may
21129      * run out of space, triggering a split. We need this
21130      * cursor to be consistent until the end of the rebalance. */
21131     cx.outer.mc_next = txn->tw.cursors[dbi];
21132     txn->tw.cursors[dbi] = &cx.outer;
21133     rc = mdbx_cursor_del(&cx.outer, flags);
21134     txn->tw.cursors[dbi] = cx.outer.mc_next;
21135   }
21136   return rc;
21137 }
21138 
21139 /* Split a page and insert a new node.
21140  * Set MDBX_TXN_ERROR on failure.
21141  * [in,out] mc Cursor pointing to the page and desired insertion index.
21142  * The cursor will be updated to point to the actual page and index where
21143  * the node got inserted after the split.
21144  * [in] newkey The key for the newly inserted node.
21145  * [in] newdata The data for the newly inserted node.
21146  * [in] newpgno The page number, if the new node is a branch node.
21147  * [in] nflags The NODE_ADD_FLAGS for the new node.
21148  * Returns 0 on success, non-zero on failure. */
mdbx_page_split(MDBX_cursor * mc,const MDBX_val * const newkey,MDBX_val * const newdata,pgno_t newpgno,unsigned nflags)21149 static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey,
21150                            MDBX_val *const newdata, pgno_t newpgno,
21151                            unsigned nflags) {
21152   unsigned flags;
21153   int rc = MDBX_SUCCESS, foliage = 0;
21154   unsigned i, ptop;
21155   MDBX_env *const env = mc->mc_txn->mt_env;
21156   MDBX_val sepkey, rkey, xdata;
21157   MDBX_page *tmp_ki_copy = NULL;
21158   DKBUF;
21159 
21160   MDBX_page *const mp = mc->mc_pg[mc->mc_top];
21161   const unsigned newindx = mc->mc_ki[mc->mc_top];
21162   unsigned nkeys = page_numkeys(mp);
21163   if (mdbx_audit_enabled()) {
21164     rc = mdbx_cursor_check(mc, C_UPDATING);
21165     if (unlikely(rc != MDBX_SUCCESS))
21166       return rc;
21167   }
21168   STATIC_ASSERT(P_BRANCH == 1);
21169   const unsigned minkeys = (mp->mp_flags & P_BRANCH) + 1;
21170 
21171   mdbx_debug(">> splitting %s-page %" PRIaPGNO
21172              " and adding %zu+%zu [%s] at %i, nkeys %i",
21173              IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, newkey->iov_len,
21174              newdata ? newdata->iov_len : 0, DKEY_DEBUG(newkey),
21175              mc->mc_ki[mc->mc_top], nkeys);
21176   mdbx_cassert(mc, nkeys + 1 >= minkeys * 2);
21177 
21178   /* Create a new sibling page. */
21179   struct page_result npr = mdbx_page_new(mc, mp->mp_flags, 1);
21180   if (unlikely(npr.err != MDBX_SUCCESS))
21181     return npr.err;
21182   MDBX_page *const sister = npr.page;
21183   sister->mp_leaf2_ksize = mp->mp_leaf2_ksize;
21184   mdbx_debug("new sibling: page %" PRIaPGNO, sister->mp_pgno);
21185 
21186   /* Usually when splitting the root page, the cursor
21187    * height is 1. But when called from mdbx_update_key,
21188    * the cursor height may be greater because it walks
21189    * up the stack while finding the branch slot to update. */
21190   if (mc->mc_top < 1) {
21191     npr = mdbx_page_new(mc, P_BRANCH, 1);
21192     rc = npr.err;
21193     if (unlikely(rc != MDBX_SUCCESS))
21194       goto done;
21195     MDBX_page *const pp = npr.page;
21196     /* shift current top to make room for new parent */
21197     mdbx_cassert(mc, mc->mc_snum < 2 && mc->mc_db->md_depth > 0);
21198 #if MDBX_DEBUG
21199     memset(mc->mc_pg + 3, 0, sizeof(mc->mc_pg) - sizeof(mc->mc_pg[0]) * 3);
21200     memset(mc->mc_ki + 3, -1, sizeof(mc->mc_ki) - sizeof(mc->mc_ki[0]) * 3);
21201 #endif
21202     mc->mc_pg[2] = mc->mc_pg[1];
21203     mc->mc_ki[2] = mc->mc_ki[1];
21204     mc->mc_pg[1] = mc->mc_pg[0];
21205     mc->mc_ki[1] = mc->mc_ki[0];
21206     mc->mc_pg[0] = pp;
21207     mc->mc_ki[0] = 0;
21208     mc->mc_db->md_root = pp->mp_pgno;
21209     mdbx_debug("root split! new root = %" PRIaPGNO, pp->mp_pgno);
21210     foliage = mc->mc_db->md_depth++;
21211 
21212     /* Add left (implicit) pointer. */
21213     rc = mdbx_node_add_branch(mc, 0, NULL, mp->mp_pgno);
21214     if (unlikely(rc != MDBX_SUCCESS)) {
21215       /* undo the pre-push */
21216       mc->mc_pg[0] = mc->mc_pg[1];
21217       mc->mc_ki[0] = mc->mc_ki[1];
21218       mc->mc_db->md_root = mp->mp_pgno;
21219       mc->mc_db->md_depth--;
21220       goto done;
21221     }
21222     mc->mc_snum++;
21223     mc->mc_top++;
21224     ptop = 0;
21225     if (mdbx_audit_enabled()) {
21226       rc = mdbx_cursor_check(mc, C_UPDATING);
21227       if (unlikely(rc != MDBX_SUCCESS))
21228         goto done;
21229     }
21230   } else {
21231     ptop = mc->mc_top - 1;
21232     mdbx_debug("parent branch page is %" PRIaPGNO, mc->mc_pg[ptop]->mp_pgno);
21233   }
21234 
21235   MDBX_cursor mn;
21236   cursor_copy(mc, &mn);
21237   mn.mc_pg[mn.mc_top] = sister;
21238   mn.mc_ki[mn.mc_top] = 0;
21239   mn.mc_ki[ptop] = mc->mc_ki[ptop] + 1;
21240 
21241   unsigned split_indx =
21242       (newindx < nkeys)
21243           ? /* split at the middle */ (nkeys + 1) / 2
21244           : /* split at the end (i.e. like append-mode ) */ nkeys - minkeys + 1;
21245 
21246   mdbx_cassert(mc, !IS_BRANCH(mp) || newindx > 0);
21247   /* It is reasonable and possible to split the page at the begin */
21248   if (unlikely(newindx < minkeys)) {
21249     split_indx = minkeys;
21250     if (newindx == 0 && foliage == 0 && !(nflags & MDBX_SPLIT_REPLACE)) {
21251       split_indx = 0;
21252       /* Checking for ability of splitting by the left-side insertion
21253        * of a pure page with the new key */
21254       for (i = 0; i < mc->mc_top; ++i)
21255         if (mc->mc_ki[i]) {
21256           get_key(page_node(mc->mc_pg[i], mc->mc_ki[i]), &sepkey);
21257           if (mc->mc_dbx->md_cmp(newkey, &sepkey) >= 0)
21258             split_indx = minkeys;
21259           break;
21260         }
21261       if (split_indx == 0) {
21262         /* Save the current first key which was omitted on the parent branch
21263          * page and should be updated if the new first entry will be added */
21264         if (IS_LEAF2(mp)) {
21265           sepkey.iov_len = mp->mp_leaf2_ksize;
21266           sepkey.iov_base = page_leaf2key(mp, 0, sepkey.iov_len);
21267         } else
21268           get_key(page_node(mp, 0), &sepkey);
21269         mdbx_cassert(mc, mc->mc_dbx->md_cmp(newkey, &sepkey) < 0);
21270         /* Avoiding rare complex cases of split the parent page */
21271         if (page_room(mn.mc_pg[ptop]) < branch_size(env, &sepkey))
21272           split_indx = minkeys;
21273       }
21274     }
21275   }
21276 
21277   const bool pure_right = split_indx == nkeys;
21278   const bool pure_left = split_indx == 0;
21279   if (unlikely(pure_right)) {
21280     /* newindx == split_indx == nkeys */
21281     mdbx_trace("no-split, but add new pure page at the %s", "right/after");
21282     mdbx_cassert(mc, newindx == nkeys && split_indx == nkeys && minkeys == 1);
21283     sepkey = *newkey;
21284   } else if (unlikely(pure_left)) {
21285     /* newindx == split_indx == 0 */
21286     mdbx_trace("no-split, but add new pure page at the %s", "left/before");
21287     mdbx_cassert(mc, newindx == 0 && split_indx == 0 && minkeys == 1);
21288     mdbx_trace("old-first-key is %s", DKEY_DEBUG(&sepkey));
21289   } else {
21290     if (IS_LEAF2(sister)) {
21291       char *split, *ins;
21292       unsigned lsize, rsize, ksize;
21293       /* Move half of the keys to the right sibling */
21294       const int x = mc->mc_ki[mc->mc_top] - split_indx;
21295       ksize = mc->mc_db->md_xsize;
21296       split = page_leaf2key(mp, split_indx, ksize);
21297       rsize = (nkeys - split_indx) * ksize;
21298       lsize = (nkeys - split_indx) * sizeof(indx_t);
21299       mdbx_cassert(mc, mp->mp_lower >= lsize);
21300       mp->mp_lower -= (indx_t)lsize;
21301       mdbx_cassert(mc, sister->mp_lower + lsize <= UINT16_MAX);
21302       sister->mp_lower += (indx_t)lsize;
21303       mdbx_cassert(mc, mp->mp_upper + rsize - lsize <= UINT16_MAX);
21304       mp->mp_upper += (indx_t)(rsize - lsize);
21305       mdbx_cassert(mc, sister->mp_upper >= rsize - lsize);
21306       sister->mp_upper -= (indx_t)(rsize - lsize);
21307       sepkey.iov_len = ksize;
21308       sepkey.iov_base = (newindx != split_indx) ? split : newkey->iov_base;
21309       if (x < 0) {
21310         mdbx_cassert(mc, ksize >= sizeof(indx_t));
21311         ins = page_leaf2key(mp, mc->mc_ki[mc->mc_top], ksize);
21312         memcpy(sister->mp_ptrs, split, rsize);
21313         sepkey.iov_base = sister->mp_ptrs;
21314         memmove(ins + ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize);
21315         memcpy(ins, newkey->iov_base, ksize);
21316         mdbx_cassert(mc, UINT16_MAX - mp->mp_lower >= (int)sizeof(indx_t));
21317         mp->mp_lower += sizeof(indx_t);
21318         mdbx_cassert(mc, mp->mp_upper >= ksize - sizeof(indx_t));
21319         mp->mp_upper -= (indx_t)(ksize - sizeof(indx_t));
21320       } else {
21321         memcpy(sister->mp_ptrs, split, x * ksize);
21322         ins = page_leaf2key(sister, x, ksize);
21323         memcpy(ins, newkey->iov_base, ksize);
21324         memcpy(ins + ksize, split + x * ksize, rsize - x * ksize);
21325         mdbx_cassert(mc, UINT16_MAX - sister->mp_lower >= (int)sizeof(indx_t));
21326         sister->mp_lower += sizeof(indx_t);
21327         mdbx_cassert(mc, sister->mp_upper >= ksize - sizeof(indx_t));
21328         sister->mp_upper -= (indx_t)(ksize - sizeof(indx_t));
21329         mdbx_cassert(mc, x <= (int)UINT16_MAX);
21330         mc->mc_ki[mc->mc_top] = (indx_t)x;
21331       }
21332 
21333       if (mdbx_audit_enabled()) {
21334         rc = mdbx_cursor_check(mc, C_UPDATING);
21335         if (unlikely(rc != MDBX_SUCCESS))
21336           goto done;
21337         rc = mdbx_cursor_check(&mn, C_UPDATING);
21338         if (unlikely(rc != MDBX_SUCCESS))
21339           goto done;
21340       }
21341     } else {
21342       /* Maximum free space in an empty page */
21343       const unsigned max_space = page_space(env);
21344       const size_t new_size = IS_LEAF(mp) ? leaf_size(env, newkey, newdata)
21345                                           : branch_size(env, newkey);
21346 
21347       /* grab a page to hold a temporary copy */
21348       tmp_ki_copy = mdbx_page_malloc(mc->mc_txn, 1);
21349       if (unlikely(tmp_ki_copy == NULL)) {
21350         rc = MDBX_ENOMEM;
21351         goto done;
21352       }
21353 
21354       /* prepare to insert */
21355       for (unsigned j = i = 0; i < nkeys; ++i, ++j) {
21356         tmp_ki_copy->mp_ptrs[j] = 0;
21357         j += (i == newindx);
21358         tmp_ki_copy->mp_ptrs[j] = mp->mp_ptrs[i];
21359       }
21360       tmp_ki_copy->mp_pgno = mp->mp_pgno;
21361       tmp_ki_copy->mp_flags = mp->mp_flags;
21362       tmp_ki_copy->mp_txnid = INVALID_TXNID;
21363       tmp_ki_copy->mp_lower = 0;
21364       tmp_ki_copy->mp_upper = (indx_t)max_space;
21365 
21366       /* When items are relatively large the split point needs
21367        * to be checked, because being off-by-one will make the
21368        * difference between success or failure in mdbx_node_add.
21369        *
21370        * It's also relevant if a page happens to be laid out
21371        * such that one half of its nodes are all "small" and
21372        * the other half of its nodes are "large". If the new
21373        * item is also "large" and falls on the half with
21374        * "large" nodes, it also may not fit.
21375        *
21376        * As a final tweak, if the new item goes on the last
21377        * spot on the page (and thus, onto the new page), bias
21378        * the split so the new page is emptier than the old page.
21379        * This yields better packing during sequential inserts. */
21380 
21381       if (nkeys < 32 || new_size > max_space / 16) {
21382         /* Find split point */
21383         int dir;
21384         if (newindx <= split_indx) {
21385           i = 0;
21386           dir = 1;
21387         } else {
21388           i = nkeys;
21389           dir = -1;
21390         }
21391         size_t before = 0, after = new_size + page_used(env, mp);
21392         int best = split_indx;
21393         int best_offset = nkeys + 1;
21394 
21395         mdbx_trace("seek separator from %u, step %i, default %u, new-idx %u, "
21396                    "new-size %zu",
21397                    i, dir, split_indx, newindx, new_size);
21398         do {
21399           mdbx_cassert(mc, i <= nkeys);
21400           size_t size = new_size;
21401           if (i != newindx) {
21402             MDBX_node *node =
21403                 (MDBX_node *)((char *)mp + tmp_ki_copy->mp_ptrs[i] + PAGEHDRSZ);
21404             size = NODESIZE + node_ks(node) + sizeof(indx_t);
21405             if (IS_LEAF(mp))
21406               size += F_ISSET(node_flags(node), F_BIGDATA) ? sizeof(pgno_t)
21407                                                            : node_ds(node);
21408             size = EVEN(size);
21409           }
21410 
21411           before += size;
21412           after -= size;
21413           mdbx_trace("step %u, size %zu, before %zu, after %zu, max %u", i,
21414                      size, before, after, max_space);
21415 
21416           if (before <= max_space && after <= max_space) {
21417             int offset = branchless_abs(split_indx - i);
21418             if (offset >= best_offset)
21419               break;
21420             best_offset = offset;
21421             best = i;
21422           }
21423           i += dir;
21424         } while (i < nkeys);
21425 
21426         split_indx = best + (dir > 0);
21427         split_indx = (split_indx <= nkeys - minkeys + 1) ? split_indx
21428                                                          : nkeys - minkeys + 1;
21429         split_indx = (split_indx >= minkeys) ? split_indx : minkeys;
21430         mdbx_trace("chosen %u", split_indx);
21431       }
21432 
21433       sepkey.iov_len = newkey->iov_len;
21434       sepkey.iov_base = newkey->iov_base;
21435       if (split_indx != newindx) {
21436         MDBX_node *node =
21437             (MDBX_node *)((char *)mp + tmp_ki_copy->mp_ptrs[split_indx] +
21438                           PAGEHDRSZ);
21439         sepkey.iov_len = node_ks(node);
21440         sepkey.iov_base = node_key(node);
21441       }
21442     }
21443   }
21444   mdbx_debug("separator is %d [%s]", split_indx, DKEY_DEBUG(&sepkey));
21445 
21446   bool did_split_parent = false;
21447   /* Copy separator key to the parent. */
21448   if (page_room(mn.mc_pg[ptop]) < branch_size(env, &sepkey)) {
21449     mdbx_trace("need split parent branch-page for key %s", DKEY_DEBUG(&sepkey));
21450     mdbx_cassert(mc, page_numkeys(mn.mc_pg[ptop]) > 2);
21451     mdbx_cassert(mc, !pure_left);
21452     const int snum = mc->mc_snum;
21453     const int depth = mc->mc_db->md_depth;
21454     mn.mc_snum--;
21455     mn.mc_top--;
21456     did_split_parent = true;
21457     /* We want other splits to find mn when doing fixups */
21458     WITH_CURSOR_TRACKING(
21459         mn, rc = mdbx_page_split(&mn, &sepkey, NULL, sister->mp_pgno, 0));
21460     if (unlikely(rc != MDBX_SUCCESS))
21461       goto done;
21462     mdbx_cassert(mc, (int)mc->mc_snum - snum == mc->mc_db->md_depth - depth);
21463     if (mdbx_audit_enabled()) {
21464       rc = mdbx_cursor_check(mc, C_UPDATING);
21465       if (unlikely(rc != MDBX_SUCCESS))
21466         goto done;
21467     }
21468 
21469     /* root split? */
21470     ptop += mc->mc_snum - snum;
21471 
21472     /* Right page might now have changed parent.
21473      * Check if left page also changed parent. */
21474     if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
21475         mc->mc_ki[ptop] >= page_numkeys(mc->mc_pg[ptop])) {
21476       for (i = 0; i < ptop; i++) {
21477         mc->mc_pg[i] = mn.mc_pg[i];
21478         mc->mc_ki[i] = mn.mc_ki[i];
21479       }
21480       mc->mc_pg[ptop] = mn.mc_pg[ptop];
21481       if (mn.mc_ki[ptop]) {
21482         mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1;
21483       } else {
21484         /* find right page's left sibling */
21485         mc->mc_ki[ptop] = mn.mc_ki[ptop];
21486         rc = mdbx_cursor_sibling(mc, SIBLING_LEFT);
21487         if (unlikely(rc != MDBX_SUCCESS)) {
21488           if (rc == MDBX_NOTFOUND) /* improper mdbx_cursor_sibling() result */ {
21489             mdbx_error("unexpected %i error going left sibling", rc);
21490             rc = MDBX_PROBLEM;
21491           }
21492           goto done;
21493         }
21494       }
21495     }
21496   } else if (unlikely(pure_left)) {
21497     MDBX_page *ptop_page = mc->mc_pg[ptop];
21498     mdbx_debug("adding to parent page %u node[%u] left-leaf page #%u key %s",
21499                ptop_page->mp_pgno, mc->mc_ki[ptop], sister->mp_pgno,
21500                DKEY(mc->mc_ki[ptop] ? newkey : NULL));
21501     mc->mc_top--;
21502     rc = mdbx_node_add_branch(mc, mc->mc_ki[ptop],
21503                               mc->mc_ki[ptop] ? newkey : NULL, sister->mp_pgno);
21504     mdbx_cassert(mc, mp == mc->mc_pg[ptop + 1] &&
21505                          newindx == mc->mc_ki[ptop + 1] && ptop == mc->mc_top);
21506 
21507     if (likely(rc == MDBX_SUCCESS) && mc->mc_ki[ptop] == 0) {
21508       mdbx_debug("update prev-first key on parent %s", DKEY(&sepkey));
21509       MDBX_node *node = page_node(mc->mc_pg[ptop], 1);
21510       mdbx_cassert(mc, node_ks(node) == 0 && node_pgno(node) == mp->mp_pgno);
21511       mdbx_cassert(mc, mc->mc_top == ptop && mc->mc_ki[ptop] == 0);
21512       mc->mc_ki[ptop] = 1;
21513       rc = mdbx_update_key(mc, &sepkey);
21514       mdbx_cassert(mc, mc->mc_top == ptop && mc->mc_ki[ptop] == 1);
21515       mdbx_cassert(mc,
21516                    mp == mc->mc_pg[ptop + 1] && newindx == mc->mc_ki[ptop + 1]);
21517       mc->mc_ki[ptop] = 0;
21518     }
21519 
21520     mc->mc_top++;
21521     if (unlikely(rc != MDBX_SUCCESS))
21522       goto done;
21523 
21524     MDBX_node *node = page_node(mc->mc_pg[ptop], mc->mc_ki[ptop] + 1);
21525     mdbx_cassert(mc, node_pgno(node) == mp->mp_pgno &&
21526                          mc->mc_pg[ptop] == ptop_page);
21527   } else {
21528     mn.mc_top--;
21529     mdbx_trace("add-to-parent the right-entry[%u] for new sibling-page",
21530                mn.mc_ki[ptop]);
21531     rc = mdbx_node_add_branch(&mn, mn.mc_ki[ptop], &sepkey, sister->mp_pgno);
21532     mn.mc_top++;
21533     if (unlikely(rc != MDBX_SUCCESS))
21534       goto done;
21535   }
21536 
21537   if (unlikely(pure_left | pure_right)) {
21538     mc->mc_pg[mc->mc_top] = sister;
21539     mc->mc_ki[mc->mc_top] = 0;
21540     switch (PAGETYPE(sister)) {
21541     case P_LEAF: {
21542       mdbx_cassert(mc, newpgno == 0 || newpgno == P_INVALID);
21543       rc = mdbx_node_add_leaf(mc, 0, newkey, newdata, nflags);
21544     } break;
21545     case P_LEAF | P_LEAF2: {
21546       mdbx_cassert(mc, (nflags & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0);
21547       mdbx_cassert(mc, newpgno == 0 || newpgno == P_INVALID);
21548       rc = mdbx_node_add_leaf2(mc, 0, newkey);
21549     } break;
21550     default:
21551       rc = bad_page(sister, "wrong page-type %u\n", PAGETYPE(sister));
21552     }
21553     if (unlikely(rc != MDBX_SUCCESS))
21554       goto done;
21555 
21556     if (pure_right) {
21557       for (i = 0; i < mc->mc_top; i++)
21558         mc->mc_ki[i] = mn.mc_ki[i];
21559     } else if (mc->mc_ki[mc->mc_top - 1] == 0) {
21560       for (i = 2; i <= mc->mc_top; ++i)
21561         if (mc->mc_ki[mc->mc_top - i]) {
21562           get_key(
21563               page_node(mc->mc_pg[mc->mc_top - i], mc->mc_ki[mc->mc_top - i]),
21564               &sepkey);
21565           if (mc->mc_dbx->md_cmp(newkey, &sepkey) < 0) {
21566             mc->mc_top -= i;
21567             mdbx_debug("update new-first on parent [%i] page %u key %s",
21568                        mc->mc_ki[mc->mc_top], mc->mc_pg[mc->mc_top]->mp_pgno,
21569                        DKEY(newkey));
21570             rc = mdbx_update_key(mc, newkey);
21571             mc->mc_top += i;
21572             if (unlikely(rc != MDBX_SUCCESS))
21573               goto done;
21574           }
21575           break;
21576         }
21577     }
21578   } else if (!IS_LEAF2(mp)) {
21579     /* Move nodes */
21580     mc->mc_pg[mc->mc_top] = sister;
21581     i = split_indx;
21582     unsigned n = 0;
21583     pgno_t pgno = 0;
21584     do {
21585       mdbx_trace("i %u, nkeys %u => n %u, rp #%u", i, nkeys, n,
21586                  sister->mp_pgno);
21587       MDBX_val *rdata = NULL;
21588       if (i == newindx) {
21589         rkey.iov_base = newkey->iov_base;
21590         rkey.iov_len = newkey->iov_len;
21591         if (IS_LEAF(mp))
21592           rdata = newdata;
21593         else
21594           pgno = newpgno;
21595         flags = nflags;
21596         /* Update index for the new key. */
21597         mc->mc_ki[mc->mc_top] = (indx_t)n;
21598       } else {
21599         MDBX_node *node =
21600             (MDBX_node *)((char *)mp + tmp_ki_copy->mp_ptrs[i] + PAGEHDRSZ);
21601         rkey.iov_base = node_key(node);
21602         rkey.iov_len = node_ks(node);
21603         if (IS_LEAF(mp)) {
21604           xdata.iov_base = node_data(node);
21605           xdata.iov_len = node_ds(node);
21606           rdata = &xdata;
21607         } else
21608           pgno = node_pgno(node);
21609         flags = node_flags(node);
21610       }
21611 
21612       switch (PAGETYPE(sister)) {
21613       case P_BRANCH: {
21614         mdbx_cassert(mc, 0 == (uint16_t)flags);
21615         /* First branch index doesn't need key data. */
21616         rc = mdbx_node_add_branch(mc, n, n ? &rkey : NULL, pgno);
21617       } break;
21618       case P_LEAF: {
21619         mdbx_cassert(mc, pgno == 0);
21620         mdbx_cassert(mc, rdata != NULL);
21621         rc = mdbx_node_add_leaf(mc, n, &rkey, rdata, flags);
21622       } break;
21623       /* case P_LEAF | P_LEAF2: {
21624         mdbx_cassert(mc, (nflags & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0);
21625         mdbx_cassert(mc, gno == 0);
21626         rc = mdbx_node_add_leaf2(mc, n, &rkey);
21627       } break; */
21628       default:
21629         rc = bad_page(sister, "wrong page-type %u\n", PAGETYPE(sister));
21630       }
21631       if (unlikely(rc != MDBX_SUCCESS))
21632         goto done;
21633 
21634       ++n;
21635       if (++i > nkeys) {
21636         i = 0;
21637         n = 0;
21638         mc->mc_pg[mc->mc_top] = tmp_ki_copy;
21639         mdbx_trace("switch to mp #%u", tmp_ki_copy->mp_pgno);
21640       }
21641     } while (i != split_indx);
21642 
21643     mdbx_trace("i %u, nkeys %u, n %u, pgno #%u", i, nkeys, n,
21644                mc->mc_pg[mc->mc_top]->mp_pgno);
21645 
21646     nkeys = page_numkeys(tmp_ki_copy);
21647     for (i = 0; i < nkeys; i++)
21648       mp->mp_ptrs[i] = tmp_ki_copy->mp_ptrs[i];
21649     mp->mp_lower = tmp_ki_copy->mp_lower;
21650     mp->mp_upper = tmp_ki_copy->mp_upper;
21651     memcpy(page_node(mp, nkeys - 1), page_node(tmp_ki_copy, nkeys - 1),
21652            env->me_psize - tmp_ki_copy->mp_upper - PAGEHDRSZ);
21653 
21654     /* reset back to original page */
21655     if (newindx < split_indx) {
21656       mc->mc_pg[mc->mc_top] = mp;
21657     } else {
21658       mc->mc_pg[mc->mc_top] = sister;
21659       mc->mc_ki[ptop]++;
21660       /* Make sure mc_ki is still valid. */
21661       if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
21662           mc->mc_ki[ptop] >= page_numkeys(mc->mc_pg[ptop])) {
21663         for (i = 0; i <= ptop; i++) {
21664           mc->mc_pg[i] = mn.mc_pg[i];
21665           mc->mc_ki[i] = mn.mc_ki[i];
21666         }
21667       }
21668     }
21669   } else if (newindx >= split_indx) {
21670     mc->mc_pg[mc->mc_top] = sister;
21671     mc->mc_ki[ptop]++;
21672     /* Make sure mc_ki is still valid. */
21673     if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
21674         mc->mc_ki[ptop] >= page_numkeys(mc->mc_pg[ptop])) {
21675       for (i = 0; i <= ptop; i++) {
21676         mc->mc_pg[i] = mn.mc_pg[i];
21677         mc->mc_ki[i] = mn.mc_ki[i];
21678       }
21679     }
21680   }
21681 
21682   /* Adjust other cursors pointing to mp and/or to parent page */
21683   nkeys = page_numkeys(mp);
21684   for (MDBX_cursor *m2 = mc->mc_txn->tw.cursors[mc->mc_dbi]; m2;
21685        m2 = m2->mc_next) {
21686     MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2;
21687     if (m3 == mc)
21688       continue;
21689     if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED))
21690       continue;
21691     if (foliage) {
21692       /* sub cursors may be on different DB */
21693       if (m3->mc_pg[0] != mp)
21694         continue;
21695       /* root split */
21696       for (int k = foliage; k >= 0; k--) {
21697         m3->mc_ki[k + 1] = m3->mc_ki[k];
21698         m3->mc_pg[k + 1] = m3->mc_pg[k];
21699       }
21700       m3->mc_ki[0] = (m3->mc_ki[0] >= nkeys) ? 1 : 0;
21701       m3->mc_pg[0] = mc->mc_pg[0];
21702       m3->mc_snum++;
21703       m3->mc_top++;
21704     }
21705 
21706     if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp && !pure_left) {
21707       if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDBX_SPLIT_REPLACE))
21708         m3->mc_ki[mc->mc_top]++;
21709       if (m3->mc_ki[mc->mc_top] >= nkeys) {
21710         m3->mc_pg[mc->mc_top] = sister;
21711         mdbx_cassert(mc, m3->mc_ki[mc->mc_top] >= nkeys);
21712         m3->mc_ki[mc->mc_top] -= (indx_t)nkeys;
21713         for (i = 0; i < mc->mc_top; i++) {
21714           m3->mc_ki[i] = mn.mc_ki[i];
21715           m3->mc_pg[i] = mn.mc_pg[i];
21716         }
21717       }
21718     } else if (!did_split_parent && m3->mc_top >= ptop &&
21719                m3->mc_pg[ptop] == mc->mc_pg[ptop] &&
21720                m3->mc_ki[ptop] >= mc->mc_ki[ptop]) {
21721       m3->mc_ki[ptop]++; /* also for the `pure-left` case */
21722     }
21723     if (XCURSOR_INITED(m3) && IS_LEAF(mp))
21724       XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]);
21725   }
21726   mdbx_trace("mp #%u left: %d, sister #%u left: %d", mp->mp_pgno, page_room(mp),
21727              sister->mp_pgno, page_room(sister));
21728 
21729 done:
21730   if (tmp_ki_copy)
21731     mdbx_dpage_free(env, tmp_ki_copy, 1);
21732 
21733   if (unlikely(rc != MDBX_SUCCESS))
21734     mc->mc_txn->mt_flags |= MDBX_TXN_ERROR;
21735   else {
21736     if (mdbx_audit_enabled())
21737       rc = mdbx_cursor_check(mc, C_UPDATING);
21738     if (unlikely(nflags & MDBX_RESERVE)) {
21739       MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
21740       if (!(node_flags(node) & F_BIGDATA))
21741         newdata->iov_base = node_data(node);
21742     }
21743 #if MDBX_ENABLE_PGOP_STAT
21744     env->me_lck->mti_pgop_stat.split.weak += 1;
21745 #endif /* MDBX_ENABLE_PGOP_STAT */
21746   }
21747 
21748   mdbx_debug("<< mp #%u, rc %d", mp->mp_pgno, rc);
21749   return rc;
21750 }
21751 
mdbx_put(MDBX_txn * txn,MDBX_dbi dbi,const MDBX_val * key,MDBX_val * data,unsigned flags)21752 int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data,
21753              unsigned flags) {
21754   int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED);
21755   if (unlikely(rc != MDBX_SUCCESS))
21756     return rc;
21757 
21758   if (unlikely(!key || !data))
21759     return MDBX_EINVAL;
21760 
21761   if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID)))
21762     return MDBX_BAD_DBI;
21763 
21764   if (unlikely(flags & ~(MDBX_NOOVERWRITE | MDBX_NODUPDATA | MDBX_ALLDUPS |
21765                          MDBX_ALLDUPS | MDBX_RESERVE | MDBX_APPEND |
21766                          MDBX_APPENDDUP | MDBX_CURRENT | MDBX_MULTIPLE)))
21767     return MDBX_EINVAL;
21768 
21769   if (unlikely(txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED)))
21770     return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN;
21771 
21772   MDBX_cursor_couple cx;
21773   rc = mdbx_cursor_init(&cx.outer, txn, dbi);
21774   if (unlikely(rc != MDBX_SUCCESS))
21775     return rc;
21776   cx.outer.mc_next = txn->tw.cursors[dbi];
21777   txn->tw.cursors[dbi] = &cx.outer;
21778 
21779   /* LY: support for update (explicit overwrite) */
21780   if (flags & MDBX_CURRENT) {
21781     rc = mdbx_cursor_get(&cx.outer, (MDBX_val *)key, NULL, MDBX_SET);
21782     if (likely(rc == MDBX_SUCCESS) &&
21783         (txn->mt_dbs[dbi].md_flags & MDBX_DUPSORT) &&
21784         (flags & MDBX_ALLDUPS) == 0) {
21785       /* LY: allows update (explicit overwrite) only for unique keys */
21786       MDBX_node *node = page_node(cx.outer.mc_pg[cx.outer.mc_top],
21787                                   cx.outer.mc_ki[cx.outer.mc_top]);
21788       if (F_ISSET(node_flags(node), F_DUPDATA)) {
21789         mdbx_tassert(txn, XCURSOR_INITED(&cx.outer) &&
21790                               cx.outer.mc_xcursor->mx_db.md_entries > 1);
21791         rc = MDBX_EMULTIVAL;
21792       }
21793     }
21794   }
21795 
21796   if (likely(rc == MDBX_SUCCESS))
21797     rc = mdbx_cursor_put(&cx.outer, key, data, flags);
21798   txn->tw.cursors[dbi] = cx.outer.mc_next;
21799 
21800   return rc;
21801 }
21802 
21803 /**** COPYING *****************************************************************/
21804 
21805 /* State needed for a double-buffering compacting copy. */
21806 typedef struct mdbx_copy {
21807   MDBX_env *mc_env;
21808   MDBX_txn *mc_txn;
21809   mdbx_condpair_t mc_condpair;
21810   uint8_t *mc_wbuf[2];
21811   uint8_t *mc_over[2];
21812   size_t mc_wlen[2];
21813   size_t mc_olen[2];
21814   mdbx_filehandle_t mc_fd;
21815   /* Error code.  Never cleared if set.  Both threads can set nonzero
21816    * to fail the copy.  Not mutex-protected, MDBX expects atomic int. */
21817   volatile int mc_error;
21818   pgno_t mc_next_pgno;
21819   volatile unsigned mc_head;
21820   volatile unsigned mc_tail;
21821 } mdbx_copy;
21822 
21823 /* Dedicated writer thread for compacting copy. */
mdbx_env_copythr(void * arg)21824 __cold static THREAD_RESULT THREAD_CALL mdbx_env_copythr(void *arg) {
21825   mdbx_copy *my = arg;
21826 
21827 #if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64))
21828   sigset_t sigset;
21829   sigemptyset(&sigset);
21830   sigaddset(&sigset, SIGPIPE);
21831   my->mc_error = pthread_sigmask(SIG_BLOCK, &sigset, NULL);
21832 #endif /* EPIPE */
21833 
21834   mdbx_condpair_lock(&my->mc_condpair);
21835   while (!my->mc_error) {
21836     while (my->mc_tail == my->mc_head && !my->mc_error) {
21837       int err = mdbx_condpair_wait(&my->mc_condpair, true);
21838       if (err != MDBX_SUCCESS) {
21839         my->mc_error = err;
21840         goto bailout;
21841       }
21842     }
21843     const unsigned toggle = my->mc_tail & 1;
21844     size_t wsize = my->mc_wlen[toggle];
21845     if (wsize == 0) {
21846       my->mc_tail += 1;
21847       break /* EOF */;
21848     }
21849     my->mc_wlen[toggle] = 0;
21850     uint8_t *ptr = my->mc_wbuf[toggle];
21851   again:
21852     if (!my->mc_error) {
21853       int err = mdbx_write(my->mc_fd, ptr, wsize);
21854       if (err != MDBX_SUCCESS) {
21855 #if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64))
21856         if (err == EPIPE) {
21857           /* Collect the pending SIGPIPE,
21858            * otherwise at least OS X gives it to the process on thread-exit. */
21859           int unused;
21860           sigwait(&sigset, &unused);
21861         }
21862 #endif /* EPIPE */
21863         my->mc_error = err;
21864         goto bailout;
21865       }
21866     }
21867 
21868     /* If there's an overflow page tail, write it too */
21869     wsize = my->mc_olen[toggle];
21870     if (wsize) {
21871       my->mc_olen[toggle] = 0;
21872       ptr = my->mc_over[toggle];
21873       goto again;
21874     }
21875     my->mc_tail += 1;
21876     mdbx_condpair_signal(&my->mc_condpair, false);
21877   }
21878 bailout:
21879   mdbx_condpair_unlock(&my->mc_condpair);
21880   return (THREAD_RESULT)0;
21881 }
21882 
21883 /* Give buffer and/or MDBX_EOF to writer thread, await unused buffer. */
mdbx_env_cthr_toggle(mdbx_copy * my)21884 __cold static int mdbx_env_cthr_toggle(mdbx_copy *my) {
21885   mdbx_condpair_lock(&my->mc_condpair);
21886   mdbx_assert(my->mc_env, my->mc_head - my->mc_tail < 2 || my->mc_error);
21887   my->mc_head += 1;
21888   mdbx_condpair_signal(&my->mc_condpair, true);
21889   while (!my->mc_error &&
21890          my->mc_head - my->mc_tail == 2 /* both buffers in use */) {
21891     int err = mdbx_condpair_wait(&my->mc_condpair, false);
21892     if (err != MDBX_SUCCESS)
21893       my->mc_error = err;
21894   }
21895   mdbx_condpair_unlock(&my->mc_condpair);
21896   return my->mc_error;
21897 }
21898 
21899 /* Depth-first tree traversal for compacting copy.
21900  * [in] my control structure.
21901  * [in,out] pg database root.
21902  * [in] flags includes F_DUPDATA if it is a sorted-duplicate sub-DB. */
mdbx_env_cwalk(mdbx_copy * my,pgno_t * pg,int flags)21903 __cold static int mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) {
21904   MDBX_cursor_couple couple;
21905   MDBX_page *mo, *mp, *leaf;
21906   char *buf, *ptr;
21907   int rc;
21908   unsigned i;
21909 
21910   /* Empty DB, nothing to do */
21911   if (*pg == P_INVALID)
21912     return MDBX_SUCCESS;
21913 
21914   memset(&couple, 0, sizeof(couple));
21915   couple.outer.mc_snum = 1;
21916   couple.outer.mc_txn = my->mc_txn;
21917   couple.outer.mc_flags = couple.inner.mx_cursor.mc_flags =
21918       C_COPYING | C_SKIPORD;
21919 
21920   rc = mdbx_page_get(&couple.outer, *pg, &couple.outer.mc_pg[0],
21921                      my->mc_txn->mt_txnid);
21922   if (unlikely(rc != MDBX_SUCCESS))
21923     return rc;
21924   rc = mdbx_page_search_root(&couple.outer, NULL, MDBX_PS_FIRST);
21925   if (unlikely(rc != MDBX_SUCCESS))
21926     return rc;
21927 
21928   /* Make cursor pages writable */
21929   buf = ptr = mdbx_malloc(pgno2bytes(my->mc_env, couple.outer.mc_snum));
21930   if (buf == NULL)
21931     return MDBX_ENOMEM;
21932 
21933   for (i = 0; i < couple.outer.mc_top; i++) {
21934     mdbx_page_copy((MDBX_page *)ptr, couple.outer.mc_pg[i],
21935                    my->mc_env->me_psize);
21936     couple.outer.mc_pg[i] = (MDBX_page *)ptr;
21937     ptr += my->mc_env->me_psize;
21938   }
21939 
21940   /* This is writable space for a leaf page. Usually not needed. */
21941   leaf = (MDBX_page *)ptr;
21942 
21943   while (couple.outer.mc_snum > 0) {
21944     mp = couple.outer.mc_pg[couple.outer.mc_top];
21945     unsigned n = page_numkeys(mp);
21946 
21947     if (IS_LEAF(mp)) {
21948       if (!IS_LEAF2(mp) && !(flags & F_DUPDATA)) {
21949         for (i = 0; i < n; i++) {
21950           MDBX_node *node = page_node(mp, i);
21951           if (node_flags(node) & F_BIGDATA) {
21952             MDBX_page *omp;
21953 
21954             /* Need writable leaf */
21955             if (mp != leaf) {
21956               couple.outer.mc_pg[couple.outer.mc_top] = leaf;
21957               mdbx_page_copy(leaf, mp, my->mc_env->me_psize);
21958               mp = leaf;
21959               node = page_node(mp, i);
21960             }
21961 
21962             const pgno_t pgno = node_largedata_pgno(node);
21963             poke_pgno(node_data(node), my->mc_next_pgno);
21964             rc = mdbx_page_get(&couple.outer, pgno, &omp,
21965                                pp_txnid4chk(mp, my->mc_txn));
21966             if (unlikely(rc != MDBX_SUCCESS))
21967               goto done;
21968             unsigned toggle = my->mc_head & 1;
21969             if (my->mc_wlen[toggle] + my->mc_env->me_psize >
21970                 ((size_t)(MDBX_ENVCOPY_WRITEBUF))) {
21971               rc = mdbx_env_cthr_toggle(my);
21972               if (unlikely(rc != MDBX_SUCCESS))
21973                 goto done;
21974               toggle = my->mc_head & 1;
21975             }
21976             mo = (MDBX_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]);
21977             memcpy(mo, omp, my->mc_env->me_psize);
21978             mo->mp_pgno = my->mc_next_pgno;
21979             my->mc_next_pgno += omp->mp_pages;
21980             my->mc_wlen[toggle] += my->mc_env->me_psize;
21981             if (omp->mp_pages > 1) {
21982               my->mc_olen[toggle] = pgno2bytes(my->mc_env, omp->mp_pages - 1);
21983               my->mc_over[toggle] = (uint8_t *)omp + my->mc_env->me_psize;
21984               rc = mdbx_env_cthr_toggle(my);
21985               if (unlikely(rc != MDBX_SUCCESS))
21986                 goto done;
21987               toggle = my->mc_head & 1;
21988             }
21989           } else if (node_flags(node) & F_SUBDATA) {
21990             if (!MDBX_DISABLE_PAGECHECKS &&
21991                 unlikely(node_ds(node) != sizeof(MDBX_db))) {
21992               rc = MDBX_CORRUPTED;
21993               goto done;
21994             }
21995 
21996             /* Need writable leaf */
21997             if (mp != leaf) {
21998               couple.outer.mc_pg[couple.outer.mc_top] = leaf;
21999               mdbx_page_copy(leaf, mp, my->mc_env->me_psize);
22000               mp = leaf;
22001               node = page_node(mp, i);
22002             }
22003 
22004             MDBX_db db;
22005             memcpy(&db, node_data(node), sizeof(MDBX_db));
22006             rc = mdbx_env_cwalk(my, &db.md_root, node_flags(node) & F_DUPDATA);
22007             if (rc)
22008               goto done;
22009             memcpy(node_data(node), &db, sizeof(MDBX_db));
22010           }
22011         }
22012       }
22013     } else {
22014       couple.outer.mc_ki[couple.outer.mc_top]++;
22015       if (couple.outer.mc_ki[couple.outer.mc_top] < n) {
22016       again:
22017         rc = mdbx_page_get(
22018             &couple.outer,
22019             node_pgno(page_node(mp, couple.outer.mc_ki[couple.outer.mc_top])),
22020             &mp, pp_txnid4chk(mp, my->mc_txn));
22021         if (unlikely(rc != MDBX_SUCCESS))
22022           goto done;
22023         couple.outer.mc_top++;
22024         couple.outer.mc_snum++;
22025         couple.outer.mc_ki[couple.outer.mc_top] = 0;
22026         if (IS_BRANCH(mp)) {
22027           /* Whenever we advance to a sibling branch page,
22028            * we must proceed all the way down to its first leaf. */
22029           mdbx_page_copy(couple.outer.mc_pg[couple.outer.mc_top], mp,
22030                          my->mc_env->me_psize);
22031           goto again;
22032         } else
22033           couple.outer.mc_pg[couple.outer.mc_top] = mp;
22034         continue;
22035       }
22036     }
22037     unsigned toggle = my->mc_head & 1;
22038     if (my->mc_wlen[toggle] + my->mc_wlen[toggle] >
22039         ((size_t)(MDBX_ENVCOPY_WRITEBUF))) {
22040       rc = mdbx_env_cthr_toggle(my);
22041       if (unlikely(rc != MDBX_SUCCESS))
22042         goto done;
22043       toggle = my->mc_head & 1;
22044     }
22045     mo = (MDBX_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]);
22046     mdbx_page_copy(mo, mp, my->mc_env->me_psize);
22047     mo->mp_pgno = my->mc_next_pgno++;
22048     my->mc_wlen[toggle] += my->mc_env->me_psize;
22049     if (couple.outer.mc_top) {
22050       /* Update parent if there is one */
22051       node_set_pgno(page_node(couple.outer.mc_pg[couple.outer.mc_top - 1],
22052                               couple.outer.mc_ki[couple.outer.mc_top - 1]),
22053                     mo->mp_pgno);
22054       mdbx_cursor_pop(&couple.outer);
22055     } else {
22056       /* Otherwise we're done */
22057       *pg = mo->mp_pgno;
22058       break;
22059     }
22060   }
22061 done:
22062   mdbx_free(buf);
22063   return rc;
22064 }
22065 
compact_fixup_meta(MDBX_env * env,MDBX_meta * meta)22066 __cold static void compact_fixup_meta(MDBX_env *env, MDBX_meta *meta) {
22067   /* Calculate filesize taking in account shrink/growing thresholds */
22068   if (meta->mm_geo.next != meta->mm_geo.now) {
22069     meta->mm_geo.now = meta->mm_geo.next;
22070     const pgno_t aligner = pv2pages(
22071         meta->mm_geo.grow_pv ? meta->mm_geo.grow_pv : meta->mm_geo.shrink_pv);
22072     if (aligner) {
22073       const pgno_t aligned = pgno_align2os_pgno(
22074           env, meta->mm_geo.next + aligner - meta->mm_geo.next % aligner);
22075       meta->mm_geo.now = aligned;
22076     }
22077   }
22078 
22079   if (meta->mm_geo.now < meta->mm_geo.lower)
22080     meta->mm_geo.now = meta->mm_geo.lower;
22081   if (meta->mm_geo.now > meta->mm_geo.upper)
22082     meta->mm_geo.now = meta->mm_geo.upper;
22083 
22084   /* Update signature */
22085   assert(meta->mm_geo.now >= meta->mm_geo.next);
22086   unaligned_poke_u64(4, meta->mm_datasync_sign, mdbx_meta_sign(meta));
22087 }
22088 
22089 /* Make resizeable */
make_sizeable(MDBX_meta * meta)22090 __cold static void make_sizeable(MDBX_meta *meta) {
22091   meta->mm_geo.lower = MIN_PAGENO;
22092   if (meta->mm_geo.grow_pv == 0) {
22093     const pgno_t step = 1 + (meta->mm_geo.upper - meta->mm_geo.lower) / 42;
22094     meta->mm_geo.grow_pv = pages2pv(step);
22095   }
22096   if (meta->mm_geo.shrink_pv == 0) {
22097     const pgno_t step = pv2pages(meta->mm_geo.grow_pv) << 1;
22098     meta->mm_geo.shrink_pv = pages2pv(step);
22099   }
22100 }
22101 
22102 /* Copy environment with compaction. */
mdbx_env_compact(MDBX_env * env,MDBX_txn * read_txn,mdbx_filehandle_t fd,uint8_t * buffer,const bool dest_is_pipe,const int flags)22103 __cold static int mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn,
22104                                    mdbx_filehandle_t fd, uint8_t *buffer,
22105                                    const bool dest_is_pipe, const int flags) {
22106   const size_t meta_bytes = pgno2bytes(env, NUM_METAS);
22107   uint8_t *const data_buffer =
22108       buffer + ceil_powerof2(meta_bytes, env->me_os_psize);
22109   MDBX_meta *const meta = mdbx_init_metas(env, buffer);
22110   mdbx_meta_set_txnid(env, meta, read_txn->mt_txnid);
22111 
22112   if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE)
22113     make_sizeable(meta);
22114 
22115   /* copy canary sequences if present */
22116   if (read_txn->mt_canary.v) {
22117     meta->mm_canary = read_txn->mt_canary;
22118     meta->mm_canary.v = mdbx_meta_txnid_stable(env, meta);
22119   }
22120 
22121   /* Set metapage 1 with current main DB */
22122   pgno_t new_root, root = read_txn->mt_dbs[MAIN_DBI].md_root;
22123   if ((new_root = root) == P_INVALID) {
22124     /* When the DB is empty, handle it specially to
22125      * fix any breakage like page leaks from ITS#8174. */
22126     meta->mm_dbs[MAIN_DBI].md_flags = read_txn->mt_dbs[MAIN_DBI].md_flags;
22127     compact_fixup_meta(env, meta);
22128     if (dest_is_pipe) {
22129       int rc = mdbx_write(fd, buffer, meta_bytes);
22130       if (rc != MDBX_SUCCESS)
22131         return rc;
22132     }
22133   } else {
22134     /* Count free pages + GC pages.  Subtract from last_pg
22135      * to find the new last_pg, which also becomes the new root. */
22136     pgno_t freecount = 0;
22137     MDBX_cursor_couple couple;
22138     MDBX_val key, data;
22139 
22140     int rc = mdbx_cursor_init(&couple.outer, read_txn, FREE_DBI);
22141     if (unlikely(rc != MDBX_SUCCESS))
22142       return rc;
22143     while ((rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_NEXT)) == 0)
22144       freecount += *(pgno_t *)data.iov_base;
22145     if (unlikely(rc != MDBX_NOTFOUND))
22146       return rc;
22147 
22148     freecount += read_txn->mt_dbs[FREE_DBI].md_branch_pages +
22149                  read_txn->mt_dbs[FREE_DBI].md_leaf_pages +
22150                  read_txn->mt_dbs[FREE_DBI].md_overflow_pages;
22151 
22152     new_root = read_txn->mt_next_pgno - 1 - freecount;
22153     meta->mm_geo.next = new_root + 1;
22154     meta->mm_dbs[MAIN_DBI] = read_txn->mt_dbs[MAIN_DBI];
22155     meta->mm_dbs[MAIN_DBI].md_root = new_root;
22156 
22157     mdbx_copy ctx;
22158     memset(&ctx, 0, sizeof(ctx));
22159     rc = mdbx_condpair_init(&ctx.mc_condpair);
22160     if (unlikely(rc != MDBX_SUCCESS))
22161       return rc;
22162 
22163     memset(data_buffer, 0, ((size_t)(MDBX_ENVCOPY_WRITEBUF)) * 2);
22164     ctx.mc_wbuf[0] = data_buffer;
22165     ctx.mc_wbuf[1] = data_buffer + ((size_t)(MDBX_ENVCOPY_WRITEBUF));
22166     ctx.mc_next_pgno = NUM_METAS;
22167     ctx.mc_env = env;
22168     ctx.mc_fd = fd;
22169     ctx.mc_txn = read_txn;
22170 
22171     mdbx_thread_t thread;
22172     int thread_err = mdbx_thread_create(&thread, mdbx_env_copythr, &ctx);
22173     if (likely(thread_err == MDBX_SUCCESS)) {
22174       if (dest_is_pipe) {
22175         compact_fixup_meta(env, meta);
22176         rc = mdbx_write(fd, buffer, meta_bytes);
22177       }
22178       if (rc == MDBX_SUCCESS)
22179         rc = mdbx_env_cwalk(&ctx, &root, 0);
22180       mdbx_env_cthr_toggle(&ctx);
22181       mdbx_env_cthr_toggle(&ctx);
22182       thread_err = mdbx_thread_join(thread);
22183       mdbx_assert(env, (ctx.mc_tail == ctx.mc_head &&
22184                         ctx.mc_wlen[ctx.mc_head & 1] == 0) ||
22185                            ctx.mc_error);
22186       mdbx_condpair_destroy(&ctx.mc_condpair);
22187     }
22188     if (unlikely(thread_err != MDBX_SUCCESS))
22189       return thread_err;
22190     if (unlikely(rc != MDBX_SUCCESS))
22191       return rc;
22192     if (unlikely(ctx.mc_error != MDBX_SUCCESS))
22193       return ctx.mc_error;
22194 
22195     if (dest_is_pipe) {
22196       if (unlikely(root != new_root)) {
22197         mdbx_error("post-compactification root %" PRIaPGNO
22198                    " NE expected %" PRIaPGNO
22199                    " (source DB corrupted or has a page leak(s))",
22200                    root, new_root);
22201         return MDBX_CORRUPTED; /* page leak or corrupt DB */
22202       }
22203     } else {
22204       if (unlikely(root > new_root)) {
22205         mdbx_error("post-compactification root %" PRIaPGNO
22206                    " GT expected %" PRIaPGNO " (source DB corrupted)",
22207                    root, new_root);
22208         return MDBX_CORRUPTED; /* page leak or corrupt DB */
22209       }
22210       if (unlikely(root < new_root)) {
22211         mdbx_warning("post-compactification root %" PRIaPGNO
22212                      " LT expected %" PRIaPGNO " (page leak(s) in source DB)",
22213                      root, new_root);
22214         /* fixup meta */
22215         meta->mm_dbs[MAIN_DBI].md_root = root;
22216         meta->mm_geo.next = root + 1;
22217       }
22218       compact_fixup_meta(env, meta);
22219     }
22220   }
22221 
22222   /* Extend file if required */
22223   if (meta->mm_geo.now != meta->mm_geo.next) {
22224     const size_t whole_size = pgno2bytes(env, meta->mm_geo.now);
22225     if (!dest_is_pipe)
22226       return mdbx_ftruncate(fd, whole_size);
22227 
22228     const size_t used_size = pgno2bytes(env, meta->mm_geo.next);
22229     memset(data_buffer, 0, ((size_t)(MDBX_ENVCOPY_WRITEBUF)));
22230     for (size_t offset = used_size; offset < whole_size;) {
22231       const size_t chunk =
22232           (((size_t)(MDBX_ENVCOPY_WRITEBUF)) < whole_size - offset)
22233               ? ((size_t)(MDBX_ENVCOPY_WRITEBUF))
22234               : whole_size - offset;
22235       /* copy to avoid EFAULT in case swapped-out */
22236       int rc = mdbx_write(fd, data_buffer, chunk);
22237       if (unlikely(rc != MDBX_SUCCESS))
22238         return rc;
22239       offset += chunk;
22240     }
22241   }
22242   return MDBX_SUCCESS;
22243 }
22244 
22245 /* Copy environment as-is. */
mdbx_env_copy_asis(MDBX_env * env,MDBX_txn * read_txn,mdbx_filehandle_t fd,uint8_t * buffer,const bool dest_is_pipe,const int flags)22246 __cold static int mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn,
22247                                      mdbx_filehandle_t fd, uint8_t *buffer,
22248                                      const bool dest_is_pipe, const int flags) {
22249   /* We must start the actual read txn after blocking writers */
22250   int rc = mdbx_txn_end(read_txn, MDBX_END_RESET_TMP);
22251   if (unlikely(rc != MDBX_SUCCESS))
22252     return rc;
22253 
22254   /* Temporarily block writers until we snapshot the meta pages */
22255   rc = mdbx_txn_lock(env, false);
22256   if (unlikely(rc != MDBX_SUCCESS))
22257     return rc;
22258 
22259   rc = mdbx_txn_renew0(read_txn, MDBX_TXN_RDONLY);
22260   if (unlikely(rc != MDBX_SUCCESS)) {
22261     mdbx_txn_unlock(env);
22262     return rc;
22263   }
22264 
22265   mdbx_jitter4testing(false);
22266   const size_t meta_bytes = pgno2bytes(env, NUM_METAS);
22267   /* Make a snapshot of meta-pages,
22268    * but writing ones after the data was flushed */
22269   memcpy(buffer, env->me_map, meta_bytes);
22270   MDBX_meta *const headcopy = /* LY: get pointer to the snapshot copy */
22271       (MDBX_meta *)(buffer + ((uint8_t *)mdbx_meta_head(env) - env->me_map));
22272   mdbx_txn_unlock(env);
22273 
22274   if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE)
22275     make_sizeable(headcopy);
22276   /* Update signature to steady */
22277   unaligned_poke_u64(4, headcopy->mm_datasync_sign, mdbx_meta_sign(headcopy));
22278 
22279   /* Copy the data */
22280   const size_t whole_size = pgno_align2os_bytes(env, read_txn->mt_end_pgno);
22281   const size_t used_size = pgno2bytes(env, read_txn->mt_next_pgno);
22282   mdbx_jitter4testing(false);
22283 
22284   if (dest_is_pipe)
22285     rc = mdbx_write(fd, buffer, meta_bytes);
22286 
22287   uint8_t *const data_buffer =
22288       buffer + ceil_powerof2(meta_bytes, env->me_os_psize);
22289   for (size_t offset = meta_bytes; rc == MDBX_SUCCESS && offset < used_size;) {
22290 #if MDBX_USE_SENDFILE
22291     static bool sendfile_unavailable;
22292     if (dest_is_pipe && likely(!sendfile_unavailable)) {
22293       off_t in_offset = offset;
22294       const ssize_t written =
22295           sendfile(fd, env->me_lazy_fd, &in_offset, used_size - offset);
22296       if (likely(written > 0)) {
22297         offset = in_offset;
22298         continue;
22299       }
22300       rc = MDBX_ENODATA;
22301       if (written == 0 || ignore_enosys(rc = errno) != MDBX_RESULT_TRUE)
22302         break;
22303       sendfile_unavailable = true;
22304     }
22305 #endif /* MDBX_USE_SENDFILE */
22306 
22307 #if MDBX_USE_COPYFILERANGE
22308     static bool copyfilerange_unavailable;
22309     if (!dest_is_pipe && likely(!copyfilerange_unavailable)) {
22310       off_t in_offset = offset, out_offset = offset;
22311       ssize_t bytes_copied = copy_file_range(
22312           env->me_lazy_fd, &in_offset, fd, &out_offset, used_size - offset, 0);
22313       if (likely(bytes_copied > 0)) {
22314         offset = in_offset;
22315         continue;
22316       }
22317       rc = MDBX_ENODATA;
22318       if (bytes_copied == 0 || ignore_enosys(rc = errno) != MDBX_RESULT_TRUE)
22319         break;
22320       copyfilerange_unavailable = true;
22321     }
22322 #endif /* MDBX_USE_COPYFILERANGE */
22323 
22324     /* fallback to portable */
22325     const size_t chunk =
22326         (((size_t)(MDBX_ENVCOPY_WRITEBUF)) < used_size - offset)
22327             ? ((size_t)(MDBX_ENVCOPY_WRITEBUF))
22328             : used_size - offset;
22329     /* copy to avoid EFAULT in case swapped-out */
22330     memcpy(data_buffer, env->me_map + offset, chunk);
22331     rc = mdbx_write(fd, data_buffer, chunk);
22332     offset += chunk;
22333   }
22334 
22335   /* Extend file if required */
22336   if (likely(rc == MDBX_SUCCESS) && whole_size != used_size) {
22337     if (!dest_is_pipe)
22338       rc = mdbx_ftruncate(fd, whole_size);
22339     else {
22340       memset(data_buffer, 0, ((size_t)(MDBX_ENVCOPY_WRITEBUF)));
22341       for (size_t offset = used_size;
22342            rc == MDBX_SUCCESS && offset < whole_size;) {
22343         const size_t chunk =
22344             (((size_t)(MDBX_ENVCOPY_WRITEBUF)) < whole_size - offset)
22345                 ? ((size_t)(MDBX_ENVCOPY_WRITEBUF))
22346                 : whole_size - offset;
22347         /* copy to avoid EFAULT in case swapped-out */
22348         rc = mdbx_write(fd, data_buffer, chunk);
22349         offset += chunk;
22350       }
22351     }
22352   }
22353 
22354   return rc;
22355 }
22356 
mdbx_env_copy2fd(MDBX_env * env,mdbx_filehandle_t fd,unsigned flags)22357 __cold int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd,
22358                             unsigned flags) {
22359   int rc = check_env(env, true);
22360   if (unlikely(rc != MDBX_SUCCESS))
22361     return rc;
22362 
22363   const int dest_is_pipe = mdbx_is_pipe(fd);
22364   if (MDBX_IS_ERROR(dest_is_pipe))
22365     return dest_is_pipe;
22366 
22367   if (!dest_is_pipe) {
22368     rc = mdbx_fseek(fd, 0);
22369     if (unlikely(rc != MDBX_SUCCESS))
22370       return rc;
22371   }
22372 
22373   const size_t buffer_size =
22374       pgno_align2os_bytes(env, NUM_METAS) +
22375       ceil_powerof2(((flags & MDBX_CP_COMPACT)
22376                          ? ((size_t)(MDBX_ENVCOPY_WRITEBUF)) * 2
22377                          : ((size_t)(MDBX_ENVCOPY_WRITEBUF))),
22378                     env->me_os_psize);
22379 
22380   uint8_t *buffer = NULL;
22381   rc = mdbx_memalign_alloc(env->me_os_psize, buffer_size, (void **)&buffer);
22382   if (unlikely(rc != MDBX_SUCCESS))
22383     return rc;
22384 
22385   MDBX_txn *read_txn = NULL;
22386   /* Do the lock/unlock of the reader mutex before starting the
22387    * write txn. Otherwise other read txns could block writers. */
22388   rc = mdbx_txn_begin(env, NULL, MDBX_TXN_RDONLY, &read_txn);
22389   if (unlikely(rc != MDBX_SUCCESS)) {
22390     mdbx_memalign_free(buffer);
22391     return rc;
22392   }
22393 
22394   if (!dest_is_pipe) {
22395     /* Firstly write a stub to meta-pages.
22396      * Now we sure to incomplete copy will not be used. */
22397     memset(buffer, -1, pgno2bytes(env, NUM_METAS));
22398     rc = mdbx_write(fd, buffer, pgno2bytes(env, NUM_METAS));
22399   }
22400 
22401   if (likely(rc == MDBX_SUCCESS)) {
22402     memset(buffer, 0, pgno2bytes(env, NUM_METAS));
22403     rc = ((flags & MDBX_CP_COMPACT) ? mdbx_env_compact : mdbx_env_copy_asis)(
22404         env, read_txn, fd, buffer, dest_is_pipe, flags);
22405   }
22406   mdbx_txn_abort(read_txn);
22407 
22408   if (!dest_is_pipe) {
22409     if (likely(rc == MDBX_SUCCESS))
22410       rc = mdbx_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_SIZE);
22411 
22412     /* Write actual meta */
22413     if (likely(rc == MDBX_SUCCESS))
22414       rc = mdbx_pwrite(fd, buffer, pgno2bytes(env, NUM_METAS), 0);
22415 
22416     if (likely(rc == MDBX_SUCCESS))
22417       rc = mdbx_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
22418   }
22419 
22420   mdbx_memalign_free(buffer);
22421   return rc;
22422 }
22423 
mdbx_env_copy(MDBX_env * env,const char * dest_path,MDBX_copy_flags_t flags)22424 __cold int mdbx_env_copy(MDBX_env *env, const char *dest_path,
22425                          MDBX_copy_flags_t flags) {
22426   int rc = check_env(env, true);
22427   if (unlikely(rc != MDBX_SUCCESS))
22428     return rc;
22429 
22430   if (unlikely(!dest_path))
22431     return MDBX_EINVAL;
22432 
22433   /* The destination path must exist, but the destination file must not.
22434    * We don't want the OS to cache the writes, since the source data is
22435    * already in the OS cache. */
22436   mdbx_filehandle_t newfd;
22437   rc = mdbx_openfile(MDBX_OPEN_COPY, env, dest_path, &newfd,
22438 #if defined(_WIN32) || defined(_WIN64)
22439                      (mdbx_mode_t)-1
22440 #else
22441                      S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP
22442 #endif
22443   );
22444 
22445   if (rc == MDBX_SUCCESS) {
22446 #if defined(_WIN32) || defined(_WIN64)
22447     OVERLAPPED ov;
22448     memset(&ov, 0, sizeof(ov));
22449     if (!LockFileEx(newfd, LOCKFILE_EXCLUSIVE_LOCK | LOCKFILE_FAIL_IMMEDIATELY,
22450                     0, 0, INT32_MAX, &ov))
22451       rc = GetLastError();
22452 #else
22453     struct flock lock_op;
22454     memset(&lock_op, 0, sizeof(lock_op));
22455     lock_op.l_type = F_WRLCK;
22456     lock_op.l_whence = SEEK_SET;
22457     lock_op.l_start = 0;
22458     lock_op.l_len =
22459         (sizeof(lock_op.l_len) > 4 ? INT64_MAX : INT32_MAX) & ~(size_t)0xffff;
22460     if (fcntl(newfd, F_SETLK, &lock_op)
22461 #if (defined(__linux__) || defined(__gnu_linux__)) && defined(LOCK_EX) &&      \
22462     (!defined(__ANDROID_API__) || __ANDROID_API__ >= 24)
22463         || flock(newfd, LOCK_EX | LOCK_NB)
22464 #endif /* Linux */
22465     )
22466       rc = errno;
22467 #endif /* Windows / POSIX */
22468   }
22469 
22470   if (rc == MDBX_SUCCESS)
22471     rc = mdbx_env_copy2fd(env, newfd, flags);
22472 
22473   if (newfd != INVALID_HANDLE_VALUE) {
22474     int err = mdbx_closefile(newfd);
22475     if (rc == MDBX_SUCCESS && err != rc)
22476       rc = err;
22477     if (rc != MDBX_SUCCESS)
22478       (void)mdbx_removefile(dest_path);
22479   }
22480 
22481   return rc;
22482 }
22483 
22484 /******************************************************************************/
22485 
mdbx_env_set_flags(MDBX_env * env,MDBX_env_flags_t flags,bool onoff)22486 __cold int mdbx_env_set_flags(MDBX_env *env, MDBX_env_flags_t flags,
22487                               bool onoff) {
22488   int rc = check_env(env, false);
22489   if (unlikely(rc != MDBX_SUCCESS))
22490     return rc;
22491 
22492   if (unlikely(flags &
22493                ((env->me_flags & MDBX_ENV_ACTIVE) ? ~ENV_CHANGEABLE_FLAGS
22494                                                   : ~ENV_USABLE_FLAGS)))
22495     return MDBX_EPERM;
22496 
22497   if (unlikely(env->me_flags & MDBX_RDONLY))
22498     return MDBX_EACCESS;
22499 
22500   if ((env->me_flags & MDBX_ENV_ACTIVE) &&
22501       unlikely(env->me_txn0->mt_owner == mdbx_thread_self()))
22502     return MDBX_BUSY;
22503 
22504   const bool lock_needed = (env->me_flags & MDBX_ENV_ACTIVE) &&
22505                            env->me_txn0->mt_owner != mdbx_thread_self();
22506   bool should_unlock = false;
22507   if (lock_needed) {
22508     rc = mdbx_txn_lock(env, false);
22509     if (unlikely(rc))
22510       return rc;
22511     should_unlock = true;
22512   }
22513 
22514   if (onoff)
22515     env->me_flags = merge_sync_flags(env->me_flags, flags);
22516   else
22517     env->me_flags &= ~flags;
22518 
22519   if (should_unlock)
22520     mdbx_txn_unlock(env);
22521   return MDBX_SUCCESS;
22522 }
22523 
mdbx_env_get_flags(const MDBX_env * env,unsigned * arg)22524 __cold int mdbx_env_get_flags(const MDBX_env *env, unsigned *arg) {
22525   int rc = check_env(env, false);
22526   if (unlikely(rc != MDBX_SUCCESS))
22527     return rc;
22528 
22529   if (unlikely(!arg))
22530     return MDBX_EINVAL;
22531 
22532   *arg = env->me_flags & ENV_USABLE_FLAGS;
22533   return MDBX_SUCCESS;
22534 }
22535 
mdbx_env_set_userctx(MDBX_env * env,void * ctx)22536 __cold int mdbx_env_set_userctx(MDBX_env *env, void *ctx) {
22537   int rc = check_env(env, false);
22538   if (unlikely(rc != MDBX_SUCCESS))
22539     return rc;
22540 
22541   env->me_userctx = ctx;
22542   return MDBX_SUCCESS;
22543 }
22544 
mdbx_env_get_userctx(const MDBX_env * env)22545 __cold void *mdbx_env_get_userctx(const MDBX_env *env) {
22546   return env ? env->me_userctx : NULL;
22547 }
22548 
mdbx_env_set_assert(MDBX_env * env,MDBX_assert_func * func)22549 __cold int mdbx_env_set_assert(MDBX_env *env, MDBX_assert_func *func) {
22550   int rc = check_env(env, false);
22551   if (unlikely(rc != MDBX_SUCCESS))
22552     return rc;
22553 
22554 #if MDBX_DEBUG
22555   env->me_assert_func = func;
22556   return MDBX_SUCCESS;
22557 #else
22558   (void)func;
22559   return MDBX_ENOSYS;
22560 #endif
22561 }
22562 
mdbx_env_get_path(const MDBX_env * env,const char ** arg)22563 __cold int mdbx_env_get_path(const MDBX_env *env, const char **arg) {
22564   int rc = check_env(env, true);
22565   if (unlikely(rc != MDBX_SUCCESS))
22566     return rc;
22567 
22568   if (unlikely(!arg))
22569     return MDBX_EINVAL;
22570 
22571   *arg = env->me_pathname;
22572   return MDBX_SUCCESS;
22573 }
22574 
mdbx_env_get_fd(const MDBX_env * env,mdbx_filehandle_t * arg)22575 __cold int mdbx_env_get_fd(const MDBX_env *env, mdbx_filehandle_t *arg) {
22576   int rc = check_env(env, true);
22577   if (unlikely(rc != MDBX_SUCCESS))
22578     return rc;
22579 
22580   if (unlikely(!arg))
22581     return MDBX_EINVAL;
22582 
22583   *arg = env->me_lazy_fd;
22584   return MDBX_SUCCESS;
22585 }
22586 
22587 #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API
mdbx_env_stat(const MDBX_env * env,MDBX_stat * stat,size_t bytes)22588 __cold int mdbx_env_stat(const MDBX_env *env, MDBX_stat *stat, size_t bytes) {
22589   return __inline_mdbx_env_stat(env, stat, bytes);
22590 }
22591 #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */
22592 
stat_get(const MDBX_db * db,MDBX_stat * st,size_t bytes)22593 static void stat_get(const MDBX_db *db, MDBX_stat *st, size_t bytes) {
22594   st->ms_depth = db->md_depth;
22595   st->ms_branch_pages = db->md_branch_pages;
22596   st->ms_leaf_pages = db->md_leaf_pages;
22597   st->ms_overflow_pages = db->md_overflow_pages;
22598   st->ms_entries = db->md_entries;
22599   if (likely(bytes >=
22600              offsetof(MDBX_stat, ms_mod_txnid) + sizeof(st->ms_mod_txnid)))
22601     st->ms_mod_txnid = db->md_mod_txnid;
22602 }
22603 
stat_add(const MDBX_db * db,MDBX_stat * const st,const size_t bytes)22604 static void stat_add(const MDBX_db *db, MDBX_stat *const st,
22605                      const size_t bytes) {
22606   st->ms_depth += db->md_depth;
22607   st->ms_branch_pages += db->md_branch_pages;
22608   st->ms_leaf_pages += db->md_leaf_pages;
22609   st->ms_overflow_pages += db->md_overflow_pages;
22610   st->ms_entries += db->md_entries;
22611   if (likely(bytes >=
22612              offsetof(MDBX_stat, ms_mod_txnid) + sizeof(st->ms_mod_txnid)))
22613     st->ms_mod_txnid = (st->ms_mod_txnid > db->md_mod_txnid) ? st->ms_mod_txnid
22614                                                              : db->md_mod_txnid;
22615 }
22616 
stat_acc(const MDBX_txn * txn,MDBX_stat * st,size_t bytes)22617 __cold static int stat_acc(const MDBX_txn *txn, MDBX_stat *st, size_t bytes) {
22618   int err = check_txn(txn, MDBX_TXN_BLOCKED);
22619   if (unlikely(err != MDBX_SUCCESS))
22620     return err;
22621 
22622   st->ms_psize = txn->mt_env->me_psize;
22623 #if 1
22624   /* assuming GC is internal and not subject for accounting */
22625   stat_get(&txn->mt_dbs[MAIN_DBI], st, bytes);
22626 #else
22627   stat_get(&txn->mt_dbs[FREE_DBI], st, bytes);
22628   stat_add(&txn->mt_dbs[MAIN_DBI], st, bytes);
22629 #endif
22630 
22631   /* account opened named subDBs */
22632   for (MDBX_dbi dbi = CORE_DBS; dbi < txn->mt_numdbs; dbi++)
22633     if ((txn->mt_dbistate[dbi] & (DBI_VALID | DBI_STALE)) == DBI_VALID)
22634       stat_add(txn->mt_dbs + dbi, st, bytes);
22635 
22636   if (!(txn->mt_dbs[MAIN_DBI].md_flags & (MDBX_DUPSORT | MDBX_INTEGERKEY)) &&
22637       txn->mt_dbs[MAIN_DBI].md_entries /* TODO: use `md_subs` field */) {
22638     MDBX_cursor_couple cx;
22639     err = mdbx_cursor_init(&cx.outer, (MDBX_txn *)txn, MAIN_DBI);
22640     if (unlikely(err != MDBX_SUCCESS))
22641       return err;
22642 
22643     /* scan and account not opened named subDBs */
22644     err = mdbx_page_search(&cx.outer, NULL, MDBX_PS_FIRST);
22645     while (err == MDBX_SUCCESS) {
22646       const MDBX_page *mp = cx.outer.mc_pg[cx.outer.mc_top];
22647       for (unsigned i = 0; i < page_numkeys(mp); i++) {
22648         const MDBX_node *node = page_node(mp, i);
22649         if (node_flags(node) != F_SUBDATA)
22650           continue;
22651         if (unlikely(node_ds(node) != sizeof(MDBX_db)))
22652           return MDBX_CORRUPTED;
22653 
22654         /* skip opened and already accounted */
22655         for (MDBX_dbi dbi = CORE_DBS; dbi < txn->mt_numdbs; dbi++)
22656           if ((txn->mt_dbistate[dbi] & (DBI_VALID | DBI_STALE)) == DBI_VALID &&
22657               node_ks(node) == txn->mt_dbxs[dbi].md_name.iov_len &&
22658               memcmp(node_key(node), txn->mt_dbxs[dbi].md_name.iov_base,
22659                      node_ks(node)) == 0) {
22660             node = NULL;
22661             break;
22662           }
22663 
22664         if (node) {
22665           MDBX_db db;
22666           memcpy(&db, node_data(node), sizeof(db));
22667           stat_add(&db, st, bytes);
22668         }
22669       }
22670       err = mdbx_cursor_sibling(&cx.outer, SIBLING_RIGHT);
22671     }
22672     if (unlikely(err != MDBX_NOTFOUND))
22673       return err;
22674   }
22675 
22676   return MDBX_SUCCESS;
22677 }
22678 
mdbx_env_stat_ex(const MDBX_env * env,const MDBX_txn * txn,MDBX_stat * dest,size_t bytes)22679 __cold int mdbx_env_stat_ex(const MDBX_env *env, const MDBX_txn *txn,
22680                             MDBX_stat *dest, size_t bytes) {
22681   if (unlikely(!dest))
22682     return MDBX_EINVAL;
22683   const size_t size_before_modtxnid = offsetof(MDBX_stat, ms_mod_txnid);
22684   if (unlikely(bytes != sizeof(MDBX_stat)) && bytes != size_before_modtxnid)
22685     return MDBX_EINVAL;
22686 
22687   if (likely(txn)) {
22688     if (env && unlikely(txn->mt_env != env))
22689       return MDBX_EINVAL;
22690     return stat_acc(txn, dest, bytes);
22691   }
22692 
22693   int err = check_env(env, true);
22694   if (unlikely(err != MDBX_SUCCESS))
22695     return err;
22696 
22697   if (env->me_txn0 && env->me_txn0->mt_owner == mdbx_thread_self())
22698     /* inside write-txn */
22699     return stat_acc(env->me_txn, dest, bytes);
22700 
22701   MDBX_txn *tmp_txn;
22702   err = mdbx_txn_begin((MDBX_env *)env, NULL, MDBX_TXN_RDONLY, &tmp_txn);
22703   if (unlikely(err != MDBX_SUCCESS))
22704     return err;
22705 
22706   const int rc = stat_acc(tmp_txn, dest, bytes);
22707   err = mdbx_txn_abort(tmp_txn);
22708   if (unlikely(err != MDBX_SUCCESS))
22709     return err;
22710   return rc;
22711 }
22712 
mdbx_dbi_dupsort_depthmask(MDBX_txn * txn,MDBX_dbi dbi,uint32_t * mask)22713 __cold int mdbx_dbi_dupsort_depthmask(MDBX_txn *txn, MDBX_dbi dbi,
22714                                       uint32_t *mask) {
22715   int rc = check_txn(txn, MDBX_TXN_BLOCKED);
22716   if (unlikely(rc != MDBX_SUCCESS))
22717     return rc;
22718 
22719   if (unlikely(!mask))
22720     return MDBX_EINVAL;
22721 
22722   if (unlikely(!check_dbi(txn, dbi, DBI_VALID)))
22723     return MDBX_BAD_DBI;
22724 
22725   MDBX_cursor_couple cx;
22726   rc = mdbx_cursor_init(&cx.outer, txn, dbi);
22727   if (unlikely(rc != MDBX_SUCCESS))
22728     return rc;
22729   if ((cx.outer.mc_db->md_flags & MDBX_DUPSORT) == 0)
22730     return MDBX_RESULT_TRUE;
22731 
22732   MDBX_val key, data;
22733   rc = mdbx_cursor_first(&cx.outer, &key, &data);
22734   *mask = 0;
22735   while (rc == MDBX_SUCCESS) {
22736     const MDBX_node *node = page_node(cx.outer.mc_pg[cx.outer.mc_top],
22737                                       cx.outer.mc_ki[cx.outer.mc_top]);
22738     const MDBX_db *db = node_data(node);
22739     const unsigned flags = node_flags(node);
22740     switch (flags) {
22741     case F_BIGDATA:
22742     case 0:
22743       /* single-value entry, deep = 0 */
22744       *mask |= 1 << 0;
22745       break;
22746     case F_DUPDATA:
22747       /* single sub-page, deep = 1 */
22748       *mask |= 1 << 1;
22749       break;
22750     case F_DUPDATA | F_SUBDATA:
22751       /* sub-tree */
22752       *mask |= 1 << UNALIGNED_PEEK_16(db, MDBX_db, md_depth);
22753       break;
22754     default:
22755       mdbx_error("wrong node-flags %u", flags);
22756       return MDBX_CORRUPTED;
22757     }
22758     rc = mdbx_cursor_next(&cx.outer, &key, &data, MDBX_NEXT_NODUP);
22759   }
22760 
22761   return (rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc;
22762 }
22763 
22764 #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API
mdbx_env_info(const MDBX_env * env,MDBX_envinfo * info,size_t bytes)22765 __cold int mdbx_env_info(const MDBX_env *env, MDBX_envinfo *info,
22766                          size_t bytes) {
22767   return __inline_mdbx_env_info(env, info, bytes);
22768 }
22769 #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */
22770 
fetch_envinfo_ex(const MDBX_env * env,const MDBX_txn * txn,MDBX_envinfo * arg,const size_t bytes)22771 __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn,
22772                                    MDBX_envinfo *arg, const size_t bytes) {
22773 
22774   const size_t size_before_bootid = offsetof(MDBX_envinfo, mi_bootid);
22775   const size_t size_before_pgop_stat = offsetof(MDBX_envinfo, mi_pgop_stat);
22776 
22777   /* is the environment open? (https://github.com/erthink/libmdbx/issues/171) */
22778   if (unlikely(!env->me_map)) {
22779     /* environment not yet opened */
22780 #if 1
22781     /* default behavior: returns the available info but zeroed the rest */
22782     memset(arg, 0, bytes);
22783     arg->mi_geo.lower = env->me_dbgeo.lower;
22784     arg->mi_geo.upper = env->me_dbgeo.upper;
22785     arg->mi_geo.shrink = env->me_dbgeo.shrink;
22786     arg->mi_geo.grow = env->me_dbgeo.grow;
22787     arg->mi_geo.current = env->me_dbgeo.now;
22788     arg->mi_maxreaders = env->me_maxreaders;
22789     arg->mi_dxb_pagesize = env->me_psize;
22790     arg->mi_sys_pagesize = env->me_os_psize;
22791     if (likely(bytes > size_before_bootid)) {
22792       arg->mi_bootid.current.x = bootid.x;
22793       arg->mi_bootid.current.y = bootid.y;
22794     }
22795     return MDBX_SUCCESS;
22796 #else
22797     /* some users may prefer this behavior: return appropriate error */
22798     return MDBX_EPERM;
22799 #endif
22800   }
22801 
22802   const MDBX_meta *const meta0 = METAPAGE(env, 0);
22803   const MDBX_meta *const meta1 = METAPAGE(env, 1);
22804   const MDBX_meta *const meta2 = METAPAGE(env, 2);
22805   if (unlikely(env->me_flags & MDBX_FATAL_ERROR))
22806     return MDBX_PANIC;
22807 
22808   const MDBX_meta *const recent_meta = mdbx_meta_head(env);
22809   arg->mi_recent_txnid = mdbx_meta_txnid_fluid(env, recent_meta);
22810   arg->mi_meta0_txnid = mdbx_meta_txnid_fluid(env, meta0);
22811   arg->mi_meta0_sign = unaligned_peek_u64(4, meta0->mm_datasync_sign);
22812   arg->mi_meta1_txnid = mdbx_meta_txnid_fluid(env, meta1);
22813   arg->mi_meta1_sign = unaligned_peek_u64(4, meta1->mm_datasync_sign);
22814   arg->mi_meta2_txnid = mdbx_meta_txnid_fluid(env, meta2);
22815   arg->mi_meta2_sign = unaligned_peek_u64(4, meta2->mm_datasync_sign);
22816   if (likely(bytes > size_before_bootid)) {
22817     memcpy(&arg->mi_bootid.meta0, &meta0->mm_bootid, 16);
22818     memcpy(&arg->mi_bootid.meta1, &meta1->mm_bootid, 16);
22819     memcpy(&arg->mi_bootid.meta2, &meta2->mm_bootid, 16);
22820   }
22821 
22822   const MDBX_meta *txn_meta = recent_meta;
22823   arg->mi_last_pgno = txn_meta->mm_geo.next - 1;
22824   arg->mi_geo.current = pgno2bytes(env, txn_meta->mm_geo.now);
22825   if (txn) {
22826     arg->mi_last_pgno = txn->mt_next_pgno - 1;
22827     arg->mi_geo.current = pgno2bytes(env, txn->mt_end_pgno);
22828 
22829     const txnid_t wanna_meta_txnid = (txn->mt_flags & MDBX_TXN_RDONLY)
22830                                          ? txn->mt_txnid
22831                                          : txn->mt_txnid - xMDBX_TXNID_STEP;
22832     txn_meta = (arg->mi_meta0_txnid == wanna_meta_txnid) ? meta0 : txn_meta;
22833     txn_meta = (arg->mi_meta1_txnid == wanna_meta_txnid) ? meta1 : txn_meta;
22834     txn_meta = (arg->mi_meta2_txnid == wanna_meta_txnid) ? meta2 : txn_meta;
22835   }
22836   arg->mi_geo.lower = pgno2bytes(env, txn_meta->mm_geo.lower);
22837   arg->mi_geo.upper = pgno2bytes(env, txn_meta->mm_geo.upper);
22838   arg->mi_geo.shrink = pgno2bytes(env, pv2pages(txn_meta->mm_geo.shrink_pv));
22839   arg->mi_geo.grow = pgno2bytes(env, pv2pages(txn_meta->mm_geo.grow_pv));
22840   const pgno_t unsynced_pages =
22841       atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed) +
22842       (atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) !=
22843        (uint32_t)arg->mi_last_pgno);
22844 
22845   arg->mi_mapsize = env->me_dxb_mmap.limit;
22846 
22847   const MDBX_lockinfo *const lck = env->me_lck;
22848   arg->mi_maxreaders = env->me_maxreaders;
22849   arg->mi_numreaders = env->me_lck_mmap.lck
22850                            ? atomic_load32(&lck->mti_numreaders, mo_Relaxed)
22851                            : INT32_MAX;
22852   arg->mi_dxb_pagesize = env->me_psize;
22853   arg->mi_sys_pagesize = env->me_os_psize;
22854 
22855   if (likely(bytes > size_before_bootid)) {
22856     arg->mi_unsync_volume = pgno2bytes(env, unsynced_pages);
22857     const uint64_t monotime_now = mdbx_osal_monotime();
22858     uint64_t ts = atomic_load64(&lck->mti_sync_timestamp, mo_Relaxed);
22859     arg->mi_since_sync_seconds16dot16 =
22860         ts ? mdbx_osal_monotime_to_16dot16(monotime_now - ts) : 0;
22861     ts = atomic_load64(&lck->mti_reader_check_timestamp, mo_Relaxed);
22862     arg->mi_since_reader_check_seconds16dot16 =
22863         ts ? mdbx_osal_monotime_to_16dot16(monotime_now - ts) : 0;
22864     arg->mi_autosync_threshold = pgno2bytes(
22865         env, atomic_load32(&lck->mti_autosync_threshold, mo_Relaxed));
22866     arg->mi_autosync_period_seconds16dot16 = mdbx_osal_monotime_to_16dot16(
22867         atomic_load64(&lck->mti_autosync_period, mo_Relaxed));
22868     arg->mi_bootid.current.x = bootid.x;
22869     arg->mi_bootid.current.y = bootid.y;
22870     arg->mi_mode = env->me_lck_mmap.lck ? lck->mti_envmode.weak : env->me_flags;
22871   }
22872 
22873   if (likely(bytes > size_before_pgop_stat)) {
22874 #if MDBX_ENABLE_PGOP_STAT
22875     arg->mi_pgop_stat.newly =
22876         atomic_load64(&lck->mti_pgop_stat.newly, mo_Relaxed);
22877     arg->mi_pgop_stat.cow = atomic_load64(&lck->mti_pgop_stat.cow, mo_Relaxed);
22878     arg->mi_pgop_stat.clone =
22879         atomic_load64(&lck->mti_pgop_stat.clone, mo_Relaxed);
22880     arg->mi_pgop_stat.split =
22881         atomic_load64(&lck->mti_pgop_stat.split, mo_Relaxed);
22882     arg->mi_pgop_stat.merge =
22883         atomic_load64(&lck->mti_pgop_stat.merge, mo_Relaxed);
22884     arg->mi_pgop_stat.spill =
22885         atomic_load64(&lck->mti_pgop_stat.spill, mo_Relaxed);
22886     arg->mi_pgop_stat.unspill =
22887         atomic_load64(&lck->mti_pgop_stat.unspill, mo_Relaxed);
22888     arg->mi_pgop_stat.wops =
22889         atomic_load64(&lck->mti_pgop_stat.wops, mo_Relaxed);
22890 #else
22891     memset(&arg->mi_pgop_stat, 0, sizeof(arg->mi_pgop_stat));
22892 #endif /* MDBX_ENABLE_PGOP_STAT*/
22893   }
22894 
22895   arg->mi_self_latter_reader_txnid = arg->mi_latter_reader_txnid = 0;
22896   if (lck) {
22897     arg->mi_self_latter_reader_txnid = arg->mi_latter_reader_txnid =
22898         arg->mi_recent_txnid;
22899     for (unsigned i = 0; i < arg->mi_numreaders; ++i) {
22900       const uint32_t pid =
22901           atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease);
22902       if (pid) {
22903         const txnid_t txnid = safe64_read(&lck->mti_readers[i].mr_txnid);
22904         if (arg->mi_latter_reader_txnid > txnid)
22905           arg->mi_latter_reader_txnid = txnid;
22906         if (pid == env->me_pid && arg->mi_self_latter_reader_txnid > txnid)
22907           arg->mi_self_latter_reader_txnid = txnid;
22908       }
22909     }
22910   }
22911 
22912   mdbx_compiler_barrier();
22913   return MDBX_SUCCESS;
22914 }
22915 
mdbx_env_info_ex(const MDBX_env * env,const MDBX_txn * txn,MDBX_envinfo * arg,size_t bytes)22916 __cold int mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn,
22917                             MDBX_envinfo *arg, size_t bytes) {
22918   if (unlikely((env == NULL && txn == NULL) || arg == NULL))
22919     return MDBX_EINVAL;
22920 
22921   if (txn) {
22922     int err = check_txn(txn, MDBX_TXN_BLOCKED);
22923     if (unlikely(err != MDBX_SUCCESS))
22924       return err;
22925   }
22926   if (env) {
22927     int err = check_env(env, false);
22928     if (unlikely(err != MDBX_SUCCESS))
22929       return err;
22930     if (txn && unlikely(txn->mt_env != env))
22931       return MDBX_EINVAL;
22932   } else {
22933     env = txn->mt_env;
22934   }
22935 
22936   const size_t size_before_bootid = offsetof(MDBX_envinfo, mi_bootid);
22937   const size_t size_before_pgop_stat = offsetof(MDBX_envinfo, mi_pgop_stat);
22938   if (unlikely(bytes != sizeof(MDBX_envinfo)) && bytes != size_before_bootid &&
22939       bytes != size_before_pgop_stat)
22940     return MDBX_EINVAL;
22941 
22942   MDBX_envinfo snap;
22943   int rc = fetch_envinfo_ex(env, txn, &snap, sizeof(snap));
22944   if (unlikely(rc != MDBX_SUCCESS))
22945     return rc;
22946 
22947   while (1) {
22948     rc = fetch_envinfo_ex(env, txn, arg, bytes);
22949     if (unlikely(rc != MDBX_SUCCESS))
22950       return rc;
22951     if (likely(memcmp(&snap, arg, bytes) == 0))
22952       return MDBX_SUCCESS;
22953     memcpy(&snap, arg, bytes);
22954   }
22955 }
22956 
get_default_keycmp(unsigned flags)22957 static __inline MDBX_cmp_func *get_default_keycmp(unsigned flags) {
22958   return (flags & MDBX_REVERSEKEY)   ? cmp_reverse
22959          : (flags & MDBX_INTEGERKEY) ? cmp_int_align2
22960                                      : cmp_lexical;
22961 }
22962 
get_default_datacmp(unsigned flags)22963 static __inline MDBX_cmp_func *get_default_datacmp(unsigned flags) {
22964   return !(flags & MDBX_DUPSORT)
22965              ? cmp_lenfast
22966              : ((flags & MDBX_INTEGERDUP)
22967                     ? cmp_int_unaligned
22968                     : ((flags & MDBX_REVERSEDUP) ? cmp_reverse : cmp_lexical));
22969 }
22970 
mdbx_dbi_bind(MDBX_txn * txn,const MDBX_dbi dbi,unsigned user_flags,MDBX_cmp_func * keycmp,MDBX_cmp_func * datacmp)22971 static int mdbx_dbi_bind(MDBX_txn *txn, const MDBX_dbi dbi, unsigned user_flags,
22972                          MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) {
22973   /* LY: so, accepting only three cases for the table's flags:
22974    * 1) user_flags and both comparators are zero
22975    *    = assume that a by-default mode/flags is requested for reading;
22976    * 2) user_flags exactly the same
22977    *    = assume that the target mode/flags are requested properly;
22978    * 3) user_flags differs, but table is empty and MDBX_CREATE is provided
22979    *    = assume that a properly create request with custom flags;
22980    */
22981   if ((user_flags ^ txn->mt_dbs[dbi].md_flags) & DB_PERSISTENT_FLAGS) {
22982     /* flags are differs, check other conditions */
22983     if ((!user_flags && (!keycmp || keycmp == txn->mt_dbxs[dbi].md_cmp) &&
22984          (!datacmp || datacmp == txn->mt_dbxs[dbi].md_dcmp)) ||
22985         user_flags == MDBX_ACCEDE) {
22986       /* no comparators were provided and flags are zero,
22987        * seems that is case #1 above */
22988       user_flags = txn->mt_dbs[dbi].md_flags;
22989     } else if ((user_flags & MDBX_CREATE) && txn->mt_dbs[dbi].md_entries == 0) {
22990       if (txn->mt_flags & MDBX_TXN_RDONLY)
22991         return /* FIXME: return extended info */ MDBX_EACCESS;
22992       /* make sure flags changes get committed */
22993       txn->mt_dbs[dbi].md_flags = user_flags & DB_PERSISTENT_FLAGS;
22994       txn->mt_flags |= MDBX_TXN_DIRTY;
22995     } else {
22996       return /* FIXME: return extended info */ MDBX_INCOMPATIBLE;
22997     }
22998   }
22999 
23000   if (!keycmp)
23001     keycmp = txn->mt_dbxs[dbi].md_cmp ? txn->mt_dbxs[dbi].md_cmp
23002                                       : get_default_keycmp(user_flags);
23003   if (txn->mt_dbxs[dbi].md_cmp != keycmp) {
23004     if (txn->mt_dbxs[dbi].md_cmp)
23005       return MDBX_EINVAL;
23006     txn->mt_dbxs[dbi].md_cmp = keycmp;
23007   }
23008 
23009   if (!datacmp)
23010     datacmp = txn->mt_dbxs[dbi].md_dcmp ? txn->mt_dbxs[dbi].md_dcmp
23011                                         : get_default_datacmp(user_flags);
23012   if (txn->mt_dbxs[dbi].md_dcmp != datacmp) {
23013     if (txn->mt_dbxs[dbi].md_dcmp)
23014       return MDBX_EINVAL;
23015     txn->mt_dbxs[dbi].md_dcmp = datacmp;
23016   }
23017 
23018   return MDBX_SUCCESS;
23019 }
23020 
dbi_open(MDBX_txn * txn,const char * table_name,unsigned user_flags,MDBX_dbi * dbi,MDBX_cmp_func * keycmp,MDBX_cmp_func * datacmp)23021 static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags,
23022                     MDBX_dbi *dbi, MDBX_cmp_func *keycmp,
23023                     MDBX_cmp_func *datacmp) {
23024   int rc = MDBX_EINVAL;
23025   if (unlikely(!dbi))
23026     return rc;
23027 
23028   if (unlikely((user_flags & ~DB_USABLE_FLAGS) != 0)) {
23029   early_bailout:
23030     *dbi = 0;
23031     return rc;
23032   }
23033 
23034   rc = check_txn(txn, MDBX_TXN_BLOCKED);
23035   if (unlikely(rc != MDBX_SUCCESS))
23036     goto early_bailout;
23037 
23038   switch (user_flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED | MDBX_DUPSORT |
23039                         MDBX_REVERSEDUP | MDBX_ACCEDE)) {
23040   case MDBX_ACCEDE:
23041     if ((user_flags & MDBX_CREATE) == 0)
23042       break;
23043     __fallthrough /* fall through */;
23044   default:
23045     rc = MDBX_EINVAL;
23046     goto early_bailout;
23047 
23048   case MDBX_DUPSORT:
23049   case MDBX_DUPSORT | MDBX_REVERSEDUP:
23050   case MDBX_DUPSORT | MDBX_DUPFIXED:
23051   case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP:
23052   case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP:
23053   case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP:
23054   case 0:
23055     break;
23056   }
23057 
23058   /* main table? */
23059   if (!table_name) {
23060     rc = mdbx_dbi_bind(txn, MAIN_DBI, user_flags, keycmp, datacmp);
23061     if (unlikely(rc != MDBX_SUCCESS))
23062       goto early_bailout;
23063     *dbi = MAIN_DBI;
23064     return rc;
23065   }
23066 
23067   MDBX_env *env = txn->mt_env;
23068   size_t len = strlen(table_name);
23069   if (len > env->me_leaf_nodemax - NODESIZE - sizeof(MDBX_db))
23070     return MDBX_EINVAL;
23071 
23072   if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) {
23073     txn->mt_dbxs[MAIN_DBI].md_cmp =
23074         get_default_keycmp(txn->mt_dbs[MAIN_DBI].md_flags);
23075     txn->mt_dbxs[MAIN_DBI].md_dcmp =
23076         get_default_datacmp(txn->mt_dbs[MAIN_DBI].md_flags);
23077   }
23078 
23079   /* Is the DB already open? */
23080   MDBX_dbi scan, slot;
23081   for (slot = scan = txn->mt_numdbs; --scan >= CORE_DBS;) {
23082     if (!txn->mt_dbxs[scan].md_name.iov_len) {
23083       /* Remember this free slot */
23084       slot = scan;
23085       continue;
23086     }
23087     if (len == txn->mt_dbxs[scan].md_name.iov_len &&
23088         !strncmp(table_name, txn->mt_dbxs[scan].md_name.iov_base, len)) {
23089       rc = mdbx_dbi_bind(txn, scan, user_flags, keycmp, datacmp);
23090       if (unlikely(rc != MDBX_SUCCESS))
23091         goto early_bailout;
23092       *dbi = scan;
23093       return rc;
23094     }
23095   }
23096 
23097   /* Fail, if no free slot and max hit */
23098   if (unlikely(slot >= env->me_maxdbs)) {
23099     rc = MDBX_DBS_FULL;
23100     goto early_bailout;
23101   }
23102 
23103   /* Cannot mix named table with some main-table flags */
23104   if (unlikely(txn->mt_dbs[MAIN_DBI].md_flags &
23105                (MDBX_DUPSORT | MDBX_INTEGERKEY))) {
23106     rc = (user_flags & MDBX_CREATE) ? MDBX_INCOMPATIBLE : MDBX_NOTFOUND;
23107     goto early_bailout;
23108   }
23109 
23110   /* Find the DB info */
23111   MDBX_val key, data;
23112   key.iov_len = len;
23113   key.iov_base = (void *)table_name;
23114   MDBX_cursor_couple couple;
23115   rc = mdbx_cursor_init(&couple.outer, txn, MAIN_DBI);
23116   if (unlikely(rc != MDBX_SUCCESS))
23117     goto early_bailout;
23118   rc = mdbx_cursor_set(&couple.outer, &key, &data, MDBX_SET).err;
23119   if (unlikely(rc != MDBX_SUCCESS)) {
23120     if (rc != MDBX_NOTFOUND || !(user_flags & MDBX_CREATE))
23121       goto early_bailout;
23122   } else {
23123     /* make sure this is actually a table */
23124     MDBX_node *node = page_node(couple.outer.mc_pg[couple.outer.mc_top],
23125                                 couple.outer.mc_ki[couple.outer.mc_top]);
23126     if (unlikely((node_flags(node) & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) {
23127       rc = MDBX_INCOMPATIBLE;
23128       goto early_bailout;
23129     }
23130     if (!MDBX_DISABLE_PAGECHECKS && unlikely(data.iov_len != sizeof(MDBX_db))) {
23131       rc = MDBX_CORRUPTED;
23132       goto early_bailout;
23133     }
23134   }
23135 
23136   if (rc != MDBX_SUCCESS && unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) {
23137     rc = MDBX_EACCESS;
23138     goto early_bailout;
23139   }
23140 
23141   /* Done here so we cannot fail after creating a new DB */
23142   char *namedup = mdbx_strdup(table_name);
23143   if (unlikely(!namedup)) {
23144     rc = MDBX_ENOMEM;
23145     goto early_bailout;
23146   }
23147 
23148   int err = mdbx_fastmutex_acquire(&env->me_dbi_lock);
23149   if (unlikely(err != MDBX_SUCCESS)) {
23150     rc = err;
23151     mdbx_free(namedup);
23152     goto early_bailout;
23153   }
23154 
23155   /* Import handles from env */
23156   dbi_import_locked(txn);
23157 
23158   /* Rescan after mutex acquisition & import handles */
23159   for (slot = scan = txn->mt_numdbs; --scan >= CORE_DBS;) {
23160     if (!txn->mt_dbxs[scan].md_name.iov_len) {
23161       /* Remember this free slot */
23162       slot = scan;
23163       continue;
23164     }
23165     if (len == txn->mt_dbxs[scan].md_name.iov_len &&
23166         !strncmp(table_name, txn->mt_dbxs[scan].md_name.iov_base, len)) {
23167       rc = mdbx_dbi_bind(txn, scan, user_flags, keycmp, datacmp);
23168       if (unlikely(rc != MDBX_SUCCESS))
23169         goto later_bailout;
23170       *dbi = scan;
23171       goto later_exit;
23172     }
23173   }
23174 
23175   if (unlikely(slot >= env->me_maxdbs)) {
23176     rc = MDBX_DBS_FULL;
23177     goto later_bailout;
23178   }
23179 
23180   unsigned dbiflags = DBI_FRESH | DBI_VALID | DBI_USRVALID;
23181   MDBX_db db_dummy;
23182   if (unlikely(rc)) {
23183     /* MDBX_NOTFOUND and MDBX_CREATE: Create new DB */
23184     mdbx_tassert(txn, rc == MDBX_NOTFOUND);
23185     memset(&db_dummy, 0, sizeof(db_dummy));
23186     db_dummy.md_root = P_INVALID;
23187     db_dummy.md_mod_txnid = txn->mt_txnid;
23188     db_dummy.md_flags = user_flags & DB_PERSISTENT_FLAGS;
23189     data.iov_len = sizeof(db_dummy);
23190     data.iov_base = &db_dummy;
23191     WITH_CURSOR_TRACKING(couple.outer,
23192                          rc = mdbx_cursor_put(&couple.outer, &key, &data,
23193                                               F_SUBDATA | MDBX_NOOVERWRITE));
23194 
23195     if (unlikely(rc != MDBX_SUCCESS))
23196       goto later_bailout;
23197 
23198     dbiflags |= DBI_DIRTY | DBI_CREAT;
23199     txn->mt_flags |= MDBX_TXN_DIRTY;
23200   }
23201 
23202   /* Got info, register DBI in this txn */
23203   memset(txn->mt_dbxs + slot, 0, sizeof(MDBX_dbx));
23204   memcpy(&txn->mt_dbs[slot], data.iov_base, sizeof(MDBX_db));
23205   env->me_dbflags[slot] = 0;
23206   rc = mdbx_dbi_bind(txn, slot, user_flags, keycmp, datacmp);
23207   if (unlikely(rc != MDBX_SUCCESS)) {
23208     mdbx_tassert(txn, (dbiflags & DBI_CREAT) == 0);
23209   later_bailout:
23210     *dbi = 0;
23211   later_exit:
23212     mdbx_free(namedup);
23213   } else {
23214     txn->mt_dbistate[slot] = (uint8_t)dbiflags;
23215     txn->mt_dbxs[slot].md_name.iov_base = namedup;
23216     txn->mt_dbxs[slot].md_name.iov_len = len;
23217     txn->mt_dbiseqs[slot] = ++env->me_dbiseqs[slot];
23218     if (!(dbiflags & DBI_CREAT))
23219       env->me_dbflags[slot] = txn->mt_dbs[slot].md_flags | DB_VALID;
23220     if (txn->mt_numdbs == slot) {
23221       mdbx_compiler_barrier();
23222       txn->mt_numdbs = env->me_numdbs = slot + 1;
23223       if (!(txn->mt_flags & MDBX_TXN_RDONLY))
23224         txn->tw.cursors[slot] = NULL;
23225     }
23226     mdbx_assert(env, env->me_numdbs > slot);
23227     *dbi = slot;
23228   }
23229 
23230   mdbx_ensure(env, mdbx_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS);
23231   return rc;
23232 }
23233 
mdbx_dbi_open(MDBX_txn * txn,const char * table_name,MDBX_db_flags_t table_flags,MDBX_dbi * dbi)23234 int mdbx_dbi_open(MDBX_txn *txn, const char *table_name,
23235                   MDBX_db_flags_t table_flags, MDBX_dbi *dbi) {
23236   return dbi_open(txn, table_name, table_flags, dbi, nullptr, nullptr);
23237 }
23238 
mdbx_dbi_open_ex(MDBX_txn * txn,const char * table_name,MDBX_db_flags_t table_flags,MDBX_dbi * dbi,MDBX_cmp_func * keycmp,MDBX_cmp_func * datacmp)23239 int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name,
23240                      MDBX_db_flags_t table_flags, MDBX_dbi *dbi,
23241                      MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) {
23242   return dbi_open(txn, table_name, table_flags, dbi, keycmp, datacmp);
23243 }
23244 
mdbx_dbi_stat(MDBX_txn * txn,MDBX_dbi dbi,MDBX_stat * dest,size_t bytes)23245 __cold int mdbx_dbi_stat(MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest,
23246                          size_t bytes) {
23247   int rc = check_txn(txn, MDBX_TXN_BLOCKED);
23248   if (unlikely(rc != MDBX_SUCCESS))
23249     return rc;
23250 
23251   if (unlikely(!dest))
23252     return MDBX_EINVAL;
23253 
23254   if (unlikely(!check_dbi(txn, dbi, DBI_VALID)))
23255     return MDBX_BAD_DBI;
23256 
23257   const size_t size_before_modtxnid = offsetof(MDBX_stat, ms_mod_txnid);
23258   if (unlikely(bytes != sizeof(MDBX_stat)) && bytes != size_before_modtxnid)
23259     return MDBX_EINVAL;
23260 
23261   if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED))
23262     return MDBX_BAD_TXN;
23263 
23264   if (unlikely(txn->mt_dbistate[dbi] & DBI_STALE)) {
23265     rc = mdbx_fetch_sdb(txn, dbi);
23266     if (unlikely(rc != MDBX_SUCCESS))
23267       return rc;
23268   }
23269 
23270   dest->ms_psize = txn->mt_env->me_psize;
23271   stat_get(&txn->mt_dbs[dbi], dest, bytes);
23272   return MDBX_SUCCESS;
23273 }
23274 
mdbx_dbi_close_locked(MDBX_env * env,MDBX_dbi dbi)23275 static int mdbx_dbi_close_locked(MDBX_env *env, MDBX_dbi dbi) {
23276   mdbx_assert(env, dbi >= CORE_DBS);
23277   if (unlikely(dbi >= env->me_numdbs))
23278     return MDBX_BAD_DBI;
23279 
23280   char *ptr = env->me_dbxs[dbi].md_name.iov_base;
23281   /* If there was no name, this was already closed */
23282   if (unlikely(!ptr))
23283     return MDBX_BAD_DBI;
23284 
23285   env->me_dbflags[dbi] = 0;
23286   env->me_dbiseqs[dbi]++;
23287   env->me_dbxs[dbi].md_name.iov_len = 0;
23288   mdbx_memory_fence(mo_AcquireRelease, true);
23289   env->me_dbxs[dbi].md_name.iov_base = NULL;
23290   mdbx_free(ptr);
23291 
23292   if (env->me_numdbs == dbi + 1) {
23293     unsigned i = env->me_numdbs;
23294     do
23295       --i;
23296     while (i > CORE_DBS && !env->me_dbxs[i - 1].md_name.iov_base);
23297     env->me_numdbs = i;
23298   }
23299 
23300   return MDBX_SUCCESS;
23301 }
23302 
mdbx_dbi_close(MDBX_env * env,MDBX_dbi dbi)23303 int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi) {
23304   int rc = check_env(env, true);
23305   if (unlikely(rc != MDBX_SUCCESS))
23306     return rc;
23307 
23308   if (unlikely(dbi < CORE_DBS || dbi >= env->me_maxdbs))
23309     return MDBX_BAD_DBI;
23310 
23311   rc = mdbx_fastmutex_acquire(&env->me_dbi_lock);
23312   if (likely(rc == MDBX_SUCCESS)) {
23313     rc = (dbi < env->me_maxdbs && (env->me_dbflags[dbi] & DB_VALID))
23314              ? mdbx_dbi_close_locked(env, dbi)
23315              : MDBX_BAD_DBI;
23316     mdbx_ensure(env, mdbx_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS);
23317   }
23318   return rc;
23319 }
23320 
mdbx_dbi_flags_ex(MDBX_txn * txn,MDBX_dbi dbi,unsigned * flags,unsigned * state)23321 int mdbx_dbi_flags_ex(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags,
23322                       unsigned *state) {
23323   int rc = check_txn(txn, MDBX_TXN_BLOCKED);
23324   if (unlikely(rc != MDBX_SUCCESS))
23325     return rc;
23326 
23327   if (unlikely(!flags || !state))
23328     return MDBX_EINVAL;
23329 
23330   if (unlikely(!check_dbi(txn, dbi, DBI_VALID)))
23331     return MDBX_BAD_DBI;
23332 
23333   *flags = txn->mt_dbs[dbi].md_flags & DB_PERSISTENT_FLAGS;
23334   *state =
23335       txn->mt_dbistate[dbi] & (DBI_FRESH | DBI_CREAT | DBI_DIRTY | DBI_STALE);
23336 
23337   return MDBX_SUCCESS;
23338 }
23339 
23340 #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API
mdbx_dbi_flags(MDBX_txn * txn,MDBX_dbi dbi,unsigned * flags)23341 int mdbx_dbi_flags(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags) {
23342   return __inline_mdbx_dbi_flags(txn, dbi, flags);
23343 }
23344 #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */
23345 
mdbx_drop_tree(MDBX_cursor * mc,const bool may_have_subDBs)23346 static int mdbx_drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) {
23347   int rc = mdbx_page_search(mc, NULL, MDBX_PS_FIRST);
23348   if (likely(rc == MDBX_SUCCESS)) {
23349     MDBX_txn *txn = mc->mc_txn;
23350 
23351     /* DUPSORT sub-DBs have no ovpages/DBs. Omit scanning leaves.
23352      * This also avoids any P_LEAF2 pages, which have no nodes.
23353      * Also if the DB doesn't have sub-DBs and has no overflow
23354      * pages, omit scanning leaves. */
23355     if (!(may_have_subDBs | mc->mc_db->md_overflow_pages))
23356       mdbx_cursor_pop(mc);
23357 
23358     rc = mdbx_pnl_need(&txn->tw.retired_pages,
23359                        mc->mc_db->md_branch_pages + mc->mc_db->md_leaf_pages +
23360                            mc->mc_db->md_overflow_pages);
23361     if (unlikely(rc != MDBX_SUCCESS))
23362       goto bailout;
23363 
23364     MDBX_cursor mx;
23365     cursor_copy(mc, &mx);
23366     while (mc->mc_snum > 0) {
23367       MDBX_page *const mp = mc->mc_pg[mc->mc_top];
23368       const unsigned nkeys = page_numkeys(mp);
23369       if (IS_LEAF(mp)) {
23370         mdbx_cassert(mc, mc->mc_snum == mc->mc_db->md_depth);
23371         for (unsigned i = 0; i < nkeys; i++) {
23372           MDBX_node *node = page_node(mp, i);
23373           if (node_flags(node) & F_BIGDATA) {
23374             rc = mdbx_page_retire_ex(mc, node_largedata_pgno(node), NULL, 0);
23375             if (unlikely(rc != MDBX_SUCCESS))
23376               goto bailout;
23377             if (!(may_have_subDBs | mc->mc_db->md_overflow_pages))
23378               goto pop;
23379           } else if (node_flags(node) & F_SUBDATA) {
23380             if (unlikely((node_flags(node) & F_DUPDATA) == 0)) {
23381               rc = /* disallowing implicit subDB deletion */ MDBX_INCOMPATIBLE;
23382               goto bailout;
23383             }
23384             rc = mdbx_xcursor_init1(mc, node, mp);
23385             if (unlikely(rc != MDBX_SUCCESS))
23386               goto bailout;
23387             rc = mdbx_drop_tree(&mc->mc_xcursor->mx_cursor, false);
23388             if (unlikely(rc != MDBX_SUCCESS))
23389               goto bailout;
23390           }
23391         }
23392       } else {
23393         mdbx_cassert(mc, mc->mc_snum < mc->mc_db->md_depth);
23394         if (mdbx_audit_enabled())
23395           mc->mc_flags |= C_RETIRING;
23396         const int pagetype =
23397             (IS_FROZEN(txn, mp) ? P_FROZEN : 0) +
23398             ((mc->mc_snum + 1 == mc->mc_db->md_depth) ? P_LEAF : P_BRANCH);
23399         for (unsigned i = 0; i < nkeys; i++) {
23400           MDBX_node *node = page_node(mp, i);
23401           mdbx_tassert(txn, (node_flags(node) &
23402                              (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0);
23403           const pgno_t pgno = node_pgno(node);
23404           rc = mdbx_page_retire_ex(mc, pgno, NULL, pagetype);
23405           if (unlikely(rc != MDBX_SUCCESS))
23406             goto bailout;
23407         }
23408         if (mdbx_audit_enabled())
23409           mc->mc_flags -= C_RETIRING;
23410       }
23411       if (!mc->mc_top)
23412         break;
23413       mdbx_cassert(mc, nkeys > 0);
23414       mc->mc_ki[mc->mc_top] = (indx_t)nkeys;
23415       rc = mdbx_cursor_sibling(mc, SIBLING_RIGHT);
23416       if (unlikely(rc != MDBX_SUCCESS)) {
23417         if (unlikely(rc != MDBX_NOTFOUND))
23418           goto bailout;
23419       /* no more siblings, go back to beginning
23420        * of previous level. */
23421       pop:
23422         mdbx_cursor_pop(mc);
23423         mc->mc_ki[0] = 0;
23424         for (unsigned i = 1; i < mc->mc_snum; i++) {
23425           mc->mc_ki[i] = 0;
23426           mc->mc_pg[i] = mx.mc_pg[i];
23427         }
23428       }
23429     }
23430     rc = mdbx_page_retire(mc, mc->mc_pg[0]);
23431   bailout:
23432     if (unlikely(rc != MDBX_SUCCESS))
23433       txn->mt_flags |= MDBX_TXN_ERROR;
23434   } else if (rc == MDBX_NOTFOUND) {
23435     rc = MDBX_SUCCESS;
23436   }
23437   mc->mc_flags &= ~C_INITIALIZED;
23438   return rc;
23439 }
23440 
mdbx_drop(MDBX_txn * txn,MDBX_dbi dbi,bool del)23441 int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, bool del) {
23442   int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED);
23443   if (unlikely(rc != MDBX_SUCCESS))
23444     return rc;
23445 
23446   MDBX_cursor *mc;
23447   rc = mdbx_cursor_open(txn, dbi, &mc);
23448   if (unlikely(rc != MDBX_SUCCESS))
23449     return rc;
23450 
23451   rc = mdbx_drop_tree(mc, dbi == MAIN_DBI ||
23452                               (mc->mc_db->md_flags & MDBX_DUPSORT) != 0);
23453   /* Invalidate the dropped DB's cursors */
23454   for (MDBX_cursor *m2 = txn->tw.cursors[dbi]; m2; m2 = m2->mc_next)
23455     m2->mc_flags &= ~(C_INITIALIZED | C_EOF);
23456   if (unlikely(rc))
23457     goto bailout;
23458 
23459   /* Can't delete the main DB */
23460   if (del && dbi >= CORE_DBS) {
23461     rc = mdbx_del0(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA);
23462     if (likely(rc == MDBX_SUCCESS)) {
23463       mdbx_tassert(txn, txn->mt_dbistate[MAIN_DBI] & DBI_DIRTY);
23464       mdbx_tassert(txn, txn->mt_flags & MDBX_TXN_DIRTY);
23465       txn->mt_dbistate[dbi] = DBI_STALE;
23466       MDBX_env *env = txn->mt_env;
23467       rc = mdbx_fastmutex_acquire(&env->me_dbi_lock);
23468       if (unlikely(rc != MDBX_SUCCESS)) {
23469         txn->mt_flags |= MDBX_TXN_ERROR;
23470         goto bailout;
23471       }
23472       mdbx_dbi_close_locked(env, dbi);
23473       mdbx_ensure(env,
23474                   mdbx_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS);
23475     } else {
23476       txn->mt_flags |= MDBX_TXN_ERROR;
23477     }
23478   } else {
23479     /* reset the DB record, mark it dirty */
23480     txn->mt_dbistate[dbi] |= DBI_DIRTY;
23481     txn->mt_dbs[dbi].md_depth = 0;
23482     txn->mt_dbs[dbi].md_branch_pages = 0;
23483     txn->mt_dbs[dbi].md_leaf_pages = 0;
23484     txn->mt_dbs[dbi].md_overflow_pages = 0;
23485     txn->mt_dbs[dbi].md_entries = 0;
23486     txn->mt_dbs[dbi].md_root = P_INVALID;
23487     txn->mt_dbs[dbi].md_seq = 0;
23488     txn->mt_flags |= MDBX_TXN_DIRTY;
23489   }
23490 
23491 bailout:
23492   mdbx_cursor_close(mc);
23493   return rc;
23494 }
23495 
mdbx_set_compare(MDBX_txn * txn,MDBX_dbi dbi,MDBX_cmp_func * cmp)23496 int mdbx_set_compare(MDBX_txn *txn, MDBX_dbi dbi, MDBX_cmp_func *cmp) {
23497   int rc = check_txn(txn, MDBX_TXN_BLOCKED);
23498   if (unlikely(rc != MDBX_SUCCESS))
23499     return rc;
23500 
23501   if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID)))
23502     return MDBX_BAD_DBI;
23503 
23504   txn->mt_dbxs[dbi].md_cmp = cmp;
23505   return MDBX_SUCCESS;
23506 }
23507 
mdbx_set_dupsort(MDBX_txn * txn,MDBX_dbi dbi,MDBX_cmp_func * cmp)23508 int mdbx_set_dupsort(MDBX_txn *txn, MDBX_dbi dbi, MDBX_cmp_func *cmp) {
23509   int rc = check_txn(txn, MDBX_TXN_BLOCKED);
23510   if (unlikely(rc != MDBX_SUCCESS))
23511     return rc;
23512 
23513   if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID)))
23514     return MDBX_BAD_DBI;
23515 
23516   txn->mt_dbxs[dbi].md_dcmp = cmp;
23517   return MDBX_SUCCESS;
23518 }
23519 
mdbx_reader_list(const MDBX_env * env,MDBX_reader_list_func * func,void * ctx)23520 __cold int mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func,
23521                             void *ctx) {
23522   int rc = check_env(env, true);
23523   if (unlikely(rc != MDBX_SUCCESS))
23524     return rc;
23525 
23526   if (unlikely(!func))
23527     return MDBX_EINVAL;
23528 
23529   rc = MDBX_RESULT_TRUE;
23530   int serial = 0;
23531   MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
23532   if (likely(lck)) {
23533     const unsigned snap_nreaders =
23534         atomic_load32(&lck->mti_numreaders, mo_AcquireRelease);
23535     for (unsigned i = 0; i < snap_nreaders; i++) {
23536       const MDBX_reader *r = lck->mti_readers + i;
23537     retry_reader:;
23538       const uint32_t pid = atomic_load32(&r->mr_pid, mo_AcquireRelease);
23539       if (!pid)
23540         continue;
23541       txnid_t txnid = safe64_read(&r->mr_txnid);
23542       const uint64_t tid = atomic_load64(&r->mr_tid, mo_Relaxed);
23543       const pgno_t pages_used =
23544           atomic_load32(&r->mr_snapshot_pages_used, mo_Relaxed);
23545       const uint64_t reader_pages_retired =
23546           atomic_load64(&r->mr_snapshot_pages_retired, mo_Relaxed);
23547       if (unlikely(
23548               txnid != safe64_read(&r->mr_txnid) ||
23549               pid != atomic_load32(&r->mr_pid, mo_AcquireRelease) ||
23550               tid != atomic_load64(&r->mr_tid, mo_Relaxed) ||
23551               pages_used !=
23552                   atomic_load32(&r->mr_snapshot_pages_used, mo_Relaxed) ||
23553               reader_pages_retired !=
23554                   atomic_load64(&r->mr_snapshot_pages_retired, mo_Relaxed)))
23555         goto retry_reader;
23556 
23557       mdbx_assert(env, txnid > 0);
23558       if (txnid >= SAFE64_INVALID_THRESHOLD)
23559         txnid = 0;
23560 
23561       size_t bytes_used = 0;
23562       size_t bytes_retained = 0;
23563       uint64_t lag = 0;
23564       if (txnid) {
23565       retry_header:;
23566         const MDBX_meta *const recent_meta = mdbx_meta_head(env);
23567         const uint64_t head_pages_retired =
23568             unaligned_peek_u64(4, recent_meta->mm_pages_retired);
23569         const txnid_t head_txnid = mdbx_meta_txnid_fluid(env, recent_meta);
23570         mdbx_compiler_barrier();
23571         if (unlikely(
23572                 recent_meta != mdbx_meta_head(env) ||
23573                 head_pages_retired !=
23574                     unaligned_peek_u64(4, recent_meta->mm_pages_retired)) ||
23575             head_txnid != mdbx_meta_txnid_fluid(env, recent_meta))
23576           goto retry_header;
23577 
23578         lag = (head_txnid - txnid) / xMDBX_TXNID_STEP;
23579         bytes_used = pgno2bytes(env, pages_used);
23580         bytes_retained = (head_pages_retired > reader_pages_retired)
23581                              ? pgno2bytes(env, (pgno_t)(head_pages_retired -
23582                                                         reader_pages_retired))
23583                              : 0;
23584       }
23585       rc = func(ctx, ++serial, i, pid, (mdbx_tid_t)tid, txnid, lag, bytes_used,
23586                 bytes_retained);
23587       if (unlikely(rc != MDBX_SUCCESS))
23588         break;
23589     }
23590   }
23591 
23592   return rc;
23593 }
23594 
23595 /* Insert pid into list if not already present.
23596  * return -1 if already present. */
mdbx_pid_insert(uint32_t * ids,uint32_t pid)23597 __cold static bool mdbx_pid_insert(uint32_t *ids, uint32_t pid) {
23598   /* binary search of pid in list */
23599   unsigned base = 0;
23600   unsigned cursor = 1;
23601   int val = 0;
23602   unsigned n = ids[0];
23603 
23604   while (n > 0) {
23605     unsigned pivot = n >> 1;
23606     cursor = base + pivot + 1;
23607     val = pid - ids[cursor];
23608 
23609     if (val < 0) {
23610       n = pivot;
23611     } else if (val > 0) {
23612       base = cursor;
23613       n -= pivot + 1;
23614     } else {
23615       /* found, so it's a duplicate */
23616       return false;
23617     }
23618   }
23619 
23620   if (val > 0)
23621     ++cursor;
23622 
23623   ids[0]++;
23624   for (n = ids[0]; n > cursor; n--)
23625     ids[n] = ids[n - 1];
23626   ids[n] = pid;
23627   return true;
23628 }
23629 
mdbx_reader_check(MDBX_env * env,int * dead)23630 __cold int mdbx_reader_check(MDBX_env *env, int *dead) {
23631   if (dead)
23632     *dead = 0;
23633   return mdbx_cleanup_dead_readers(env, false, dead);
23634 }
23635 
23636 /* Return:
23637  *  MDBX_RESULT_TRUE - done and mutex recovered
23638  *  MDBX_SUCCESS     - done
23639  *  Otherwise errcode. */
23640 __cold MDBX_INTERNAL_FUNC int
mdbx_cleanup_dead_readers(MDBX_env * env,int rdt_locked,int * dead)23641 mdbx_cleanup_dead_readers(MDBX_env *env, int rdt_locked, int *dead) {
23642   int rc = check_env(env, true);
23643   if (unlikely(rc != MDBX_SUCCESS))
23644     return rc;
23645 
23646   mdbx_assert(env, rdt_locked >= 0);
23647   MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
23648   if (unlikely(lck == NULL)) {
23649     /* exclusive mode */
23650     if (dead)
23651       *dead = 0;
23652     return MDBX_SUCCESS;
23653   }
23654 
23655   const unsigned snap_nreaders =
23656       atomic_load32(&lck->mti_numreaders, mo_AcquireRelease);
23657   uint32_t pidsbuf_onstask[142];
23658   uint32_t *const pids =
23659       (snap_nreaders < ARRAY_LENGTH(pidsbuf_onstask))
23660           ? pidsbuf_onstask
23661           : mdbx_malloc((snap_nreaders + 1) * sizeof(uint32_t));
23662   if (unlikely(!pids))
23663     return MDBX_ENOMEM;
23664 
23665   pids[0] = 0;
23666   int count = 0;
23667   for (unsigned i = 0; i < snap_nreaders; i++) {
23668     const uint32_t pid =
23669         atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease);
23670     if (pid == 0)
23671       continue /* skip empty */;
23672     if (pid == env->me_pid)
23673       continue /* skip self */;
23674     if (!mdbx_pid_insert(pids, pid))
23675       continue /* such pid already processed */;
23676 
23677     int err = mdbx_rpid_check(env, pid);
23678     if (err == MDBX_RESULT_TRUE)
23679       continue /* reader is live */;
23680 
23681     if (err != MDBX_SUCCESS) {
23682       rc = err;
23683       break /* mdbx_rpid_check() failed */;
23684     }
23685 
23686     /* stale reader found */
23687     if (!rdt_locked) {
23688       err = mdbx_rdt_lock(env);
23689       if (MDBX_IS_ERROR(err)) {
23690         rc = err;
23691         break;
23692       }
23693 
23694       rdt_locked = -1;
23695       if (err == MDBX_RESULT_TRUE) {
23696         /* mutex recovered, the mdbx_ipclock_failed() checked all readers */
23697         rc = MDBX_RESULT_TRUE;
23698         break;
23699       }
23700 
23701       /* a other process may have clean and reused slot, recheck */
23702       if (lck->mti_readers[i].mr_pid.weak != pid)
23703         continue;
23704 
23705       err = mdbx_rpid_check(env, pid);
23706       if (MDBX_IS_ERROR(err)) {
23707         rc = err;
23708         break;
23709       }
23710 
23711       if (err != MDBX_SUCCESS)
23712         continue /* the race with other process, slot reused */;
23713     }
23714 
23715     /* clean it */
23716     for (unsigned j = i; j < snap_nreaders; j++) {
23717       if (lck->mti_readers[j].mr_pid.weak == pid) {
23718         mdbx_debug("clear stale reader pid %" PRIuPTR " txn %" PRIaTXN,
23719                    (size_t)pid, lck->mti_readers[j].mr_txnid.weak);
23720         atomic_store32(&lck->mti_readers[j].mr_pid, 0, mo_Relaxed);
23721         atomic_store32(&lck->mti_readers_refresh_flag, true, mo_AcquireRelease);
23722         count++;
23723       }
23724     }
23725   }
23726 
23727   if (likely(!MDBX_IS_ERROR(rc)))
23728     atomic_store64(&lck->mti_reader_check_timestamp, mdbx_osal_monotime(),
23729                    mo_Relaxed);
23730 
23731   if (rdt_locked < 0)
23732     mdbx_rdt_unlock(env);
23733 
23734   if (pids != pidsbuf_onstask)
23735     mdbx_free(pids);
23736 
23737   if (dead)
23738     *dead = count;
23739   return rc;
23740 }
23741 
mdbx_setup_debug(int loglevel,int flags,MDBX_debug_func * logger)23742 __cold int mdbx_setup_debug(int loglevel, int flags, MDBX_debug_func *logger) {
23743   const int rc = mdbx_runtime_flags | (mdbx_loglevel << 16);
23744 
23745   if (loglevel != MDBX_LOG_DONTCHANGE)
23746     mdbx_loglevel = (uint8_t)loglevel;
23747 
23748   if (flags != MDBX_DBG_DONTCHANGE) {
23749     flags &=
23750 #if MDBX_DEBUG
23751         MDBX_DBG_ASSERT | MDBX_DBG_AUDIT | MDBX_DBG_JITTER |
23752 #endif
23753         MDBX_DBG_DUMP | MDBX_DBG_LEGACY_MULTIOPEN | MDBX_DBG_LEGACY_OVERLAP;
23754     mdbx_runtime_flags = (uint8_t)flags;
23755   }
23756 
23757   if (logger != MDBX_LOGGER_DONTCHANGE)
23758     mdbx_debug_logger = logger;
23759   return rc;
23760 }
23761 
mdbx_kick_longlived_readers(MDBX_env * env,const txnid_t laggard)23762 __cold static txnid_t mdbx_kick_longlived_readers(MDBX_env *env,
23763                                                   const txnid_t laggard) {
23764   mdbx_debug("DB size maxed out by reading #%" PRIaTXN, laggard);
23765 
23766   int retry;
23767   for (retry = 0; retry < INT_MAX; ++retry) {
23768     txnid_t oldest = mdbx_recent_steady_txnid(env);
23769     mdbx_assert(env, oldest < env->me_txn0->mt_txnid);
23770     mdbx_assert(env, oldest >= laggard);
23771     mdbx_assert(env, oldest >= env->me_lck->mti_oldest_reader.weak);
23772     MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
23773     if (oldest == laggard || unlikely(!lck /* without-LCK mode */))
23774       return oldest;
23775 
23776     if (MDBX_IS_ERROR(mdbx_cleanup_dead_readers(env, false, NULL)))
23777       break;
23778 
23779     MDBX_reader *asleep = nullptr;
23780     uint64_t oldest_retired = UINT64_MAX;
23781     const unsigned snap_nreaders =
23782         atomic_load32(&lck->mti_numreaders, mo_AcquireRelease);
23783     for (unsigned i = 0; i < snap_nreaders; ++i) {
23784     retry:
23785       if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) {
23786         /* mdbx_jitter4testing(true); */
23787         const uint64_t snap_retired = atomic_load64(
23788             &lck->mti_readers[i].mr_snapshot_pages_retired, mo_Relaxed);
23789         const txnid_t snap_txnid = safe64_read(&lck->mti_readers[i].mr_txnid);
23790         if (unlikely(snap_retired !=
23791                          atomic_load64(
23792                              &lck->mti_readers[i].mr_snapshot_pages_retired,
23793                              mo_AcquireRelease) ||
23794                      snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid)))
23795           goto retry;
23796         if (oldest > snap_txnid &&
23797             laggard <= /* ignore pending updates */ snap_txnid) {
23798           oldest = snap_txnid;
23799           oldest_retired = snap_retired;
23800           asleep = &lck->mti_readers[i];
23801         }
23802       }
23803     }
23804 
23805     if (laggard < oldest || !asleep) {
23806       if (retry && env->me_hsr_callback) {
23807         /* LY: notify end of hsr-loop */
23808         const txnid_t gap = oldest - laggard;
23809         env->me_hsr_callback(env, env->me_txn, 0, 0, laggard,
23810                              (gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, 0,
23811                              -retry);
23812       }
23813       mdbx_notice("hsr-kick: update oldest %" PRIaTXN " -> %" PRIaTXN,
23814                   lck->mti_oldest_reader.weak, oldest);
23815       mdbx_assert(env, lck->mti_oldest_reader.weak <= oldest);
23816       return atomic_store64(&lck->mti_oldest_reader, oldest, mo_Relaxed);
23817     }
23818 
23819     if (!env->me_hsr_callback)
23820       break;
23821 
23822     uint32_t pid = atomic_load32(&asleep->mr_pid, mo_AcquireRelease);
23823     uint64_t tid = asleep->mr_tid.weak;
23824     if (safe64_read(&asleep->mr_txnid) != laggard || pid <= 0)
23825       continue;
23826 
23827     const MDBX_meta *head_meta = mdbx_meta_head(env);
23828     const txnid_t gap =
23829         (mdbx_meta_txnid_stable(env, head_meta) - laggard) / xMDBX_TXNID_STEP;
23830     const uint64_t head_retired =
23831         unaligned_peek_u64(4, head_meta->mm_pages_retired);
23832     const size_t space =
23833         (oldest_retired > head_retired)
23834             ? pgno2bytes(env, (pgno_t)(oldest_retired - head_retired))
23835             : 0;
23836     int rc = env->me_hsr_callback(
23837         env, env->me_txn, pid, (mdbx_tid_t)tid, laggard,
23838         (gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, space, retry);
23839     if (rc < 0)
23840       break;
23841 
23842     if (rc > 0) {
23843       if (rc == 1) {
23844         safe64_reset_compare(&asleep->mr_txnid, laggard);
23845       } else {
23846         safe64_reset(&asleep->mr_txnid, true);
23847         atomic_store64(&asleep->mr_tid, 0, mo_Relaxed);
23848         atomic_store32(&asleep->mr_pid, 0, mo_Relaxed);
23849       }
23850       atomic_store32(&lck->mti_readers_refresh_flag, true, mo_Relaxed);
23851     }
23852   }
23853 
23854   if (retry && env->me_hsr_callback) {
23855     /* LY: notify end of hsr-loop */
23856     env->me_hsr_callback(env, env->me_txn, 0, 0, laggard, 0, 0, -retry);
23857   }
23858   return mdbx_find_oldest(env->me_txn);
23859 }
23860 
23861 #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API
mdbx_env_set_syncbytes(MDBX_env * env,size_t threshold)23862 __cold int mdbx_env_set_syncbytes(MDBX_env *env, size_t threshold) {
23863   return __inline_mdbx_env_set_syncbytes(env, threshold);
23864 }
23865 
mdbx_env_set_syncperiod(MDBX_env * env,unsigned seconds_16dot16)23866 __cold int mdbx_env_set_syncperiod(MDBX_env *env, unsigned seconds_16dot16) {
23867   return __inline_mdbx_env_set_syncperiod(env, seconds_16dot16);
23868 }
23869 #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */
23870 
mdbx_env_set_hsr(MDBX_env * env,MDBX_hsr_func * hsr)23871 __cold int mdbx_env_set_hsr(MDBX_env *env, MDBX_hsr_func *hsr) {
23872   int rc = check_env(env, false);
23873   if (unlikely(rc != MDBX_SUCCESS))
23874     return rc;
23875 
23876   env->me_hsr_callback = hsr;
23877   return MDBX_SUCCESS;
23878 }
23879 
mdbx_env_get_hsr(const MDBX_env * env)23880 __cold MDBX_hsr_func *mdbx_env_get_hsr(const MDBX_env *env) {
23881   return likely(env && env->me_signature.weak == MDBX_ME_SIGNATURE)
23882              ? env->me_hsr_callback
23883              : NULL;
23884 }
23885 
23886 #ifdef __SANITIZE_THREAD__
23887 /* LY: avoid tsan-trap by me_txn, mm_last_pg and mt_next_pgno */
23888 __attribute__((__no_sanitize_thread__, __noinline__))
23889 #endif
mdbx_txn_straggler(const MDBX_txn * txn,int * percent)23890 int mdbx_txn_straggler(const MDBX_txn *txn, int *percent)
23891 {
23892   int rc = check_txn(txn, MDBX_TXN_BLOCKED);
23893   if (unlikely(rc != MDBX_SUCCESS))
23894     return (rc > 0) ? -rc : rc;
23895 
23896   MDBX_env *env = txn->mt_env;
23897   if (unlikely((txn->mt_flags & MDBX_TXN_RDONLY) == 0)) {
23898     if (percent)
23899       *percent =
23900           (int)((txn->mt_next_pgno * UINT64_C(100) + txn->mt_end_pgno / 2) /
23901                 txn->mt_end_pgno);
23902     return 0;
23903   }
23904 
23905   txnid_t recent;
23906   MDBX_meta *meta;
23907   do {
23908     meta = mdbx_meta_head(env);
23909     recent = mdbx_meta_txnid_fluid(env, meta);
23910     if (percent) {
23911       const pgno_t maxpg = meta->mm_geo.now;
23912       *percent = (int)((meta->mm_geo.next * UINT64_C(100) + maxpg / 2) / maxpg);
23913     }
23914   } while (unlikely(recent != mdbx_meta_txnid_fluid(env, meta)));
23915 
23916   txnid_t lag = (recent - txn->mt_txnid) / xMDBX_TXNID_STEP;
23917   return (lag > INT_MAX) ? INT_MAX : (int)lag;
23918 }
23919 
23920 typedef struct mdbx_walk_ctx {
23921   void *mw_user;
23922   MDBX_pgvisitor_func *mw_visitor;
23923   MDBX_txn *mw_txn;
23924   MDBX_cursor *mw_cursor;
23925   bool mw_dont_check_keys_ordering;
23926 } mdbx_walk_ctx_t;
23927 
23928 __cold static int mdbx_walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const db,
23929                                 const char *name, int deep);
23930 
walk_page_type(const MDBX_page * mp)23931 static MDBX_page_type_t walk_page_type(const MDBX_page *mp) {
23932   if (mp)
23933     switch (mp->mp_flags) {
23934     case P_BRANCH:
23935       return MDBX_page_branch;
23936     case P_LEAF:
23937       return MDBX_page_leaf;
23938     case P_LEAF | P_LEAF2:
23939       return MDBX_page_dupfixed_leaf;
23940     case P_OVERFLOW:
23941       return MDBX_page_large;
23942     case P_META:
23943       return MDBX_page_meta;
23944     }
23945   return MDBX_page_broken;
23946 }
23947 
23948 /* Depth-first tree traversal. */
mdbx_walk_tree(mdbx_walk_ctx_t * ctx,const pgno_t pgno,const char * name,int deep,txnid_t parent_txnid)23949 __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno,
23950                                  const char *name, int deep,
23951                                  txnid_t parent_txnid) {
23952   assert(pgno != P_INVALID);
23953   MDBX_page *mp = nullptr;
23954   int rc, err = mdbx_page_get(ctx->mw_cursor, pgno, &mp, parent_txnid);
23955   if (err == MDBX_SUCCESS)
23956     err = mdbx_page_check(ctx->mw_cursor, mp, 0);
23957 
23958   MDBX_page_type_t type = walk_page_type(mp);
23959   const int nentries = (mp && !IS_OVERFLOW(mp)) ? page_numkeys(mp) : 1;
23960   unsigned npages = (mp && IS_OVERFLOW(mp)) ? mp->mp_pages : 1;
23961   size_t pagesize = pgno2bytes(ctx->mw_txn->mt_env, npages);
23962   size_t header_size = (mp && !IS_LEAF2(mp) && !IS_OVERFLOW(mp))
23963                            ? PAGEHDRSZ + mp->mp_lower
23964                            : PAGEHDRSZ;
23965   size_t payload_size = 0;
23966   size_t unused_size =
23967       (mp && !IS_OVERFLOW(mp) ? page_room(mp) : pagesize - header_size) -
23968       payload_size;
23969   size_t align_bytes = 0;
23970 
23971   if (err == MDBX_SUCCESS) {
23972     /* LY: Don't use mask here, e.g bitwise
23973      * (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP).
23974      * Pages should not me marked dirty/loose or otherwise. */
23975     switch (mp->mp_flags) {
23976     default:
23977       err = MDBX_CORRUPTED;
23978       break;
23979     case P_BRANCH:
23980       if (unlikely(nentries < 2))
23981         err = MDBX_CORRUPTED;
23982     case P_LEAF:
23983     case P_LEAF | P_LEAF2:
23984       break;
23985     }
23986   }
23987 
23988   for (int i = 0; err == MDBX_SUCCESS && i < nentries;
23989        align_bytes += ((payload_size + align_bytes) & 1), i++) {
23990     if (type == MDBX_page_dupfixed_leaf) {
23991       /* LEAF2 pages have no mp_ptrs[] or node headers */
23992       payload_size += mp->mp_leaf2_ksize;
23993       continue;
23994     }
23995 
23996     MDBX_node *node = page_node(mp, i);
23997     payload_size += NODESIZE + node_ks(node);
23998 
23999     if (type == MDBX_page_branch) {
24000       assert(i > 0 || node_ks(node) == 0);
24001       continue;
24002     }
24003 
24004     assert(type == MDBX_page_leaf);
24005     switch (node_flags(node)) {
24006     case 0 /* usual node */:
24007       payload_size += node_ds(node);
24008       break;
24009 
24010     case F_BIGDATA /* long data on the large/overflow page */: {
24011       payload_size += sizeof(pgno_t);
24012       const pgno_t large_pgno = node_largedata_pgno(node);
24013       const size_t over_payload = node_ds(node);
24014       const size_t over_header = PAGEHDRSZ;
24015       npages = 1;
24016 
24017       MDBX_page *op;
24018       err = mdbx_page_get(ctx->mw_cursor, large_pgno, &op,
24019                           pp_txnid4chk(mp, ctx->mw_txn));
24020       if (err == MDBX_SUCCESS)
24021         err = mdbx_page_check(ctx->mw_cursor, op, 0);
24022       if (err == MDBX_SUCCESS) {
24023         /* LY: Don't use mask here, e.g bitwise
24024          * (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP).
24025          * Pages should not me marked dirty/loose or otherwise. */
24026         if (unlikely(P_OVERFLOW != op->mp_flags))
24027           err = bad_page(mp, "wrong page type %d for large data", op->mp_flags);
24028         else
24029           npages = op->mp_pages;
24030       }
24031 
24032       pagesize = pgno2bytes(ctx->mw_txn->mt_env, npages);
24033       const size_t over_unused = pagesize - over_payload - over_header;
24034       rc = ctx->mw_visitor(large_pgno, npages, ctx->mw_user, deep, name,
24035                            pagesize, MDBX_page_large, err, 1, over_payload,
24036                            over_header, over_unused);
24037       if (unlikely(rc != MDBX_SUCCESS))
24038         return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc;
24039     } break;
24040 
24041     case F_SUBDATA /* sub-db */: {
24042       const size_t namelen = node_ks(node);
24043       payload_size += node_ds(node);
24044       if (unlikely(namelen == 0 || node_ds(node) != sizeof(MDBX_db)))
24045         err = MDBX_CORRUPTED;
24046     } break;
24047 
24048     case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */:
24049       payload_size += sizeof(MDBX_db);
24050       if (unlikely(node_ds(node) != sizeof(MDBX_db)))
24051         err = MDBX_CORRUPTED;
24052       break;
24053 
24054     case F_DUPDATA /* short sub-page */: {
24055       if (unlikely(node_ds(node) <= PAGEHDRSZ)) {
24056         err = MDBX_CORRUPTED;
24057         break;
24058       }
24059 
24060       MDBX_page *sp = node_data(node);
24061       const int nsubkeys = page_numkeys(sp);
24062       size_t subheader_size =
24063           IS_LEAF2(sp) ? PAGEHDRSZ : PAGEHDRSZ + sp->mp_lower;
24064       size_t subunused_size = page_room(sp);
24065       size_t subpayload_size = 0;
24066       size_t subalign_bytes = 0;
24067       MDBX_page_type_t subtype;
24068 
24069       switch (sp->mp_flags & /* ignore legacy P_DIRTY flag */ ~0x10) {
24070       case P_LEAF | P_SUBP:
24071         subtype = MDBX_subpage_leaf;
24072         break;
24073       case P_LEAF | P_LEAF2 | P_SUBP:
24074         subtype = MDBX_subpage_dupfixed_leaf;
24075         break;
24076       default:
24077         subtype = MDBX_subpage_broken;
24078         err = MDBX_CORRUPTED;
24079       }
24080 
24081       for (int j = 0; err == MDBX_SUCCESS && j < nsubkeys;
24082            subalign_bytes += ((subpayload_size + subalign_bytes) & 1), j++) {
24083 
24084         if (subtype == MDBX_subpage_dupfixed_leaf) {
24085           /* LEAF2 pages have no mp_ptrs[] or node headers */
24086           subpayload_size += sp->mp_leaf2_ksize;
24087         } else {
24088           assert(subtype == MDBX_subpage_leaf);
24089           MDBX_node *subnode = page_node(sp, j);
24090           subpayload_size += NODESIZE + node_ks(subnode) + node_ds(subnode);
24091           if (unlikely(node_flags(subnode) != 0))
24092             err = MDBX_CORRUPTED;
24093         }
24094       }
24095 
24096       rc = ctx->mw_visitor(pgno, 0, ctx->mw_user, deep + 1, name, node_ds(node),
24097                            subtype, err, nsubkeys, subpayload_size,
24098                            subheader_size, subunused_size + subalign_bytes);
24099       if (unlikely(rc != MDBX_SUCCESS))
24100         return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc;
24101       header_size += subheader_size;
24102       unused_size += subunused_size;
24103       payload_size += subpayload_size;
24104       align_bytes += subalign_bytes;
24105     } break;
24106 
24107     default:
24108       err = MDBX_CORRUPTED;
24109     }
24110   }
24111 
24112   rc = ctx->mw_visitor(pgno, 1, ctx->mw_user, deep, name,
24113                        ctx->mw_txn->mt_env->me_psize, type, err, nentries,
24114                        payload_size, header_size, unused_size + align_bytes);
24115   if (unlikely(rc != MDBX_SUCCESS))
24116     return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc;
24117 
24118   for (int i = 0; err == MDBX_SUCCESS && i < nentries; i++) {
24119     if (type == MDBX_page_dupfixed_leaf)
24120       continue;
24121 
24122     MDBX_node *node = page_node(mp, i);
24123     if (type == MDBX_page_branch) {
24124       err = mdbx_walk_tree(ctx, node_pgno(node), name, deep + 1,
24125                            pp_txnid4chk(mp, ctx->mw_txn));
24126       if (unlikely(err != MDBX_SUCCESS)) {
24127         if (err == MDBX_RESULT_TRUE)
24128           break;
24129         return err;
24130       }
24131       continue;
24132     }
24133 
24134     assert(type == MDBX_page_leaf);
24135     MDBX_db db;
24136     switch (node_flags(node)) {
24137     default:
24138       continue;
24139 
24140     case F_SUBDATA /* sub-db */: {
24141       const size_t namelen = node_ks(node);
24142       if (unlikely(namelen == 0 || node_ds(node) != sizeof(MDBX_db))) {
24143         err = MDBX_CORRUPTED;
24144         break;
24145       }
24146 
24147       char namebuf_onstask[64];
24148       char *const sub_name = (namelen < sizeof(namebuf_onstask))
24149                                  ? namebuf_onstask
24150                                  : mdbx_malloc(namelen + 1);
24151       if (sub_name) {
24152         memcpy(sub_name, node_key(node), namelen);
24153         sub_name[namelen] = 0;
24154         memcpy(&db, node_data(node), sizeof(db));
24155         err = mdbx_walk_sdb(ctx, &db, sub_name, deep + 1);
24156         if (sub_name != namebuf_onstask)
24157           mdbx_free(sub_name);
24158       } else {
24159         err = MDBX_ENOMEM;
24160       }
24161     } break;
24162 
24163     case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */:
24164       if (unlikely(node_ds(node) != sizeof(MDBX_db) ||
24165                    ctx->mw_cursor->mc_xcursor == NULL))
24166         err = MDBX_CORRUPTED;
24167       else {
24168         memcpy(&db, node_data(node), sizeof(db));
24169         assert(ctx->mw_cursor->mc_xcursor ==
24170                &container_of(ctx->mw_cursor, MDBX_cursor_couple, outer)->inner);
24171         ctx->mw_cursor = &ctx->mw_cursor->mc_xcursor->mx_cursor;
24172         err = mdbx_walk_tree(ctx, db.md_root, name, deep + 1,
24173                              pp_txnid4chk(mp, ctx->mw_txn));
24174         MDBX_xcursor *inner_xcursor =
24175             container_of(ctx->mw_cursor, MDBX_xcursor, mx_cursor);
24176         MDBX_cursor_couple *couple =
24177             container_of(inner_xcursor, MDBX_cursor_couple, inner);
24178         ctx->mw_cursor = &couple->outer;
24179       }
24180       break;
24181     }
24182   }
24183 
24184   return MDBX_SUCCESS;
24185 }
24186 
mdbx_walk_sdb(mdbx_walk_ctx_t * ctx,MDBX_db * const db,const char * name,int deep)24187 __cold static int mdbx_walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const db,
24188                                 const char *name, int deep) {
24189   if (unlikely(db->md_root == P_INVALID))
24190     return MDBX_SUCCESS; /* empty db */
24191 
24192   MDBX_cursor_couple couple;
24193   MDBX_dbx dbx = {.md_klen_min = INT_MAX};
24194   uint8_t dbistate = DBI_VALID | DBI_AUDITED;
24195   int rc = mdbx_couple_init(&couple, ~0u, ctx->mw_txn, db, &dbx, &dbistate);
24196   if (unlikely(rc != MDBX_SUCCESS))
24197     return rc;
24198 
24199   if (ctx->mw_dont_check_keys_ordering) {
24200     couple.outer.mc_flags |= C_SKIPORD;
24201     couple.inner.mx_cursor.mc_flags |= C_SKIPORD;
24202   }
24203   couple.outer.mc_next = ctx->mw_cursor;
24204   ctx->mw_cursor = &couple.outer;
24205   rc = mdbx_walk_tree(ctx, db->md_root, name, deep, ctx->mw_txn->mt_txnid);
24206   ctx->mw_cursor = couple.outer.mc_next;
24207   return rc;
24208 }
24209 
mdbx_env_pgwalk(MDBX_txn * txn,MDBX_pgvisitor_func * visitor,void * user,bool dont_check_keys_ordering)24210 __cold int mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor,
24211                            void *user, bool dont_check_keys_ordering) {
24212   int rc = check_txn(txn, MDBX_TXN_BLOCKED);
24213   if (unlikely(rc != MDBX_SUCCESS))
24214     return rc;
24215 
24216   mdbx_walk_ctx_t ctx;
24217   memset(&ctx, 0, sizeof(ctx));
24218   ctx.mw_txn = txn;
24219   ctx.mw_user = user;
24220   ctx.mw_visitor = visitor;
24221   ctx.mw_dont_check_keys_ordering = dont_check_keys_ordering;
24222 
24223   rc = visitor(0, NUM_METAS, user, 0, MDBX_PGWALK_META,
24224                pgno2bytes(txn->mt_env, NUM_METAS), MDBX_page_meta, MDBX_SUCCESS,
24225                NUM_METAS, sizeof(MDBX_meta) * NUM_METAS, PAGEHDRSZ * NUM_METAS,
24226                (txn->mt_env->me_psize - sizeof(MDBX_meta) - PAGEHDRSZ) *
24227                    NUM_METAS);
24228   if (!MDBX_IS_ERROR(rc))
24229     rc = mdbx_walk_sdb(&ctx, &txn->mt_dbs[FREE_DBI], MDBX_PGWALK_GC, 0);
24230   if (!MDBX_IS_ERROR(rc))
24231     rc = mdbx_walk_sdb(&ctx, &txn->mt_dbs[MAIN_DBI], MDBX_PGWALK_MAIN, 0);
24232   return rc;
24233 }
24234 
mdbx_canary_put(MDBX_txn * txn,const MDBX_canary * canary)24235 int mdbx_canary_put(MDBX_txn *txn, const MDBX_canary *canary) {
24236   int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED);
24237   if (unlikely(rc != MDBX_SUCCESS))
24238     return rc;
24239 
24240   if (likely(canary)) {
24241     if (txn->mt_canary.x == canary->x && txn->mt_canary.y == canary->y &&
24242         txn->mt_canary.z == canary->z)
24243       return MDBX_SUCCESS;
24244     txn->mt_canary.x = canary->x;
24245     txn->mt_canary.y = canary->y;
24246     txn->mt_canary.z = canary->z;
24247   }
24248   txn->mt_canary.v = txn->mt_txnid;
24249   txn->mt_flags |= MDBX_TXN_DIRTY;
24250 
24251   return MDBX_SUCCESS;
24252 }
24253 
mdbx_canary_get(const MDBX_txn * txn,MDBX_canary * canary)24254 int mdbx_canary_get(const MDBX_txn *txn, MDBX_canary *canary) {
24255   int rc = check_txn(txn, MDBX_TXN_BLOCKED);
24256   if (unlikely(rc != MDBX_SUCCESS))
24257     return rc;
24258 
24259   if (unlikely(canary == NULL))
24260     return MDBX_EINVAL;
24261 
24262   *canary = txn->mt_canary;
24263   return MDBX_SUCCESS;
24264 }
24265 
mdbx_cursor_on_first(const MDBX_cursor * mc)24266 int mdbx_cursor_on_first(const MDBX_cursor *mc) {
24267   if (unlikely(mc == NULL))
24268     return MDBX_EINVAL;
24269 
24270   if (unlikely(mc->mc_signature != MDBX_MC_LIVE))
24271     return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL
24272                                                      : MDBX_EBADSIGN;
24273 
24274   if (!(mc->mc_flags & C_INITIALIZED))
24275     return mc->mc_db->md_entries ? MDBX_RESULT_FALSE : MDBX_RESULT_TRUE;
24276 
24277   for (unsigned i = 0; i < mc->mc_snum; ++i) {
24278     if (mc->mc_ki[i])
24279       return MDBX_RESULT_FALSE;
24280   }
24281 
24282   return MDBX_RESULT_TRUE;
24283 }
24284 
mdbx_cursor_on_last(const MDBX_cursor * mc)24285 int mdbx_cursor_on_last(const MDBX_cursor *mc) {
24286   if (unlikely(mc == NULL))
24287     return MDBX_EINVAL;
24288 
24289   if (unlikely(mc->mc_signature != MDBX_MC_LIVE))
24290     return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL
24291                                                      : MDBX_EBADSIGN;
24292 
24293   if (!(mc->mc_flags & C_INITIALIZED))
24294     return mc->mc_db->md_entries ? MDBX_RESULT_FALSE : MDBX_RESULT_TRUE;
24295 
24296   for (unsigned i = 0; i < mc->mc_snum; ++i) {
24297     unsigned nkeys = page_numkeys(mc->mc_pg[i]);
24298     if (mc->mc_ki[i] < nkeys - 1)
24299       return MDBX_RESULT_FALSE;
24300   }
24301 
24302   return MDBX_RESULT_TRUE;
24303 }
24304 
mdbx_cursor_eof(const MDBX_cursor * mc)24305 int mdbx_cursor_eof(const MDBX_cursor *mc) {
24306   if (unlikely(mc == NULL))
24307     return MDBX_EINVAL;
24308 
24309   if (unlikely(mc->mc_signature != MDBX_MC_LIVE))
24310     return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL
24311                                                      : MDBX_EBADSIGN;
24312 
24313   return ((mc->mc_flags & (C_INITIALIZED | C_EOF)) == C_INITIALIZED &&
24314           mc->mc_snum &&
24315           mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]))
24316              ? MDBX_RESULT_FALSE
24317              : MDBX_RESULT_TRUE;
24318 }
24319 
24320 //------------------------------------------------------------------------------
24321 
24322 struct diff_result {
24323   ptrdiff_t diff;
24324   unsigned level;
24325   int root_nkeys;
24326 };
24327 
24328 /* calculates: r = x - y */
cursor_diff(const MDBX_cursor * const __restrict x,const MDBX_cursor * const __restrict y,struct diff_result * const __restrict r)24329 __hot static int cursor_diff(const MDBX_cursor *const __restrict x,
24330                              const MDBX_cursor *const __restrict y,
24331                              struct diff_result *const __restrict r) {
24332   r->diff = 0;
24333   r->level = 0;
24334   r->root_nkeys = 0;
24335 
24336   if (unlikely(x->mc_signature != MDBX_MC_LIVE))
24337     return (x->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL
24338                                                     : MDBX_EBADSIGN;
24339 
24340   if (unlikely(y->mc_signature != MDBX_MC_LIVE))
24341     return (y->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL
24342                                                     : MDBX_EBADSIGN;
24343 
24344   int rc = check_txn(x->mc_txn, MDBX_TXN_BLOCKED);
24345   if (unlikely(rc != MDBX_SUCCESS))
24346     return rc;
24347 
24348   if (unlikely(x->mc_txn != y->mc_txn))
24349     return MDBX_BAD_TXN;
24350 
24351   if (unlikely(y->mc_dbi != x->mc_dbi))
24352     return MDBX_EINVAL;
24353 
24354   if (unlikely(!(y->mc_flags & x->mc_flags & C_INITIALIZED)))
24355     return MDBX_ENODATA;
24356 
24357   while (likely(r->level < y->mc_snum && r->level < x->mc_snum)) {
24358     if (unlikely(y->mc_pg[r->level] != x->mc_pg[r->level])) {
24359       mdbx_error("Mismatch cursors's pages at %u level", r->level);
24360       return MDBX_PROBLEM;
24361     }
24362 
24363     int nkeys = page_numkeys(y->mc_pg[r->level]);
24364     assert(nkeys > 0);
24365     if (r->level == 0)
24366       r->root_nkeys = nkeys;
24367 
24368     const int limit_ki = nkeys - 1;
24369     const int x_ki = x->mc_ki[r->level];
24370     const int y_ki = y->mc_ki[r->level];
24371     r->diff = ((x_ki < limit_ki) ? x_ki : limit_ki) -
24372               ((y_ki < limit_ki) ? y_ki : limit_ki);
24373     if (r->diff == 0) {
24374       r->level += 1;
24375       continue;
24376     }
24377 
24378     while (unlikely(r->diff == 1) &&
24379            likely(r->level + 1 < y->mc_snum && r->level + 1 < x->mc_snum)) {
24380       r->level += 1;
24381       /*   DB'PAGEs: 0------------------>MAX
24382        *
24383        *    CURSORs:       y < x
24384        *  STACK[i ]:         |
24385        *  STACK[+1]:  ...y++N|0++x...
24386        */
24387       nkeys = page_numkeys(y->mc_pg[r->level]);
24388       r->diff = (nkeys - y->mc_ki[r->level]) + x->mc_ki[r->level];
24389       assert(r->diff > 0);
24390     }
24391 
24392     while (unlikely(r->diff == -1) &&
24393            likely(r->level + 1 < y->mc_snum && r->level + 1 < x->mc_snum)) {
24394       r->level += 1;
24395       /*   DB'PAGEs: 0------------------>MAX
24396        *
24397        *    CURSORs:       x < y
24398        *  STACK[i ]:         |
24399        *  STACK[+1]:  ...x--N|0--y...
24400        */
24401       nkeys = page_numkeys(x->mc_pg[r->level]);
24402       r->diff = -(nkeys - x->mc_ki[r->level]) - y->mc_ki[r->level];
24403       assert(r->diff < 0);
24404     }
24405 
24406     return MDBX_SUCCESS;
24407   }
24408 
24409   r->diff = CMP2INT(x->mc_flags & C_EOF, y->mc_flags & C_EOF);
24410   return MDBX_SUCCESS;
24411 }
24412 
estimate(const MDBX_db * db,struct diff_result * const __restrict dr)24413 __hot static ptrdiff_t estimate(const MDBX_db *db,
24414                                 struct diff_result *const __restrict dr) {
24415   /*        root: branch-page    => scale = leaf-factor * branch-factor^(N-1)
24416    *     level-1: branch-page(s) => scale = leaf-factor * branch-factor^2
24417    *     level-2: branch-page(s) => scale = leaf-factor * branch-factor
24418    *     level-N: branch-page(s) => scale = leaf-factor
24419    *  leaf-level: leaf-page(s)   => scale = 1
24420    */
24421   ptrdiff_t btree_power = (ptrdiff_t)db->md_depth - 2 - (ptrdiff_t)dr->level;
24422   if (btree_power < 0)
24423     return dr->diff;
24424 
24425   ptrdiff_t estimated =
24426       (ptrdiff_t)db->md_entries * dr->diff / (ptrdiff_t)db->md_leaf_pages;
24427   if (btree_power == 0)
24428     return estimated;
24429 
24430   if (db->md_depth < 4) {
24431     assert(dr->level == 0 && btree_power == 1);
24432     return (ptrdiff_t)db->md_entries * dr->diff / (ptrdiff_t)dr->root_nkeys;
24433   }
24434 
24435   /* average_branchpage_fillfactor = total(branch_entries) / branch_pages
24436      total(branch_entries) = leaf_pages + branch_pages - 1 (root page) */
24437   const size_t log2_fixedpoint = sizeof(size_t) - 1;
24438   const size_t half = UINT64_C(1) << (log2_fixedpoint - 1);
24439   const size_t factor =
24440       ((db->md_leaf_pages + db->md_branch_pages - 1) << log2_fixedpoint) /
24441       db->md_branch_pages;
24442   while (1) {
24443     switch ((size_t)btree_power) {
24444     default: {
24445       const size_t square = (factor * factor + half) >> log2_fixedpoint;
24446       const size_t quad = (square * square + half) >> log2_fixedpoint;
24447       do {
24448         estimated = estimated * quad + half;
24449         estimated >>= log2_fixedpoint;
24450         btree_power -= 4;
24451       } while (btree_power >= 4);
24452       continue;
24453     }
24454     case 3:
24455       estimated = estimated * factor + half;
24456       estimated >>= log2_fixedpoint;
24457       __fallthrough /* fall through */;
24458     case 2:
24459       estimated = estimated * factor + half;
24460       estimated >>= log2_fixedpoint;
24461       __fallthrough /* fall through */;
24462     case 1:
24463       estimated = estimated * factor + half;
24464       estimated >>= log2_fixedpoint;
24465       __fallthrough /* fall through */;
24466     case 0:
24467       if (unlikely(estimated > (ptrdiff_t)db->md_entries))
24468         return (ptrdiff_t)db->md_entries;
24469       if (unlikely(estimated < -(ptrdiff_t)db->md_entries))
24470         return -(ptrdiff_t)db->md_entries;
24471       return estimated;
24472     }
24473   }
24474 }
24475 
mdbx_estimate_distance(const MDBX_cursor * first,const MDBX_cursor * last,ptrdiff_t * distance_items)24476 int mdbx_estimate_distance(const MDBX_cursor *first, const MDBX_cursor *last,
24477                            ptrdiff_t *distance_items) {
24478   if (unlikely(first == NULL || last == NULL || distance_items == NULL))
24479     return MDBX_EINVAL;
24480 
24481   *distance_items = 0;
24482   struct diff_result dr;
24483   int rc = cursor_diff(last, first, &dr);
24484   if (unlikely(rc != MDBX_SUCCESS))
24485     return rc;
24486 
24487   if (unlikely(dr.diff == 0) &&
24488       F_ISSET(first->mc_db->md_flags & last->mc_db->md_flags,
24489               MDBX_DUPSORT | C_INITIALIZED)) {
24490     first = &first->mc_xcursor->mx_cursor;
24491     last = &last->mc_xcursor->mx_cursor;
24492     rc = cursor_diff(first, last, &dr);
24493     if (unlikely(rc != MDBX_SUCCESS))
24494       return rc;
24495   }
24496 
24497   if (likely(dr.diff != 0))
24498     *distance_items = estimate(first->mc_db, &dr);
24499 
24500   return MDBX_SUCCESS;
24501 }
24502 
mdbx_estimate_move(const MDBX_cursor * cursor,MDBX_val * key,MDBX_val * data,MDBX_cursor_op move_op,ptrdiff_t * distance_items)24503 int mdbx_estimate_move(const MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data,
24504                        MDBX_cursor_op move_op, ptrdiff_t *distance_items) {
24505   if (unlikely(cursor == NULL || distance_items == NULL ||
24506                move_op == MDBX_GET_CURRENT || move_op == MDBX_GET_MULTIPLE))
24507     return MDBX_EINVAL;
24508 
24509   if (unlikely(cursor->mc_signature != MDBX_MC_LIVE))
24510     return (cursor->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL
24511                                                          : MDBX_EBADSIGN;
24512 
24513   int rc = check_txn(cursor->mc_txn, MDBX_TXN_BLOCKED);
24514   if (unlikely(rc != MDBX_SUCCESS))
24515     return rc;
24516 
24517   if (!(cursor->mc_flags & C_INITIALIZED))
24518     return MDBX_ENODATA;
24519 
24520   MDBX_cursor_couple next;
24521   cursor_copy(cursor, &next.outer);
24522   if (cursor->mc_db->md_flags & MDBX_DUPSORT) {
24523     next.outer.mc_xcursor = &next.inner;
24524     rc = mdbx_xcursor_init0(&next.outer);
24525     if (unlikely(rc != MDBX_SUCCESS))
24526       return rc;
24527     MDBX_xcursor *mx = &container_of(cursor, MDBX_cursor_couple, outer)->inner;
24528     cursor_copy(&mx->mx_cursor, &next.inner.mx_cursor);
24529   }
24530 
24531   MDBX_val stub = {0, 0};
24532   if (data == NULL) {
24533     const unsigned mask =
24534         1 << MDBX_GET_BOTH | 1 << MDBX_GET_BOTH_RANGE | 1 << MDBX_SET_KEY;
24535     if (unlikely(mask & (1 << move_op)))
24536       return MDBX_EINVAL;
24537     data = &stub;
24538   }
24539 
24540   if (key == NULL) {
24541     const unsigned mask = 1 << MDBX_GET_BOTH | 1 << MDBX_GET_BOTH_RANGE |
24542                           1 << MDBX_SET_KEY | 1 << MDBX_SET |
24543                           1 << MDBX_SET_RANGE;
24544     if (unlikely(mask & (1 << move_op)))
24545       return MDBX_EINVAL;
24546     key = &stub;
24547   }
24548 
24549   next.outer.mc_signature = MDBX_MC_LIVE;
24550   rc = mdbx_cursor_get(&next.outer, key, data, move_op);
24551   if (unlikely(rc != MDBX_SUCCESS &&
24552                (rc != MDBX_NOTFOUND || !(next.outer.mc_flags & C_INITIALIZED))))
24553     return rc;
24554 
24555   return mdbx_estimate_distance(cursor, &next.outer, distance_items);
24556 }
24557 
mdbx_estimate_range(MDBX_txn * txn,MDBX_dbi dbi,MDBX_val * begin_key,MDBX_val * begin_data,MDBX_val * end_key,MDBX_val * end_data,ptrdiff_t * size_items)24558 int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *begin_key,
24559                         MDBX_val *begin_data, MDBX_val *end_key,
24560                         MDBX_val *end_data, ptrdiff_t *size_items) {
24561   int rc = check_txn(txn, MDBX_TXN_BLOCKED);
24562   if (unlikely(rc != MDBX_SUCCESS))
24563     return rc;
24564 
24565   if (unlikely(!size_items))
24566     return MDBX_EINVAL;
24567 
24568   if (unlikely(begin_data && (begin_key == NULL || begin_key == MDBX_EPSILON)))
24569     return MDBX_EINVAL;
24570 
24571   if (unlikely(end_data && (end_key == NULL || end_key == MDBX_EPSILON)))
24572     return MDBX_EINVAL;
24573 
24574   if (unlikely(begin_key == MDBX_EPSILON && end_key == MDBX_EPSILON))
24575     return MDBX_EINVAL;
24576 
24577   if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID)))
24578     return MDBX_BAD_DBI;
24579 
24580   MDBX_cursor_couple begin;
24581   /* LY: first, initialize cursor to refresh a DB in case it have DB_STALE */
24582   rc = mdbx_cursor_init(&begin.outer, txn, dbi);
24583   if (unlikely(rc != MDBX_SUCCESS))
24584     return rc;
24585 
24586   if (unlikely(begin.outer.mc_db->md_entries == 0)) {
24587     *size_items = 0;
24588     return MDBX_SUCCESS;
24589   }
24590 
24591   if (!begin_key) {
24592     if (unlikely(!end_key)) {
24593       /* LY: FIRST..LAST case */
24594       *size_items = (ptrdiff_t)begin.outer.mc_db->md_entries;
24595       return MDBX_SUCCESS;
24596     }
24597     MDBX_val stub = {0, 0};
24598     rc = mdbx_cursor_first(&begin.outer, &stub, &stub);
24599     if (unlikely(end_key == MDBX_EPSILON)) {
24600       /* LY: FIRST..+epsilon case */
24601       return (rc == MDBX_SUCCESS)
24602                  ? mdbx_cursor_count(&begin.outer, (size_t *)size_items)
24603                  : rc;
24604     }
24605   } else {
24606     if (unlikely(begin_key == MDBX_EPSILON)) {
24607       if (end_key == NULL) {
24608         /* LY: -epsilon..LAST case */
24609         MDBX_val stub = {0, 0};
24610         rc = mdbx_cursor_last(&begin.outer, &stub, &stub);
24611         return (rc == MDBX_SUCCESS)
24612                    ? mdbx_cursor_count(&begin.outer, (size_t *)size_items)
24613                    : rc;
24614       }
24615       /* LY: -epsilon..value case */
24616       assert(end_key != MDBX_EPSILON);
24617       begin_key = end_key;
24618     } else if (unlikely(end_key == MDBX_EPSILON)) {
24619       /* LY: value..+epsilon case */
24620       assert(begin_key != MDBX_EPSILON);
24621       end_key = begin_key;
24622     }
24623     if (end_key && !begin_data && !end_data &&
24624         (begin_key == end_key ||
24625          begin.outer.mc_dbx->md_cmp(begin_key, end_key) == 0)) {
24626       /* LY: single key case */
24627       rc = mdbx_cursor_set(&begin.outer, begin_key, NULL, MDBX_SET).err;
24628       if (unlikely(rc != MDBX_SUCCESS)) {
24629         *size_items = 0;
24630         return (rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc;
24631       }
24632       *size_items = 1;
24633       if (begin.outer.mc_xcursor != NULL) {
24634         MDBX_node *node = page_node(begin.outer.mc_pg[begin.outer.mc_top],
24635                                     begin.outer.mc_ki[begin.outer.mc_top]);
24636         if (F_ISSET(node_flags(node), F_DUPDATA)) {
24637           /* LY: return the number of duplicates for given key */
24638           mdbx_tassert(txn,
24639                        begin.outer.mc_xcursor == &begin.inner &&
24640                            (begin.inner.mx_cursor.mc_flags & C_INITIALIZED));
24641           *size_items =
24642               (sizeof(*size_items) >= sizeof(begin.inner.mx_db.md_entries) ||
24643                begin.inner.mx_db.md_entries <= PTRDIFF_MAX)
24644                   ? (size_t)begin.inner.mx_db.md_entries
24645                   : PTRDIFF_MAX;
24646         }
24647       }
24648       return MDBX_SUCCESS;
24649     } else {
24650       rc = mdbx_cursor_set(&begin.outer, begin_key, begin_data,
24651                            begin_data ? MDBX_GET_BOTH_RANGE : MDBX_SET_RANGE)
24652                .err;
24653     }
24654   }
24655 
24656   if (unlikely(rc != MDBX_SUCCESS)) {
24657     if (rc != MDBX_NOTFOUND || !(begin.outer.mc_flags & C_INITIALIZED))
24658       return rc;
24659   }
24660 
24661   MDBX_cursor_couple end;
24662   rc = mdbx_cursor_init(&end.outer, txn, dbi);
24663   if (unlikely(rc != MDBX_SUCCESS))
24664     return rc;
24665   if (!end_key) {
24666     MDBX_val stub = {0, 0};
24667     rc = mdbx_cursor_last(&end.outer, &stub, &stub);
24668   } else {
24669     rc = mdbx_cursor_set(&end.outer, end_key, end_data,
24670                          end_data ? MDBX_GET_BOTH_RANGE : MDBX_SET_RANGE)
24671              .err;
24672   }
24673   if (unlikely(rc != MDBX_SUCCESS)) {
24674     if (rc != MDBX_NOTFOUND || !(end.outer.mc_flags & C_INITIALIZED))
24675       return rc;
24676   }
24677 
24678   rc = mdbx_estimate_distance(&begin.outer, &end.outer, size_items);
24679   if (unlikely(rc != MDBX_SUCCESS))
24680     return rc;
24681   assert(*size_items >= -(ptrdiff_t)begin.outer.mc_db->md_entries &&
24682          *size_items <= (ptrdiff_t)begin.outer.mc_db->md_entries);
24683 
24684 #if 0 /* LY: Was decided to returns as-is (i.e. negative) the estimation       \
24685        * results for an inverted ranges. */
24686 
24687   /* Commit 8ddfd1f34ad7cf7a3c4aa75d2e248ca7e639ed63
24688      Change-Id: If59eccf7311123ab6384c4b93f9b1fed5a0a10d1 */
24689 
24690   if (*size_items < 0) {
24691     /* LY: inverted range case */
24692     *size_items += (ptrdiff_t)begin.outer.mc_db->md_entries;
24693   } else if (*size_items == 0 && begin_key && end_key) {
24694     int cmp = begin.outer.mc_dbx->md_cmp(&origin_begin_key, &origin_end_key);
24695     if (cmp == 0 && (begin.inner.mx_cursor.mc_flags & C_INITIALIZED) &&
24696         begin_data && end_data)
24697       cmp = begin.outer.mc_dbx->md_dcmp(&origin_begin_data, &origin_end_data);
24698     if (cmp > 0) {
24699       /* LY: inverted range case with empty scope */
24700       *size_items = (ptrdiff_t)begin.outer.mc_db->md_entries;
24701     }
24702   }
24703   assert(*size_items >= 0 &&
24704          *size_items <= (ptrdiff_t)begin.outer.mc_db->md_entries);
24705 #endif
24706 
24707   return MDBX_SUCCESS;
24708 }
24709 
24710 //------------------------------------------------------------------------------
24711 
24712 /* Позволяет обновить или удалить существующую запись с получением
24713  * в old_data предыдущего значения данных. При этом если new_data равен
24714  * нулю, то выполняется удаление, иначе обновление/вставка.
24715  *
24716  * Текущее значение может находиться в уже измененной (грязной) странице.
24717  * В этом случае страница будет перезаписана при обновлении, а само старое
24718  * значение утрачено. Поэтому исходно в old_data должен быть передан
24719  * дополнительный буфер для копирования старого значения.
24720  * Если переданный буфер слишком мал, то функция вернет -1, установив
24721  * old_data->iov_len в соответствующее значение.
24722  *
24723  * Для не-уникальных ключей также возможен второй сценарий использования,
24724  * когда посредством old_data из записей с одинаковым ключом для
24725  * удаления/обновления выбирается конкретная. Для выбора этого сценария
24726  * во flags следует одновременно указать MDBX_CURRENT и MDBX_NOOVERWRITE.
24727  * Именно эта комбинация выбрана, так как она лишена смысла, и этим позволяет
24728  * идентифицировать запрос такого сценария.
24729  *
24730  * Функция может быть замещена соответствующими операциями с курсорами
24731  * после двух доработок (TODO):
24732  *  - внешняя аллокация курсоров, в том числе на стеке (без malloc).
24733  *  - получения dirty-статуса страницы по адресу (знать о MUTABLE/WRITEABLE).
24734  */
24735 
mdbx_replace_ex(MDBX_txn * txn,MDBX_dbi dbi,const MDBX_val * key,MDBX_val * new_data,MDBX_val * old_data,MDBX_put_flags_t flags,MDBX_preserve_func preserver,void * preserver_context)24736 int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key,
24737                     MDBX_val *new_data, MDBX_val *old_data,
24738                     MDBX_put_flags_t flags, MDBX_preserve_func preserver,
24739                     void *preserver_context) {
24740   int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED);
24741   if (unlikely(rc != MDBX_SUCCESS))
24742     return rc;
24743 
24744   if (unlikely(!key || !old_data || old_data == new_data))
24745     return MDBX_EINVAL;
24746 
24747   if (unlikely(old_data->iov_base == NULL && old_data->iov_len))
24748     return MDBX_EINVAL;
24749 
24750   if (unlikely(new_data == NULL &&
24751                (flags & (MDBX_CURRENT | MDBX_RESERVE)) != MDBX_CURRENT))
24752     return MDBX_EINVAL;
24753 
24754   if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID)))
24755     return MDBX_BAD_DBI;
24756 
24757   if (unlikely(flags &
24758                ~(MDBX_NOOVERWRITE | MDBX_NODUPDATA | MDBX_ALLDUPS |
24759                  MDBX_RESERVE | MDBX_APPEND | MDBX_APPENDDUP | MDBX_CURRENT)))
24760     return MDBX_EINVAL;
24761 
24762   MDBX_cursor_couple cx;
24763   rc = mdbx_cursor_init(&cx.outer, txn, dbi);
24764   if (unlikely(rc != MDBX_SUCCESS))
24765     return rc;
24766   cx.outer.mc_next = txn->tw.cursors[dbi];
24767   txn->tw.cursors[dbi] = &cx.outer;
24768 
24769   MDBX_val present_key = *key;
24770   if (F_ISSET(flags, MDBX_CURRENT | MDBX_NOOVERWRITE)) {
24771     /* в old_data значение для выбора конкретного дубликата */
24772     if (unlikely(!(txn->mt_dbs[dbi].md_flags & MDBX_DUPSORT))) {
24773       rc = MDBX_EINVAL;
24774       goto bailout;
24775     }
24776 
24777     /* убираем лишний бит, он был признаком запрошенного режима */
24778     flags -= MDBX_NOOVERWRITE;
24779 
24780     rc = mdbx_cursor_get(&cx.outer, &present_key, old_data, MDBX_GET_BOTH);
24781     if (rc != MDBX_SUCCESS)
24782       goto bailout;
24783   } else {
24784     /* в old_data буфер для сохранения предыдущего значения */
24785     if (unlikely(new_data && old_data->iov_base == new_data->iov_base))
24786       return MDBX_EINVAL;
24787     MDBX_val present_data;
24788     rc = mdbx_cursor_get(&cx.outer, &present_key, &present_data, MDBX_SET_KEY);
24789     if (unlikely(rc != MDBX_SUCCESS)) {
24790       old_data->iov_base = NULL;
24791       old_data->iov_len = 0;
24792       if (rc != MDBX_NOTFOUND || (flags & MDBX_CURRENT))
24793         goto bailout;
24794     } else if (flags & MDBX_NOOVERWRITE) {
24795       rc = MDBX_KEYEXIST;
24796       *old_data = present_data;
24797       goto bailout;
24798     } else {
24799       MDBX_page *page = cx.outer.mc_pg[cx.outer.mc_top];
24800       if (txn->mt_dbs[dbi].md_flags & MDBX_DUPSORT) {
24801         if (flags & MDBX_CURRENT) {
24802           /* disallow update/delete for multi-values */
24803           MDBX_node *node = page_node(page, cx.outer.mc_ki[cx.outer.mc_top]);
24804           if (F_ISSET(node_flags(node), F_DUPDATA)) {
24805             mdbx_tassert(txn, XCURSOR_INITED(&cx.outer) &&
24806                                   cx.outer.mc_xcursor->mx_db.md_entries > 1);
24807             if (cx.outer.mc_xcursor->mx_db.md_entries > 1) {
24808               rc = MDBX_EMULTIVAL;
24809               goto bailout;
24810             }
24811           }
24812           /* В оригинальной LMDB флажок MDBX_CURRENT здесь приведет
24813            * к замене данных без учета MDBX_DUPSORT сортировки,
24814            * но здесь это в любом случае допустимо, так как мы
24815            * проверили что для ключа есть только одно значение. */
24816         }
24817       }
24818 
24819       if (IS_MODIFIABLE(txn, page)) {
24820         if (new_data && cmp_lenfast(&present_data, new_data) == 0) {
24821           /* если данные совпадают, то ничего делать не надо */
24822           *old_data = *new_data;
24823           goto bailout;
24824         }
24825         rc = preserver ? preserver(preserver_context, old_data,
24826                                    present_data.iov_base, present_data.iov_len)
24827                        : MDBX_SUCCESS;
24828         if (unlikely(rc != MDBX_SUCCESS))
24829           goto bailout;
24830       } else {
24831         *old_data = present_data;
24832       }
24833       flags |= MDBX_CURRENT;
24834     }
24835   }
24836 
24837   if (likely(new_data))
24838     rc = mdbx_cursor_put(&cx.outer, key, new_data, flags);
24839   else
24840     rc = mdbx_cursor_del(&cx.outer, flags & MDBX_ALLDUPS);
24841 
24842 bailout:
24843   txn->tw.cursors[dbi] = cx.outer.mc_next;
24844   return rc;
24845 }
24846 
default_value_preserver(void * context,MDBX_val * target,const void * src,size_t bytes)24847 static int default_value_preserver(void *context, MDBX_val *target,
24848                                    const void *src, size_t bytes) {
24849   (void)context;
24850   if (unlikely(target->iov_len < bytes)) {
24851     target->iov_base = nullptr;
24852     target->iov_len = bytes;
24853     return MDBX_RESULT_TRUE;
24854   }
24855   memcpy(target->iov_base, src, target->iov_len = bytes);
24856   return MDBX_SUCCESS;
24857 }
24858 
mdbx_replace(MDBX_txn * txn,MDBX_dbi dbi,const MDBX_val * key,MDBX_val * new_data,MDBX_val * old_data,MDBX_put_flags_t flags)24859 int mdbx_replace(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key,
24860                  MDBX_val *new_data, MDBX_val *old_data,
24861                  MDBX_put_flags_t flags) {
24862   return mdbx_replace_ex(txn, dbi, key, new_data, old_data, flags,
24863                          default_value_preserver, nullptr);
24864 }
24865 
24866 /* Функция сообщает находится ли указанный адрес в "грязной" странице у
24867  * заданной пишущей транзакции. В конечном счете это позволяет избавиться от
24868  * лишнего копирования данных из НЕ-грязных страниц.
24869  *
24870  * "Грязные" страницы - это те, которые уже были изменены в ходе пишущей
24871  * транзакции. Соответственно, какие-либо дальнейшие изменения могут привести
24872  * к перезаписи таких страниц. Поэтому все функции, выполняющие изменения, в
24873  * качестве аргументов НЕ должны получать указатели на данные в таких
24874  * страницах. В свою очередь "НЕ грязные" страницы перед модификацией будут
24875  * скопированы.
24876  *
24877  * Другими словами, данные из "грязных" страниц должны быть либо скопированы
24878  * перед передачей в качестве аргументов для дальнейших модификаций, либо
24879  * отвергнуты на стадии проверки корректности аргументов.
24880  *
24881  * Таким образом, функция позволяет как избавится от лишнего копирования,
24882  * так и выполнить более полную проверку аргументов.
24883  *
24884  * ВАЖНО: Передаваемый указатель должен указывать на начало данных. Только
24885  * так гарантируется что актуальный заголовок страницы будет физически
24886  * расположен в той-же странице памяти, в том числе для многостраничных
24887  * P_OVERFLOW страниц с длинными данными. */
mdbx_is_dirty(const MDBX_txn * txn,const void * ptr)24888 int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr) {
24889   int rc = check_txn(txn, MDBX_TXN_BLOCKED);
24890   if (unlikely(rc != MDBX_SUCCESS))
24891     return rc;
24892 
24893   const MDBX_env *env = txn->mt_env;
24894   const ptrdiff_t offset = (uint8_t *)ptr - env->me_map;
24895   if (offset >= 0) {
24896     const pgno_t pgno = bytes2pgno(env, offset);
24897     if (likely(pgno < txn->mt_next_pgno)) {
24898       const MDBX_page *page = pgno2page(env, pgno);
24899       if (unlikely(page->mp_pgno != pgno ||
24900                    (page->mp_flags & P_ILL_BITS) != 0)) {
24901         /* The ptr pointed into middle of a large page,
24902          * not to the beginning of a data. */
24903         return MDBX_EINVAL;
24904       }
24905       return ((txn->mt_flags & MDBX_TXN_RDONLY) || !IS_MODIFIABLE(txn, page))
24906                  ? MDBX_RESULT_FALSE
24907                  : MDBX_RESULT_TRUE;
24908     }
24909     if ((size_t)offset < env->me_dxb_mmap.limit) {
24910       /* Указатель адресует что-то в пределах mmap, но за границей
24911        * распределенных страниц. Такое может случится если mdbx_is_dirty()
24912        * вызывается после операции, в ходе которой грязная страница была
24913        * возвращена в нераспределенное пространство. */
24914       return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EINVAL : MDBX_RESULT_TRUE;
24915     }
24916   }
24917 
24918   /* Страница вне используемого mmap-диапазона, т.е. либо в функцию был
24919    * передан некорректный адрес, либо адрес в теневой странице, которая была
24920    * выделена посредством malloc().
24921    *
24922    * Для режима MDBX_WRITE_MAP режима страница однозначно "не грязная",
24923    * а для режимов без MDBX_WRITE_MAP однозначно "не чистая". */
24924   return (txn->mt_flags & (MDBX_WRITEMAP | MDBX_TXN_RDONLY)) ? MDBX_EINVAL
24925                                                              : MDBX_RESULT_TRUE;
24926 }
24927 
mdbx_dbi_sequence(MDBX_txn * txn,MDBX_dbi dbi,uint64_t * result,uint64_t increment)24928 int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result,
24929                       uint64_t increment) {
24930   int rc = check_txn(txn, MDBX_TXN_BLOCKED);
24931   if (unlikely(rc != MDBX_SUCCESS))
24932     return rc;
24933 
24934   if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID)))
24935     return MDBX_BAD_DBI;
24936 
24937   if (unlikely(txn->mt_dbistate[dbi] & DBI_STALE)) {
24938     rc = mdbx_fetch_sdb(txn, dbi);
24939     if (unlikely(rc != MDBX_SUCCESS))
24940       return rc;
24941   }
24942 
24943   MDBX_db *dbs = &txn->mt_dbs[dbi];
24944   if (likely(result))
24945     *result = dbs->md_seq;
24946 
24947   if (likely(increment > 0)) {
24948     if (unlikely(txn->mt_flags & MDBX_TXN_RDONLY))
24949       return MDBX_EACCESS;
24950 
24951     uint64_t new = dbs->md_seq + increment;
24952     if (unlikely(new < increment))
24953       return MDBX_RESULT_TRUE;
24954 
24955     mdbx_tassert(txn, new > dbs->md_seq);
24956     dbs->md_seq = new;
24957     txn->mt_flags |= MDBX_TXN_DIRTY;
24958     txn->mt_dbistate[dbi] |= DBI_DIRTY;
24959   }
24960 
24961   return MDBX_SUCCESS;
24962 }
24963 
24964 /*----------------------------------------------------------------------------*/
24965 
24966 #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API
mdbx_limits_pgsize_min(void)24967 __cold MDBX_NOTHROW_CONST_FUNCTION intptr_t mdbx_limits_pgsize_min(void) {
24968   return __inline_mdbx_limits_pgsize_min();
24969 }
24970 
mdbx_limits_pgsize_max(void)24971 __cold MDBX_NOTHROW_CONST_FUNCTION intptr_t mdbx_limits_pgsize_max(void) {
24972   return __inline_mdbx_limits_pgsize_max();
24973 }
24974 #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */
24975 
mdbx_limits_dbsize_min(intptr_t pagesize)24976 __cold intptr_t mdbx_limits_dbsize_min(intptr_t pagesize) {
24977   if (pagesize < 1)
24978     pagesize = (intptr_t)mdbx_default_pagesize();
24979   else if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE ||
24980                     pagesize > (intptr_t)MAX_PAGESIZE ||
24981                     !is_powerof2((size_t)pagesize)))
24982     return -1;
24983 
24984   return MIN_PAGENO * pagesize;
24985 }
24986 
mdbx_limits_dbsize_max(intptr_t pagesize)24987 __cold intptr_t mdbx_limits_dbsize_max(intptr_t pagesize) {
24988   if (pagesize < 1)
24989     pagesize = (intptr_t)mdbx_default_pagesize();
24990   else if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE ||
24991                     pagesize > (intptr_t)MAX_PAGESIZE ||
24992                     !is_powerof2((size_t)pagesize)))
24993     return -1;
24994 
24995   STATIC_ASSERT(MAX_MAPSIZE < INTPTR_MAX);
24996   const uint64_t limit = (1 + (uint64_t)MAX_PAGENO) * pagesize;
24997   return (limit < (intptr_t)MAX_MAPSIZE) ? (intptr_t)limit
24998                                          : (intptr_t)MAX_MAPSIZE;
24999 }
25000 
mdbx_limits_txnsize_max(intptr_t pagesize)25001 __cold intptr_t mdbx_limits_txnsize_max(intptr_t pagesize) {
25002   if (pagesize < 1)
25003     pagesize = (intptr_t)mdbx_default_pagesize();
25004   else if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE ||
25005                     pagesize > (intptr_t)MAX_PAGESIZE ||
25006                     !is_powerof2((size_t)pagesize)))
25007     return -1;
25008 
25009   STATIC_ASSERT(MAX_MAPSIZE < INTPTR_MAX);
25010   const uint64_t pgl_limit =
25011       pagesize * (uint64_t)(MDBX_PGL_LIMIT / 1.6180339887498948482);
25012   const uint64_t map_limit = (uint64_t)(MAX_MAPSIZE / 1.6180339887498948482);
25013   return (pgl_limit < map_limit) ? (intptr_t)pgl_limit : (intptr_t)map_limit;
25014 }
25015 
25016 /*** Key-making functions to avoid custom comparators *************************/
25017 
key2double(const int64_t key)25018 static __always_inline double key2double(const int64_t key) {
25019   union {
25020     uint64_t u;
25021     double f;
25022   } casting;
25023 
25024   casting.u = (key < 0) ? key + UINT64_C(0x8000000000000000)
25025                         : UINT64_C(0xffffFFFFffffFFFF) - key;
25026   return casting.f;
25027 }
25028 
double2key(const double * const ptr)25029 static __always_inline uint64_t double2key(const double *const ptr) {
25030   STATIC_ASSERT(sizeof(double) == sizeof(int64_t));
25031   const int64_t i = *(const int64_t *)ptr;
25032   const uint64_t u = (i < 0) ? UINT64_C(0xffffFFFFffffFFFF) - i
25033                              : i + UINT64_C(0x8000000000000000);
25034   if (mdbx_assert_enabled()) {
25035     const double f = key2double(u);
25036     assert(memcmp(&f, ptr, 8) == 0);
25037   }
25038   return u;
25039 }
25040 
key2float(const int32_t key)25041 static __always_inline float key2float(const int32_t key) {
25042   union {
25043     uint32_t u;
25044     float f;
25045   } casting;
25046 
25047   casting.u =
25048       (key < 0) ? key + UINT32_C(0x80000000) : UINT32_C(0xffffFFFF) - key;
25049   return casting.f;
25050 }
25051 
float2key(const float * const ptr)25052 static __always_inline uint32_t float2key(const float *const ptr) {
25053   STATIC_ASSERT(sizeof(float) == sizeof(int32_t));
25054   const int32_t i = *(const int32_t *)ptr;
25055   const uint32_t u =
25056       (i < 0) ? UINT32_C(0xffffFFFF) - i : i + UINT32_C(0x80000000);
25057   if (mdbx_assert_enabled()) {
25058     const float f = key2float(u);
25059     assert(memcmp(&f, ptr, 4) == 0);
25060   }
25061   return u;
25062 }
25063 
mdbx_key_from_double(const double ieee754_64bit)25064 uint64_t mdbx_key_from_double(const double ieee754_64bit) {
25065   return double2key(&ieee754_64bit);
25066 }
25067 
mdbx_key_from_ptrdouble(const double * const ieee754_64bit)25068 uint64_t mdbx_key_from_ptrdouble(const double *const ieee754_64bit) {
25069   return double2key(ieee754_64bit);
25070 }
25071 
mdbx_key_from_float(const float ieee754_32bit)25072 uint32_t mdbx_key_from_float(const float ieee754_32bit) {
25073   return float2key(&ieee754_32bit);
25074 }
25075 
mdbx_key_from_ptrfloat(const float * const ieee754_32bit)25076 uint32_t mdbx_key_from_ptrfloat(const float *const ieee754_32bit) {
25077   return float2key(ieee754_32bit);
25078 }
25079 
25080 #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API
mdbx_key_from_int64(const int64_t i64)25081 MDBX_NOTHROW_CONST_FUNCTION uint64_t mdbx_key_from_int64(const int64_t i64) {
25082   return __inline_mdbx_key_from_int64(i64);
25083 }
25084 
mdbx_key_from_int32(const int32_t i32)25085 MDBX_NOTHROW_CONST_FUNCTION uint32_t mdbx_key_from_int32(const int32_t i32) {
25086   return __inline_mdbx_key_from_int32(i32);
25087 }
25088 #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */
25089 
25090 #define IEEE754_DOUBLE_MANTISSA_SIZE 52
25091 #define IEEE754_DOUBLE_EXPONENTA_BIAS 0x3FF
25092 #define IEEE754_DOUBLE_EXPONENTA_MAX 0x7FF
25093 #define IEEE754_DOUBLE_IMPLICIT_LEAD UINT64_C(0x0010000000000000)
25094 #define IEEE754_DOUBLE_MANTISSA_MASK UINT64_C(0x000FFFFFFFFFFFFF)
25095 #define IEEE754_DOUBLE_MANTISSA_AMAX UINT64_C(0x001FFFFFFFFFFFFF)
25096 
clz64(uint64_t value)25097 static __inline int clz64(uint64_t value) {
25098 #if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_clzl)
25099   if (sizeof(value) == sizeof(int))
25100     return __builtin_clz(value);
25101   if (sizeof(value) == sizeof(long))
25102     return __builtin_clzl(value);
25103 #if (defined(__SIZEOF_LONG_LONG__) && __SIZEOF_LONG_LONG__ == 8) ||            \
25104     __has_builtin(__builtin_clzll)
25105   return __builtin_clzll(value);
25106 #endif /* have(long long) && long long == uint64_t */
25107 #endif /* GNU C */
25108 
25109 #if defined(_MSC_VER)
25110   unsigned long index;
25111 #if defined(_M_AMD64) || defined(_M_ARM64) || defined(_M_X64)
25112   _BitScanReverse64(&index, value);
25113   return 63 - index;
25114 #else
25115   if (value > UINT32_MAX) {
25116     _BitScanReverse(&index, (uint32_t)(value >> 32));
25117     return 31 - index;
25118   }
25119   _BitScanReverse(&index, (uint32_t)value);
25120   return 63 - index;
25121 #endif
25122 #endif /* MSVC */
25123 
25124   value |= value >> 1;
25125   value |= value >> 2;
25126   value |= value >> 4;
25127   value |= value >> 8;
25128   value |= value >> 16;
25129   value |= value >> 32;
25130   static const uint8_t debruijn_clz64[64] = {
25131       63, 16, 62, 7,  15, 36, 61, 3,  6,  14, 22, 26, 35, 47, 60, 2,
25132       9,  5,  28, 11, 13, 21, 42, 19, 25, 31, 34, 40, 46, 52, 59, 1,
25133       17, 8,  37, 4,  23, 27, 48, 10, 29, 12, 43, 20, 32, 41, 53, 18,
25134       38, 24, 49, 30, 44, 33, 54, 39, 50, 45, 55, 51, 56, 57, 58, 0};
25135   return debruijn_clz64[value * UINT64_C(0x03F79D71B4CB0A89) >> 58];
25136 }
25137 
round_mantissa(const uint64_t u64,int shift)25138 static __inline uint64_t round_mantissa(const uint64_t u64, int shift) {
25139   assert(shift < 0 && u64 > 0);
25140   shift = -shift;
25141   const unsigned half = 1 << (shift - 1);
25142   const unsigned lsb = 1 & (unsigned)(u64 >> shift);
25143   const unsigned tie2even = 1 ^ lsb;
25144   return (u64 + half - tie2even) >> shift;
25145 }
25146 
mdbx_key_from_jsonInteger(const int64_t json_integer)25147 uint64_t mdbx_key_from_jsonInteger(const int64_t json_integer) {
25148   const uint64_t bias = UINT64_C(0x8000000000000000);
25149   if (json_integer > 0) {
25150     const uint64_t u64 = json_integer;
25151     int shift = clz64(u64) - (64 - IEEE754_DOUBLE_MANTISSA_SIZE - 1);
25152     uint64_t mantissa = u64 << shift;
25153     if (unlikely(shift < 0)) {
25154       mantissa = round_mantissa(u64, shift);
25155       if (mantissa > IEEE754_DOUBLE_MANTISSA_AMAX)
25156         mantissa = round_mantissa(u64, --shift);
25157     }
25158 
25159     assert(mantissa >= IEEE754_DOUBLE_IMPLICIT_LEAD &&
25160            mantissa <= IEEE754_DOUBLE_MANTISSA_AMAX);
25161     const uint64_t exponent =
25162         IEEE754_DOUBLE_EXPONENTA_BIAS + IEEE754_DOUBLE_MANTISSA_SIZE - shift;
25163     assert(exponent > 0 && exponent <= IEEE754_DOUBLE_EXPONENTA_MAX);
25164     const uint64_t key = bias + (exponent << IEEE754_DOUBLE_MANTISSA_SIZE) +
25165                          (mantissa - IEEE754_DOUBLE_IMPLICIT_LEAD);
25166 #if !defined(_MSC_VER) ||                                                      \
25167     defined(                                                                   \
25168         _DEBUG) /* Workaround for MSVC error LNK2019: unresolved external      \
25169                    symbol __except1 referenced in function __ftol3_except */
25170     assert(key == mdbx_key_from_double((double)json_integer));
25171 #endif /* Workaround for MSVC */
25172     return key;
25173   }
25174 
25175   if (json_integer < 0) {
25176     const uint64_t u64 = -json_integer;
25177     int shift = clz64(u64) - (64 - IEEE754_DOUBLE_MANTISSA_SIZE - 1);
25178     uint64_t mantissa = u64 << shift;
25179     if (unlikely(shift < 0)) {
25180       mantissa = round_mantissa(u64, shift);
25181       if (mantissa > IEEE754_DOUBLE_MANTISSA_AMAX)
25182         mantissa = round_mantissa(u64, --shift);
25183     }
25184 
25185     assert(mantissa >= IEEE754_DOUBLE_IMPLICIT_LEAD &&
25186            mantissa <= IEEE754_DOUBLE_MANTISSA_AMAX);
25187     const uint64_t exponent =
25188         IEEE754_DOUBLE_EXPONENTA_BIAS + IEEE754_DOUBLE_MANTISSA_SIZE - shift;
25189     assert(exponent > 0 && exponent <= IEEE754_DOUBLE_EXPONENTA_MAX);
25190     const uint64_t key = bias - 1 - (exponent << IEEE754_DOUBLE_MANTISSA_SIZE) -
25191                          (mantissa - IEEE754_DOUBLE_IMPLICIT_LEAD);
25192 #if !defined(_MSC_VER) ||                                                      \
25193     defined(                                                                   \
25194         _DEBUG) /* Workaround for MSVC error LNK2019: unresolved external      \
25195                    symbol __except1 referenced in function __ftol3_except */
25196     assert(key == mdbx_key_from_double((double)json_integer));
25197 #endif /* Workaround for MSVC */
25198     return key;
25199   }
25200 
25201   return bias;
25202 }
25203 
mdbx_jsonInteger_from_key(const MDBX_val v)25204 int64_t mdbx_jsonInteger_from_key(const MDBX_val v) {
25205   assert(v.iov_len == 8);
25206   const uint64_t key = unaligned_peek_u64(2, v.iov_base);
25207   const uint64_t bias = UINT64_C(0x8000000000000000);
25208   const uint64_t covalent = (key > bias) ? key - bias : bias - key - 1;
25209   const int shift = IEEE754_DOUBLE_EXPONENTA_BIAS + 63 -
25210                     (IEEE754_DOUBLE_EXPONENTA_MAX &
25211                      (int)(covalent >> IEEE754_DOUBLE_MANTISSA_SIZE));
25212   if (unlikely(shift < 1))
25213     return (key < bias) ? INT64_MIN : INT64_MAX;
25214   if (unlikely(shift > 63))
25215     return 0;
25216 
25217   const uint64_t unscaled = ((covalent & IEEE754_DOUBLE_MANTISSA_MASK)
25218                              << (63 - IEEE754_DOUBLE_MANTISSA_SIZE)) +
25219                             bias;
25220   const int64_t absolute = unscaled >> shift;
25221   const int64_t value = (key < bias) ? -absolute : absolute;
25222   assert(key == mdbx_key_from_jsonInteger(value) ||
25223          (mdbx_key_from_jsonInteger(value - 1) < key &&
25224           key < mdbx_key_from_jsonInteger(value + 1)));
25225   return value;
25226 }
25227 
mdbx_double_from_key(const MDBX_val v)25228 double mdbx_double_from_key(const MDBX_val v) {
25229   assert(v.iov_len == 8);
25230   return key2double(unaligned_peek_u64(2, v.iov_base));
25231 }
25232 
mdbx_float_from_key(const MDBX_val v)25233 float mdbx_float_from_key(const MDBX_val v) {
25234   assert(v.iov_len == 4);
25235   return key2float(unaligned_peek_u32(2, v.iov_base));
25236 }
25237 
mdbx_int32_from_key(const MDBX_val v)25238 int32_t mdbx_int32_from_key(const MDBX_val v) {
25239   assert(v.iov_len == 4);
25240   return (int32_t)(unaligned_peek_u32(2, v.iov_base) - UINT32_C(0x80000000));
25241 }
25242 
mdbx_int64_from_key(const MDBX_val v)25243 int64_t mdbx_int64_from_key(const MDBX_val v) {
25244   assert(v.iov_len == 8);
25245   return (int64_t)(unaligned_peek_u64(2, v.iov_base) -
25246                    UINT64_C(0x8000000000000000));
25247 }
25248 
mdbx_get_keycmp(unsigned flags)25249 __cold MDBX_cmp_func *mdbx_get_keycmp(unsigned flags) {
25250   return get_default_keycmp(flags);
25251 }
25252 
mdbx_get_datacmp(unsigned flags)25253 __cold MDBX_cmp_func *mdbx_get_datacmp(unsigned flags) {
25254   return get_default_datacmp(flags);
25255 }
25256 
mdbx_env_set_option(MDBX_env * env,const MDBX_option_t option,const uint64_t value)25257 __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option,
25258                                const uint64_t value) {
25259   int err = check_env(env, false);
25260   if (unlikely(err != MDBX_SUCCESS))
25261     return err;
25262 
25263   const bool lock_needed = ((env->me_flags & MDBX_ENV_ACTIVE) && env->me_txn0 &&
25264                             env->me_txn0->mt_owner != mdbx_thread_self());
25265   bool should_unlock = false;
25266   switch (option) {
25267   case MDBX_opt_sync_bytes:
25268     if (unlikely(env->me_flags & MDBX_RDONLY))
25269       return MDBX_EACCESS;
25270     if (unlikely(!(env->me_flags & MDBX_ENV_ACTIVE)))
25271       return MDBX_EPERM;
25272     if (sizeof(value) > sizeof(size_t) && unlikely(value != (size_t)value))
25273       return MDBX_TOO_LARGE;
25274     if (atomic_store32(&env->me_lck->mti_autosync_threshold,
25275                        bytes2pgno(env, (size_t)value + env->me_psize - 1),
25276                        mo_Relaxed) != 0 &&
25277         (env->me_flags & MDBX_ENV_ACTIVE)) {
25278       err = mdbx_env_sync_poll(env);
25279       if (unlikely(MDBX_IS_ERROR(err)))
25280         return err;
25281     }
25282     break;
25283 
25284   case MDBX_opt_sync_period:
25285     if (unlikely(env->me_flags & MDBX_RDONLY))
25286       return MDBX_EACCESS;
25287     if (unlikely(!(env->me_flags & MDBX_ENV_ACTIVE)))
25288       return MDBX_EPERM;
25289     if (unlikely(value > UINT32_MAX))
25290       return MDBX_TOO_LARGE;
25291     if (atomic_store64(&env->me_lck->mti_autosync_period,
25292                        mdbx_osal_16dot16_to_monotime((uint32_t)value),
25293                        mo_Relaxed) != 0 &&
25294         (env->me_flags & MDBX_ENV_ACTIVE)) {
25295       err = mdbx_env_sync_poll(env);
25296       if (unlikely(MDBX_IS_ERROR(err)))
25297         return err;
25298     }
25299     break;
25300 
25301   case MDBX_opt_max_db:
25302     if (unlikely(value > MDBX_MAX_DBI))
25303       return MDBX_EINVAL;
25304     if (unlikely(env->me_map))
25305       return MDBX_EPERM;
25306     env->me_maxdbs = (unsigned)value + CORE_DBS;
25307     break;
25308 
25309   case MDBX_opt_max_readers:
25310     if (unlikely(value < 1 || value > MDBX_READERS_LIMIT))
25311       return MDBX_EINVAL;
25312     if (unlikely(env->me_map))
25313       return MDBX_EPERM;
25314     env->me_maxreaders = (unsigned)value;
25315     break;
25316 
25317   case MDBX_opt_dp_reserve_limit:
25318     if (unlikely(value > INT_MAX))
25319       return MDBX_EINVAL;
25320     if (env->me_options.dp_reserve_limit != (unsigned)value) {
25321       if (lock_needed) {
25322         err = mdbx_txn_lock(env, false);
25323         if (unlikely(err != MDBX_SUCCESS))
25324           return err;
25325         should_unlock = true;
25326       }
25327       env->me_options.dp_reserve_limit = (unsigned)value;
25328       while (env->me_dp_reserve_len > env->me_options.dp_reserve_limit) {
25329         mdbx_assert(env, env->me_dp_reserve != NULL);
25330         MDBX_page *dp = env->me_dp_reserve;
25331         MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, env->me_psize);
25332         VALGRIND_MAKE_MEM_DEFINED(&dp->mp_next, sizeof(dp->mp_next));
25333         env->me_dp_reserve = dp->mp_next;
25334         VALGRIND_MEMPOOL_FREE(env, dp);
25335         mdbx_free(dp);
25336         env->me_dp_reserve_len -= 1;
25337       }
25338     }
25339     break;
25340 
25341   case MDBX_opt_rp_augment_limit:
25342     if (unlikely(value > MDBX_PGL_LIMIT))
25343       return MDBX_EINVAL;
25344     env->me_options.rp_augment_limit = (unsigned)value;
25345     break;
25346 
25347   case MDBX_opt_txn_dp_limit:
25348   case MDBX_opt_txn_dp_initial:
25349     if (unlikely(value > MDBX_PGL_LIMIT || value < CURSOR_STACK * 4))
25350       return MDBX_EINVAL;
25351     if (unlikely(env->me_flags & MDBX_RDONLY))
25352       return MDBX_EACCESS;
25353     if (lock_needed) {
25354       err = mdbx_txn_lock(env, false);
25355       if (unlikely(err != MDBX_SUCCESS))
25356         return err;
25357       should_unlock = true;
25358     }
25359     if (env->me_txn)
25360       err = MDBX_EPERM /* unable change during transaction */;
25361     else {
25362       const pgno_t value32 = (pgno_t)value;
25363       if (option == MDBX_opt_txn_dp_initial &&
25364           env->me_options.dp_initial != value32) {
25365         env->me_options.dp_initial = value32;
25366         if (env->me_options.dp_limit < value32) {
25367           env->me_options.dp_limit = value32;
25368           env->me_options.flags.non_auto.dp_limit = 1;
25369         }
25370       }
25371       if (option == MDBX_opt_txn_dp_limit &&
25372           env->me_options.dp_limit != value32) {
25373         env->me_options.dp_limit = value32;
25374         env->me_options.flags.non_auto.dp_limit = 1;
25375         if (env->me_options.dp_initial > value32)
25376           env->me_options.dp_initial = value32;
25377       }
25378     }
25379     break;
25380 
25381   case MDBX_opt_spill_max_denominator:
25382     if (unlikely(value > 255))
25383       return MDBX_EINVAL;
25384     env->me_options.spill_max_denominator = (uint8_t)value;
25385     break;
25386   case MDBX_opt_spill_min_denominator:
25387     if (unlikely(value > 255))
25388       return MDBX_EINVAL;
25389     env->me_options.spill_min_denominator = (uint8_t)value;
25390     break;
25391   case MDBX_opt_spill_parent4child_denominator:
25392     if (unlikely(value > 255))
25393       return MDBX_EINVAL;
25394     env->me_options.spill_parent4child_denominator = (uint8_t)value;
25395     break;
25396 
25397   case MDBX_opt_loose_limit:
25398     if (unlikely(value > 255))
25399       return MDBX_EINVAL;
25400     env->me_options.dp_loose_limit = (uint8_t)value;
25401     break;
25402 
25403   case MDBX_opt_merge_threshold_16dot16_percent:
25404     if (unlikely(value < 8192 || value > 32768))
25405       return MDBX_EINVAL;
25406     env->me_options.merge_threshold_16dot16_percent = (unsigned)value;
25407     recalculate_merge_threshold(env);
25408     break;
25409 
25410   default:
25411     return MDBX_EINVAL;
25412   }
25413 
25414   if (should_unlock)
25415     mdbx_txn_unlock(env);
25416   return err;
25417 }
25418 
mdbx_env_get_option(const MDBX_env * env,const MDBX_option_t option,uint64_t * pvalue)25419 __cold int mdbx_env_get_option(const MDBX_env *env, const MDBX_option_t option,
25420                                uint64_t *pvalue) {
25421   int err = check_env(env, false);
25422   if (unlikely(err != MDBX_SUCCESS))
25423     return err;
25424   if (unlikely(!pvalue))
25425     return MDBX_EINVAL;
25426 
25427   switch (option) {
25428   case MDBX_opt_sync_bytes:
25429     if (unlikely(!(env->me_flags & MDBX_ENV_ACTIVE)))
25430       return MDBX_EPERM;
25431     *pvalue = pgno2bytes(
25432         env, atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed));
25433     break;
25434 
25435   case MDBX_opt_sync_period:
25436     if (unlikely(!(env->me_flags & MDBX_ENV_ACTIVE)))
25437       return MDBX_EPERM;
25438     *pvalue = mdbx_osal_monotime_to_16dot16(
25439         atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed));
25440     break;
25441 
25442   case MDBX_opt_max_db:
25443     *pvalue = env->me_maxdbs - CORE_DBS;
25444     break;
25445 
25446   case MDBX_opt_max_readers:
25447     *pvalue = env->me_maxreaders;
25448     break;
25449 
25450   case MDBX_opt_dp_reserve_limit:
25451     *pvalue = env->me_options.dp_reserve_limit;
25452     break;
25453 
25454   case MDBX_opt_rp_augment_limit:
25455     *pvalue = env->me_options.rp_augment_limit;
25456     break;
25457 
25458   case MDBX_opt_txn_dp_limit:
25459     *pvalue = env->me_options.dp_limit;
25460     break;
25461   case MDBX_opt_txn_dp_initial:
25462     *pvalue = env->me_options.dp_initial;
25463     break;
25464 
25465   case MDBX_opt_spill_max_denominator:
25466     *pvalue = env->me_options.spill_max_denominator;
25467     break;
25468   case MDBX_opt_spill_min_denominator:
25469     *pvalue = env->me_options.spill_min_denominator;
25470     break;
25471   case MDBX_opt_spill_parent4child_denominator:
25472     *pvalue = env->me_options.spill_parent4child_denominator;
25473     break;
25474 
25475   case MDBX_opt_loose_limit:
25476     *pvalue = env->me_options.dp_loose_limit;
25477     break;
25478 
25479   case MDBX_opt_merge_threshold_16dot16_percent:
25480     *pvalue = env->me_options.merge_threshold_16dot16_percent;
25481     break;
25482 
25483   default:
25484     return MDBX_EINVAL;
25485   }
25486 
25487   return MDBX_SUCCESS;
25488 }
25489 
25490 /*** Attribute support functions for Nexenta **********************************/
25491 #ifdef MDBX_NEXENTA_ATTRS
25492 
mdbx_attr_peek(MDBX_val * data,mdbx_attr_t * attrptr)25493 static __inline int mdbx_attr_peek(MDBX_val *data, mdbx_attr_t *attrptr) {
25494   if (unlikely(data->iov_len < sizeof(mdbx_attr_t)))
25495     return MDBX_INCOMPATIBLE;
25496 
25497   if (likely(attrptr != NULL))
25498     *attrptr = *(mdbx_attr_t *)data->iov_base;
25499   data->iov_len -= sizeof(mdbx_attr_t);
25500   data->iov_base =
25501       likely(data->iov_len > 0) ? ((mdbx_attr_t *)data->iov_base) + 1 : NULL;
25502 
25503   return MDBX_SUCCESS;
25504 }
25505 
mdbx_attr_poke(MDBX_val * reserved,MDBX_val * data,mdbx_attr_t attr,MDBX_put_flags_t flags)25506 static __inline int mdbx_attr_poke(MDBX_val *reserved, MDBX_val *data,
25507                                    mdbx_attr_t attr, MDBX_put_flags_t flags) {
25508   mdbx_attr_t *space = reserved->iov_base;
25509   if (flags & MDBX_RESERVE) {
25510     if (likely(data != NULL)) {
25511       data->iov_base = data->iov_len ? space + 1 : NULL;
25512     }
25513   } else {
25514     *space = attr;
25515     if (likely(data != NULL)) {
25516       memcpy(space + 1, data->iov_base, data->iov_len);
25517     }
25518   }
25519 
25520   return MDBX_SUCCESS;
25521 }
25522 
mdbx_cursor_get_attr(MDBX_cursor * mc,MDBX_val * key,MDBX_val * data,mdbx_attr_t * attrptr,MDBX_cursor_op op)25523 int mdbx_cursor_get_attr(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data,
25524                          mdbx_attr_t *attrptr, MDBX_cursor_op op) {
25525   int rc = mdbx_cursor_get(mc, key, data, op);
25526   if (unlikely(rc != MDBX_SUCCESS))
25527     return rc;
25528 
25529   return mdbx_attr_peek(data, attrptr);
25530 }
25531 
mdbx_get_attr(MDBX_txn * txn,MDBX_dbi dbi,MDBX_val * key,MDBX_val * data,uint64_t * attrptr)25532 int mdbx_get_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data,
25533                   uint64_t *attrptr) {
25534   int rc = mdbx_get(txn, dbi, key, data);
25535   if (unlikely(rc != MDBX_SUCCESS))
25536     return rc;
25537 
25538   return mdbx_attr_peek(data, attrptr);
25539 }
25540 
mdbx_put_attr(MDBX_txn * txn,MDBX_dbi dbi,MDBX_val * key,MDBX_val * data,mdbx_attr_t attr,MDBX_put_flags_t flags)25541 int mdbx_put_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data,
25542                   mdbx_attr_t attr, MDBX_put_flags_t flags) {
25543   MDBX_val reserve;
25544   reserve.iov_base = NULL;
25545   reserve.iov_len = (data ? data->iov_len : 0) + sizeof(mdbx_attr_t);
25546 
25547   int rc = mdbx_put(txn, dbi, key, &reserve, flags | MDBX_RESERVE);
25548   if (unlikely(rc != MDBX_SUCCESS))
25549     return rc;
25550 
25551   return mdbx_attr_poke(&reserve, data, attr, flags);
25552 }
25553 
mdbx_cursor_put_attr(MDBX_cursor * cursor,MDBX_val * key,MDBX_val * data,mdbx_attr_t attr,MDBX_put_flags_t flags)25554 int mdbx_cursor_put_attr(MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data,
25555                          mdbx_attr_t attr, MDBX_put_flags_t flags) {
25556   MDBX_val reserve;
25557   reserve.iov_base = NULL;
25558   reserve.iov_len = (data ? data->iov_len : 0) + sizeof(mdbx_attr_t);
25559 
25560   int rc = mdbx_cursor_put(cursor, key, &reserve, flags | MDBX_RESERVE);
25561   if (unlikely(rc != MDBX_SUCCESS))
25562     return rc;
25563 
25564   return mdbx_attr_poke(&reserve, data, attr, flags);
25565 }
25566 
mdbx_set_attr(MDBX_txn * txn,MDBX_dbi dbi,MDBX_val * key,MDBX_val * data,mdbx_attr_t attr)25567 int mdbx_set_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data,
25568                   mdbx_attr_t attr) {
25569   if (unlikely(!key || !txn))
25570     return MDBX_EINVAL;
25571 
25572   if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE))
25573     return MDBX_EBADSIGN;
25574 
25575   if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)))
25576     return MDBX_BAD_DBI;
25577 
25578   if (unlikely(txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED)))
25579     return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN;
25580 
25581   MDBX_cursor_couple cx;
25582   MDBX_val old_data;
25583   int rc = mdbx_cursor_init(&cx.outer, txn, dbi);
25584   if (unlikely(rc != MDBX_SUCCESS))
25585     return rc;
25586   rc = mdbx_cursor_set(&cx.outer, key, &old_data, MDBX_SET, NULL);
25587   if (unlikely(rc != MDBX_SUCCESS)) {
25588     if (rc == MDBX_NOTFOUND && data) {
25589       cx.outer.mc_next = txn->tw.cursors[dbi];
25590       txn->tw.cursors[dbi] = &cx.outer;
25591       rc = mdbx_cursor_put_attr(&cx.outer, key, data, attr, 0);
25592       txn->tw.cursors[dbi] = cx.outer.mc_next;
25593     }
25594     return rc;
25595   }
25596 
25597   mdbx_attr_t old_attr = 0;
25598   rc = mdbx_attr_peek(&old_data, &old_attr);
25599   if (unlikely(rc != MDBX_SUCCESS))
25600     return rc;
25601 
25602   if (old_attr == attr && (!data || (data->iov_len == old_data.iov_len &&
25603                                      memcmp(data->iov_base, old_data.iov_base,
25604                                             old_data.iov_len) == 0)))
25605     return MDBX_SUCCESS;
25606 
25607   cx.outer.mc_next = txn->tw.cursors[dbi];
25608   txn->tw.cursors[dbi] = &cx.outer;
25609   rc = mdbx_cursor_put_attr(&cx.outer, key, data ? data : &old_data, attr,
25610                             MDBX_CURRENT);
25611   txn->tw.cursors[dbi] = cx.outer.mc_next;
25612   return rc;
25613 }
25614 #endif /* MDBX_NEXENTA_ATTRS */
25615 
25616 /******************************************************************************/
25617 /* *INDENT-OFF* */
25618 /* clang-format off */
25619 
25620 __dll_export
25621 #ifdef __attribute_used__
25622     __attribute_used__
25623 #elif defined(__GNUC__) || __has_attribute(__used__)
25624     __attribute__((__used__))
25625 #endif
25626 #ifdef __attribute_externally_visible__
25627         __attribute_externally_visible__
25628 #elif (defined(__GNUC__) && !defined(__clang__)) ||                            \
25629     __has_attribute(__externally_visible__)
25630     __attribute__((__externally_visible__))
25631 #endif
25632     const struct MDBX_build_info mdbx_build = {
25633 #ifdef MDBX_BUILD_TIMESTAMP
25634     MDBX_BUILD_TIMESTAMP
25635 #else
25636     "\"" __DATE__ " " __TIME__ "\""
25637 #endif /* MDBX_BUILD_TIMESTAMP */
25638 
25639     ,
25640 #ifdef MDBX_BUILD_TARGET
25641     MDBX_BUILD_TARGET
25642 #else
25643   #if defined(__ANDROID_API__)
25644     "Android" MDBX_STRINGIFY(__ANDROID_API__)
25645   #elif defined(__linux__) || defined(__gnu_linux__)
25646     "Linux"
25647   #elif defined(EMSCRIPTEN) || defined(__EMSCRIPTEN__)
25648     "webassembly"
25649   #elif defined(__CYGWIN__)
25650     "CYGWIN"
25651   #elif defined(_WIN64) || defined(_WIN32) || defined(__TOS_WIN__) \
25652       || defined(__WINDOWS__)
25653     "Windows"
25654   #elif defined(__APPLE__)
25655     #if (defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE) \
25656       || (defined(TARGET_IPHONE_SIMULATOR) && TARGET_IPHONE_SIMULATOR)
25657       "iOS"
25658     #else
25659       "MacOS"
25660     #endif
25661   #elif defined(__FreeBSD__)
25662     "FreeBSD"
25663   #elif defined(__DragonFly__)
25664     "DragonFlyBSD"
25665   #elif defined(__NetBSD__)
25666     "NetBSD"
25667   #elif defined(__OpenBSD__)
25668     "OpenBSD"
25669   #elif defined(__bsdi__)
25670     "UnixBSDI"
25671   #elif defined(__MACH__)
25672     "MACH"
25673   #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC))
25674     "HPUX"
25675   #elif defined(_AIX)
25676     "AIX"
25677   #elif defined(__sun) && defined(__SVR4)
25678     "Solaris"
25679   #elif defined(__BSD__) || defined(BSD)
25680     "UnixBSD"
25681   #elif defined(__unix__) || defined(UNIX) || defined(__unix) \
25682       || defined(__UNIX) || defined(__UNIX__)
25683     "UNIX"
25684   #elif defined(_POSIX_VERSION)
25685     "POSIX" MDBX_STRINGIFY(_POSIX_VERSION)
25686   #else
25687     "UnknownOS"
25688   #endif /* Target OS */
25689 
25690     "-"
25691 
25692   #if defined(__amd64__)
25693     "AMD64"
25694   #elif defined(__ia32__)
25695     "IA32"
25696   #elif defined(__e2k__) || defined(__elbrus__)
25697     "Elbrus"
25698   #elif defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA)
25699     "Alpha"
25700   #elif defined(__aarch64__) || defined(_M_ARM64)
25701     "ARM64"
25702   #elif defined(__arm__) || defined(__thumb__) || defined(__TARGET_ARCH_ARM) \
25703       || defined(__TARGET_ARCH_THUMB) || defined(_ARM) || defined(_M_ARM) \
25704       || defined(_M_ARMT) || defined(__arm)
25705     "ARM"
25706   #elif defined(__mips64) || defined(__mips64__) || (defined(__mips) && (__mips >= 64))
25707     "MIPS64"
25708   #elif defined(__mips__) || defined(__mips) || defined(_R4000) || defined(__MIPS__)
25709     "MIPS"
25710   #elif defined(__hppa64__) || defined(__HPPA64__) || defined(__hppa64)
25711     "PARISC64"
25712   #elif defined(__hppa__) || defined(__HPPA__) || defined(__hppa)
25713     "PARISC"
25714   #elif defined(__ia64__) || defined(__ia64) || defined(_IA64) \
25715       || defined(__IA64__) || defined(_M_IA64) || defined(__itanium__)
25716     "Itanium"
25717   #elif defined(__powerpc64__) || defined(__ppc64__) || defined(__ppc64) \
25718       || defined(__powerpc64) || defined(_ARCH_PPC64)
25719     "PowerPC64"
25720   #elif defined(__powerpc__) || defined(__ppc__) || defined(__powerpc) \
25721       || defined(__ppc) || defined(_ARCH_PPC) || defined(__PPC__) || defined(__POWERPC__)
25722     "PowerPC"
25723   #elif defined(__sparc64__) || defined(__sparc64)
25724     "SPARC64"
25725   #elif defined(__sparc__) || defined(__sparc)
25726     "SPARC"
25727   #elif defined(__s390__) || defined(__s390) || defined(__zarch__) || defined(__zarch)
25728     "S390"
25729   #else
25730     "UnknownARCH"
25731   #endif
25732 #endif /* MDBX_BUILD_TARGET */
25733 
25734 #ifdef MDBX_BUILD_TYPE
25735 # if defined(_MSC_VER)
25736 #   pragma message("Configuration-depended MDBX_BUILD_TYPE: " MDBX_BUILD_TYPE)
25737 # endif
25738     "-" MDBX_BUILD_TYPE
25739 #endif /* MDBX_BUILD_TYPE */
25740     ,
25741     "MDBX_DEBUG=" MDBX_STRINGIFY(MDBX_DEBUG)
25742     " MDBX_WORDBITS=" MDBX_STRINGIFY(MDBX_WORDBITS)
25743     " BYTE_ORDER="
25744 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
25745     "LITTLE_ENDIAN"
25746 #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
25747     "BIG_ENDIAN"
25748 #else
25749     #error "FIXME: Unsupported byte order"
25750 #endif /* __BYTE_ORDER__ */
25751     " MDBX_ENV_CHECKPID=" MDBX_ENV_CHECKPID_CONFIG
25752     " MDBX_TXN_CHECKOWNER=" MDBX_TXN_CHECKOWNER_CONFIG
25753     " MDBX_64BIT_ATOMIC=" MDBX_64BIT_ATOMIC_CONFIG
25754     " MDBX_64BIT_CAS=" MDBX_64BIT_CAS_CONFIG
25755     " MDBX_TRUST_RTC=" MDBX_TRUST_RTC_CONFIG
25756     " MDBX_ENABLE_REFUND=" MDBX_STRINGIFY(MDBX_ENABLE_REFUND)
25757     " MDBX_ENABLE_MADVISE=" MDBX_STRINGIFY(MDBX_ENABLE_MADVISE)
25758 #if MDBX_DISABLE_PAGECHECKS
25759     " MDBX_DISABLE_PAGECHECKS=YES"
25760 #endif /* MDBX_DISABLE_PAGECHECKS */
25761 #ifdef __SANITIZE_ADDRESS__
25762     " SANITIZE_ADDRESS=YES"
25763 #endif /* __SANITIZE_ADDRESS__ */
25764 #ifdef MDBX_USE_VALGRIND
25765     " MDBX_USE_VALGRIND=YES"
25766 #endif /* MDBX_USE_VALGRIND */
25767 #if MDBX_FORCE_ASSERTIONS
25768     " MDBX_FORCE_ASSERTIONS=YES"
25769 #endif /* MDBX_FORCE_ASSERTIONS */
25770 #ifdef _GNU_SOURCE
25771     " _GNU_SOURCE=YES"
25772 #else
25773     " _GNU_SOURCE=NO"
25774 #endif /* _GNU_SOURCE */
25775 #ifdef __APPLE__
25776     " MDBX_OSX_SPEED_INSTEADOF_DURABILITY=" MDBX_STRINGIFY(MDBX_OSX_SPEED_INSTEADOF_DURABILITY)
25777 #endif /* MacOS */
25778 #if defined(_WIN32) || defined(_WIN64)
25779     " MDBX_WITHOUT_MSVC_CRT=" MDBX_STRINGIFY(MDBX_AVOID_CRT)
25780     " MDBX_BUILD_SHARED_LIBRARY=" MDBX_STRINGIFY(MDBX_BUILD_SHARED_LIBRARY)
25781 #if !MDBX_BUILD_SHARED_LIBRARY
25782     " MDBX_MANUAL_MODULE_HANDLER=" MDBX_STRINGIFY(MDBX_MANUAL_MODULE_HANDLER)
25783 #endif
25784     " WINVER=" MDBX_STRINGIFY(WINVER)
25785 #else /* Windows */
25786     " MDBX_LOCKING=" MDBX_LOCKING_CONFIG
25787     " MDBX_USE_OFDLOCKS=" MDBX_USE_OFDLOCKS_CONFIG
25788 #endif /* !Windows */
25789     " MDBX_CACHELINE_SIZE=" MDBX_STRINGIFY(MDBX_CACHELINE_SIZE)
25790     " MDBX_CPU_WRITEBACK_INCOHERENT=" MDBX_STRINGIFY(MDBX_CPU_WRITEBACK_INCOHERENT)
25791     " MDBX_MMAP_INCOHERENT_CPU_CACHE=" MDBX_STRINGIFY(MDBX_MMAP_INCOHERENT_CPU_CACHE)
25792     " MDBX_MMAP_INCOHERENT_FILE_WRITE=" MDBX_STRINGIFY(MDBX_MMAP_INCOHERENT_FILE_WRITE)
25793     " MDBX_UNALIGNED_OK=" MDBX_STRINGIFY(MDBX_UNALIGNED_OK)
25794     " MDBX_PNL_ASCENDING=" MDBX_STRINGIFY(MDBX_PNL_ASCENDING)
25795     ,
25796 #ifdef MDBX_BUILD_COMPILER
25797     MDBX_BUILD_COMPILER
25798 #else
25799   #ifdef __INTEL_COMPILER
25800     "Intel C/C++ " MDBX_STRINGIFY(__INTEL_COMPILER)
25801   #elif defined(__apple_build_version__)
25802     "Apple clang " MDBX_STRINGIFY(__apple_build_version__)
25803   #elif defined(__ibmxl__)
25804     "IBM clang C " MDBX_STRINGIFY(__ibmxl_version__) "." MDBX_STRINGIFY(__ibmxl_release__)
25805     "." MDBX_STRINGIFY(__ibmxl_modification__) "." MDBX_STRINGIFY(__ibmxl_ptf_fix_level__)
25806   #elif defined(__clang__)
25807     "clang " MDBX_STRINGIFY(__clang_version__)
25808   #elif defined(__MINGW64__)
25809     "MINGW-64 " MDBX_STRINGIFY(__MINGW64_MAJOR_VERSION) "." MDBX_STRINGIFY(__MINGW64_MINOR_VERSION)
25810   #elif defined(__MINGW32__)
25811     "MINGW-32 " MDBX_STRINGIFY(__MINGW32_MAJOR_VERSION) "." MDBX_STRINGIFY(__MINGW32_MINOR_VERSION)
25812   #elif defined(__IBMC__)
25813     "IBM C " MDBX_STRINGIFY(__IBMC__)
25814   #elif defined(__GNUC__)
25815     "GNU C/C++ "
25816     #ifdef __VERSION__
25817       __VERSION__
25818     #else
25819       MDBX_STRINGIFY(__GNUC__) "." MDBX_STRINGIFY(__GNUC_MINOR__) "." MDBX_STRINGIFY(__GNUC_PATCHLEVEL__)
25820     #endif
25821   #elif defined(_MSC_VER)
25822     "MSVC " MDBX_STRINGIFY(_MSC_FULL_VER) "-" MDBX_STRINGIFY(_MSC_BUILD)
25823   #else
25824     "Unknown compiler"
25825   #endif
25826 #endif /* MDBX_BUILD_COMPILER */
25827     ,
25828 #ifdef MDBX_BUILD_FLAGS_CONFIG
25829     MDBX_BUILD_FLAGS_CONFIG
25830 #endif /* MDBX_BUILD_FLAGS_CONFIG */
25831 #ifdef MDBX_BUILD_FLAGS
25832     MDBX_BUILD_FLAGS
25833 #endif /* MDBX_BUILD_FLAGS */
25834 #if !(defined(MDBX_BUILD_FLAGS_CONFIG) || defined(MDBX_BUILD_FLAGS))
25835     "undefined (please use correct build script)"
25836 #ifdef _MSC_VER
25837 #pragma message("warning: Build flags undefined. Please use correct build script")
25838 #else
25839 #warning "Build flags undefined. Please use correct build script"
25840 #endif // _MSC_VER
25841 #endif
25842 };
25843 
25844 #ifdef __SANITIZE_ADDRESS__
__asan_default_options()25845 LIBMDBX_API __attribute__((__weak__)) const char *__asan_default_options() {
25846   return "symbolize=1:allow_addr2line=1:"
25847 #if MDBX_DEBUG
25848          "debug=1:"
25849          "verbosity=2:"
25850 #endif /* MDBX_DEBUG */
25851          "log_threads=1:"
25852          "report_globals=1:"
25853          "replace_str=1:replace_intrin=1:"
25854          "malloc_context_size=9:"
25855          "detect_leaks=1:"
25856          "check_printf=1:"
25857          "detect_deadlocks=1:"
25858 #ifndef LTO_ENABLED
25859          "check_initialization_order=1:"
25860 #endif
25861          "detect_stack_use_after_return=1:"
25862          "intercept_tls_get_addr=1:"
25863          "decorate_proc_maps=1:"
25864          "abort_on_error=1";
25865 }
25866 #endif /* __SANITIZE_ADDRESS__ */
25867 
25868 /* *INDENT-ON* */
25869 /* clang-format on */
25870 /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */
25871 
25872 /*
25873  * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru>
25874  * and other libmdbx authors: please see AUTHORS file.
25875  * All rights reserved.
25876  *
25877  * Redistribution and use in source and binary forms, with or without
25878  * modification, are permitted only as authorized by the OpenLDAP
25879  * Public License.
25880  *
25881  * A copy of this license is available in the file LICENSE in the
25882  * top-level directory of the distribution or, alternatively, at
25883  * <http://www.OpenLDAP.org/license.html>.
25884  */
25885 
25886 
25887 #if defined(_WIN32) || defined(_WIN64)
25888 
25889 #include <winioctl.h>
25890 
waitstatus2errcode(DWORD result)25891 static int waitstatus2errcode(DWORD result) {
25892   switch (result) {
25893   case WAIT_OBJECT_0:
25894     return MDBX_SUCCESS;
25895   case WAIT_FAILED:
25896     return (int)GetLastError();
25897   case WAIT_ABANDONED:
25898     return ERROR_ABANDONED_WAIT_0;
25899   case WAIT_IO_COMPLETION:
25900     return ERROR_USER_APC;
25901   case WAIT_TIMEOUT:
25902     return ERROR_TIMEOUT;
25903   default:
25904     return ERROR_UNHANDLED_ERROR;
25905   }
25906 }
25907 
25908 /* Map a result from an NTAPI call to WIN32 error code. */
ntstatus2errcode(NTSTATUS status)25909 static int ntstatus2errcode(NTSTATUS status) {
25910   DWORD dummy;
25911   OVERLAPPED ov;
25912   memset(&ov, 0, sizeof(ov));
25913   ov.Internal = status;
25914   return GetOverlappedResult(NULL, &ov, &dummy, FALSE) ? MDBX_SUCCESS
25915                                                        : (int)GetLastError();
25916 }
25917 
25918 /* We use native NT APIs to setup the memory map, so that we can
25919  * let the DB file grow incrementally instead of always preallocating
25920  * the full size. These APIs are defined in <wdm.h> and <ntifs.h>
25921  * but those headers are meant for driver-level development and
25922  * conflict with the regular user-level headers, so we explicitly
25923  * declare them here. Using these APIs also means we must link to
25924  * ntdll.dll, which is not linked by default in user code. */
25925 
25926 extern NTSTATUS NTAPI NtCreateSection(
25927     OUT PHANDLE SectionHandle, IN ACCESS_MASK DesiredAccess,
25928     IN OPTIONAL POBJECT_ATTRIBUTES ObjectAttributes,
25929     IN OPTIONAL PLARGE_INTEGER MaximumSize, IN ULONG SectionPageProtection,
25930     IN ULONG AllocationAttributes, IN OPTIONAL HANDLE FileHandle);
25931 
25932 typedef struct _SECTION_BASIC_INFORMATION {
25933   ULONG Unknown;
25934   ULONG SectionAttributes;
25935   LARGE_INTEGER SectionSize;
25936 } SECTION_BASIC_INFORMATION, *PSECTION_BASIC_INFORMATION;
25937 
25938 extern NTSTATUS NTAPI NtMapViewOfSection(
25939     IN HANDLE SectionHandle, IN HANDLE ProcessHandle, IN OUT PVOID *BaseAddress,
25940     IN ULONG_PTR ZeroBits, IN SIZE_T CommitSize,
25941     IN OUT OPTIONAL PLARGE_INTEGER SectionOffset, IN OUT PSIZE_T ViewSize,
25942     IN SECTION_INHERIT InheritDisposition, IN ULONG AllocationType,
25943     IN ULONG Win32Protect);
25944 
25945 extern NTSTATUS NTAPI NtUnmapViewOfSection(IN HANDLE ProcessHandle,
25946                                            IN OPTIONAL PVOID BaseAddress);
25947 
25948 extern NTSTATUS NTAPI NtClose(HANDLE Handle);
25949 
25950 extern NTSTATUS NTAPI NtAllocateVirtualMemory(
25951     IN HANDLE ProcessHandle, IN OUT PVOID *BaseAddress, IN ULONG_PTR ZeroBits,
25952     IN OUT PSIZE_T RegionSize, IN ULONG AllocationType, IN ULONG Protect);
25953 
25954 extern NTSTATUS NTAPI NtFreeVirtualMemory(IN HANDLE ProcessHandle,
25955                                           IN PVOID *BaseAddress,
25956                                           IN OUT PSIZE_T RegionSize,
25957                                           IN ULONG FreeType);
25958 
25959 #ifndef WOF_CURRENT_VERSION
25960 typedef struct _WOF_EXTERNAL_INFO {
25961   DWORD Version;
25962   DWORD Provider;
25963 } WOF_EXTERNAL_INFO, *PWOF_EXTERNAL_INFO;
25964 #endif /* WOF_CURRENT_VERSION */
25965 
25966 #ifndef WIM_PROVIDER_CURRENT_VERSION
25967 #define WIM_PROVIDER_HASH_SIZE 20
25968 
25969 typedef struct _WIM_PROVIDER_EXTERNAL_INFO {
25970   DWORD Version;
25971   DWORD Flags;
25972   LARGE_INTEGER DataSourceId;
25973   BYTE ResourceHash[WIM_PROVIDER_HASH_SIZE];
25974 } WIM_PROVIDER_EXTERNAL_INFO, *PWIM_PROVIDER_EXTERNAL_INFO;
25975 #endif /* WIM_PROVIDER_CURRENT_VERSION */
25976 
25977 #ifndef FILE_PROVIDER_CURRENT_VERSION
25978 typedef struct _FILE_PROVIDER_EXTERNAL_INFO_V1 {
25979   ULONG Version;
25980   ULONG Algorithm;
25981   ULONG Flags;
25982 } FILE_PROVIDER_EXTERNAL_INFO_V1, *PFILE_PROVIDER_EXTERNAL_INFO_V1;
25983 #endif /* FILE_PROVIDER_CURRENT_VERSION */
25984 
25985 #ifndef STATUS_OBJECT_NOT_EXTERNALLY_BACKED
25986 #define STATUS_OBJECT_NOT_EXTERNALLY_BACKED ((NTSTATUS)0xC000046DL)
25987 #endif
25988 #ifndef STATUS_INVALID_DEVICE_REQUEST
25989 #define STATUS_INVALID_DEVICE_REQUEST ((NTSTATUS)0xC0000010L)
25990 #endif
25991 #ifndef STATUS_NOT_SUPPORTED
25992 #define STATUS_NOT_SUPPORTED ((NTSTATUS)0xC00000BBL)
25993 #endif
25994 
25995 #ifndef FILE_DEVICE_FILE_SYSTEM
25996 #define FILE_DEVICE_FILE_SYSTEM 0x00000009
25997 #endif
25998 
25999 #ifndef FSCTL_GET_EXTERNAL_BACKING
26000 #define FSCTL_GET_EXTERNAL_BACKING                                             \
26001   CTL_CODE(FILE_DEVICE_FILE_SYSTEM, 196, METHOD_BUFFERED, FILE_ANY_ACCESS)
26002 #endif
26003 
26004 #ifndef ERROR_NOT_CAPABLE
26005 #define ERROR_NOT_CAPABLE 775L
26006 #endif
26007 
26008 #endif /* _WIN32 || _WIN64 */
26009 
26010 /*----------------------------------------------------------------------------*/
26011 
26012 #if defined(__UCLIBC__)
26013 __extern_C void __assert(const char *, const char *, unsigned int, const char *)
26014 #ifdef __THROW
26015     __THROW
26016 #else
26017     __nothrow
26018 #endif /* __THROW */
26019     MDBX_NORETURN;
26020 #define __assert_fail(assertion, file, line, function)                         \
26021   __assert(assertion, file, line, function)
26022 
26023 #elif _POSIX_C_SOURCE > 200212 &&                                              \
26024     /* workaround for avoid musl libc wrong prototype */ (                     \
26025         defined(__GLIBC__) || defined(__GNU_LIBRARY__))
26026 /* Prototype should match libc runtime. ISO POSIX (2003) & LSB 1.x-3.x */
26027 __extern_C void __assert_fail(const char *assertion, const char *file,
26028                               unsigned line, const char *function)
26029 #ifdef __THROW
26030     __THROW
26031 #else
26032     __nothrow
26033 #endif /* __THROW */
26034     MDBX_NORETURN;
26035 
26036 #elif defined(__APPLE__) || defined(__MACH__)
26037 __extern_C void __assert_rtn(const char *function, const char *file, int line,
26038                              const char *assertion) /* __nothrow */
26039 #ifdef __dead2
26040     __dead2
26041 #else
26042     MDBX_NORETURN
26043 #endif /* __dead2 */
26044 #ifdef __disable_tail_calls
26045     __disable_tail_calls
26046 #endif /* __disable_tail_calls */
26047     ;
26048 
26049 #define __assert_fail(assertion, file, line, function)                         \
26050   __assert_rtn(function, file, line, assertion)
26051 #elif defined(__sun) || defined(__SVR4) || defined(__svr4__)
26052 __extern_C void __assert_c99(const char *assection, const char *file, int line,
26053                              const char *function) MDBX_NORETURN;
26054 #define __assert_fail(assertion, file, line, function)                         \
26055   __assert_c99(assertion, file, line, function)
26056 #elif defined(__OpenBSD__)
26057 __extern_C __dead void __assert2(const char *file, int line,
26058                                  const char *function,
26059                                  const char *assertion) /* __nothrow */;
26060 #define __assert_fail(assertion, file, line, function)                         \
26061   __assert2(file, line, function, assertion)
26062 #elif defined(__NetBSD__)
26063 __extern_C __dead void __assert13(const char *file, int line,
26064                                   const char *function,
26065                                   const char *assertion) /* __nothrow */;
26066 #define __assert_fail(assertion, file, line, function)                         \
26067   __assert13(file, line, function, assertion)
26068 #elif defined(__FreeBSD__) || defined(__BSD__) || defined(__bsdi__) ||         \
26069     defined(__DragonFly__)
26070 __extern_C void __assert(const char *function, const char *file, int line,
26071                          const char *assertion) /* __nothrow */
26072 #ifdef __dead2
26073     __dead2
26074 #else
26075     MDBX_NORETURN
26076 #endif /* __dead2 */
26077 #ifdef __disable_tail_calls
26078     __disable_tail_calls
26079 #endif /* __disable_tail_calls */
26080     ;
26081 #define __assert_fail(assertion, file, line, function)                         \
26082   __assert(function, file, line, assertion)
26083 
26084 #endif /* __assert_fail */
26085 
26086 #if !defined(__ANDROID_API__) || MDBX_DEBUG
26087 
mdbx_assert_fail(const MDBX_env * env,const char * msg,const char * func,int line)26088 __cold void mdbx_assert_fail(const MDBX_env *env, const char *msg,
26089                              const char *func, int line) {
26090 #if MDBX_DEBUG
26091   if (env && env->me_assert_func) {
26092     env->me_assert_func(env, msg, func, line);
26093     return;
26094   }
26095 #else
26096   (void)env;
26097 #endif /* MDBX_DEBUG */
26098 
26099   if (mdbx_debug_logger)
26100     mdbx_debug_log(MDBX_LOG_FATAL, func, line, "assert: %s\n", msg);
26101   else {
26102 #if defined(_WIN32) || defined(_WIN64)
26103     char *message = nullptr;
26104     const int num = mdbx_asprintf(&message, "\r\nMDBX-ASSERTION: %s, %s:%u",
26105                                   msg, func ? func : "unknown", line);
26106     if (num < 1 || !message)
26107       message = "<troubles with assertion-message preparation>";
26108     OutputDebugStringA(message);
26109     if (IsDebuggerPresent())
26110       DebugBreak();
26111 #elif defined(__ANDROID_API__)
26112     __android_log_assert(msg, "mdbx", "%s:%u", func, line);
26113 #else
26114     __assert_fail(msg, "mdbx", line, func);
26115 #endif
26116   }
26117 
26118 #if defined(_WIN32) || defined(_WIN64)
26119   FatalExit(ERROR_UNHANDLED_ERROR);
26120 #else
26121   abort();
26122 #endif
26123 }
26124 
26125 #endif /* __ANDROID_API__ || MDBX_DEBUG */
26126 
mdbx_panic(const char * fmt,...)26127 __cold void mdbx_panic(const char *fmt, ...) {
26128   va_list ap;
26129   va_start(ap, fmt);
26130 
26131   char *message = nullptr;
26132   const int num = mdbx_vasprintf(&message, fmt, ap);
26133   va_end(ap);
26134   const char *const const_message =
26135       (num < 1 || !message) ? "<troubles with panic-message preparation>"
26136                             : message;
26137 
26138 #if defined(_WIN32) || defined(_WIN64)
26139   OutputDebugStringA("\r\nMDBX-PANIC: ");
26140   OutputDebugStringA(const_message);
26141   if (IsDebuggerPresent())
26142     DebugBreak();
26143   FatalExit(ERROR_UNHANDLED_ERROR);
26144 #else
26145 #if defined(__ANDROID_API__)
26146   __android_log_assert("panic", "mdbx", "%s", const_message);
26147 #else
26148   __assert_fail(const_message, "mdbx", 0, "panic");
26149 #endif /* __ANDROID_API__ */
26150   abort();
26151 #endif
26152 }
26153 
26154 /*----------------------------------------------------------------------------*/
26155 
26156 #ifndef mdbx_vasprintf
mdbx_vasprintf(char ** strp,const char * fmt,va_list ap)26157 MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt,
26158                                       va_list ap) {
26159   va_list ones;
26160   va_copy(ones, ap);
26161   int needed = vsnprintf(nullptr, 0, fmt, ap);
26162 
26163   if (unlikely(needed < 0 || needed >= INT_MAX)) {
26164     *strp = nullptr;
26165     va_end(ones);
26166     return needed;
26167   }
26168 
26169   *strp = mdbx_malloc(needed + 1);
26170   if (unlikely(*strp == nullptr)) {
26171     va_end(ones);
26172 #if defined(_WIN32) || defined(_WIN64)
26173     SetLastError(MDBX_ENOMEM);
26174 #else
26175     errno = MDBX_ENOMEM;
26176 #endif
26177     return -1;
26178   }
26179 
26180   int actual = vsnprintf(*strp, needed + 1, fmt, ones);
26181   va_end(ones);
26182 
26183   assert(actual == needed);
26184   if (unlikely(actual < 0)) {
26185     mdbx_free(*strp);
26186     *strp = nullptr;
26187   }
26188   return actual;
26189 }
26190 #endif /* mdbx_vasprintf */
26191 
26192 #ifndef mdbx_asprintf
mdbx_asprintf(char ** strp,const char * fmt,...)26193 MDBX_INTERNAL_FUNC int mdbx_asprintf(char **strp, const char *fmt, ...) {
26194   va_list ap;
26195   va_start(ap, fmt);
26196   int rc = mdbx_vasprintf(strp, fmt, ap);
26197   va_end(ap);
26198   return rc;
26199 }
26200 #endif /* mdbx_asprintf */
26201 
26202 #ifndef mdbx_memalign_alloc
mdbx_memalign_alloc(size_t alignment,size_t bytes,void ** result)26203 MDBX_INTERNAL_FUNC int mdbx_memalign_alloc(size_t alignment, size_t bytes,
26204                                            void **result) {
26205   assert(is_powerof2(alignment) && alignment >= sizeof(void *));
26206 #if defined(_WIN32) || defined(_WIN64)
26207   (void)alignment;
26208   *result = VirtualAlloc(NULL, bytes, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
26209   return *result ? MDBX_SUCCESS : MDBX_ENOMEM /* ERROR_OUTOFMEMORY */;
26210 #elif defined(_ISOC11_SOURCE)
26211   *result = aligned_alloc(alignment, ceil_powerof2(bytes, alignment));
26212   return *result ? MDBX_SUCCESS : errno;
26213 #elif _POSIX_VERSION >= 200112L &&                                             \
26214     (!defined(__ANDROID_API__) || __ANDROID_API__ >= 17)
26215   *result = nullptr;
26216   return posix_memalign(result, alignment, bytes);
26217 #elif __GLIBC_PREREQ(2, 16) || __STDC_VERSION__ >= 201112L
26218   *result = memalign(alignment, bytes);
26219   return *result ? MDBX_SUCCESS : errno;
26220 #else
26221 #error FIXME
26222 #endif
26223 }
26224 #endif /* mdbx_memalign_alloc */
26225 
26226 #ifndef mdbx_memalign_free
mdbx_memalign_free(void * ptr)26227 MDBX_INTERNAL_FUNC void mdbx_memalign_free(void *ptr) {
26228 #if defined(_WIN32) || defined(_WIN64)
26229   VirtualFree(ptr, 0, MEM_RELEASE);
26230 #else
26231   mdbx_free(ptr);
26232 #endif
26233 }
26234 #endif /* mdbx_memalign_free */
26235 
26236 #ifndef mdbx_strdup
mdbx_strdup(const char * str)26237 char *mdbx_strdup(const char *str) {
26238   if (!str)
26239     return NULL;
26240   size_t bytes = strlen(str) + 1;
26241   char *dup = mdbx_malloc(bytes);
26242   if (dup)
26243     memcpy(dup, str, bytes);
26244   return dup;
26245 }
26246 #endif /* mdbx_strdup */
26247 
26248 /*----------------------------------------------------------------------------*/
26249 
mdbx_condpair_init(mdbx_condpair_t * condpair)26250 MDBX_INTERNAL_FUNC int mdbx_condpair_init(mdbx_condpair_t *condpair) {
26251   int rc;
26252   memset(condpair, 0, sizeof(mdbx_condpair_t));
26253 #if defined(_WIN32) || defined(_WIN64)
26254   if ((condpair->mutex = CreateMutexW(NULL, FALSE, NULL)) == NULL) {
26255     rc = (int)GetLastError();
26256     goto bailout_mutex;
26257   }
26258   if ((condpair->event[0] = CreateEventW(NULL, FALSE, FALSE, NULL)) == NULL) {
26259     rc = (int)GetLastError();
26260     goto bailout_event;
26261   }
26262   if ((condpair->event[1] = CreateEventW(NULL, FALSE, FALSE, NULL)) != NULL)
26263     return MDBX_SUCCESS;
26264 
26265   rc = (int)GetLastError();
26266   (void)CloseHandle(condpair->event[0]);
26267 bailout_event:
26268   (void)CloseHandle(condpair->mutex);
26269 #else
26270   rc = pthread_mutex_init(&condpair->mutex, NULL);
26271   if (unlikely(rc != 0))
26272     goto bailout_mutex;
26273   rc = pthread_cond_init(&condpair->cond[0], NULL);
26274   if (unlikely(rc != 0))
26275     goto bailout_cond;
26276   rc = pthread_cond_init(&condpair->cond[1], NULL);
26277   if (likely(rc == 0))
26278     return MDBX_SUCCESS;
26279 
26280   (void)pthread_cond_destroy(&condpair->cond[0]);
26281 bailout_cond:
26282   (void)pthread_mutex_destroy(&condpair->mutex);
26283 #endif
26284 bailout_mutex:
26285   memset(condpair, 0, sizeof(mdbx_condpair_t));
26286   return rc;
26287 }
26288 
mdbx_condpair_destroy(mdbx_condpair_t * condpair)26289 MDBX_INTERNAL_FUNC int mdbx_condpair_destroy(mdbx_condpair_t *condpair) {
26290 #if defined(_WIN32) || defined(_WIN64)
26291   int rc = CloseHandle(condpair->mutex) ? MDBX_SUCCESS : (int)GetLastError();
26292   rc = CloseHandle(condpair->event[0]) ? rc : (int)GetLastError();
26293   rc = CloseHandle(condpair->event[1]) ? rc : (int)GetLastError();
26294 #else
26295   int err, rc = pthread_mutex_destroy(&condpair->mutex);
26296   rc = (err = pthread_cond_destroy(&condpair->cond[0])) ? err : rc;
26297   rc = (err = pthread_cond_destroy(&condpair->cond[1])) ? err : rc;
26298 #endif
26299   memset(condpair, 0, sizeof(mdbx_condpair_t));
26300   return rc;
26301 }
26302 
mdbx_condpair_lock(mdbx_condpair_t * condpair)26303 MDBX_INTERNAL_FUNC int mdbx_condpair_lock(mdbx_condpair_t *condpair) {
26304 #if defined(_WIN32) || defined(_WIN64)
26305   DWORD code = WaitForSingleObject(condpair->mutex, INFINITE);
26306   return waitstatus2errcode(code);
26307 #else
26308   return pthread_mutex_lock(&condpair->mutex);
26309 #endif
26310 }
26311 
mdbx_condpair_unlock(mdbx_condpair_t * condpair)26312 MDBX_INTERNAL_FUNC int mdbx_condpair_unlock(mdbx_condpair_t *condpair) {
26313 #if defined(_WIN32) || defined(_WIN64)
26314   return ReleaseMutex(condpair->mutex) ? MDBX_SUCCESS : (int)GetLastError();
26315 #else
26316   return pthread_mutex_unlock(&condpair->mutex);
26317 #endif
26318 }
26319 
mdbx_condpair_signal(mdbx_condpair_t * condpair,bool part)26320 MDBX_INTERNAL_FUNC int mdbx_condpair_signal(mdbx_condpair_t *condpair,
26321                                             bool part) {
26322 #if defined(_WIN32) || defined(_WIN64)
26323   return SetEvent(condpair->event[part]) ? MDBX_SUCCESS : (int)GetLastError();
26324 #else
26325   return pthread_cond_signal(&condpair->cond[part]);
26326 #endif
26327 }
26328 
mdbx_condpair_wait(mdbx_condpair_t * condpair,bool part)26329 MDBX_INTERNAL_FUNC int mdbx_condpair_wait(mdbx_condpair_t *condpair,
26330                                           bool part) {
26331 #if defined(_WIN32) || defined(_WIN64)
26332   DWORD code = SignalObjectAndWait(condpair->mutex, condpair->event[part],
26333                                    INFINITE, FALSE);
26334   if (code == WAIT_OBJECT_0) {
26335     code = WaitForSingleObject(condpair->mutex, INFINITE);
26336     if (code == WAIT_OBJECT_0)
26337       return MDBX_SUCCESS;
26338   }
26339   return waitstatus2errcode(code);
26340 #else
26341   return pthread_cond_wait(&condpair->cond[part], &condpair->mutex);
26342 #endif
26343 }
26344 
26345 /*----------------------------------------------------------------------------*/
26346 
mdbx_fastmutex_init(mdbx_fastmutex_t * fastmutex)26347 MDBX_INTERNAL_FUNC int mdbx_fastmutex_init(mdbx_fastmutex_t *fastmutex) {
26348 #if defined(_WIN32) || defined(_WIN64)
26349   InitializeCriticalSection(fastmutex);
26350   return MDBX_SUCCESS;
26351 #else
26352   return pthread_mutex_init(fastmutex, NULL);
26353 #endif
26354 }
26355 
mdbx_fastmutex_destroy(mdbx_fastmutex_t * fastmutex)26356 MDBX_INTERNAL_FUNC int mdbx_fastmutex_destroy(mdbx_fastmutex_t *fastmutex) {
26357 #if defined(_WIN32) || defined(_WIN64)
26358   DeleteCriticalSection(fastmutex);
26359   return MDBX_SUCCESS;
26360 #else
26361   return pthread_mutex_destroy(fastmutex);
26362 #endif
26363 }
26364 
mdbx_fastmutex_acquire(mdbx_fastmutex_t * fastmutex)26365 MDBX_INTERNAL_FUNC int mdbx_fastmutex_acquire(mdbx_fastmutex_t *fastmutex) {
26366 #if defined(_WIN32) || defined(_WIN64)
26367   __try {
26368     EnterCriticalSection(fastmutex);
26369   } __except (
26370       (GetExceptionCode() ==
26371        0xC0000194 /* STATUS_POSSIBLE_DEADLOCK / EXCEPTION_POSSIBLE_DEADLOCK */)
26372           ? EXCEPTION_EXECUTE_HANDLER
26373           : EXCEPTION_CONTINUE_SEARCH) {
26374     return ERROR_POSSIBLE_DEADLOCK;
26375   }
26376   return MDBX_SUCCESS;
26377 #else
26378   return pthread_mutex_lock(fastmutex);
26379 #endif
26380 }
26381 
mdbx_fastmutex_release(mdbx_fastmutex_t * fastmutex)26382 MDBX_INTERNAL_FUNC int mdbx_fastmutex_release(mdbx_fastmutex_t *fastmutex) {
26383 #if defined(_WIN32) || defined(_WIN64)
26384   LeaveCriticalSection(fastmutex);
26385   return MDBX_SUCCESS;
26386 #else
26387   return pthread_mutex_unlock(fastmutex);
26388 #endif
26389 }
26390 
26391 /*----------------------------------------------------------------------------*/
26392 
mdbx_removefile(const char * pathname)26393 MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname) {
26394 #if defined(_WIN32) || defined(_WIN64)
26395   const size_t wlen = mbstowcs(nullptr, pathname, INT_MAX);
26396   if (wlen < 1 || wlen > /* MAX_PATH */ INT16_MAX)
26397     return ERROR_INVALID_NAME;
26398   wchar_t *const pathnameW = _alloca((wlen + 1) * sizeof(wchar_t));
26399   if (wlen != mbstowcs(pathnameW, pathname, wlen + 1))
26400     return ERROR_INVALID_NAME;
26401   return DeleteFileW(pathnameW) ? MDBX_SUCCESS : (int)GetLastError();
26402 #else
26403   return unlink(pathname) ? errno : MDBX_SUCCESS;
26404 #endif
26405 }
26406 
26407 #if !(defined(_WIN32) || defined(_WIN64))
is_valid_fd(int fd)26408 static bool is_valid_fd(int fd) { return !(isatty(fd) < 0 && errno == EBADF); }
26409 #endif /*! Windows */
26410 
mdbx_removedirectory(const char * pathname)26411 MDBX_INTERNAL_FUNC int mdbx_removedirectory(const char *pathname) {
26412 #if defined(_WIN32) || defined(_WIN64)
26413   const size_t wlen = mbstowcs(nullptr, pathname, INT_MAX);
26414   if (wlen < 1 || wlen > /* MAX_PATH */ INT16_MAX)
26415     return ERROR_INVALID_NAME;
26416   wchar_t *const pathnameW = _alloca((wlen + 1) * sizeof(wchar_t));
26417   if (wlen != mbstowcs(pathnameW, pathname, wlen + 1))
26418     return ERROR_INVALID_NAME;
26419   return RemoveDirectoryW(pathnameW) ? MDBX_SUCCESS : (int)GetLastError();
26420 #else
26421   return rmdir(pathname) ? errno : MDBX_SUCCESS;
26422 #endif
26423 }
26424 
mdbx_openfile(const enum mdbx_openfile_purpose purpose,const MDBX_env * env,const char * pathname,mdbx_filehandle_t * fd,mdbx_mode_t unix_mode_bits)26425 MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose,
26426                                      const MDBX_env *env, const char *pathname,
26427                                      mdbx_filehandle_t *fd,
26428                                      mdbx_mode_t unix_mode_bits) {
26429   *fd = INVALID_HANDLE_VALUE;
26430 
26431 #if defined(_WIN32) || defined(_WIN64)
26432   const size_t wlen = mbstowcs(nullptr, pathname, INT_MAX);
26433   if (wlen < 1 || wlen > /* MAX_PATH */ INT16_MAX)
26434     return ERROR_INVALID_NAME;
26435   wchar_t *const pathnameW = _alloca((wlen + 1) * sizeof(wchar_t));
26436   if (wlen != mbstowcs(pathnameW, pathname, wlen + 1))
26437     return ERROR_INVALID_NAME;
26438 
26439   DWORD CreationDisposition = unix_mode_bits ? OPEN_ALWAYS : OPEN_EXISTING;
26440   DWORD FlagsAndAttributes =
26441       FILE_FLAG_POSIX_SEMANTICS | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED;
26442   DWORD DesiredAccess = FILE_READ_ATTRIBUTES;
26443   DWORD ShareMode = (env->me_flags & MDBX_EXCLUSIVE)
26444                         ? 0
26445                         : (FILE_SHARE_READ | FILE_SHARE_WRITE);
26446 
26447   switch (purpose) {
26448   default:
26449     return ERROR_INVALID_PARAMETER;
26450   case MDBX_OPEN_LCK:
26451     CreationDisposition = OPEN_ALWAYS;
26452     DesiredAccess |= GENERIC_READ | GENERIC_WRITE;
26453     FlagsAndAttributes |= FILE_ATTRIBUTE_HIDDEN | FILE_ATTRIBUTE_TEMPORARY;
26454     break;
26455   case MDBX_OPEN_DXB_READ:
26456     CreationDisposition = OPEN_EXISTING;
26457     DesiredAccess |= GENERIC_READ;
26458     ShareMode |= FILE_SHARE_READ;
26459     break;
26460   case MDBX_OPEN_DXB_LAZY:
26461     DesiredAccess |= GENERIC_READ | GENERIC_WRITE;
26462     break;
26463   case MDBX_OPEN_DXB_DSYNC:
26464     CreationDisposition = OPEN_EXISTING;
26465     DesiredAccess |= GENERIC_WRITE;
26466     FlagsAndAttributes |= FILE_FLAG_WRITE_THROUGH;
26467     break;
26468   case MDBX_OPEN_COPY:
26469     CreationDisposition = CREATE_NEW;
26470     ShareMode = 0;
26471     DesiredAccess |= GENERIC_WRITE;
26472     FlagsAndAttributes |=
26473         (env->me_psize < env->me_os_psize) ? 0 : FILE_FLAG_NO_BUFFERING;
26474     break;
26475   case MDBX_OPEN_DELETE:
26476     CreationDisposition = OPEN_EXISTING;
26477     ShareMode |= FILE_SHARE_DELETE;
26478     DesiredAccess =
26479         FILE_READ_ATTRIBUTES | FILE_WRITE_ATTRIBUTES | DELETE | SYNCHRONIZE;
26480     break;
26481   }
26482 
26483   *fd = CreateFileW(pathnameW, DesiredAccess, ShareMode, NULL,
26484                     CreationDisposition, FlagsAndAttributes, NULL);
26485   if (*fd == INVALID_HANDLE_VALUE)
26486     return (int)GetLastError();
26487 
26488   BY_HANDLE_FILE_INFORMATION info;
26489   if (!GetFileInformationByHandle(*fd, &info)) {
26490     int err = (int)GetLastError();
26491     CloseHandle(*fd);
26492     *fd = INVALID_HANDLE_VALUE;
26493     return err;
26494   }
26495   const DWORD AttributesDiff =
26496       (info.dwFileAttributes ^ FlagsAndAttributes) &
26497       (FILE_ATTRIBUTE_HIDDEN | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED |
26498        FILE_ATTRIBUTE_TEMPORARY | FILE_ATTRIBUTE_COMPRESSED);
26499   if (AttributesDiff)
26500     (void)SetFileAttributesW(pathnameW, info.dwFileAttributes ^ AttributesDiff);
26501 
26502 #else
26503   int flags = unix_mode_bits ? O_CREAT : 0;
26504   switch (purpose) {
26505   default:
26506     return EINVAL;
26507   case MDBX_OPEN_LCK:
26508     flags |= O_RDWR;
26509     break;
26510   case MDBX_OPEN_DXB_READ:
26511     flags = O_RDONLY;
26512     break;
26513   case MDBX_OPEN_DXB_LAZY:
26514     flags |= O_RDWR;
26515     break;
26516   case MDBX_OPEN_COPY:
26517     flags = O_CREAT | O_WRONLY | O_EXCL;
26518     break;
26519   case MDBX_OPEN_DXB_DSYNC:
26520     flags |= O_WRONLY;
26521 #if defined(O_DSYNC)
26522     flags |= O_DSYNC;
26523 #elif defined(O_SYNC)
26524     flags |= O_SYNC;
26525 #elif defined(O_FSYNC)
26526     flags |= O_FSYNC;
26527 #endif
26528     break;
26529   case MDBX_OPEN_DELETE:
26530     flags = O_RDWR;
26531     break;
26532   }
26533 
26534   const bool direct_nocache_for_copy =
26535       env->me_psize >= env->me_os_psize && purpose == MDBX_OPEN_COPY;
26536   if (direct_nocache_for_copy) {
26537 #if defined(O_DIRECT)
26538     flags |= O_DIRECT;
26539 #endif /* O_DIRECT */
26540 #if defined(O_NOCACHE)
26541     flags |= O_NOCACHE;
26542 #endif /* O_NOCACHE */
26543   }
26544 
26545 #ifdef O_CLOEXEC
26546   flags |= O_CLOEXEC;
26547 #endif /* O_CLOEXEC */
26548 
26549   /* Safeguard for https://github.com/erthink/libmdbx/issues/144 */
26550 #if STDIN_FILENO == 0 && STDOUT_FILENO == 1 && STDERR_FILENO == 2
26551   int stub_fd0 = -1, stub_fd1 = -1, stub_fd2 = -1;
26552   static const char dev_null[] = "/dev/null";
26553   if (!is_valid_fd(STDIN_FILENO)) {
26554     mdbx_warning("STD%s_FILENO/%d is invalid, open %s for temporary stub", "IN",
26555                  STDIN_FILENO, dev_null);
26556     stub_fd0 = open(dev_null, O_RDONLY | O_NOCTTY);
26557   }
26558   if (!is_valid_fd(STDOUT_FILENO)) {
26559     mdbx_warning("STD%s_FILENO/%d is invalid, open %s for temporary stub",
26560                  "OUT", STDOUT_FILENO, dev_null);
26561     stub_fd1 = open(dev_null, O_WRONLY | O_NOCTTY);
26562   }
26563   if (!is_valid_fd(STDERR_FILENO)) {
26564     mdbx_warning("STD%s_FILENO/%d is invalid, open %s for temporary stub",
26565                  "ERR", STDERR_FILENO, dev_null);
26566     stub_fd2 = open(dev_null, O_WRONLY | O_NOCTTY);
26567   }
26568 #else
26569 #error "Unexpected or unsupported UNIX or POSIX system"
26570 #endif /* STDIN_FILENO == 0 && STDERR_FILENO == 2 */
26571 
26572   *fd = open(pathname, flags, unix_mode_bits);
26573 #if defined(O_DIRECT)
26574   if (*fd < 0 && (flags & O_DIRECT) &&
26575       (errno == EINVAL || errno == EAFNOSUPPORT)) {
26576     flags &= ~(O_DIRECT | O_EXCL);
26577     *fd = open(pathname, flags, unix_mode_bits);
26578   }
26579 #endif /* O_DIRECT */
26580 
26581   /* Safeguard for https://github.com/erthink/libmdbx/issues/144 */
26582 #if STDIN_FILENO == 0 && STDOUT_FILENO == 1 && STDERR_FILENO == 2
26583   if (*fd == STDIN_FILENO) {
26584     mdbx_warning("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "IN",
26585                  STDIN_FILENO);
26586     assert(stub_fd0 == -1);
26587     *fd = dup(stub_fd0 = *fd);
26588   }
26589   if (*fd == STDOUT_FILENO) {
26590     mdbx_warning("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "OUT",
26591                  STDOUT_FILENO);
26592     assert(stub_fd1 == -1);
26593     *fd = dup(stub_fd1 = *fd);
26594   }
26595   if (*fd == STDERR_FILENO) {
26596     mdbx_warning("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "ERR",
26597                  STDERR_FILENO);
26598     assert(stub_fd2 == -1);
26599     *fd = dup(stub_fd2 = *fd);
26600   }
26601   if (stub_fd0 != -1)
26602     close(stub_fd0);
26603   if (stub_fd1 != -1)
26604     close(stub_fd1);
26605   if (stub_fd2 != -1)
26606     close(stub_fd2);
26607   if (*fd >= STDIN_FILENO && *fd <= STDERR_FILENO) {
26608     mdbx_error(
26609         "Rejecting the use of a FD in the range "
26610         "STDIN_FILENO/%d..STDERR_FILENO/%d to prevent database corruption",
26611         STDIN_FILENO, STDERR_FILENO);
26612     close(*fd);
26613     return EBADF;
26614   }
26615 #else
26616 #error "Unexpected or unsupported UNIX or POSIX system"
26617 #endif /* STDIN_FILENO == 0 && STDERR_FILENO == 2 */
26618 
26619   if (*fd < 0)
26620     return errno;
26621 
26622 #if defined(FD_CLOEXEC) && !defined(O_CLOEXEC)
26623   const int fd_flags = fcntl(*fd, F_GETFD);
26624   if (fd_flags != -1)
26625     (void)fcntl(*fd, F_SETFD, fd_flags | FD_CLOEXEC);
26626 #endif /* FD_CLOEXEC && !O_CLOEXEC */
26627 
26628   if (direct_nocache_for_copy) {
26629 #if defined(F_NOCACHE) && !defined(O_NOCACHE)
26630     (void)fcntl(*fd, F_NOCACHE, 1);
26631 #endif /* F_NOCACHE */
26632   }
26633 
26634 #endif
26635   return MDBX_SUCCESS;
26636 }
26637 
mdbx_closefile(mdbx_filehandle_t fd)26638 MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd) {
26639 #if defined(_WIN32) || defined(_WIN64)
26640   return CloseHandle(fd) ? MDBX_SUCCESS : (int)GetLastError();
26641 #else
26642   assert(fd > STDERR_FILENO);
26643   return (close(fd) == 0) ? MDBX_SUCCESS : errno;
26644 #endif
26645 }
26646 
mdbx_pread(mdbx_filehandle_t fd,void * buf,size_t bytes,uint64_t offset)26647 MDBX_INTERNAL_FUNC int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t bytes,
26648                                   uint64_t offset) {
26649   if (bytes > MAX_WRITE)
26650     return MDBX_EINVAL;
26651 #if defined(_WIN32) || defined(_WIN64)
26652   OVERLAPPED ov;
26653   ov.hEvent = 0;
26654   ov.Offset = (DWORD)offset;
26655   ov.OffsetHigh = HIGH_DWORD(offset);
26656 
26657   DWORD read = 0;
26658   if (unlikely(!ReadFile(fd, buf, (DWORD)bytes, &read, &ov))) {
26659     int rc = (int)GetLastError();
26660     return (rc == MDBX_SUCCESS) ? /* paranoia */ ERROR_READ_FAULT : rc;
26661   }
26662 #else
26663   STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t),
26664                     "libmdbx requires 64-bit file I/O on 64-bit systems");
26665   intptr_t read = pread(fd, buf, bytes, offset);
26666   if (read < 0) {
26667     int rc = errno;
26668     return (rc == MDBX_SUCCESS) ? /* paranoia */ MDBX_EIO : rc;
26669   }
26670 #endif
26671   return (bytes == (size_t)read) ? MDBX_SUCCESS : MDBX_ENODATA;
26672 }
26673 
mdbx_pwrite(mdbx_filehandle_t fd,const void * buf,size_t bytes,uint64_t offset)26674 MDBX_INTERNAL_FUNC int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf,
26675                                    size_t bytes, uint64_t offset) {
26676   while (true) {
26677 #if defined(_WIN32) || defined(_WIN64)
26678     OVERLAPPED ov;
26679     ov.hEvent = 0;
26680     ov.Offset = (DWORD)offset;
26681     ov.OffsetHigh = HIGH_DWORD(offset);
26682 
26683     DWORD written;
26684     if (unlikely(!WriteFile(
26685             fd, buf, likely(bytes <= MAX_WRITE) ? (DWORD)bytes : MAX_WRITE,
26686             &written, &ov)))
26687       return (int)GetLastError();
26688     if (likely(bytes == written))
26689       return MDBX_SUCCESS;
26690 #else
26691     STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t),
26692                       "libmdbx requires 64-bit file I/O on 64-bit systems");
26693     const intptr_t written =
26694         pwrite(fd, buf, likely(bytes <= MAX_WRITE) ? bytes : MAX_WRITE, offset);
26695     if (likely(bytes == (size_t)written))
26696       return MDBX_SUCCESS;
26697     if (written < 0) {
26698       const int rc = errno;
26699       if (rc != EINTR)
26700         return rc;
26701       continue;
26702     }
26703 #endif
26704     bytes -= written;
26705     offset += written;
26706     buf = (char *)buf + written;
26707   }
26708 }
26709 
mdbx_write(mdbx_filehandle_t fd,const void * buf,size_t bytes)26710 MDBX_INTERNAL_FUNC int mdbx_write(mdbx_filehandle_t fd, const void *buf,
26711                                   size_t bytes) {
26712   while (true) {
26713 #if defined(_WIN32) || defined(_WIN64)
26714     DWORD written;
26715     if (unlikely(!WriteFile(
26716             fd, buf, likely(bytes <= MAX_WRITE) ? (DWORD)bytes : MAX_WRITE,
26717             &written, nullptr)))
26718       return (int)GetLastError();
26719     if (likely(bytes == written))
26720       return MDBX_SUCCESS;
26721 #else
26722     STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t),
26723                       "libmdbx requires 64-bit file I/O on 64-bit systems");
26724     const intptr_t written =
26725         write(fd, buf, likely(bytes <= MAX_WRITE) ? bytes : MAX_WRITE);
26726     if (likely(bytes == (size_t)written))
26727       return MDBX_SUCCESS;
26728     if (written < 0) {
26729       const int rc = errno;
26730       if (rc != EINTR)
26731         return rc;
26732       continue;
26733     }
26734 #endif
26735     bytes -= written;
26736     buf = (char *)buf + written;
26737   }
26738 }
26739 
mdbx_pwritev(mdbx_filehandle_t fd,struct iovec * iov,int iovcnt,uint64_t offset,size_t expected_written)26740 int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt,
26741                  uint64_t offset, size_t expected_written) {
26742 #if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__) ||                \
26743     (defined(__ANDROID_API__) && __ANDROID_API__ < 24)
26744   size_t written = 0;
26745   for (int i = 0; i < iovcnt; ++i) {
26746     int rc = mdbx_pwrite(fd, iov[i].iov_base, iov[i].iov_len, offset);
26747     if (unlikely(rc != MDBX_SUCCESS))
26748       return rc;
26749     written += iov[i].iov_len;
26750     offset += iov[i].iov_len;
26751   }
26752   return (expected_written == written) ? MDBX_SUCCESS
26753                                        : MDBX_EIO /* ERROR_WRITE_FAULT */;
26754 #else
26755   int rc;
26756   intptr_t written;
26757   do {
26758     STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t),
26759                       "libmdbx requires 64-bit file I/O on 64-bit systems");
26760     written = pwritev(fd, iov, iovcnt, offset);
26761     if (likely(expected_written == (size_t)written))
26762       return MDBX_SUCCESS;
26763     rc = errno;
26764   } while (rc == EINTR);
26765   return (written < 0) ? rc : MDBX_EIO /* Use which error code? */;
26766 #endif
26767 }
26768 
mdbx_fsync(mdbx_filehandle_t fd,enum mdbx_syncmode_bits mode_bits)26769 MDBX_INTERNAL_FUNC int mdbx_fsync(mdbx_filehandle_t fd,
26770                                   enum mdbx_syncmode_bits mode_bits) {
26771 #if defined(_WIN32) || defined(_WIN64)
26772   if ((mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_IODQ)) && !FlushFileBuffers(fd))
26773     return (int)GetLastError();
26774   return MDBX_SUCCESS;
26775 #else
26776 
26777 #if defined(__APPLE__) &&                                                      \
26778     MDBX_OSX_SPEED_INSTEADOF_DURABILITY == MDBX_OSX_WANNA_DURABILITY
26779   if (mode_bits & MDBX_SYNC_IODQ)
26780     return likely(fcntl(fd, F_FULLFSYNC) != -1) ? MDBX_SUCCESS : errno;
26781 #endif /* MacOS */
26782 
26783   /* LY: This approach is always safe and without appreciable performance
26784    * degradation, even on a kernel with fdatasync's bug.
26785    *
26786    * For more info about of a corresponding fdatasync() bug
26787    * see http://www.spinics.net/lists/linux-ext4/msg33714.html */
26788   while (1) {
26789     switch (mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_SIZE)) {
26790     case MDBX_SYNC_NONE:
26791       return MDBX_SUCCESS /* nothing to do */;
26792 #if defined(_POSIX_SYNCHRONIZED_IO) && _POSIX_SYNCHRONIZED_IO > 0
26793     case MDBX_SYNC_DATA:
26794       if (fdatasync(fd) == 0)
26795         return MDBX_SUCCESS;
26796       break /* error */;
26797 #if defined(__linux__) || defined(__gnu_linux__)
26798     case MDBX_SYNC_SIZE:
26799       if (mdbx_linux_kernel_version >= 0x03060000)
26800         return MDBX_SUCCESS;
26801       __fallthrough /* fall through */;
26802 #endif /* Linux */
26803 #endif /* _POSIX_SYNCHRONIZED_IO > 0 */
26804     default:
26805       if (fsync(fd) == 0)
26806         return MDBX_SUCCESS;
26807     }
26808 
26809     int rc = errno;
26810     if (rc != EINTR)
26811       return rc;
26812   }
26813 #endif
26814 }
26815 
mdbx_filesize(mdbx_filehandle_t fd,uint64_t * length)26816 int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length) {
26817 #if defined(_WIN32) || defined(_WIN64)
26818   BY_HANDLE_FILE_INFORMATION info;
26819   if (!GetFileInformationByHandle(fd, &info))
26820     return (int)GetLastError();
26821   *length = info.nFileSizeLow | (uint64_t)info.nFileSizeHigh << 32;
26822 #else
26823   struct stat st;
26824 
26825   STATIC_ASSERT_MSG(sizeof(off_t) <= sizeof(uint64_t),
26826                     "libmdbx requires 64-bit file I/O on 64-bit systems");
26827   if (fstat(fd, &st))
26828     return errno;
26829 
26830   *length = st.st_size;
26831 #endif
26832   return MDBX_SUCCESS;
26833 }
26834 
mdbx_is_pipe(mdbx_filehandle_t fd)26835 MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd) {
26836 #if defined(_WIN32) || defined(_WIN64)
26837   switch (GetFileType(fd)) {
26838   case FILE_TYPE_DISK:
26839     return MDBX_RESULT_FALSE;
26840   case FILE_TYPE_CHAR:
26841   case FILE_TYPE_PIPE:
26842     return MDBX_RESULT_TRUE;
26843   default:
26844     return (int)GetLastError();
26845   }
26846 #else
26847   struct stat info;
26848   if (fstat(fd, &info))
26849     return errno;
26850   switch (info.st_mode & S_IFMT) {
26851   case S_IFBLK:
26852   case S_IFREG:
26853     return MDBX_RESULT_FALSE;
26854   case S_IFCHR:
26855   case S_IFIFO:
26856   case S_IFSOCK:
26857     return MDBX_RESULT_TRUE;
26858   case S_IFDIR:
26859   case S_IFLNK:
26860   default:
26861     return MDBX_INCOMPATIBLE;
26862   }
26863 #endif
26864 }
26865 
mdbx_ftruncate(mdbx_filehandle_t fd,uint64_t length)26866 MDBX_INTERNAL_FUNC int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length) {
26867 #if defined(_WIN32) || defined(_WIN64)
26868   if (mdbx_SetFileInformationByHandle) {
26869     FILE_END_OF_FILE_INFO EndOfFileInfo;
26870     EndOfFileInfo.EndOfFile.QuadPart = length;
26871     return mdbx_SetFileInformationByHandle(fd, FileEndOfFileInfo,
26872                                            &EndOfFileInfo,
26873                                            sizeof(FILE_END_OF_FILE_INFO))
26874                ? MDBX_SUCCESS
26875                : (int)GetLastError();
26876   } else {
26877     LARGE_INTEGER li;
26878     li.QuadPart = length;
26879     return (SetFilePointerEx(fd, li, NULL, FILE_BEGIN) && SetEndOfFile(fd))
26880                ? MDBX_SUCCESS
26881                : (int)GetLastError();
26882   }
26883 #else
26884   STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t),
26885                     "libmdbx requires 64-bit file I/O on 64-bit systems");
26886   return ftruncate(fd, length) == 0 ? MDBX_SUCCESS : errno;
26887 #endif
26888 }
26889 
mdbx_fseek(mdbx_filehandle_t fd,uint64_t pos)26890 MDBX_INTERNAL_FUNC int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos) {
26891 #if defined(_WIN32) || defined(_WIN64)
26892   LARGE_INTEGER li;
26893   li.QuadPart = pos;
26894   return SetFilePointerEx(fd, li, NULL, FILE_BEGIN) ? MDBX_SUCCESS
26895                                                     : (int)GetLastError();
26896 #else
26897   STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t),
26898                     "libmdbx requires 64-bit file I/O on 64-bit systems");
26899   return (lseek(fd, pos, SEEK_SET) < 0) ? errno : MDBX_SUCCESS;
26900 #endif
26901 }
26902 
26903 /*----------------------------------------------------------------------------*/
26904 
26905 MDBX_INTERNAL_FUNC int
mdbx_thread_create(mdbx_thread_t * thread,THREAD_RESULT (THREAD_CALL * start_routine)(void *),void * arg)26906 mdbx_thread_create(mdbx_thread_t *thread,
26907                    THREAD_RESULT(THREAD_CALL *start_routine)(void *),
26908                    void *arg) {
26909 #if defined(_WIN32) || defined(_WIN64)
26910   *thread = CreateThread(NULL, 0, start_routine, arg, 0, NULL);
26911   return *thread ? MDBX_SUCCESS : (int)GetLastError();
26912 #else
26913   return pthread_create(thread, NULL, start_routine, arg);
26914 #endif
26915 }
26916 
mdbx_thread_join(mdbx_thread_t thread)26917 MDBX_INTERNAL_FUNC int mdbx_thread_join(mdbx_thread_t thread) {
26918 #if defined(_WIN32) || defined(_WIN64)
26919   DWORD code = WaitForSingleObject(thread, INFINITE);
26920   return waitstatus2errcode(code);
26921 #else
26922   void *unused_retval = &unused_retval;
26923   return pthread_join(thread, &unused_retval);
26924 #endif
26925 }
26926 
26927 /*----------------------------------------------------------------------------*/
26928 
mdbx_msync(mdbx_mmap_t * map,size_t offset,size_t length,enum mdbx_syncmode_bits mode_bits)26929 MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset,
26930                                   size_t length,
26931                                   enum mdbx_syncmode_bits mode_bits) {
26932   uint8_t *ptr = (uint8_t *)map->address + offset;
26933 #if defined(_WIN32) || defined(_WIN64)
26934   if (!FlushViewOfFile(ptr, length))
26935     return (int)GetLastError();
26936 #else
26937 #if defined(__linux__) || defined(__gnu_linux__)
26938   if (mode_bits == MDBX_SYNC_NONE && mdbx_linux_kernel_version > 0x02061300)
26939     /* Since Linux 2.6.19, MS_ASYNC is in fact a no-op. The kernel properly
26940      * tracks dirty pages and flushes them to storage as necessary. */
26941     return MDBX_SUCCESS;
26942 #endif /* Linux */
26943   if (msync(ptr, length, (mode_bits & MDBX_SYNC_DATA) ? MS_SYNC : MS_ASYNC))
26944     return errno;
26945   mode_bits &= ~MDBX_SYNC_DATA;
26946 #endif
26947   return mdbx_fsync(map->fd, mode_bits);
26948 }
26949 
mdbx_check_fs_rdonly(mdbx_filehandle_t handle,const char * pathname,int err)26950 MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle,
26951                                             const char *pathname, int err) {
26952 #if defined(_WIN32) || defined(_WIN64)
26953   (void)pathname;
26954   (void)err;
26955   if (!mdbx_GetVolumeInformationByHandleW)
26956     return MDBX_ENOSYS;
26957   DWORD unused, flags;
26958   if (!mdbx_GetVolumeInformationByHandleW(handle, nullptr, 0, nullptr, &unused,
26959                                           &flags, nullptr, 0))
26960     return (int)GetLastError();
26961   if ((flags & FILE_READ_ONLY_VOLUME) == 0)
26962     return MDBX_EACCESS;
26963 #else
26964   struct statvfs info;
26965   if (err != MDBX_ENOFILE) {
26966     if (statvfs(pathname, &info))
26967       return errno;
26968     if ((info.f_flag & ST_RDONLY) == 0)
26969       return err;
26970   }
26971   if (fstatvfs(handle, &info))
26972     return errno;
26973   if ((info.f_flag & ST_RDONLY) == 0)
26974     return (err == MDBX_ENOFILE) ? MDBX_EACCESS : err;
26975 #endif /* !Windows */
26976   return MDBX_SUCCESS;
26977 }
26978 
mdbx_check_fs_local(mdbx_filehandle_t handle,int flags)26979 static int mdbx_check_fs_local(mdbx_filehandle_t handle, int flags) {
26980 #if defined(_WIN32) || defined(_WIN64)
26981   if (mdbx_RunningUnderWine() && !(flags & MDBX_EXCLUSIVE))
26982     return ERROR_NOT_CAPABLE /* workaround for Wine */;
26983 
26984   if (GetFileType(handle) != FILE_TYPE_DISK)
26985     return ERROR_FILE_OFFLINE;
26986 
26987   if (mdbx_GetFileInformationByHandleEx) {
26988     FILE_REMOTE_PROTOCOL_INFO RemoteProtocolInfo;
26989     if (mdbx_GetFileInformationByHandleEx(handle, FileRemoteProtocolInfo,
26990                                           &RemoteProtocolInfo,
26991                                           sizeof(RemoteProtocolInfo))) {
26992       if ((RemoteProtocolInfo.Flags & REMOTE_PROTOCOL_INFO_FLAG_OFFLINE) &&
26993           !(flags & MDBX_RDONLY))
26994         return ERROR_FILE_OFFLINE;
26995       if (!(RemoteProtocolInfo.Flags & REMOTE_PROTOCOL_INFO_FLAG_LOOPBACK) &&
26996           !(flags & MDBX_EXCLUSIVE))
26997         return ERROR_REMOTE_STORAGE_MEDIA_ERROR;
26998     }
26999   }
27000 
27001   if (mdbx_NtFsControlFile) {
27002     NTSTATUS rc;
27003     struct {
27004       WOF_EXTERNAL_INFO wof_info;
27005       union {
27006         WIM_PROVIDER_EXTERNAL_INFO wim_info;
27007         FILE_PROVIDER_EXTERNAL_INFO_V1 file_info;
27008       };
27009       size_t reserved_for_microsoft_madness[42];
27010     } GetExternalBacking_OutputBuffer;
27011     IO_STATUS_BLOCK StatusBlock;
27012     rc = mdbx_NtFsControlFile(handle, NULL, NULL, NULL, &StatusBlock,
27013                               FSCTL_GET_EXTERNAL_BACKING, NULL, 0,
27014                               &GetExternalBacking_OutputBuffer,
27015                               sizeof(GetExternalBacking_OutputBuffer));
27016     if (NT_SUCCESS(rc)) {
27017       if (!(flags & MDBX_EXCLUSIVE))
27018         return ERROR_REMOTE_STORAGE_MEDIA_ERROR;
27019     } else if (rc != STATUS_OBJECT_NOT_EXTERNALLY_BACKED &&
27020                rc != STATUS_INVALID_DEVICE_REQUEST &&
27021                rc != STATUS_NOT_SUPPORTED)
27022       return ntstatus2errcode(rc);
27023   }
27024 
27025   if (mdbx_GetVolumeInformationByHandleW && mdbx_GetFinalPathNameByHandleW) {
27026     WCHAR *PathBuffer = mdbx_malloc(sizeof(WCHAR) * INT16_MAX);
27027     if (!PathBuffer)
27028       return MDBX_ENOMEM;
27029 
27030     int rc = MDBX_SUCCESS;
27031     DWORD VolumeSerialNumber, FileSystemFlags;
27032     if (!mdbx_GetVolumeInformationByHandleW(handle, PathBuffer, INT16_MAX,
27033                                             &VolumeSerialNumber, NULL,
27034                                             &FileSystemFlags, NULL, 0)) {
27035       rc = (int)GetLastError();
27036       goto bailout;
27037     }
27038 
27039     if ((flags & MDBX_RDONLY) == 0) {
27040       if (FileSystemFlags &
27041           (FILE_SEQUENTIAL_WRITE_ONCE | FILE_READ_ONLY_VOLUME |
27042            FILE_VOLUME_IS_COMPRESSED)) {
27043         rc = ERROR_REMOTE_STORAGE_MEDIA_ERROR;
27044         goto bailout;
27045       }
27046     }
27047 
27048     if (!mdbx_GetFinalPathNameByHandleW(handle, PathBuffer, INT16_MAX,
27049                                         FILE_NAME_NORMALIZED |
27050                                             VOLUME_NAME_NT)) {
27051       rc = (int)GetLastError();
27052       goto bailout;
27053     }
27054 
27055     if (_wcsnicmp(PathBuffer, L"\\Device\\Mup\\", 12) == 0) {
27056       if (!(flags & MDBX_EXCLUSIVE)) {
27057         rc = ERROR_REMOTE_STORAGE_MEDIA_ERROR;
27058         goto bailout;
27059       }
27060     } else if (mdbx_GetFinalPathNameByHandleW(handle, PathBuffer, INT16_MAX,
27061                                               FILE_NAME_NORMALIZED |
27062                                                   VOLUME_NAME_DOS)) {
27063       UINT DriveType = GetDriveTypeW(PathBuffer);
27064       if (DriveType == DRIVE_NO_ROOT_DIR &&
27065           _wcsnicmp(PathBuffer, L"\\\\?\\", 4) == 0 &&
27066           _wcsnicmp(PathBuffer + 5, L":\\", 2) == 0) {
27067         PathBuffer[7] = 0;
27068         DriveType = GetDriveTypeW(PathBuffer + 4);
27069       }
27070       switch (DriveType) {
27071       case DRIVE_CDROM:
27072         if (flags & MDBX_RDONLY)
27073           break;
27074       // fall through
27075       case DRIVE_UNKNOWN:
27076       case DRIVE_NO_ROOT_DIR:
27077       case DRIVE_REMOTE:
27078       default:
27079         if (!(flags & MDBX_EXCLUSIVE))
27080           rc = ERROR_REMOTE_STORAGE_MEDIA_ERROR;
27081       // fall through
27082       case DRIVE_REMOVABLE:
27083       case DRIVE_FIXED:
27084       case DRIVE_RAMDISK:
27085         break;
27086       }
27087     }
27088   bailout:
27089     mdbx_free(PathBuffer);
27090     return rc;
27091   }
27092 
27093 #else
27094 
27095   struct statvfs statvfs_info;
27096   if (fstatvfs(handle, &statvfs_info))
27097     return errno;
27098 #if defined(ST_LOCAL) || defined(ST_EXPORTED)
27099   const unsigned long st_flags = statvfs_info.f_flag;
27100 #endif /* ST_LOCAL || ST_EXPORTED */
27101 
27102 #if defined(__NetBSD__)
27103   const unsigned type = 0;
27104   const char *const name = statvfs_info.f_fstypename;
27105   const size_t name_len = VFS_NAMELEN;
27106 #elif defined(_AIX) || defined(__OS400__)
27107   const char *const name = statvfs_info.f_basetype;
27108   const size_t name_len = sizeof(statvfs_info.f_basetype);
27109   struct stat st;
27110   if (fstat(handle, &st))
27111     return errno;
27112   const unsigned type = st.st_vfstype;
27113   if ((st.st_flag & FS_REMOTE) != 0 && !(flags & MDBX_EXCLUSIVE))
27114     return MDBX_EREMOTE;
27115 #elif defined(FSTYPSZ) || defined(_FSTYPSZ)
27116   const unsigned type = 0;
27117   const char *const name = statvfs_info.f_basetype;
27118   const size_t name_len = sizeof(statvfs_info.f_basetype);
27119 #elif defined(__sun) || defined(__SVR4) || defined(__svr4__) ||                \
27120     defined(ST_FSTYPSZ) || defined(_ST_FSTYPSZ)
27121   const unsigned type = 0;
27122   struct stat st;
27123   if (fstat(handle, &st))
27124     return errno;
27125   const char *const name = st.st_fstype;
27126   const size_t name_len = strlen(name);
27127 #else
27128   struct statfs statfs_info;
27129   if (fstatfs(handle, &statfs_info))
27130     return errno;
27131 #if defined(__OpenBSD__)
27132   const unsigned type = 0;
27133 #else
27134   const unsigned type = statfs_info.f_type;
27135 #endif
27136 #if defined(MNT_LOCAL) || defined(MNT_EXPORTED)
27137   const unsigned long mnt_flags = statfs_info.f_flags;
27138 #endif /* MNT_LOCAL || MNT_EXPORTED */
27139 #if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) ||     \
27140     defined(__BSD__) || defined(__bsdi__) || defined(__DragonFly__) ||         \
27141     defined(__APPLE__) || defined(__MACH__) || defined(MFSNAMELEN) ||          \
27142     defined(MFSTYPENAMELEN) || defined(VFS_NAMELEN)
27143   const char *const name = statfs_info.f_fstypename;
27144   const size_t name_len = sizeof(statfs_info.f_fstypename);
27145 #elif defined(__ANDROID_API__) && __ANDROID_API__ < 21
27146   const char *const name = "";
27147   const unsigned name_len = 0;
27148 #else
27149 
27150   const char *name = "";
27151   unsigned name_len = 0;
27152 
27153   struct stat st;
27154   if (fstat(handle, &st))
27155     return errno;
27156 
27157   char pathbuf[PATH_MAX];
27158   FILE *mounted = nullptr;
27159 #if defined(__linux__) || defined(__gnu_linux__)
27160   mounted = setmntent("/proc/mounts", "r");
27161 #endif /* Linux */
27162   if (!mounted)
27163     mounted = setmntent("/etc/mtab", "r");
27164   if (mounted) {
27165     const struct mntent *ent;
27166 #if defined(_BSD_SOURCE) || defined(_SVID_SOURCE) || defined(__BIONIC__) ||    \
27167     (defined(_DEFAULT_SOURCE) && __GLIBC_PREREQ(2, 19))
27168     struct mntent entbuf;
27169     const bool should_copy = false;
27170     while (nullptr !=
27171            (ent = getmntent_r(mounted, &entbuf, pathbuf, sizeof(pathbuf))))
27172 #else
27173     const bool should_copy = true;
27174     while (nullptr != (ent = getmntent(mounted)))
27175 #endif
27176     {
27177       struct stat mnt;
27178       if (!stat(ent->mnt_dir, &mnt) && mnt.st_dev == st.st_dev) {
27179         if (should_copy) {
27180           name =
27181               strncpy(pathbuf, ent->mnt_fsname, name_len = sizeof(pathbuf) - 1);
27182           pathbuf[name_len] = 0;
27183         } else {
27184           name = ent->mnt_fsname;
27185           name_len = strlen(name);
27186         }
27187         break;
27188       }
27189     }
27190     endmntent(mounted);
27191   }
27192 #endif /* !xBSD && !Android/Bionic */
27193 #endif
27194 
27195   if (name_len) {
27196     if (((name_len > 2 && strncasecmp("nfs", name, 3) == 0) ||
27197          strncasecmp("cifs", name, name_len) == 0 ||
27198          strncasecmp("ncpfs", name, name_len) == 0 ||
27199          strncasecmp("smbfs", name, name_len) == 0 ||
27200          strcasecmp("9P" /* WSL2 */, name) == 0 ||
27201          ((name_len > 3 && strncasecmp("fuse", name, 4) == 0) &&
27202           strncasecmp("fuseblk", name, name_len) != 0)) &&
27203         !(flags & MDBX_EXCLUSIVE))
27204       return MDBX_EREMOTE;
27205     if (strcasecmp("ftp", name) == 0 || strcasecmp("http", name) == 0 ||
27206         strcasecmp("sshfs", name) == 0)
27207       return MDBX_EREMOTE;
27208   }
27209 
27210 #ifdef ST_LOCAL
27211   if ((st_flags & ST_LOCAL) == 0 && !(flags & MDBX_EXCLUSIVE))
27212     return MDBX_EREMOTE;
27213 #elif defined(MNT_LOCAL)
27214   if ((mnt_flags & MNT_LOCAL) == 0 && !(flags & MDBX_EXCLUSIVE))
27215     return MDBX_EREMOTE;
27216 #endif /* ST/MNT_LOCAL */
27217 
27218 #ifdef ST_EXPORTED
27219   if ((st_flags & ST_EXPORTED) != 0 && !(flags & MDBX_RDONLY))
27220     return MDBX_EREMOTE;
27221 #elif defined(MNT_EXPORTED)
27222   if ((mnt_flags & MNT_EXPORTED) != 0 && !(flags & MDBX_RDONLY))
27223     return MDBX_EREMOTE;
27224 #endif /* ST/MNT_EXPORTED */
27225 
27226   switch (type) {
27227   case 0xFF534D42 /* CIFS_MAGIC_NUMBER */:
27228   case 0x6969 /* NFS_SUPER_MAGIC */:
27229   case 0x564c /* NCP_SUPER_MAGIC */:
27230   case 0x517B /* SMB_SUPER_MAGIC */:
27231 #if defined(__digital__) || defined(__osf__) || defined(__osf)
27232   case 0x0E /* Tru64 NFS */:
27233 #endif
27234 #ifdef ST_FST_NFS
27235   case ST_FST_NFS:
27236 #endif
27237     if ((flags & MDBX_EXCLUSIVE) == 0)
27238       return MDBX_EREMOTE;
27239   case 0:
27240   default:
27241     break;
27242   }
27243 #endif /* Unix */
27244 
27245   return MDBX_SUCCESS;
27246 }
27247 
check_mmap_limit(const size_t limit)27248 static int check_mmap_limit(const size_t limit) {
27249   const bool should_check =
27250 #if defined(__SANITIZE_ADDRESS__)
27251       true;
27252 #else
27253       RUNNING_ON_VALGRIND;
27254 #endif /* __SANITIZE_ADDRESS__ */
27255 
27256   if (should_check) {
27257     intptr_t pagesize, total_ram_pages, avail_ram_pages;
27258     int err =
27259         mdbx_get_sysraminfo(&pagesize, &total_ram_pages, &avail_ram_pages);
27260     if (unlikely(err != MDBX_SUCCESS))
27261       return err;
27262 
27263     const int log2page = log2n_powerof2(pagesize);
27264     if ((limit >> (log2page + 7)) > (size_t)total_ram_pages ||
27265         (limit >> (log2page + 6)) > (size_t)avail_ram_pages) {
27266       mdbx_error(
27267           "%s (%zu pages) is too large for available (%zu pages) or total "
27268           "(%zu pages) system RAM",
27269           "database upper size limit", limit >> log2page, avail_ram_pages,
27270           total_ram_pages);
27271       return MDBX_TOO_LARGE;
27272     }
27273   }
27274 
27275   return MDBX_SUCCESS;
27276 }
27277 
mdbx_mmap(const int flags,mdbx_mmap_t * map,const size_t size,const size_t limit,const unsigned options)27278 MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map,
27279                                  const size_t size, const size_t limit,
27280                                  const unsigned options) {
27281   assert(size <= limit);
27282   map->limit = 0;
27283   map->current = 0;
27284   map->address = nullptr;
27285   map->filesize = 0;
27286 #if defined(_WIN32) || defined(_WIN64)
27287   map->section = NULL;
27288 #endif /* Windows */
27289 
27290   int err = mdbx_check_fs_local(map->fd, flags);
27291   if (unlikely(err != MDBX_SUCCESS))
27292     return err;
27293 
27294   err = check_mmap_limit(limit);
27295   if (unlikely(err != MDBX_SUCCESS))
27296     return err;
27297 
27298   if ((flags & MDBX_RDONLY) == 0 && (options & MMAP_OPTION_TRUNCATE) != 0) {
27299     err = mdbx_ftruncate(map->fd, size);
27300     if (err != MDBX_SUCCESS)
27301       return err;
27302     map->filesize = size;
27303 #if !(defined(_WIN32) || defined(_WIN64))
27304     map->current = size;
27305 #endif /* !Windows */
27306   } else {
27307     err = mdbx_filesize(map->fd, &map->filesize);
27308     if (err != MDBX_SUCCESS)
27309       return err;
27310 #if !(defined(_WIN32) || defined(_WIN64))
27311     map->current = (map->filesize > limit) ? limit : (size_t)map->filesize;
27312 #endif /* !Windows */
27313   }
27314 
27315 #if defined(_WIN32) || defined(_WIN64)
27316   LARGE_INTEGER SectionSize;
27317   SectionSize.QuadPart = size;
27318   err = NtCreateSection(
27319       &map->section,
27320       /* DesiredAccess */
27321       (flags & MDBX_WRITEMAP)
27322           ? SECTION_QUERY | SECTION_MAP_READ | SECTION_EXTEND_SIZE |
27323                 SECTION_MAP_WRITE
27324           : SECTION_QUERY | SECTION_MAP_READ | SECTION_EXTEND_SIZE,
27325       /* ObjectAttributes */ NULL, /* MaximumSize (InitialSize) */ &SectionSize,
27326       /* SectionPageProtection */
27327       (flags & MDBX_RDONLY) ? PAGE_READONLY : PAGE_READWRITE,
27328       /* AllocationAttributes */ SEC_RESERVE, map->fd);
27329   if (!NT_SUCCESS(err))
27330     return ntstatus2errcode(err);
27331 
27332   SIZE_T ViewSize = (flags & MDBX_RDONLY)     ? 0
27333                     : mdbx_RunningUnderWine() ? size
27334                                               : limit;
27335   err = NtMapViewOfSection(
27336       map->section, GetCurrentProcess(), &map->address,
27337       /* ZeroBits */ 0,
27338       /* CommitSize */ 0,
27339       /* SectionOffset */ NULL, &ViewSize,
27340       /* InheritDisposition */ ViewUnmap,
27341       /* AllocationType */ (flags & MDBX_RDONLY) ? 0 : MEM_RESERVE,
27342       /* Win32Protect */
27343       (flags & MDBX_WRITEMAP) ? PAGE_READWRITE : PAGE_READONLY);
27344   if (!NT_SUCCESS(err)) {
27345     NtClose(map->section);
27346     map->section = 0;
27347     map->address = nullptr;
27348     return ntstatus2errcode(err);
27349   }
27350   assert(map->address != MAP_FAILED);
27351 
27352   map->current = (size_t)SectionSize.QuadPart;
27353   map->limit = ViewSize;
27354 
27355 #else /* Windows */
27356 
27357 #ifndef MAP_TRYFIXED
27358 #define MAP_TRYFIXED 0
27359 #endif
27360 
27361 #ifndef MAP_HASSEMAPHORE
27362 #define MAP_HASSEMAPHORE 0
27363 #endif
27364 
27365 #ifndef MAP_CONCEAL
27366 #define MAP_CONCEAL 0
27367 #endif
27368 
27369 #ifndef MAP_NOSYNC
27370 #define MAP_NOSYNC 0
27371 #endif
27372 
27373 #ifndef MAP_FIXED_NOREPLACE
27374 #define MAP_FIXED_NOREPLACE 0
27375 #endif
27376 
27377 #ifndef MAP_NORESERVE
27378 #define MAP_NORESERVE 0
27379 #endif
27380 
27381   map->address = mmap(
27382       NULL, limit, (flags & MDBX_WRITEMAP) ? PROT_READ | PROT_WRITE : PROT_READ,
27383       MAP_SHARED | MAP_FILE | MAP_NORESERVE |
27384           (F_ISSET(flags, MDBX_UTTERLY_NOSYNC) ? MAP_NOSYNC : 0) |
27385           ((options & MMAP_OPTION_SEMAPHORE) ? MAP_HASSEMAPHORE | MAP_NOSYNC
27386                                              : MAP_CONCEAL),
27387       map->fd, 0);
27388 
27389   if (unlikely(map->address == MAP_FAILED)) {
27390     map->limit = 0;
27391     map->current = 0;
27392     map->address = nullptr;
27393     return errno;
27394   }
27395   map->limit = limit;
27396 
27397 #if MDBX_ENABLE_MADVISE
27398 #ifdef MADV_DONTFORK
27399   if (unlikely(madvise(map->address, map->limit, MADV_DONTFORK) != 0))
27400     return errno;
27401 #endif /* MADV_DONTFORK */
27402 #ifdef MADV_NOHUGEPAGE
27403   (void)madvise(map->address, map->limit, MADV_NOHUGEPAGE);
27404 #endif /* MADV_NOHUGEPAGE */
27405 #endif /* MDBX_ENABLE_MADVISE */
27406 
27407 #endif /* ! Windows */
27408 
27409   VALGRIND_MAKE_MEM_DEFINED(map->address, map->current);
27410   MDBX_ASAN_UNPOISON_MEMORY_REGION(map->address, map->current);
27411   return MDBX_SUCCESS;
27412 }
27413 
mdbx_munmap(mdbx_mmap_t * map)27414 MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map) {
27415   VALGRIND_MAKE_MEM_NOACCESS(map->address, map->current);
27416   /* Unpoisoning is required for ASAN to avoid false-positive diagnostic
27417    * when this memory will re-used by malloc or another mmapping.
27418    * See https://github.com/erthink/libmdbx/pull/93#issuecomment-613687203 */
27419   MDBX_ASAN_UNPOISON_MEMORY_REGION(map->address,
27420                                    (map->filesize && map->filesize < map->limit)
27421                                        ? map->filesize
27422                                        : map->limit);
27423 #if defined(_WIN32) || defined(_WIN64)
27424   if (map->section)
27425     NtClose(map->section);
27426   NTSTATUS rc = NtUnmapViewOfSection(GetCurrentProcess(), map->address);
27427   if (!NT_SUCCESS(rc))
27428     ntstatus2errcode(rc);
27429 #else
27430   if (unlikely(munmap(map->address, map->limit)))
27431     return errno;
27432 #endif /* ! Windows */
27433 
27434   map->limit = 0;
27435   map->current = 0;
27436   map->address = nullptr;
27437   return MDBX_SUCCESS;
27438 }
27439 
mdbx_mresize(const int flags,mdbx_mmap_t * map,size_t size,size_t limit)27440 MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map,
27441                                     size_t size, size_t limit) {
27442   assert(size <= limit);
27443 #if defined(_WIN32) || defined(_WIN64)
27444   assert(size != map->current || limit != map->limit || size < map->filesize);
27445 
27446   NTSTATUS status;
27447   LARGE_INTEGER SectionSize;
27448   int err, rc = MDBX_SUCCESS;
27449 
27450   if (!(flags & MDBX_RDONLY) && limit == map->limit && size > map->current &&
27451       /* workaround for Wine */ mdbx_NtExtendSection) {
27452     /* growth rw-section */
27453     SectionSize.QuadPart = size;
27454     status = mdbx_NtExtendSection(map->section, &SectionSize);
27455     if (!NT_SUCCESS(status))
27456       return ntstatus2errcode(status);
27457     map->current = size;
27458     if (map->filesize < size)
27459       map->filesize = size;
27460     return MDBX_SUCCESS;
27461   }
27462 
27463   if (limit > map->limit) {
27464     err = check_mmap_limit(limit);
27465     if (unlikely(err != MDBX_SUCCESS))
27466       return err;
27467 
27468     /* check ability of address space for growth before unmap */
27469     PVOID BaseAddress = (PBYTE)map->address + map->limit;
27470     SIZE_T RegionSize = limit - map->limit;
27471     status = NtAllocateVirtualMemory(GetCurrentProcess(), &BaseAddress, 0,
27472                                      &RegionSize, MEM_RESERVE, PAGE_NOACCESS);
27473     if (status == (NTSTATUS) /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018)
27474       return MDBX_UNABLE_EXTEND_MAPSIZE;
27475     if (!NT_SUCCESS(status))
27476       return ntstatus2errcode(status);
27477 
27478     status = NtFreeVirtualMemory(GetCurrentProcess(), &BaseAddress, &RegionSize,
27479                                  MEM_RELEASE);
27480     if (!NT_SUCCESS(status))
27481       return ntstatus2errcode(status);
27482   }
27483 
27484   /* Windows unable:
27485    *  - shrink a mapped file;
27486    *  - change size of mapped view;
27487    *  - extend read-only mapping;
27488    * Therefore we should unmap/map entire section. */
27489   if ((flags & MDBX_MRESIZE_MAY_UNMAP) == 0)
27490     return MDBX_RESULT_TRUE;
27491 
27492   /* Unpoisoning is required for ASAN to avoid false-positive diagnostic
27493    * when this memory will re-used by malloc or another mmapping.
27494    * See https://github.com/erthink/libmdbx/pull/93#issuecomment-613687203 */
27495   MDBX_ASAN_UNPOISON_MEMORY_REGION(map->address, map->limit);
27496   status = NtUnmapViewOfSection(GetCurrentProcess(), map->address);
27497   if (!NT_SUCCESS(status))
27498     return ntstatus2errcode(status);
27499   status = NtClose(map->section);
27500   map->section = NULL;
27501   PVOID ReservedAddress = NULL;
27502   SIZE_T ReservedSize = limit;
27503 
27504   if (!NT_SUCCESS(status)) {
27505   bailout_ntstatus:
27506     err = ntstatus2errcode(status);
27507   bailout:
27508     map->address = NULL;
27509     map->current = map->limit = 0;
27510     if (ReservedAddress) {
27511       ReservedSize = 0;
27512       status = NtFreeVirtualMemory(GetCurrentProcess(), &ReservedAddress,
27513                                    &ReservedSize, MEM_RELEASE);
27514       assert(NT_SUCCESS(status));
27515       (void)status;
27516     }
27517     return err;
27518   }
27519 
27520 retry_file_and_section:
27521   /* resizing of the file may take a while,
27522    * therefore we reserve address space to avoid occupy it by other threads */
27523   ReservedAddress = map->address;
27524   status = NtAllocateVirtualMemory(GetCurrentProcess(), &ReservedAddress, 0,
27525                                    &ReservedSize, MEM_RESERVE, PAGE_NOACCESS);
27526   if (!NT_SUCCESS(status)) {
27527     ReservedAddress = NULL;
27528     if (status != (NTSTATUS) /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018)
27529       goto bailout_ntstatus /* no way to recovery */;
27530 
27531     if (flags & MDBX_MRESIZE_MAY_MOVE)
27532       /* the base address could be changed */
27533       map->address = NULL;
27534   }
27535 
27536   err = mdbx_filesize(map->fd, &map->filesize);
27537   if (err != MDBX_SUCCESS)
27538     goto bailout;
27539 
27540   if ((flags & MDBX_RDONLY) == 0 && map->filesize != size) {
27541     err = mdbx_ftruncate(map->fd, size);
27542     if (err == MDBX_SUCCESS)
27543       map->filesize = size;
27544     /* ignore error, because Windows unable shrink file
27545      * that already mapped (by another process) */
27546   }
27547 
27548   SectionSize.QuadPart = size;
27549   status = NtCreateSection(
27550       &map->section,
27551       /* DesiredAccess */
27552       (flags & MDBX_WRITEMAP)
27553           ? SECTION_QUERY | SECTION_MAP_READ | SECTION_EXTEND_SIZE |
27554                 SECTION_MAP_WRITE
27555           : SECTION_QUERY | SECTION_MAP_READ | SECTION_EXTEND_SIZE,
27556       /* ObjectAttributes */ NULL,
27557       /* MaximumSize (InitialSize) */ &SectionSize,
27558       /* SectionPageProtection */
27559       (flags & MDBX_RDONLY) ? PAGE_READONLY : PAGE_READWRITE,
27560       /* AllocationAttributes */ SEC_RESERVE, map->fd);
27561 
27562   if (!NT_SUCCESS(status))
27563     goto bailout_ntstatus;
27564 
27565   if (ReservedAddress) {
27566     /* release reserved address space */
27567     ReservedSize = 0;
27568     status = NtFreeVirtualMemory(GetCurrentProcess(), &ReservedAddress,
27569                                  &ReservedSize, MEM_RELEASE);
27570     ReservedAddress = NULL;
27571     if (!NT_SUCCESS(status))
27572       goto bailout_ntstatus;
27573   }
27574 
27575 retry_mapview:;
27576   SIZE_T ViewSize = (flags & MDBX_RDONLY) ? size : limit;
27577   status = NtMapViewOfSection(
27578       map->section, GetCurrentProcess(), &map->address,
27579       /* ZeroBits */ 0,
27580       /* CommitSize */ 0,
27581       /* SectionOffset */ NULL, &ViewSize,
27582       /* InheritDisposition */ ViewUnmap,
27583       /* AllocationType */ (flags & MDBX_RDONLY) ? 0 : MEM_RESERVE,
27584       /* Win32Protect */
27585       (flags & MDBX_WRITEMAP) ? PAGE_READWRITE : PAGE_READONLY);
27586 
27587   if (!NT_SUCCESS(status)) {
27588     if (status == (NTSTATUS) /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018 &&
27589         map->address && (flags & MDBX_MRESIZE_MAY_MOVE) != 0) {
27590       /* try remap at another base address */
27591       map->address = NULL;
27592       goto retry_mapview;
27593     }
27594     NtClose(map->section);
27595     map->section = NULL;
27596 
27597     if (map->address && (size != map->current || limit != map->limit)) {
27598       /* try remap with previously size and limit,
27599        * but will return MDBX_UNABLE_EXTEND_MAPSIZE on success */
27600       rc = (limit > map->limit) ? MDBX_UNABLE_EXTEND_MAPSIZE : MDBX_RESULT_TRUE;
27601       size = map->current;
27602       ReservedSize = limit = map->limit;
27603       goto retry_file_and_section;
27604     }
27605 
27606     /* no way to recovery */
27607     goto bailout_ntstatus;
27608   }
27609   assert(map->address != MAP_FAILED);
27610 
27611   map->current = (size_t)SectionSize.QuadPart;
27612   map->limit = ViewSize;
27613 
27614 #else /* Windows */
27615 
27616   map->filesize = 0;
27617   int rc = mdbx_filesize(map->fd, &map->filesize);
27618   if (rc != MDBX_SUCCESS)
27619     return rc;
27620 
27621   if (flags & MDBX_RDONLY) {
27622     map->current = (map->filesize > limit) ? limit : (size_t)map->filesize;
27623     if (map->current != size)
27624       rc =
27625           (size > map->current) ? MDBX_UNABLE_EXTEND_MAPSIZE : MDBX_RESULT_TRUE;
27626   } else {
27627     if (map->filesize != size) {
27628       rc = mdbx_ftruncate(map->fd, size);
27629       if (rc != MDBX_SUCCESS)
27630         return rc;
27631       map->filesize = size;
27632     }
27633 
27634     if (map->current > size) {
27635       /* Clearing asan's bitmask for the region which released in shrinking,
27636        * since:
27637        *  - after the shrinking we will get an exception when accessing
27638        *    this region and (therefore) do not need the help of ASAN.
27639        *  - this allows us to clear the mask only within the file size
27640        *    when closing the mapping. */
27641       MDBX_ASAN_UNPOISON_MEMORY_REGION(
27642           (char *)map->address + size,
27643           ((map->current < map->limit) ? map->current : map->limit) - size);
27644     }
27645     map->current = size;
27646   }
27647 
27648   if (limit == map->limit)
27649     return rc;
27650 
27651   if (limit < map->limit) {
27652     /* unmap an excess at end of mapping. */
27653     // coverity[offset_free : FALSE]
27654     if (unlikely(munmap(map->dxb + limit, map->limit - limit)))
27655       return errno;
27656     map->limit = limit;
27657     return rc;
27658   }
27659 
27660   int err = check_mmap_limit(limit);
27661   if (unlikely(err != MDBX_SUCCESS))
27662     return err;
27663 
27664   assert(limit > map->limit);
27665   uint8_t *ptr = MAP_FAILED;
27666 
27667 #if defined(MREMAP_MAYMOVE)
27668   ptr = mremap(map->address, map->limit, limit,
27669                (flags & MDBX_MRESIZE_MAY_MOVE) ? MREMAP_MAYMOVE : 0);
27670   if (ptr == MAP_FAILED) {
27671     err = errno;
27672     switch (err) {
27673     default:
27674       return err;
27675     case EAGAIN:
27676     case ENOMEM:
27677       return MDBX_UNABLE_EXTEND_MAPSIZE;
27678     case EFAULT /* MADV_DODUMP / MADV_DONTDUMP are mixed for mmap-range */:
27679       break;
27680     }
27681   }
27682 #endif /* MREMAP_MAYMOVE */
27683 
27684   const unsigned mmap_flags =
27685       MAP_CONCEAL | MAP_SHARED | MAP_FILE | MAP_NORESERVE |
27686       (F_ISSET(flags, MDBX_UTTERLY_NOSYNC) ? MAP_NOSYNC : 0);
27687   const unsigned mmap_prot =
27688       (flags & MDBX_WRITEMAP) ? PROT_READ | PROT_WRITE : PROT_READ;
27689 
27690   if (ptr == MAP_FAILED) {
27691     /* Try to mmap additional space beyond the end of mapping. */
27692     ptr = mmap(map->dxb + map->limit, limit - map->limit, mmap_prot,
27693                mmap_flags | MAP_FIXED_NOREPLACE, map->fd, map->limit);
27694     if (ptr == map->dxb + map->limit)
27695       ptr = map->dxb;
27696     else if (ptr != MAP_FAILED) {
27697       /* the desired address is busy, unmap unsuitable one */
27698       if (unlikely(munmap(ptr, limit - map->limit)))
27699         return errno;
27700       ptr = MAP_FAILED;
27701     } else {
27702       err = errno;
27703       switch (err) {
27704       default:
27705         return err;
27706       case EAGAIN:
27707       case ENOMEM:
27708         return MDBX_UNABLE_EXTEND_MAPSIZE;
27709       case EEXIST: /* address busy */
27710       case EINVAL: /* kernel don't support MAP_FIXED_NOREPLACE */
27711         break;
27712       }
27713     }
27714   }
27715 
27716   if (ptr == MAP_FAILED) {
27717     /* unmap and map again whole region */
27718     if ((flags & MDBX_MRESIZE_MAY_UNMAP) == 0) {
27719       /* TODO: Perhaps here it is worth to implement suspend/resume threads
27720        * and perform unmap/map as like for Windows. */
27721       return MDBX_UNABLE_EXTEND_MAPSIZE;
27722     }
27723 
27724     if (unlikely(munmap(map->address, map->limit)))
27725       return errno;
27726 
27727     // coverity[pass_freed_arg : FALSE]
27728     ptr = mmap(map->address, limit, mmap_prot,
27729                (flags & MDBX_MRESIZE_MAY_MOVE)
27730                    ? mmap_flags
27731                    : mmap_flags | (MAP_FIXED_NOREPLACE ? MAP_FIXED_NOREPLACE
27732                                                        : MAP_FIXED),
27733                map->fd, 0);
27734     if (MAP_FIXED_NOREPLACE != 0 && MAP_FIXED_NOREPLACE != MAP_FIXED &&
27735         unlikely(ptr == MAP_FAILED) && !(flags & MDBX_MRESIZE_MAY_MOVE) &&
27736         errno == /* kernel don't support MAP_FIXED_NOREPLACE */ EINVAL)
27737       // coverity[pass_freed_arg : FALSE]
27738       ptr = mmap(map->address, limit, mmap_prot, mmap_flags | MAP_FIXED,
27739                  map->fd, 0);
27740 
27741     if (unlikely(ptr == MAP_FAILED)) {
27742       /* try to restore prev mapping */
27743       // coverity[pass_freed_arg : FALSE]
27744       ptr = mmap(map->address, map->limit, mmap_prot,
27745                  (flags & MDBX_MRESIZE_MAY_MOVE)
27746                      ? mmap_flags
27747                      : mmap_flags | (MAP_FIXED_NOREPLACE ? MAP_FIXED_NOREPLACE
27748                                                          : MAP_FIXED),
27749                  map->fd, 0);
27750       if (MAP_FIXED_NOREPLACE != 0 && MAP_FIXED_NOREPLACE != MAP_FIXED &&
27751           unlikely(ptr == MAP_FAILED) && !(flags & MDBX_MRESIZE_MAY_MOVE) &&
27752           errno == /* kernel don't support MAP_FIXED_NOREPLACE */ EINVAL)
27753         // coverity[pass_freed_arg : FALSE]
27754         ptr = mmap(map->address, map->limit, mmap_prot, mmap_flags | MAP_FIXED,
27755                    map->fd, 0);
27756       if (unlikely(ptr == MAP_FAILED)) {
27757         VALGRIND_MAKE_MEM_NOACCESS(map->address, map->current);
27758         /* Unpoisoning is required for ASAN to avoid false-positive diagnostic
27759          * when this memory will re-used by malloc or another mmapping.
27760          * See https://github.com/erthink/libmdbx/pull/93#issuecomment-613687203
27761          */
27762         MDBX_ASAN_UNPOISON_MEMORY_REGION(
27763             map->address,
27764             (map->current < map->limit) ? map->current : map->limit);
27765         map->limit = 0;
27766         map->current = 0;
27767         map->address = nullptr;
27768         return errno;
27769       }
27770       rc = MDBX_UNABLE_EXTEND_MAPSIZE;
27771       limit = map->limit;
27772     }
27773   }
27774 
27775   assert(ptr && ptr != MAP_FAILED);
27776   if (map->address != ptr) {
27777     VALGRIND_MAKE_MEM_NOACCESS(map->address, map->current);
27778     /* Unpoisoning is required for ASAN to avoid false-positive diagnostic
27779      * when this memory will re-used by malloc or another mmapping.
27780      * See https://github.com/erthink/libmdbx/pull/93#issuecomment-613687203 */
27781     MDBX_ASAN_UNPOISON_MEMORY_REGION(
27782         map->address, (map->current < map->limit) ? map->current : map->limit);
27783 
27784     VALGRIND_MAKE_MEM_DEFINED(ptr, map->current);
27785     MDBX_ASAN_UNPOISON_MEMORY_REGION(ptr, map->current);
27786     map->address = ptr;
27787   }
27788   map->limit = limit;
27789 
27790 #if MDBX_ENABLE_MADVISE
27791 #ifdef MADV_DONTFORK
27792   if (unlikely(madvise(map->address, map->limit, MADV_DONTFORK) != 0))
27793     return errno;
27794 #endif /* MADV_DONTFORK */
27795 #ifdef MADV_NOHUGEPAGE
27796   (void)madvise(map->address, map->limit, MADV_NOHUGEPAGE);
27797 #endif /* MADV_NOHUGEPAGE */
27798 #endif /* MDBX_ENABLE_MADVISE */
27799 
27800 #endif /* POSIX / Windows */
27801 
27802   return rc;
27803 }
27804 
27805 /*----------------------------------------------------------------------------*/
27806 
mdbx_osal_jitter(bool tiny)27807 __cold MDBX_INTERNAL_FUNC void mdbx_osal_jitter(bool tiny) {
27808   for (;;) {
27809 #if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) ||                \
27810     defined(__x86_64__)
27811     const unsigned salt = 277u * (unsigned)__rdtsc();
27812 #else
27813     const unsigned salt = rand();
27814 #endif
27815 
27816     const unsigned coin = salt % (tiny ? 29u : 43u);
27817     if (coin < 43 / 3)
27818       break;
27819 #if defined(_WIN32) || defined(_WIN64)
27820     SwitchToThread();
27821     if (coin > 43 * 2 / 3)
27822       Sleep(1);
27823 #else
27824     sched_yield();
27825     if (coin > 43 * 2 / 3)
27826       usleep(coin);
27827 #endif
27828   }
27829 }
27830 
27831 #if defined(_WIN32) || defined(_WIN64)
27832 #elif defined(__APPLE__) || defined(__MACH__)
27833 #include <mach/mach_time.h>
27834 #elif defined(__linux__) || defined(__gnu_linux__)
choice_monoclock(void)27835 __cold static clockid_t choice_monoclock(void) {
27836   struct timespec probe;
27837 #if defined(CLOCK_BOOTTIME)
27838   if (clock_gettime(CLOCK_BOOTTIME, &probe) == 0)
27839     return CLOCK_BOOTTIME;
27840 #elif defined(CLOCK_MONOTONIC_RAW)
27841   if (clock_gettime(CLOCK_MONOTONIC_RAW, &probe) == 0)
27842     return CLOCK_MONOTONIC_RAW;
27843 #elif defined(CLOCK_MONOTONIC_COARSE)
27844   if (clock_gettime(CLOCK_MONOTONIC_COARSE, &probe) == 0)
27845     return CLOCK_MONOTONIC_COARSE;
27846 #endif
27847   return CLOCK_MONOTONIC;
27848 }
27849 #endif
27850 
27851 /*----------------------------------------------------------------------------*/
27852 
27853 #if defined(_WIN32) || defined(_WIN64)
27854 static LARGE_INTEGER performance_frequency;
27855 #elif defined(__APPLE__) || defined(__MACH__)
27856 static uint64_t ratio_16dot16_to_monotine;
27857 #endif
27858 
27859 MDBX_INTERNAL_FUNC uint64_t
mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16)27860 mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16) {
27861 #if defined(_WIN32) || defined(_WIN64)
27862   if (unlikely(performance_frequency.QuadPart == 0))
27863     QueryPerformanceFrequency(&performance_frequency);
27864   const uint64_t ratio = performance_frequency.QuadPart;
27865 #elif defined(__APPLE__) || defined(__MACH__)
27866   if (unlikely(ratio_16dot16_to_monotine == 0)) {
27867     mach_timebase_info_data_t ti;
27868     mach_timebase_info(&ti);
27869     ratio_16dot16_to_monotine = UINT64_C(1000000000) * ti.denom / ti.numer;
27870   }
27871   const uint64_t ratio = ratio_16dot16_to_monotine;
27872 #else
27873   const uint64_t ratio = UINT64_C(1000000000);
27874 #endif
27875   const uint64_t ret = (ratio * seconds_16dot16 + 32768) >> 16;
27876   return likely(ret || seconds_16dot16 == 0) ? ret : /* fix underflow */ 1;
27877 }
27878 
mdbx_osal_monotime_to_16dot16(uint64_t monotime)27879 MDBX_INTERNAL_FUNC uint32_t mdbx_osal_monotime_to_16dot16(uint64_t monotime) {
27880   static uint64_t limit;
27881   if (unlikely(monotime > limit)) {
27882     if (limit != 0)
27883       return UINT32_MAX;
27884     limit = mdbx_osal_16dot16_to_monotime(UINT32_MAX - 1);
27885     if (monotime > limit)
27886       return UINT32_MAX;
27887   }
27888   const uint32_t ret =
27889 #if defined(_WIN32) || defined(_WIN64)
27890       (uint32_t)((monotime << 16) / performance_frequency.QuadPart);
27891 #elif defined(__APPLE__) || defined(__MACH__)
27892       (uint32_t)((monotime << 16) / ratio_16dot16_to_monotine);
27893 #else
27894       (uint32_t)(monotime * 128 / 1953125);
27895 #endif
27896   return likely(ret || monotime == 0) ? ret : /* fix underflow */ 1;
27897 }
27898 
mdbx_osal_monotime(void)27899 MDBX_INTERNAL_FUNC uint64_t mdbx_osal_monotime(void) {
27900 #if defined(_WIN32) || defined(_WIN64)
27901   LARGE_INTEGER counter;
27902   counter.QuadPart = 0;
27903   QueryPerformanceCounter(&counter);
27904   return counter.QuadPart;
27905 #elif defined(__APPLE__) || defined(__MACH__)
27906   return mach_absolute_time();
27907 #else
27908 
27909 #if defined(__linux__) || defined(__gnu_linux__)
27910   static clockid_t posix_clockid = -1;
27911   if (unlikely(posix_clockid < 0))
27912     posix_clockid = choice_monoclock();
27913 #elif defined(CLOCK_MONOTONIC)
27914 #define posix_clockid CLOCK_MONOTONIC
27915 #else
27916 #define posix_clockid CLOCK_REALTIME
27917 #endif
27918 
27919   struct timespec ts;
27920   if (unlikely(clock_gettime(posix_clockid, &ts) != 0)) {
27921     ts.tv_nsec = 0;
27922     ts.tv_sec = 0;
27923   }
27924   return ts.tv_sec * UINT64_C(1000000000) + ts.tv_nsec;
27925 #endif
27926 }
27927 
27928 /*----------------------------------------------------------------------------*/
27929 
bootid_shake(bin128_t * p)27930 static void bootid_shake(bin128_t *p) {
27931   /* Bob Jenkins's PRNG: https://burtleburtle.net/bob/rand/smallprng.html */
27932   const uint32_t e = p->a - (p->b << 23 | p->b >> 9);
27933   p->a = p->b ^ (p->c << 16 | p->c >> 16);
27934   p->b = p->c + (p->d << 11 | p->d >> 21);
27935   p->c = p->d + e;
27936   p->d = e + p->a;
27937 }
27938 
bootid_collect(bin128_t * p,const void * s,size_t n)27939 static void bootid_collect(bin128_t *p, const void *s, size_t n) {
27940   p->y += UINT64_C(64526882297375213);
27941   bootid_shake(p);
27942   for (size_t i = 0; i < n; ++i) {
27943     bootid_shake(p);
27944     p->y ^= UINT64_C(48797879452804441) * ((const uint8_t *)s)[i];
27945     bootid_shake(p);
27946     p->y += 14621231;
27947   }
27948   bootid_shake(p);
27949 
27950   /* minor non-linear tomfoolery */
27951   const unsigned z = p->x % 61;
27952   p->y = p->y << z | p->y >> (64 - z);
27953   bootid_shake(p);
27954   bootid_shake(p);
27955   const unsigned q = p->x % 59;
27956   p->y = p->y << q | p->y >> (64 - q);
27957   bootid_shake(p);
27958   bootid_shake(p);
27959   bootid_shake(p);
27960 }
27961 
27962 #if defined(_WIN32) || defined(_WIN64)
27963 
windows_systemtime_ms()27964 static uint64_t windows_systemtime_ms() {
27965   FILETIME ft;
27966   GetSystemTimeAsFileTime(&ft);
27967   return ((uint64_t)ft.dwHighDateTime << 32 | ft.dwLowDateTime) / 10000ul;
27968 }
27969 
windows_bootime(void)27970 static uint64_t windows_bootime(void) {
27971   unsigned confirmed = 0;
27972   uint64_t boottime = 0;
27973   uint64_t up0 = mdbx_GetTickCount64();
27974   uint64_t st0 = windows_systemtime_ms();
27975   for (uint64_t fuse = st0; up0 && st0 < fuse + 1000 * 1000u / 42;) {
27976     YieldProcessor();
27977     const uint64_t up1 = mdbx_GetTickCount64();
27978     const uint64_t st1 = windows_systemtime_ms();
27979     if (st1 > fuse && st1 == st0 && up1 == up0) {
27980       uint64_t diff = st1 - up1;
27981       if (boottime == diff) {
27982         if (++confirmed > 4)
27983           return boottime;
27984       } else {
27985         confirmed = 0;
27986         boottime = diff;
27987       }
27988       fuse = st1;
27989       Sleep(1);
27990     }
27991     st0 = st1;
27992     up0 = up1;
27993   }
27994   return 0;
27995 }
27996 
mdbx_RegGetValue(HKEY hKey,LPCSTR lpSubKey,LPCSTR lpValue,PVOID pvData,LPDWORD pcbData)27997 static LSTATUS mdbx_RegGetValue(HKEY hKey, LPCSTR lpSubKey, LPCSTR lpValue,
27998                                 PVOID pvData, LPDWORD pcbData) {
27999   LSTATUS rc;
28000   if (!mdbx_RegGetValueA) {
28001     /* an old Windows 2000/XP */
28002     HKEY hSubKey;
28003     rc = RegOpenKeyA(hKey, lpSubKey, &hSubKey);
28004     if (rc == ERROR_SUCCESS) {
28005       rc = RegQueryValueExA(hSubKey, lpValue, NULL, NULL, pvData, pcbData);
28006       RegCloseKey(hSubKey);
28007     }
28008     return rc;
28009   }
28010 
28011   rc = mdbx_RegGetValueA(hKey, lpSubKey, lpValue, RRF_RT_ANY, NULL, pvData,
28012                          pcbData);
28013   if (rc != ERROR_FILE_NOT_FOUND)
28014     return rc;
28015 
28016   rc = mdbx_RegGetValueA(hKey, lpSubKey, lpValue,
28017                          RRF_RT_ANY | 0x00010000 /* RRF_SUBKEY_WOW6464KEY */,
28018                          NULL, pvData, pcbData);
28019   if (rc != ERROR_FILE_NOT_FOUND)
28020     return rc;
28021   return mdbx_RegGetValueA(hKey, lpSubKey, lpValue,
28022                            RRF_RT_ANY | 0x00020000 /* RRF_SUBKEY_WOW6432KEY */,
28023                            NULL, pvData, pcbData);
28024 }
28025 #endif
28026 
28027 __cold MDBX_MAYBE_UNUSED static bool
bootid_parse_uuid(bin128_t * s,const void * p,const size_t n)28028 bootid_parse_uuid(bin128_t *s, const void *p, const size_t n) {
28029   if (n > 31) {
28030     unsigned bits = 0;
28031     for (unsigned i = 0; i < n; ++i) /* try parse an UUID in text form */ {
28032       uint8_t c = ((const uint8_t *)p)[i];
28033       if (c >= '0' && c <= '9')
28034         c -= '0';
28035       else if (c >= 'a' && c <= 'f')
28036         c -= 'a' - 10;
28037       else if (c >= 'A' && c <= 'F')
28038         c -= 'A' - 10;
28039       else
28040         continue;
28041       assert(c <= 15);
28042       c ^= s->y >> 60;
28043       s->y = s->y << 4 | s->x >> 60;
28044       s->x = s->x << 4 | c;
28045       bits += 4;
28046     }
28047     if (bits > 42 * 3)
28048       /* UUID parsed successfully */
28049       return true;
28050   }
28051 
28052   if (n > 15) /* is enough handle it as a binary? */ {
28053     if (n == sizeof(bin128_t)) {
28054       bin128_t aligned;
28055       memcpy(&aligned, p, sizeof(bin128_t));
28056       s->x += aligned.x;
28057       s->y += aligned.y;
28058     } else
28059       bootid_collect(s, p, n);
28060     return true;
28061   }
28062 
28063   if (n)
28064     bootid_collect(s, p, n);
28065   return false;
28066 }
28067 
mdbx_osal_bootid(void)28068 __cold MDBX_INTERNAL_FUNC bin128_t mdbx_osal_bootid(void) {
28069   bin128_t bin = {{0, 0}};
28070   bool got_machineid = false, got_boottime = false, got_bootseq = false;
28071 
28072 #if defined(__linux__) || defined(__gnu_linux__)
28073   {
28074     const int fd =
28075         open("/proc/sys/kernel/random/boot_id", O_RDONLY | O_NOFOLLOW);
28076     if (fd != -1) {
28077       struct statfs fs;
28078       char buf[42];
28079       const ssize_t len =
28080           (fstatfs(fd, &fs) == 0 && fs.f_type == /* procfs */ 0x9FA0)
28081               ? read(fd, buf, sizeof(buf))
28082               : -1;
28083       const int err = close(fd);
28084       assert(err == 0);
28085       (void)err;
28086       if (len > 0 && bootid_parse_uuid(&bin, buf, len))
28087         return bin;
28088     }
28089   }
28090 #endif /* Linux */
28091 
28092 #if defined(__APPLE__) || defined(__MACH__)
28093   {
28094     char buf[42];
28095     size_t len = sizeof(buf);
28096     if (!sysctlbyname("kern.bootsessionuuid", buf, &len, nullptr, 0) &&
28097         bootid_parse_uuid(&bin, buf, len))
28098       return bin;
28099 
28100 #if defined(__MAC_OS_X_VERSION_MIN_REQUIRED) &&                                \
28101     __MAC_OS_X_VERSION_MIN_REQUIRED > 1050
28102     uuid_t uuid;
28103     struct timespec wait = {0, 1000000000u / 42};
28104     if (!gethostuuid(uuid, &wait) &&
28105         bootid_parse_uuid(&bin, uuid, sizeof(uuid)))
28106       got_machineid = true;
28107 #endif /* > 10.5 */
28108 
28109     struct timeval boottime;
28110     len = sizeof(boottime);
28111     if (!sysctlbyname("kern.boottime", &boottime, &len, nullptr, 0) &&
28112         len == sizeof(boottime) && boottime.tv_sec)
28113       got_boottime = true;
28114   }
28115 #endif /* Apple/Darwin */
28116 
28117 #if defined(_WIN32) || defined(_WIN64)
28118   {
28119     union buf {
28120       DWORD BootId;
28121       DWORD BaseTime;
28122       SYSTEM_TIMEOFDAY_INFORMATION SysTimeOfDayInfo;
28123       struct {
28124         LARGE_INTEGER BootTime;
28125         LARGE_INTEGER CurrentTime;
28126         LARGE_INTEGER TimeZoneBias;
28127         ULONG TimeZoneId;
28128         ULONG Reserved;
28129         ULONGLONG BootTimeBias;
28130         ULONGLONG SleepTimeBias;
28131       } SysTimeOfDayInfoHacked;
28132       wchar_t MachineGuid[42];
28133       char DigitalProductId[248];
28134     } buf;
28135 
28136     static const char HKLM_MicrosoftCryptography[] =
28137         "SOFTWARE\\Microsoft\\Cryptography";
28138     DWORD len = sizeof(buf);
28139     /* Windows is madness and must die */
28140     if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_MicrosoftCryptography,
28141                          "MachineGuid", &buf.MachineGuid,
28142                          &len) == ERROR_SUCCESS &&
28143         len < sizeof(buf))
28144       got_machineid = bootid_parse_uuid(&bin, &buf.MachineGuid, len);
28145 
28146     if (!got_machineid) {
28147       /* again, Windows is madness */
28148       static const char HKLM_WindowsNT[] =
28149           "SOFTWARE\\Microsoft\\Windows NT\\CurrentVersion";
28150       static const char HKLM_WindowsNT_DPK[] =
28151           "SOFTWARE\\Microsoft\\Windows "
28152           "NT\\CurrentVersion\\DefaultProductKey";
28153       static const char HKLM_WindowsNT_DPK2[] =
28154           "SOFTWARE\\Microsoft\\Windows "
28155           "NT\\CurrentVersion\\DefaultProductKey2";
28156 
28157       len = sizeof(buf);
28158       if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_WindowsNT,
28159                            "DigitalProductId", &buf.DigitalProductId,
28160                            &len) == ERROR_SUCCESS &&
28161           len > 42 && len < sizeof(buf)) {
28162         bootid_collect(&bin, &buf.DigitalProductId, len);
28163         got_machineid = true;
28164       }
28165       len = sizeof(buf);
28166       if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_WindowsNT_DPK,
28167                            "DigitalProductId", &buf.DigitalProductId,
28168                            &len) == ERROR_SUCCESS &&
28169           len > 42 && len < sizeof(buf)) {
28170         bootid_collect(&bin, &buf.DigitalProductId, len);
28171         got_machineid = true;
28172       }
28173       len = sizeof(buf);
28174       if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_WindowsNT_DPK2,
28175                            "DigitalProductId", &buf.DigitalProductId,
28176                            &len) == ERROR_SUCCESS &&
28177           len > 42 && len < sizeof(buf)) {
28178         bootid_collect(&bin, &buf.DigitalProductId, len);
28179         got_machineid = true;
28180       }
28181     }
28182 
28183     static const char HKLM_PrefetcherParams[] =
28184         "SYSTEM\\CurrentControlSet\\Control\\Session Manager\\Memory "
28185         "Management\\PrefetchParameters";
28186     len = sizeof(buf);
28187     if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_PrefetcherParams, "BootId",
28188                          &buf.BootId, &len) == ERROR_SUCCESS &&
28189         len > 1 && len < sizeof(buf)) {
28190       bootid_collect(&bin, &buf.BootId, len);
28191       got_bootseq = true;
28192     }
28193 
28194     len = sizeof(buf);
28195     if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_PrefetcherParams, "BaseTime",
28196                          &buf.BaseTime, &len) == ERROR_SUCCESS &&
28197         len >= sizeof(buf.BaseTime) && buf.BaseTime) {
28198       bootid_collect(&bin, &buf.BaseTime, len);
28199       got_boottime = true;
28200     }
28201 
28202     /* BootTime from SYSTEM_TIMEOFDAY_INFORMATION */
28203     NTSTATUS status = NtQuerySystemInformation(
28204         0x03 /* SystemTmeOfDayInformation */, &buf.SysTimeOfDayInfo,
28205         sizeof(buf.SysTimeOfDayInfo), &len);
28206     if (NT_SUCCESS(status) &&
28207         len >= offsetof(union buf, SysTimeOfDayInfoHacked.BootTimeBias) +
28208                    sizeof(buf.SysTimeOfDayInfoHacked.BootTimeBias) &&
28209         buf.SysTimeOfDayInfoHacked.BootTime.QuadPart) {
28210       const uint64_t UnbiasedBootTime =
28211           buf.SysTimeOfDayInfoHacked.BootTime.QuadPart -
28212           buf.SysTimeOfDayInfoHacked.BootTimeBias;
28213       if (UnbiasedBootTime) {
28214         bootid_collect(&bin, &UnbiasedBootTime, sizeof(UnbiasedBootTime));
28215         got_boottime = true;
28216       }
28217     }
28218 
28219     if (!got_boottime) {
28220       uint64_t boottime = windows_bootime();
28221       if (boottime) {
28222         bootid_collect(&bin, &boottime, sizeof(boottime));
28223         got_boottime = true;
28224       }
28225     }
28226   }
28227 #endif /* Windows */
28228 
28229 #if defined(CTL_HW) && defined(HW_UUID)
28230   if (!got_machineid) {
28231     static const int mib[] = {CTL_HW, HW_UUID};
28232     char buf[42];
28233     size_t len = sizeof(buf);
28234     if (sysctl(
28235 #ifdef SYSCTL_LEGACY_NONCONST_MIB
28236             (int *)
28237 #endif
28238                 mib,
28239             ARRAY_LENGTH(mib), &buf, &len, NULL, 0) == 0)
28240       got_machineid = bootid_parse_uuid(&bin, buf, len);
28241   }
28242 #endif /* CTL_HW && HW_UUID */
28243 
28244 #if defined(CTL_KERN) && defined(KERN_HOSTUUID)
28245   if (!got_machineid) {
28246     static const int mib[] = {CTL_KERN, KERN_HOSTUUID};
28247     char buf[42];
28248     size_t len = sizeof(buf);
28249     if (sysctl(
28250 #ifdef SYSCTL_LEGACY_NONCONST_MIB
28251             (int *)
28252 #endif
28253                 mib,
28254             ARRAY_LENGTH(mib), &buf, &len, NULL, 0) == 0)
28255       got_machineid = bootid_parse_uuid(&bin, buf, len);
28256   }
28257 #endif /* CTL_KERN && KERN_HOSTUUID */
28258 
28259 #if defined(__NetBSD__)
28260   if (!got_machineid) {
28261     char buf[42];
28262     size_t len = sizeof(buf);
28263     if (sysctlbyname("machdep.dmi.system-uuid", buf, &len, NULL, 0) == 0)
28264       got_machineid = bootid_parse_uuid(&bin, buf, len);
28265   }
28266 #endif /* __NetBSD__ */
28267 
28268 #if _XOPEN_SOURCE_EXTENDED
28269   if (!got_machineid) {
28270     const int hostid = gethostid();
28271     if (hostid > 0) {
28272       bootid_collect(&bin, &hostid, sizeof(hostid));
28273       got_machineid = true;
28274     }
28275   }
28276 #endif /* _XOPEN_SOURCE_EXTENDED */
28277 
28278   if (!got_machineid) {
28279   lack:
28280     bin.x = bin.y = 0;
28281     return bin;
28282   }
28283 
28284   /*--------------------------------------------------------------------------*/
28285 
28286 #if defined(CTL_KERN) && defined(KERN_BOOTTIME)
28287   if (!got_boottime) {
28288     static const int mib[] = {CTL_KERN, KERN_BOOTTIME};
28289     struct timeval boottime;
28290     size_t len = sizeof(boottime);
28291     if (sysctl(
28292 #ifdef SYSCTL_LEGACY_NONCONST_MIB
28293             (int *)
28294 #endif
28295                 mib,
28296             ARRAY_LENGTH(mib), &boottime, &len, NULL, 0) == 0 &&
28297         len == sizeof(boottime) && boottime.tv_sec) {
28298       bootid_collect(&bin, &boottime, len);
28299       got_boottime = true;
28300     }
28301   }
28302 #endif /* CTL_KERN && KERN_BOOTTIME */
28303 
28304 #if defined(__sun) || defined(__SVR4) || defined(__svr4__)
28305   if (!got_boottime) {
28306     kstat_ctl_t *kc = kstat_open();
28307     if (kc) {
28308       kstat_t *kp = kstat_lookup(kc, "unix", 0, "system_misc");
28309       if (kp && kstat_read(kc, kp, 0) != -1) {
28310         kstat_named_t *kn = (kstat_named_t *)kstat_data_lookup(kp, "boot_time");
28311         if (kn) {
28312           switch (kn->data_type) {
28313           case KSTAT_DATA_INT32:
28314           case KSTAT_DATA_UINT32:
28315             bootid_collect(&bin, &kn->value, sizeof(int32_t));
28316             got_boottime = true;
28317           case KSTAT_DATA_INT64:
28318           case KSTAT_DATA_UINT64:
28319             bootid_collect(&bin, &kn->value, sizeof(int64_t));
28320             got_boottime = true;
28321           }
28322         }
28323       }
28324       kstat_close(kc);
28325     }
28326   }
28327 #endif /* SunOS / Solaris */
28328 
28329 #if _XOPEN_SOURCE_EXTENDED && defined(BOOT_TIME)
28330   if (!got_boottime) {
28331     setutxent();
28332     const struct utmpx id = {.ut_type = BOOT_TIME};
28333     const struct utmpx *entry = getutxid(&id);
28334     if (entry) {
28335       bootid_collect(&bin, entry, sizeof(*entry));
28336       got_boottime = true;
28337       while (unlikely((entry = getutxid(&id)) != nullptr)) {
28338         /* have multiple reboot records, assuming we can distinguish next
28339          * bootsession even if RTC is wrong or absent */
28340         bootid_collect(&bin, entry, sizeof(*entry));
28341         got_bootseq = true;
28342       }
28343     }
28344     endutxent();
28345   }
28346 #endif /* _XOPEN_SOURCE_EXTENDED && BOOT_TIME */
28347 
28348   if (!got_bootseq) {
28349     if (!got_boottime || !MDBX_TRUST_RTC)
28350       goto lack;
28351 
28352 #if defined(_WIN32) || defined(_WIN64)
28353     FILETIME now;
28354     GetSystemTimeAsFileTime(&now);
28355     if (0x1CCCCCC > now.dwHighDateTime)
28356 #else
28357     struct timespec mono, real;
28358     if (clock_gettime(CLOCK_MONOTONIC, &mono) ||
28359         clock_gettime(CLOCK_REALTIME, &real) ||
28360         /* wrong time, RTC is mad or absent */
28361         1555555555l > real.tv_sec ||
28362         /* seems no adjustment by RTC/NTP, i.e. a fake time */
28363         real.tv_sec < mono.tv_sec || 1234567890l > real.tv_sec - mono.tv_sec ||
28364         (real.tv_sec - mono.tv_sec) % 900u == 0)
28365 #endif
28366       goto lack;
28367   }
28368 
28369   return bin;
28370 }
28371 
mdbx_get_sysraminfo(intptr_t * page_size,intptr_t * total_pages,intptr_t * avail_pages)28372 __cold int mdbx_get_sysraminfo(intptr_t *page_size, intptr_t *total_pages,
28373                                intptr_t *avail_pages) {
28374   if (!page_size && !total_pages && !avail_pages)
28375     return MDBX_EINVAL;
28376   if (total_pages)
28377     *total_pages = -1;
28378   if (avail_pages)
28379     *avail_pages = -1;
28380 
28381   const intptr_t pagesize = mdbx_syspagesize();
28382   if (page_size)
28383     *page_size = pagesize;
28384   if (unlikely(pagesize < MIN_PAGESIZE || !is_powerof2(pagesize)))
28385     return MDBX_INCOMPATIBLE;
28386 
28387   MDBX_MAYBE_UNUSED const int log2page = log2n_powerof2(pagesize);
28388   assert(pagesize == (INT64_C(1) << log2page));
28389   (void)log2page;
28390 
28391 #if defined(_WIN32) || defined(_WIN64)
28392   MEMORYSTATUSEX info;
28393   memset(&info, 0, sizeof(info));
28394   info.dwLength = sizeof(info);
28395   if (!GlobalMemoryStatusEx(&info))
28396     return (int)GetLastError();
28397 #endif
28398 
28399   if (total_pages) {
28400 #if defined(_WIN32) || defined(_WIN64)
28401     const intptr_t total_ram_pages = (intptr_t)(info.ullTotalPhys >> log2page);
28402 #elif defined(_SC_PHYS_PAGES)
28403     const intptr_t total_ram_pages = sysconf(_SC_PHYS_PAGES);
28404     if (total_ram_pages == -1)
28405       return errno;
28406 #elif defined(_SC_AIX_REALMEM)
28407     const intptr_t total_ram_Kb = sysconf(_SC_AIX_REALMEM);
28408     if (total_ram_Kb == -1)
28409       return errno;
28410     const intptr_t total_ram_pages = (total_ram_Kb << 10) >> log2page;
28411 #elif defined(HW_USERMEM) || defined(HW_PHYSMEM64) || defined(HW_MEMSIZE) ||   \
28412     defined(HW_PHYSMEM)
28413     size_t ram, len = sizeof(ram);
28414     static const int mib[] = {
28415       CTL_HW,
28416 #if defined(HW_USERMEM)
28417       HW_USERMEM
28418 #elif defined(HW_PHYSMEM64)
28419       HW_PHYSMEM64
28420 #elif defined(HW_MEMSIZE)
28421       HW_MEMSIZE
28422 #else
28423       HW_PHYSMEM
28424 #endif
28425     };
28426     if (sysctl(
28427 #ifdef SYSCTL_LEGACY_NONCONST_MIB
28428             (int *)
28429 #endif
28430                 mib,
28431             ARRAY_LENGTH(mib), &ram, &len, NULL, 0) != 0)
28432       return errno;
28433     if (len != sizeof(ram))
28434       return MDBX_ENOSYS;
28435     const intptr_t total_ram_pages = (intptr_t)(ram >> log2page);
28436 #else
28437 #error "FIXME: Get User-accessible or physical RAM"
28438 #endif
28439     *total_pages = total_ram_pages;
28440     if (total_ram_pages < 1)
28441       return MDBX_ENOSYS;
28442   }
28443 
28444   if (avail_pages) {
28445 #if defined(_WIN32) || defined(_WIN64)
28446     const intptr_t avail_ram_pages = (intptr_t)(info.ullAvailPhys >> log2page);
28447 #elif defined(_SC_AVPHYS_PAGES)
28448     const intptr_t avail_ram_pages = sysconf(_SC_AVPHYS_PAGES);
28449     if (avail_ram_pages == -1)
28450       return errno;
28451 #elif defined(__MACH__)
28452     mach_msg_type_number_t count = HOST_VM_INFO_COUNT;
28453     vm_statistics_data_t vmstat;
28454     mach_port_t mport = mach_host_self();
28455     kern_return_t kerr = host_statistics(mach_host_self(), HOST_VM_INFO,
28456                                          (host_info_t)&vmstat, &count);
28457     mach_port_deallocate(mach_task_self(), mport);
28458     if (unlikely(kerr != KERN_SUCCESS))
28459       return MDBX_ENOSYS;
28460     const intptr_t avail_ram_pages = vmstat.free_count;
28461 #elif defined(VM_TOTAL) || defined(VM_METER)
28462     struct vmtotal info;
28463     size_t len = sizeof(info);
28464     static const int mib[] = {
28465       CTL_VM,
28466 #if defined(VM_TOTAL)
28467       VM_TOTAL
28468 #elif defined(VM_METER)
28469       VM_METER
28470 #endif
28471     };
28472     if (sysctl(
28473 #ifdef SYSCTL_LEGACY_NONCONST_MIB
28474             (int *)
28475 #endif
28476                 mib,
28477             ARRAY_LENGTH(mib), &info, &len, NULL, 0) != 0)
28478       return errno;
28479     if (len != sizeof(info))
28480       return MDBX_ENOSYS;
28481     const intptr_t avail_ram_pages = info.t_free;
28482 #else
28483 #error "FIXME: Get Available RAM"
28484 #endif
28485     *avail_pages = avail_ram_pages;
28486     if (avail_ram_pages < 1)
28487       return MDBX_ENOSYS;
28488   }
28489 
28490   return MDBX_SUCCESS;
28491 }
28492 /* This is CMake-template for libmdbx's version.c
28493  ******************************************************************************/
28494 
28495 
28496 #if MDBX_VERSION_MAJOR != 0 ||                             \
28497     MDBX_VERSION_MINOR != 11
28498 #error "API version mismatch! Had `git fetch --tags` done?"
28499 #endif
28500 
28501 static const char sourcery[] = MDBX_STRINGIFY(MDBX_BUILD_SOURCERY);
28502 
28503 __dll_export
28504 #ifdef __attribute_used__
28505     __attribute_used__
28506 #elif defined(__GNUC__) || __has_attribute(__used__)
28507     __attribute__((__used__))
28508 #endif
28509 #ifdef __attribute_externally_visible__
28510         __attribute_externally_visible__
28511 #elif (defined(__GNUC__) && !defined(__clang__)) ||                            \
28512     __has_attribute(__externally_visible__)
28513     __attribute__((__externally_visible__))
28514 #endif
28515     const struct MDBX_version_info mdbx_version = {
28516         0,
28517         11,
28518         2,
28519         0,
28520         {"2021-12-02T21:55:52+03:00", "6698013814a3d9954c5af147be987ff986f0224a", "d47eed079e71062ef5dd41a147df060ad13d42b2",
28521          "v0.11.2-0-gd47eed0"},
28522         sourcery};
28523 
28524 __dll_export
28525 #ifdef __attribute_used__
28526     __attribute_used__
28527 #elif defined(__GNUC__) || __has_attribute(__used__)
28528     __attribute__((__used__))
28529 #endif
28530 #ifdef __attribute_externally_visible__
28531         __attribute_externally_visible__
28532 #elif (defined(__GNUC__) && !defined(__clang__)) ||                            \
28533     __has_attribute(__externally_visible__)
28534     __attribute__((__externally_visible__))
28535 #endif
28536     const char *const mdbx_sourcery_anchor = sourcery;
28537 /*
28538  * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru>
28539  * and other libmdbx authors: please see AUTHORS file.
28540  * All rights reserved.
28541  *
28542  * Redistribution and use in source and binary forms, with or without
28543  * modification, are permitted only as authorized by the OpenLDAP
28544  * Public License.
28545  *
28546  * A copy of this license is available in the file LICENSE in the
28547  * top-level directory of the distribution or, alternatively, at
28548  * <http://www.OpenLDAP.org/license.html>.
28549  */
28550 
28551 #if defined(_WIN32) || defined(_WIN64) /* Windows LCK-implementation */
28552 
28553 /* PREAMBLE FOR WINDOWS:
28554  *
28555  * We are not concerned for performance here.
28556  * If you are running Windows a performance could NOT be the goal.
28557  * Otherwise please use Linux. */
28558 
28559 
28560 static void mdbx_winnt_import(void);
28561 
28562 #if MDBX_BUILD_SHARED_LIBRARY
28563 #if MDBX_WITHOUT_MSVC_CRT && defined(NDEBUG)
28564 /* DEBUG/CHECKED builds still require MSVC's CRT for runtime checks.
28565  *
28566  * Define dll's entry point only for Release build when NDEBUG is defined and
28567  * MDBX_WITHOUT_MSVC_CRT=ON. if the entry point isn't defined then MSVC's will
28568  * automatically use DllMainCRTStartup() from CRT library, which also
28569  * automatically call DllMain() from our mdbx.dll */
28570 #pragma comment(linker, "/ENTRY:DllMain")
28571 #endif /* MDBX_WITHOUT_MSVC_CRT */
28572 
DllMain(HANDLE module,DWORD reason,LPVOID reserved)28573 BOOL APIENTRY DllMain(HANDLE module, DWORD reason, LPVOID reserved)
28574 #else
28575 #if !MDBX_MANUAL_MODULE_HANDLER
28576 static
28577 #endif /* !MDBX_MANUAL_MODULE_HANDLER */
28578     void NTAPI
28579     mdbx_module_handler(PVOID module, DWORD reason, PVOID reserved)
28580 #endif /* MDBX_BUILD_SHARED_LIBRARY */
28581 {
28582   (void)reserved;
28583   switch (reason) {
28584   case DLL_PROCESS_ATTACH:
28585     mdbx_winnt_import();
28586     mdbx_rthc_global_init();
28587     break;
28588   case DLL_PROCESS_DETACH:
28589     mdbx_rthc_global_dtor();
28590     break;
28591 
28592   case DLL_THREAD_ATTACH:
28593     break;
28594   case DLL_THREAD_DETACH:
28595     mdbx_rthc_thread_dtor(module);
28596     break;
28597   }
28598 #if MDBX_BUILD_SHARED_LIBRARY
28599   return TRUE;
28600 #endif
28601 }
28602 
28603 #if !MDBX_BUILD_SHARED_LIBRARY && !MDBX_MANUAL_MODULE_HANDLER
28604 /* *INDENT-OFF* */
28605 /* clang-format off */
28606 #if defined(_MSC_VER)
28607 #  pragma const_seg(push)
28608 #  pragma data_seg(push)
28609 
28610 #  ifdef _WIN64
28611      /* kick a linker to create the TLS directory if not already done */
28612 #    pragma comment(linker, "/INCLUDE:_tls_used")
28613      /* Force some symbol references. */
28614 #    pragma comment(linker, "/INCLUDE:mdbx_tls_anchor")
28615      /* specific const-segment for WIN64 */
28616 #    pragma const_seg(".CRT$XLB")
28617      const
28618 #  else
28619      /* kick a linker to create the TLS directory if not already done */
28620 #    pragma comment(linker, "/INCLUDE:__tls_used")
28621      /* Force some symbol references. */
28622 #    pragma comment(linker, "/INCLUDE:_mdbx_tls_anchor")
28623      /* specific data-segment for WIN32 */
28624 #    pragma data_seg(".CRT$XLB")
28625 #  endif
28626 
28627    __declspec(allocate(".CRT$XLB")) PIMAGE_TLS_CALLBACK mdbx_tls_anchor = mdbx_module_handler;
28628 #  pragma data_seg(pop)
28629 #  pragma const_seg(pop)
28630 
28631 #elif defined(__GNUC__)
28632 #  ifdef _WIN64
28633      const
28634 #  endif
28635    PIMAGE_TLS_CALLBACK mdbx_tls_anchor __attribute__((__section__(".CRT$XLB"), used)) = mdbx_module_handler;
28636 #else
28637 #  error FIXME
28638 #endif
28639 /* *INDENT-ON* */
28640 /* clang-format on */
28641 #endif /* !MDBX_BUILD_SHARED_LIBRARY && !MDBX_MANUAL_MODULE_HANDLER */
28642 
28643 /*----------------------------------------------------------------------------*/
28644 
28645 #define LCK_SHARED 0
28646 #define LCK_EXCLUSIVE LOCKFILE_EXCLUSIVE_LOCK
28647 #define LCK_WAITFOR 0
28648 #define LCK_DONTWAIT LOCKFILE_FAIL_IMMEDIATELY
28649 
flock(mdbx_filehandle_t fd,DWORD flags,uint64_t offset,size_t bytes)28650 static __inline BOOL flock(mdbx_filehandle_t fd, DWORD flags, uint64_t offset,
28651                            size_t bytes) {
28652   OVERLAPPED ov;
28653   ov.hEvent = 0;
28654   ov.Offset = (DWORD)offset;
28655   ov.OffsetHigh = HIGH_DWORD(offset);
28656   return LockFileEx(fd, flags, 0, (DWORD)bytes, HIGH_DWORD(bytes), &ov);
28657 }
28658 
funlock(mdbx_filehandle_t fd,uint64_t offset,size_t bytes)28659 static __inline BOOL funlock(mdbx_filehandle_t fd, uint64_t offset,
28660                              size_t bytes) {
28661   return UnlockFile(fd, (DWORD)offset, HIGH_DWORD(offset), (DWORD)bytes,
28662                     HIGH_DWORD(bytes));
28663 }
28664 
28665 /*----------------------------------------------------------------------------*/
28666 /* global `write` lock for write-txt processing,
28667  * exclusive locking both meta-pages) */
28668 
28669 #define LCK_MAXLEN (1u + ((~(size_t)0) >> 1))
28670 #define LCK_META_OFFSET 0
28671 #define LCK_META_LEN (MAX_PAGESIZE * NUM_METAS)
28672 #define LCK_BODY_OFFSET LCK_META_LEN
28673 #define LCK_BODY_LEN (LCK_MAXLEN - LCK_BODY_OFFSET)
28674 #define LCK_BODY LCK_BODY_OFFSET, LCK_BODY_LEN
28675 #define LCK_WHOLE 0, LCK_MAXLEN
28676 
mdbx_txn_lock(MDBX_env * env,bool dontwait)28677 int mdbx_txn_lock(MDBX_env *env, bool dontwait) {
28678   if (dontwait) {
28679     if (!TryEnterCriticalSection(&env->me_windowsbug_lock))
28680       return MDBX_BUSY;
28681   } else {
28682     __try {
28683       EnterCriticalSection(&env->me_windowsbug_lock);
28684     }
28685     __except ((GetExceptionCode() ==
28686                  0xC0000194 /* STATUS_POSSIBLE_DEADLOCK / EXCEPTION_POSSIBLE_DEADLOCK */)
28687                     ? EXCEPTION_EXECUTE_HANDLER
28688                     : EXCEPTION_CONTINUE_SEARCH) {
28689       return ERROR_POSSIBLE_DEADLOCK;
28690     }
28691   }
28692 
28693   if ((env->me_flags & MDBX_EXCLUSIVE) ||
28694       flock(env->me_lazy_fd,
28695             dontwait ? (LCK_EXCLUSIVE | LCK_DONTWAIT)
28696                      : (LCK_EXCLUSIVE | LCK_WAITFOR),
28697             LCK_BODY))
28698     return MDBX_SUCCESS;
28699   int rc = (int)GetLastError();
28700   LeaveCriticalSection(&env->me_windowsbug_lock);
28701   return (!dontwait || rc != ERROR_LOCK_VIOLATION) ? rc : MDBX_BUSY;
28702 }
28703 
mdbx_txn_unlock(MDBX_env * env)28704 void mdbx_txn_unlock(MDBX_env *env) {
28705   int rc = (env->me_flags & MDBX_EXCLUSIVE)
28706                ? TRUE
28707                : funlock(env->me_lazy_fd, LCK_BODY);
28708   LeaveCriticalSection(&env->me_windowsbug_lock);
28709   if (!rc)
28710     mdbx_panic("%s failed: err %u", __func__, (int)GetLastError());
28711 }
28712 
28713 /*----------------------------------------------------------------------------*/
28714 /* global `read` lock for readers registration,
28715  * exclusive locking `mti_numreaders` (second) cacheline */
28716 
28717 #define LCK_LO_OFFSET 0
28718 #define LCK_LO_LEN offsetof(MDBX_lockinfo, mti_numreaders)
28719 #define LCK_UP_OFFSET LCK_LO_LEN
28720 #define LCK_UP_LEN (sizeof(MDBX_lockinfo) - LCK_UP_OFFSET)
28721 #define LCK_LOWER LCK_LO_OFFSET, LCK_LO_LEN
28722 #define LCK_UPPER LCK_UP_OFFSET, LCK_UP_LEN
28723 
mdbx_rdt_lock(MDBX_env * env)28724 MDBX_INTERNAL_FUNC int mdbx_rdt_lock(MDBX_env *env) {
28725   mdbx_srwlock_AcquireShared(&env->me_remap_guard);
28726   if (env->me_lfd == INVALID_HANDLE_VALUE)
28727     return MDBX_SUCCESS; /* readonly database in readonly filesystem */
28728 
28729   /* transition from S-? (used) to S-E (locked),
28730    * e.g. exclusive lock upper-part */
28731   if ((env->me_flags & MDBX_EXCLUSIVE) ||
28732       flock(env->me_lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER))
28733     return MDBX_SUCCESS;
28734 
28735   int rc = (int)GetLastError();
28736   mdbx_srwlock_ReleaseShared(&env->me_remap_guard);
28737   return rc;
28738 }
28739 
mdbx_rdt_unlock(MDBX_env * env)28740 MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env) {
28741   if (env->me_lfd != INVALID_HANDLE_VALUE) {
28742     /* transition from S-E (locked) to S-? (used), e.g. unlock upper-part */
28743     if ((env->me_flags & MDBX_EXCLUSIVE) == 0 &&
28744         !funlock(env->me_lfd, LCK_UPPER))
28745       mdbx_panic("%s failed: err %u", __func__, (int)GetLastError());
28746   }
28747   mdbx_srwlock_ReleaseShared(&env->me_remap_guard);
28748 }
28749 
mdbx_lockfile(mdbx_filehandle_t fd,bool wait)28750 MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait) {
28751   return flock(fd,
28752                wait ? LCK_EXCLUSIVE | LCK_WAITFOR
28753                     : LCK_EXCLUSIVE | LCK_DONTWAIT,
28754                0, LCK_MAXLEN)
28755              ? MDBX_SUCCESS
28756              : (int)GetLastError();
28757 }
28758 
suspend_and_append(mdbx_handle_array_t ** array,const DWORD ThreadId)28759 static int suspend_and_append(mdbx_handle_array_t **array,
28760                               const DWORD ThreadId) {
28761   const unsigned limit = (*array)->limit;
28762   if ((*array)->count == limit) {
28763     void *ptr = mdbx_realloc(
28764         (limit > ARRAY_LENGTH((*array)->handles))
28765             ? *array
28766             : /* don't free initial array on the stack */ NULL,
28767         sizeof(mdbx_handle_array_t) +
28768             sizeof(HANDLE) * (limit * 2 - ARRAY_LENGTH((*array)->handles)));
28769     if (!ptr)
28770       return MDBX_ENOMEM;
28771     if (limit == ARRAY_LENGTH((*array)->handles))
28772       memcpy(ptr, *array, sizeof(mdbx_handle_array_t));
28773     *array = (mdbx_handle_array_t *)ptr;
28774     (*array)->limit = limit * 2;
28775   }
28776 
28777   HANDLE hThread = OpenThread(THREAD_SUSPEND_RESUME | THREAD_QUERY_INFORMATION,
28778                               FALSE, ThreadId);
28779   if (hThread == NULL)
28780     return (int)GetLastError();
28781 
28782   if (SuspendThread(hThread) == (DWORD)-1) {
28783     int err = (int)GetLastError();
28784     DWORD ExitCode;
28785     if (err == /* workaround for Win10 UCRT bug */ ERROR_ACCESS_DENIED ||
28786         !GetExitCodeThread(hThread, &ExitCode) || ExitCode != STILL_ACTIVE)
28787       err = MDBX_SUCCESS;
28788     CloseHandle(hThread);
28789     return err;
28790   }
28791 
28792   (*array)->handles[(*array)->count++] = hThread;
28793   return MDBX_SUCCESS;
28794 }
28795 
28796 MDBX_INTERNAL_FUNC int
mdbx_suspend_threads_before_remap(MDBX_env * env,mdbx_handle_array_t ** array)28797 mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) {
28798   mdbx_assert(env, (env->me_flags & MDBX_NOTLS) == 0);
28799   const uintptr_t CurrentTid = GetCurrentThreadId();
28800   int rc;
28801   if (env->me_lck_mmap.lck) {
28802     /* Scan LCK for threads of the current process */
28803     const MDBX_reader *const begin = env->me_lck_mmap.lck->mti_readers;
28804     const MDBX_reader *const end =
28805         begin +
28806         atomic_load32(&env->me_lck_mmap.lck->mti_numreaders, mo_AcquireRelease);
28807     const uintptr_t WriteTxnOwner = env->me_txn0 ? env->me_txn0->mt_owner : 0;
28808     for (const MDBX_reader *reader = begin; reader < end; ++reader) {
28809       if (reader->mr_pid.weak != env->me_pid || !reader->mr_tid.weak) {
28810       skip_lck:
28811         continue;
28812       }
28813       if (reader->mr_tid.weak == CurrentTid ||
28814           reader->mr_tid.weak == WriteTxnOwner)
28815         goto skip_lck;
28816 
28817       rc = suspend_and_append(array, (mdbx_tid_t)reader->mr_tid.weak);
28818       if (rc != MDBX_SUCCESS) {
28819       bailout_lck:
28820         (void)mdbx_resume_threads_after_remap(*array);
28821         return rc;
28822       }
28823     }
28824     if (WriteTxnOwner && WriteTxnOwner != CurrentTid) {
28825       rc = suspend_and_append(array, (mdbx_tid_t)WriteTxnOwner);
28826       if (rc != MDBX_SUCCESS)
28827         goto bailout_lck;
28828     }
28829   } else {
28830     /* Without LCK (i.e. read-only mode).
28831      * Walk through a snapshot of all running threads */
28832     mdbx_assert(env, env->me_flags & (MDBX_EXCLUSIVE | MDBX_RDONLY));
28833     const HANDLE hSnapshot = CreateToolhelp32Snapshot(TH32CS_SNAPTHREAD, 0);
28834     if (hSnapshot == INVALID_HANDLE_VALUE)
28835       return (int)GetLastError();
28836 
28837     THREADENTRY32 entry;
28838     entry.dwSize = sizeof(THREADENTRY32);
28839 
28840     if (!Thread32First(hSnapshot, &entry)) {
28841       rc = (int)GetLastError();
28842     bailout_toolhelp:
28843       CloseHandle(hSnapshot);
28844       (void)mdbx_resume_threads_after_remap(*array);
28845       return rc;
28846     }
28847 
28848     do {
28849       if (entry.th32OwnerProcessID != env->me_pid ||
28850           entry.th32ThreadID == CurrentTid)
28851         continue;
28852 
28853       rc = suspend_and_append(array, entry.th32ThreadID);
28854       if (rc != MDBX_SUCCESS)
28855         goto bailout_toolhelp;
28856 
28857     } while (Thread32Next(hSnapshot, &entry));
28858 
28859     rc = (int)GetLastError();
28860     if (rc != ERROR_NO_MORE_FILES)
28861       goto bailout_toolhelp;
28862     CloseHandle(hSnapshot);
28863   }
28864 
28865   return MDBX_SUCCESS;
28866 }
28867 
28868 MDBX_INTERNAL_FUNC int
mdbx_resume_threads_after_remap(mdbx_handle_array_t * array)28869 mdbx_resume_threads_after_remap(mdbx_handle_array_t *array) {
28870   int rc = MDBX_SUCCESS;
28871   for (unsigned i = 0; i < array->count; ++i) {
28872     const HANDLE hThread = array->handles[i];
28873     if (ResumeThread(hThread) == (DWORD)-1) {
28874       const int err = (int)GetLastError();
28875       DWORD ExitCode;
28876       if (err != /* workaround for Win10 UCRT bug */ ERROR_ACCESS_DENIED &&
28877           GetExitCodeThread(hThread, &ExitCode) && ExitCode == STILL_ACTIVE)
28878         rc = err;
28879     }
28880     CloseHandle(hThread);
28881   }
28882   return rc;
28883 }
28884 
28885 /*----------------------------------------------------------------------------*/
28886 /* global `initial` lock for lockfile initialization,
28887  * exclusive/shared locking first cacheline */
28888 
28889 /* Briefly description of locking schema/algorithm:
28890  *  - Windows does not support upgrading or downgrading for file locking.
28891  *  - Therefore upgrading/downgrading is emulated by shared and exclusive
28892  *    locking of upper and lower halves.
28893  *  - In other words, we have FSM with possible 9 states,
28894  *    i.e. free/shared/exclusive x free/shared/exclusive == 9.
28895  *    Only 6 states of FSM are used, which 2 of ones are transitive.
28896  *
28897  * States:
28898  *   ?-?  = free, i.e. unlocked
28899  *   S-?  = used, i.e. shared lock
28900  *   E-?  = exclusive-read, i.e. operational exclusive
28901  *   ?-S
28902  *   ?-E  = middle (transitive state)
28903  *   S-S
28904  *   S-E  = locked (transitive state)
28905  *   E-S
28906  *   E-E  = exclusive-write, i.e. exclusive due (re)initialization
28907  *
28908  *  The mdbx_lck_seize() moves the locking-FSM from the initial free/unlocked
28909  *  state to the "exclusive write" (and returns MDBX_RESULT_TRUE) if possible,
28910  *  or to the "used" (and returns MDBX_RESULT_FALSE).
28911  *
28912  *  The mdbx_lck_downgrade() moves the locking-FSM from "exclusive write"
28913  *  state to the "used" (i.e. shared) state.
28914  *
28915  *  The mdbx_lck_upgrade() moves the locking-FSM from "used" (i.e. shared)
28916  *  state to the "exclusive write" state.
28917  */
28918 
lck_unlock(MDBX_env * env)28919 static void lck_unlock(MDBX_env *env) {
28920   int err;
28921 
28922   if (env->me_lfd != INVALID_HANDLE_VALUE) {
28923     /* double `unlock` for robustly remove overlapped shared/exclusive locks */
28924     while (funlock(env->me_lfd, LCK_LOWER))
28925       ;
28926     err = (int)GetLastError();
28927     assert(err == ERROR_NOT_LOCKED ||
28928            (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION));
28929     (void)err;
28930     SetLastError(ERROR_SUCCESS);
28931 
28932     while (funlock(env->me_lfd, LCK_UPPER))
28933       ;
28934     err = (int)GetLastError();
28935     assert(err == ERROR_NOT_LOCKED ||
28936            (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION));
28937     (void)err;
28938     SetLastError(ERROR_SUCCESS);
28939   }
28940 
28941   if (env->me_lazy_fd != INVALID_HANDLE_VALUE) {
28942     /* explicitly unlock to avoid latency for other processes (windows kernel
28943      * releases such locks via deferred queues) */
28944     while (funlock(env->me_lazy_fd, LCK_BODY))
28945       ;
28946     err = (int)GetLastError();
28947     assert(err == ERROR_NOT_LOCKED ||
28948            (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION));
28949     (void)err;
28950     SetLastError(ERROR_SUCCESS);
28951 
28952     while (funlock(env->me_lazy_fd, LCK_WHOLE))
28953       ;
28954     err = (int)GetLastError();
28955     assert(err == ERROR_NOT_LOCKED ||
28956            (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION));
28957     (void)err;
28958     SetLastError(ERROR_SUCCESS);
28959   }
28960 }
28961 
28962 /* Seize state as 'exclusive-write' (E-E and returns MDBX_RESULT_TRUE)
28963  * or as 'used' (S-? and returns MDBX_RESULT_FALSE).
28964  * Otherwise returns an error. */
internal_seize_lck(HANDLE lfd)28965 static int internal_seize_lck(HANDLE lfd) {
28966   int rc;
28967   assert(lfd != INVALID_HANDLE_VALUE);
28968 
28969   /* 1) now on ?-? (free), get ?-E (middle) */
28970   mdbx_jitter4testing(false);
28971   if (!flock(lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER)) {
28972     rc = (int)GetLastError() /* 2) something went wrong, give up */;
28973     mdbx_error("%s, err %u", "?-?(free) >> ?-E(middle)", rc);
28974     return rc;
28975   }
28976 
28977   /* 3) now on ?-E (middle), try E-E (exclusive-write) */
28978   mdbx_jitter4testing(false);
28979   if (flock(lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER))
28980     return MDBX_RESULT_TRUE /* 4) got E-E (exclusive-write), done */;
28981 
28982   /* 5) still on ?-E (middle) */
28983   rc = (int)GetLastError();
28984   mdbx_jitter4testing(false);
28985   if (rc != ERROR_SHARING_VIOLATION && rc != ERROR_LOCK_VIOLATION) {
28986     /* 6) something went wrong, give up */
28987     if (!funlock(lfd, LCK_UPPER))
28988       mdbx_panic("%s(%s) failed: err %u", __func__, "?-E(middle) >> ?-?(free)",
28989                  (int)GetLastError());
28990     return rc;
28991   }
28992 
28993   /* 7) still on ?-E (middle), try S-E (locked) */
28994   mdbx_jitter4testing(false);
28995   rc = flock(lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER) ? MDBX_RESULT_FALSE
28996                                                         : (int)GetLastError();
28997 
28998   mdbx_jitter4testing(false);
28999   if (rc != MDBX_RESULT_FALSE)
29000     mdbx_error("%s, err %u", "?-E(middle) >> S-E(locked)", rc);
29001 
29002   /* 8) now on S-E (locked) or still on ?-E (middle),
29003    *    transition to S-? (used) or ?-? (free) */
29004   if (!funlock(lfd, LCK_UPPER))
29005     mdbx_panic("%s(%s) failed: err %u", __func__,
29006                "X-E(locked/middle) >> X-?(used/free)", (int)GetLastError());
29007 
29008   /* 9) now on S-? (used, DONE) or ?-? (free, FAILURE) */
29009   return rc;
29010 }
29011 
mdbx_lck_seize(MDBX_env * env)29012 MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) {
29013   int rc;
29014 
29015   assert(env->me_lazy_fd != INVALID_HANDLE_VALUE);
29016   if (env->me_flags & MDBX_EXCLUSIVE)
29017     return MDBX_RESULT_TRUE /* nope since files were must be opened
29018                                non-shareable */
29019         ;
29020 
29021   if (env->me_lfd == INVALID_HANDLE_VALUE) {
29022     /* LY: without-lck mode (e.g. on read-only filesystem) */
29023     mdbx_jitter4testing(false);
29024     if (!flock(env->me_lazy_fd, LCK_SHARED | LCK_DONTWAIT, LCK_WHOLE)) {
29025       rc = (int)GetLastError();
29026       mdbx_error("%s, err %u", "without-lck", rc);
29027       return rc;
29028     }
29029     return MDBX_RESULT_FALSE;
29030   }
29031 
29032   rc = internal_seize_lck(env->me_lfd);
29033   mdbx_jitter4testing(false);
29034   if (rc == MDBX_RESULT_TRUE && (env->me_flags & MDBX_RDONLY) == 0) {
29035     /* Check that another process don't operates in without-lck mode.
29036      * Doing such check by exclusive locking the body-part of db. Should be
29037      * noted:
29038      *  - we need an exclusive lock for do so;
29039      *  - we can't lock meta-pages, otherwise other process could get an error
29040      *    while opening db in valid (non-conflict) mode. */
29041     if (!flock(env->me_lazy_fd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_BODY)) {
29042       rc = (int)GetLastError();
29043       mdbx_error("%s, err %u", "lock-against-without-lck", rc);
29044       mdbx_jitter4testing(false);
29045       lck_unlock(env);
29046     } else {
29047       mdbx_jitter4testing(false);
29048       if (!funlock(env->me_lazy_fd, LCK_BODY))
29049         mdbx_panic("%s(%s) failed: err %u", __func__,
29050                    "unlock-against-without-lck", (int)GetLastError());
29051     }
29052   }
29053 
29054   return rc;
29055 }
29056 
mdbx_lck_downgrade(MDBX_env * env)29057 MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env) {
29058   /* Transite from exclusive-write state (E-E) to used (S-?) */
29059   assert(env->me_lazy_fd != INVALID_HANDLE_VALUE);
29060   assert(env->me_lfd != INVALID_HANDLE_VALUE);
29061 
29062   if (env->me_flags & MDBX_EXCLUSIVE)
29063     return MDBX_SUCCESS /* nope since files were must be opened non-shareable */
29064         ;
29065   /* 1) now at E-E (exclusive-write), transition to ?_E (middle) */
29066   if (!funlock(env->me_lfd, LCK_LOWER))
29067     mdbx_panic("%s(%s) failed: err %u", __func__,
29068                "E-E(exclusive-write) >> ?-E(middle)", (int)GetLastError());
29069 
29070   /* 2) now at ?-E (middle), transition to S-E (locked) */
29071   if (!flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER)) {
29072     int rc = (int)GetLastError() /* 3) something went wrong, give up */;
29073     mdbx_error("%s, err %u", "?-E(middle) >> S-E(locked)", rc);
29074     return rc;
29075   }
29076 
29077   /* 4) got S-E (locked), continue transition to S-? (used) */
29078   if (!funlock(env->me_lfd, LCK_UPPER))
29079     mdbx_panic("%s(%s) failed: err %u", __func__, "S-E(locked) >> S-?(used)",
29080                (int)GetLastError());
29081 
29082   return MDBX_SUCCESS /* 5) now at S-? (used), done */;
29083 }
29084 
mdbx_lck_upgrade(MDBX_env * env)29085 MDBX_INTERNAL_FUNC int mdbx_lck_upgrade(MDBX_env *env) {
29086   /* Transite from used state (S-?) to exclusive-write (E-E) */
29087   assert(env->me_lfd != INVALID_HANDLE_VALUE);
29088 
29089   if (env->me_flags & MDBX_EXCLUSIVE)
29090     return MDBX_SUCCESS /* nope since files were must be opened non-shareable */
29091         ;
29092 
29093   int rc;
29094   /* 1) now on S-? (used), try S-E (locked) */
29095   mdbx_jitter4testing(false);
29096   if (!flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_UPPER)) {
29097     rc = (int)GetLastError() /* 2) something went wrong, give up */;
29098     mdbx_verbose("%s, err %u", "S-?(used) >> S-E(locked)", rc);
29099     return rc;
29100   }
29101 
29102   /* 3) now on S-E (locked), transition to ?-E (middle) */
29103   if (!funlock(env->me_lfd, LCK_LOWER))
29104     mdbx_panic("%s(%s) failed: err %u", __func__, "S-E(locked) >> ?-E(middle)",
29105                (int)GetLastError());
29106 
29107   /* 4) now on ?-E (middle), try E-E (exclusive-write) */
29108   mdbx_jitter4testing(false);
29109   if (!flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER)) {
29110     rc = (int)GetLastError() /* 5) something went wrong, give up */;
29111     mdbx_verbose("%s, err %u", "?-E(middle) >> E-E(exclusive-write)", rc);
29112     return rc;
29113   }
29114 
29115   return MDBX_SUCCESS /* 6) now at E-E (exclusive-write), done */;
29116 }
29117 
mdbx_lck_init(MDBX_env * env,MDBX_env * inprocess_neighbor,int global_uniqueness_flag)29118 MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env,
29119                                      MDBX_env *inprocess_neighbor,
29120                                      int global_uniqueness_flag) {
29121   (void)env;
29122   (void)inprocess_neighbor;
29123   (void)global_uniqueness_flag;
29124   return MDBX_SUCCESS;
29125 }
29126 
mdbx_lck_destroy(MDBX_env * env,MDBX_env * inprocess_neighbor)29127 MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env,
29128                                         MDBX_env *inprocess_neighbor) {
29129   /* LY: should unmap before releasing the locks to avoid race condition and
29130    * STATUS_USER_MAPPED_FILE/ERROR_USER_MAPPED_FILE */
29131   if (env->me_map)
29132     mdbx_munmap(&env->me_dxb_mmap);
29133   if (env->me_lck_mmap.lck) {
29134     const bool synced = env->me_lck_mmap.lck->mti_unsynced_pages.weak == 0;
29135     mdbx_munmap(&env->me_lck_mmap);
29136     if (synced && !inprocess_neighbor && env->me_lfd != INVALID_HANDLE_VALUE &&
29137         mdbx_lck_upgrade(env) == MDBX_SUCCESS)
29138       /* this will fail if LCK is used/mmapped by other process(es) */
29139       mdbx_ftruncate(env->me_lfd, 0);
29140   }
29141   lck_unlock(env);
29142   return MDBX_SUCCESS;
29143 }
29144 
29145 /*----------------------------------------------------------------------------*/
29146 /* reader checking (by pid) */
29147 
mdbx_rpid_set(MDBX_env * env)29148 MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env) {
29149   (void)env;
29150   return MDBX_SUCCESS;
29151 }
29152 
mdbx_rpid_clear(MDBX_env * env)29153 MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env) {
29154   (void)env;
29155   return MDBX_SUCCESS;
29156 }
29157 
29158 /* Checks reader by pid.
29159  *
29160  * Returns:
29161  *   MDBX_RESULT_TRUE, if pid is live (unable to acquire lock)
29162  *   MDBX_RESULT_FALSE, if pid is dead (lock acquired)
29163  *   or otherwise the errcode. */
mdbx_rpid_check(MDBX_env * env,uint32_t pid)29164 MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid) {
29165   (void)env;
29166   HANDLE hProcess = OpenProcess(SYNCHRONIZE, FALSE, pid);
29167   int rc;
29168   if (likely(hProcess)) {
29169     rc = WaitForSingleObject(hProcess, 0);
29170     if (unlikely(rc == (int)WAIT_FAILED))
29171       rc = (int)GetLastError();
29172     CloseHandle(hProcess);
29173   } else {
29174     rc = (int)GetLastError();
29175   }
29176 
29177   switch (rc) {
29178   case ERROR_INVALID_PARAMETER:
29179     /* pid seems invalid */
29180     return MDBX_RESULT_FALSE;
29181   case WAIT_OBJECT_0:
29182     /* process just exited */
29183     return MDBX_RESULT_FALSE;
29184   case ERROR_ACCESS_DENIED:
29185     /* The ERROR_ACCESS_DENIED would be returned for CSRSS-processes, etc.
29186      * assume pid exists */
29187     return MDBX_RESULT_TRUE;
29188   case WAIT_TIMEOUT:
29189     /* pid running */
29190     return MDBX_RESULT_TRUE;
29191   default:
29192     /* failure */
29193     return rc;
29194   }
29195 }
29196 
29197 //----------------------------------------------------------------------------
29198 // Stub for slim read-write lock
29199 // Copyright (C) 1995-2002 Brad Wilson
29200 
stub_srwlock_Init(MDBX_srwlock * srwl)29201 static void WINAPI stub_srwlock_Init(MDBX_srwlock *srwl) {
29202   srwl->readerCount = srwl->writerCount = 0;
29203 }
29204 
stub_srwlock_AcquireShared(MDBX_srwlock * srwl)29205 static void WINAPI stub_srwlock_AcquireShared(MDBX_srwlock *srwl) {
29206   while (true) {
29207     assert(srwl->writerCount >= 0 && srwl->readerCount >= 0);
29208 
29209     //  If there's a writer already, spin without unnecessarily
29210     //  interlocking the CPUs
29211     if (srwl->writerCount != 0) {
29212       YieldProcessor();
29213       continue;
29214     }
29215 
29216     //  Add to the readers list
29217     _InterlockedIncrement(&srwl->readerCount);
29218 
29219     // Check for writers again (we may have been preempted). If
29220     // there are no writers writing or waiting, then we're done.
29221     if (srwl->writerCount == 0)
29222       break;
29223 
29224     // Remove from the readers list, spin, try again
29225     _InterlockedDecrement(&srwl->readerCount);
29226     YieldProcessor();
29227   }
29228 }
29229 
stub_srwlock_ReleaseShared(MDBX_srwlock * srwl)29230 static void WINAPI stub_srwlock_ReleaseShared(MDBX_srwlock *srwl) {
29231   assert(srwl->readerCount > 0);
29232   _InterlockedDecrement(&srwl->readerCount);
29233 }
29234 
stub_srwlock_AcquireExclusive(MDBX_srwlock * srwl)29235 static void WINAPI stub_srwlock_AcquireExclusive(MDBX_srwlock *srwl) {
29236   while (true) {
29237     assert(srwl->writerCount >= 0 && srwl->readerCount >= 0);
29238 
29239     //  If there's a writer already, spin without unnecessarily
29240     //  interlocking the CPUs
29241     if (srwl->writerCount != 0) {
29242       YieldProcessor();
29243       continue;
29244     }
29245 
29246     // See if we can become the writer (expensive, because it inter-
29247     // locks the CPUs, so writing should be an infrequent process)
29248     if (_InterlockedExchange(&srwl->writerCount, 1) == 0)
29249       break;
29250   }
29251 
29252   // Now we're the writer, but there may be outstanding readers.
29253   // Spin until there aren't any more; new readers will wait now
29254   // that we're the writer.
29255   while (srwl->readerCount != 0) {
29256     assert(srwl->writerCount >= 0 && srwl->readerCount >= 0);
29257     YieldProcessor();
29258   }
29259 }
29260 
stub_srwlock_ReleaseExclusive(MDBX_srwlock * srwl)29261 static void WINAPI stub_srwlock_ReleaseExclusive(MDBX_srwlock *srwl) {
29262   assert(srwl->writerCount == 1 && srwl->readerCount >= 0);
29263   srwl->writerCount = 0;
29264 }
29265 
stub_GetTickCount64(void)29266 static uint64_t WINAPI stub_GetTickCount64(void) {
29267   LARGE_INTEGER Counter, Frequency;
29268   return (QueryPerformanceFrequency(&Frequency) &&
29269           QueryPerformanceCounter(&Counter))
29270              ? Counter.QuadPart * 1000ul / Frequency.QuadPart
29271              : 0;
29272 }
29273 
29274 /*----------------------------------------------------------------------------*/
29275 
29276 #ifndef xMDBX_ALLOY
29277 MDBX_srwlock_function mdbx_srwlock_Init, mdbx_srwlock_AcquireShared,
29278     mdbx_srwlock_ReleaseShared, mdbx_srwlock_AcquireExclusive,
29279     mdbx_srwlock_ReleaseExclusive;
29280 
29281 MDBX_NtExtendSection mdbx_NtExtendSection;
29282 MDBX_GetFileInformationByHandleEx mdbx_GetFileInformationByHandleEx;
29283 MDBX_GetVolumeInformationByHandleW mdbx_GetVolumeInformationByHandleW;
29284 MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW;
29285 MDBX_SetFileInformationByHandle mdbx_SetFileInformationByHandle;
29286 MDBX_NtFsControlFile mdbx_NtFsControlFile;
29287 MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory;
29288 MDBX_GetTickCount64 mdbx_GetTickCount64;
29289 MDBX_RegGetValueA mdbx_RegGetValueA;
29290 #endif /* xMDBX_ALLOY */
29291 
mdbx_winnt_import(void)29292 static void mdbx_winnt_import(void) {
29293   const HINSTANCE hNtdll = GetModuleHandleA("ntdll.dll");
29294 
29295 #define GET_PROC_ADDR(dll, ENTRY)                                              \
29296   mdbx_##ENTRY = (MDBX_##ENTRY)GetProcAddress(dll, #ENTRY)
29297 
29298   if (GetProcAddress(hNtdll, "wine_get_version")) {
29299     assert(mdbx_RunningUnderWine());
29300   } else {
29301     GET_PROC_ADDR(hNtdll, NtFsControlFile);
29302     GET_PROC_ADDR(hNtdll, NtExtendSection);
29303     assert(!mdbx_RunningUnderWine());
29304   }
29305 
29306   const HINSTANCE hKernel32dll = GetModuleHandleA("kernel32.dll");
29307   GET_PROC_ADDR(hKernel32dll, GetFileInformationByHandleEx);
29308   GET_PROC_ADDR(hKernel32dll, GetTickCount64);
29309   if (!mdbx_GetTickCount64)
29310     mdbx_GetTickCount64 = stub_GetTickCount64;
29311   if (!mdbx_RunningUnderWine()) {
29312     GET_PROC_ADDR(hKernel32dll, SetFileInformationByHandle);
29313     GET_PROC_ADDR(hKernel32dll, GetVolumeInformationByHandleW);
29314     GET_PROC_ADDR(hKernel32dll, GetFinalPathNameByHandleW);
29315     GET_PROC_ADDR(hKernel32dll, PrefetchVirtualMemory);
29316   }
29317 
29318   const HINSTANCE hAdvapi32dll = GetModuleHandleA("advapi32.dll");
29319   GET_PROC_ADDR(hAdvapi32dll, RegGetValueA);
29320 #undef GET_PROC_ADDR
29321 
29322   const MDBX_srwlock_function init =
29323       (MDBX_srwlock_function)GetProcAddress(hKernel32dll, "InitializeSRWLock");
29324   if (init != NULL) {
29325     mdbx_srwlock_Init = init;
29326     mdbx_srwlock_AcquireShared = (MDBX_srwlock_function)GetProcAddress(
29327         hKernel32dll, "AcquireSRWLockShared");
29328     mdbx_srwlock_ReleaseShared = (MDBX_srwlock_function)GetProcAddress(
29329         hKernel32dll, "ReleaseSRWLockShared");
29330     mdbx_srwlock_AcquireExclusive = (MDBX_srwlock_function)GetProcAddress(
29331         hKernel32dll, "AcquireSRWLockExclusive");
29332     mdbx_srwlock_ReleaseExclusive = (MDBX_srwlock_function)GetProcAddress(
29333         hKernel32dll, "ReleaseSRWLockExclusive");
29334   } else {
29335     mdbx_srwlock_Init = stub_srwlock_Init;
29336     mdbx_srwlock_AcquireShared = stub_srwlock_AcquireShared;
29337     mdbx_srwlock_ReleaseShared = stub_srwlock_ReleaseShared;
29338     mdbx_srwlock_AcquireExclusive = stub_srwlock_AcquireExclusive;
29339     mdbx_srwlock_ReleaseExclusive = stub_srwlock_ReleaseExclusive;
29340   }
29341 }
29342 
29343 #endif /* Windows LCK-implementation */
29344 /*
29345  * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru>
29346  * and other libmdbx authors: please see AUTHORS file.
29347  * All rights reserved.
29348  *
29349  * Redistribution and use in source and binary forms, with or without
29350  * modification, are permitted only as authorized by the OpenLDAP
29351  * Public License.
29352  *
29353  * A copy of this license is available in the file LICENSE in the
29354  * top-level directory of the distribution or, alternatively, at
29355  * <http://www.OpenLDAP.org/license.html>.
29356  */
29357 
29358 #if !(defined(_WIN32) || defined(_WIN64)) /* !Windows LCK-implementation */
29359 
29360 #include <sys/sem.h>
29361 
29362 /*----------------------------------------------------------------------------*/
29363 /* global constructor/destructor */
29364 
29365 #if defined(__linux__) || defined(__gnu_linux__)
29366 
29367 #include <sys/utsname.h>
29368 
29369 #ifndef xMDBX_ALLOY
29370 uint32_t mdbx_linux_kernel_version;
29371 bool mdbx_RunningOnWSL1;
29372 #endif /* xMDBX_ALLOY */
29373 
probe_for_WSL(const char * tag)29374 __cold static uint8_t probe_for_WSL(const char *tag) {
29375   const char *const WSL = strstr(tag, "WSL");
29376   if (WSL && WSL[3] >= '2' && WSL[3] <= '9')
29377     return WSL[3] - '0';
29378   const char *const wsl = strstr(tag, "wsl");
29379   if (wsl && wsl[3] >= '2' && wsl[3] <= '9')
29380     return wsl[3] - '0';
29381   if (WSL || wsl || strcasestr(tag, "Microsoft"))
29382     /* Expecting no new kernel within WSL1, either it will explicitly
29383      * marked by an appropriate WSL-version hint. */
29384     return (mdbx_linux_kernel_version < /* 4.19.x */ 0x04130000) ? 1 : 2;
29385   return 0;
29386 }
29387 
29388 #endif /* Linux */
29389 
29390 __cold static __attribute__((__constructor__)) void
mdbx_global_constructor(void)29391 mdbx_global_constructor(void) {
29392 #if defined(__linux__) || defined(__gnu_linux__)
29393   struct utsname buffer;
29394   if (uname(&buffer) == 0) {
29395     int i = 0;
29396     char *p = buffer.release;
29397     while (*p && i < 4) {
29398       if (*p >= '0' && *p <= '9') {
29399         long number = strtol(p, &p, 10);
29400         if (number > 0) {
29401           if (number > 255)
29402             number = 255;
29403           mdbx_linux_kernel_version += number << (24 - i * 8);
29404         }
29405         ++i;
29406       } else {
29407         ++p;
29408       }
29409     }
29410     /* "Official" way of detecting WSL1 but not WSL2
29411      * https://github.com/Microsoft/WSL/issues/423#issuecomment-221627364
29412      *
29413      * WARNING: False negative detection of WSL1 will result in DATA LOSS!
29414      * So, the REQUIREMENTS for this code:
29415      *  1. MUST detect WSL1 without false-negatives.
29416      *  2. DESIRABLE detect WSL2 but without the risk of violating the first. */
29417     mdbx_RunningOnWSL1 = probe_for_WSL(buffer.version) == 1 ||
29418                          probe_for_WSL(buffer.sysname) == 1 ||
29419                          probe_for_WSL(buffer.release) == 1;
29420   }
29421 #endif /* Linux */
29422 
29423   mdbx_rthc_global_init();
29424 }
29425 
29426 __cold static __attribute__((__destructor__)) void
mdbx_global_destructor(void)29427 mdbx_global_destructor(void) {
29428   mdbx_rthc_global_dtor();
29429 }
29430 
29431 /*----------------------------------------------------------------------------*/
29432 /* lck */
29433 
29434 /* Описание реализации блокировок для POSIX & Linux:
29435  *
29436  * lck-файл отображается в память, в нём организуется таблица читателей и
29437  * размещаются совместно используемые posix-мьютексы (futex). Посредством
29438  * этих мьютексов (см struct MDBX_lockinfo) реализуются:
29439  *  - Блокировка таблицы читателей для регистрации,
29440  *    т.е. функции mdbx_rdt_lock() и mdbx_rdt_unlock().
29441  *  - Блокировка БД для пишущих транзакций,
29442  *    т.е. функции mdbx_txn_lock() и mdbx_txn_unlock().
29443  *
29444  * Остальной функционал реализуется отдельно посредством файловых блокировок:
29445  *  - Первоначальный захват БД в режиме exclusive/shared и последующий перевод
29446  *    в операционный режим, функции mdbx_lck_seize() и mdbx_lck_downgrade().
29447  *  - Проверка присутствие процессов-читателей,
29448  *    т.е. функции mdbx_rpid_set(), mdbx_rpid_clear() и mdbx_rpid_check().
29449  *
29450  * Для блокировки файлов используется fcntl(F_SETLK), так как:
29451  *  - lockf() оперирует только эксклюзивной блокировкой и требует
29452  *    открытия файла в RW-режиме.
29453  *  - flock() не гарантирует атомарности при смене блокировок
29454  *    и оперирует только всем файлом целиком.
29455  *  - Для контроля процессов-читателей используются однобайтовые
29456  *    range-блокировки lck-файла посредством fcntl(F_SETLK). При этом
29457  *    в качестве позиции используется pid процесса-читателя.
29458  *  - Для первоначального захвата и shared/exclusive выполняется блокировка
29459  *    основного файла БД и при успехе lck-файла.
29460  *
29461  * ----------------------------------------------------------------------------
29462  * УДЕРЖИВАЕМЫЕ БЛОКИРОВКИ В ЗАВИСИМОСТИ ОТ РЕЖИМА И СОСТОЯНИЯ
29463  *
29464  * Эксклюзивный режим без lck-файла:
29465  *   = заблокирован весь dxb-файл посредством F_RDLCK или F_WRLCK,
29466  *     в зависимости от MDBX_RDONLY.
29467  *
29468  * Не-операционный режим на время пере-инициализации и разрушении lck-файла:
29469  *   = F_WRLCK блокировка первого байта lck-файла, другие процессы ждут её
29470  *     снятия при получении F_RDLCK через F_SETLKW.
29471  *   - блокировки dxb-файла могут меняться до снятие эксклюзивной блокировки
29472  *    lck-файла:
29473  *       + для НЕ-эксклюзивного режима блокировка pid-байта в dxb-файле
29474  *         посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY.
29475  *       + для ЭКСКЛЮЗИВНОГО режима блокировка pid-байта всего dxb-файла
29476  *         посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY.
29477  *
29478  * ОПЕРАЦИОННЫЙ режим с lck-файлом:
29479  *   = F_RDLCK блокировка первого байта lck-файла, другие процессы не могут
29480  *     получить F_WRLCK и таким образом видят что БД используется.
29481  *   + F_WRLCK блокировка pid-байта в clk-файле после первой транзакции чтения.
29482  *   + для НЕ-эксклюзивного режима блокировка pid-байта в dxb-файле
29483  *     посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY.
29484  *   + для ЭКСКЛЮЗИВНОГО режима блокировка pid-байта всего dxb-файла
29485  *     посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY.
29486  */
29487 
29488 #if MDBX_USE_OFDLOCKS
29489 static int op_setlk, op_setlkw, op_getlk;
choice_fcntl()29490 __cold static void choice_fcntl() {
29491   assert(!op_setlk && !op_setlkw && !op_getlk);
29492   if ((mdbx_runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0
29493 #if defined(__linux__) || defined(__gnu_linux__)
29494       && mdbx_linux_kernel_version >
29495              0x030f0000 /* OFD locks are available since 3.15, but engages here
29496                            only for 3.16 and later kernels (i.e. LTS) because
29497                            of reliability reasons */
29498 #endif                  /* linux */
29499   ) {
29500     op_setlk = F_OFD_SETLK;
29501     op_setlkw = F_OFD_SETLKW;
29502     op_getlk = F_OFD_GETLK;
29503     return;
29504   }
29505   op_setlk = F_SETLK;
29506   op_setlkw = F_SETLKW;
29507   op_getlk = F_GETLK;
29508 }
29509 #else
29510 #define op_setlk F_SETLK
29511 #define op_setlkw F_SETLKW
29512 #define op_getlk F_GETLK
29513 #endif /* MDBX_USE_OFDLOCKS */
29514 
29515 #ifndef OFF_T_MAX
29516 #define OFF_T_MAX                                                              \
29517   ((sizeof(off_t) > 4 ? INT64_MAX : INT32_MAX) & ~(size_t)0xffff)
29518 #endif
29519 
lck_op(mdbx_filehandle_t fd,int cmd,int lck,off_t offset,off_t len)29520 static int lck_op(mdbx_filehandle_t fd, int cmd, int lck, off_t offset,
29521                   off_t len) {
29522   mdbx_jitter4testing(true);
29523   for (;;) {
29524     struct flock lock_op;
29525     memset(&lock_op, 0, sizeof(lock_op));
29526     lock_op.l_type = lck;
29527     lock_op.l_whence = SEEK_SET;
29528     lock_op.l_start = offset;
29529     lock_op.l_len = len;
29530     int rc = fcntl(fd, cmd, &lock_op);
29531     mdbx_jitter4testing(true);
29532     if (rc != -1) {
29533       if (cmd == op_getlk) {
29534         /* Checks reader by pid. Returns:
29535          *   MDBX_RESULT_TRUE   - if pid is live (reader holds a lock).
29536          *   MDBX_RESULT_FALSE  - if pid is dead (a lock could be placed). */
29537         return (lock_op.l_type == F_UNLCK) ? MDBX_RESULT_FALSE
29538                                            : MDBX_RESULT_TRUE;
29539       }
29540       return MDBX_SUCCESS;
29541     }
29542     rc = errno;
29543 #if MDBX_USE_OFDLOCKS
29544     if (rc == EINVAL &&
29545         (cmd == F_OFD_SETLK || cmd == F_OFD_SETLKW || cmd == F_OFD_GETLK)) {
29546       /* fallback to non-OFD locks */
29547       if (cmd == F_OFD_SETLK)
29548         cmd = F_SETLK;
29549       else if (cmd == F_OFD_SETLKW)
29550         cmd = F_SETLKW;
29551       else
29552         cmd = F_GETLK;
29553       op_setlk = F_SETLK;
29554       op_setlkw = F_SETLKW;
29555       op_getlk = F_GETLK;
29556       continue;
29557     }
29558 #endif /* MDBX_USE_OFDLOCKS */
29559     if (rc != EINTR || cmd == op_setlkw) {
29560       mdbx_assert(nullptr, MDBX_IS_ERROR(rc));
29561       return rc;
29562     }
29563   }
29564 }
29565 
mdbx_lockfile(mdbx_filehandle_t fd,bool wait)29566 MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait) {
29567 #if MDBX_USE_OFDLOCKS
29568   if (unlikely(op_setlk == 0))
29569     choice_fcntl();
29570 #endif /* MDBX_USE_OFDLOCKS */
29571   return lck_op(fd, wait ? op_setlkw : op_setlk, F_WRLCK, 0, OFF_T_MAX);
29572 }
29573 
mdbx_rpid_set(MDBX_env * env)29574 MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env) {
29575   assert(env->me_lfd != INVALID_HANDLE_VALUE);
29576   assert(env->me_pid > 0);
29577   if (unlikely(mdbx_getpid() != env->me_pid))
29578     return MDBX_PANIC;
29579   return lck_op(env->me_lfd, op_setlk, F_WRLCK, env->me_pid, 1);
29580 }
29581 
mdbx_rpid_clear(MDBX_env * env)29582 MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env) {
29583   assert(env->me_lfd != INVALID_HANDLE_VALUE);
29584   assert(env->me_pid > 0);
29585   return lck_op(env->me_lfd, op_setlk, F_UNLCK, env->me_pid, 1);
29586 }
29587 
mdbx_rpid_check(MDBX_env * env,uint32_t pid)29588 MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid) {
29589   assert(env->me_lfd != INVALID_HANDLE_VALUE);
29590   assert(pid > 0);
29591   return lck_op(env->me_lfd, op_getlk, F_WRLCK, pid, 1);
29592 }
29593 
29594 /*---------------------------------------------------------------------------*/
29595 
29596 #if MDBX_LOCKING > MDBX_LOCKING_SYSV
mdbx_ipclock_stub(mdbx_ipclock_t * ipc)29597 MDBX_INTERNAL_FUNC int mdbx_ipclock_stub(mdbx_ipclock_t *ipc) {
29598 #if MDBX_LOCKING == MDBX_LOCKING_POSIX1988
29599   return sem_init(ipc, false, 1) ? errno : 0;
29600 #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 ||                                \
29601     MDBX_LOCKING == MDBX_LOCKING_POSIX2008
29602   return pthread_mutex_init(ipc, nullptr);
29603 #else
29604 #error "FIXME"
29605 #endif
29606 }
29607 
mdbx_ipclock_destroy(mdbx_ipclock_t * ipc)29608 MDBX_INTERNAL_FUNC int mdbx_ipclock_destroy(mdbx_ipclock_t *ipc) {
29609 #if MDBX_LOCKING == MDBX_LOCKING_POSIX1988
29610   return sem_destroy(ipc) ? errno : 0;
29611 #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 ||                                \
29612     MDBX_LOCKING == MDBX_LOCKING_POSIX2008
29613   return pthread_mutex_destroy(ipc);
29614 #else
29615 #error "FIXME"
29616 #endif
29617 }
29618 #endif /* MDBX_LOCKING > MDBX_LOCKING_SYSV */
29619 
check_fstat(MDBX_env * env)29620 static int check_fstat(MDBX_env *env) {
29621   struct stat st;
29622 
29623   int rc = MDBX_SUCCESS;
29624   if (fstat(env->me_lazy_fd, &st)) {
29625     rc = errno;
29626     mdbx_error("fstat(%s), err %d", "DXB", rc);
29627     return rc;
29628   }
29629 
29630   if (!S_ISREG(st.st_mode) || st.st_nlink < 1) {
29631 #ifdef EBADFD
29632     rc = EBADFD;
29633 #else
29634     rc = EPERM;
29635 #endif
29636     mdbx_error("%s %s, err %d", "DXB",
29637                (st.st_nlink < 1) ? "file was removed" : "not a regular file",
29638                rc);
29639     return rc;
29640   }
29641 
29642   if (st.st_size < (off_t)(MDBX_MIN_PAGESIZE * NUM_METAS)) {
29643     mdbx_verbose("dxb-file is too short (%u), exclusive-lock needed",
29644                  (unsigned)st.st_size);
29645     rc = MDBX_RESULT_TRUE;
29646   }
29647 
29648   //----------------------------------------------------------------------------
29649 
29650   if (fstat(env->me_lfd, &st)) {
29651     rc = errno;
29652     mdbx_error("fstat(%s), err %d", "LCK", rc);
29653     return rc;
29654   }
29655 
29656   if (!S_ISREG(st.st_mode) || st.st_nlink < 1) {
29657 #ifdef EBADFD
29658     rc = EBADFD;
29659 #else
29660     rc = EPERM;
29661 #endif
29662     mdbx_error("%s %s, err %d", "LCK",
29663                (st.st_nlink < 1) ? "file was removed" : "not a regular file",
29664                rc);
29665     return rc;
29666   }
29667 
29668   /* Checking file size for detect the situation when we got the shared lock
29669    * immediately after mdbx_lck_destroy(). */
29670   if (st.st_size < (off_t)(sizeof(MDBX_lockinfo) + sizeof(MDBX_reader))) {
29671     mdbx_verbose("lck-file is too short (%u), exclusive-lock needed",
29672                  (unsigned)st.st_size);
29673     rc = MDBX_RESULT_TRUE;
29674   }
29675 
29676   return rc;
29677 }
29678 
mdbx_lck_seize(MDBX_env * env)29679 __cold MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) {
29680   assert(env->me_lazy_fd != INVALID_HANDLE_VALUE);
29681   if (unlikely(mdbx_getpid() != env->me_pid))
29682     return MDBX_PANIC;
29683 #if MDBX_USE_OFDLOCKS
29684   if (unlikely(op_setlk == 0))
29685     choice_fcntl();
29686 #endif /* MDBX_USE_OFDLOCKS */
29687 
29688   int rc = MDBX_SUCCESS;
29689 #if defined(__linux__) || defined(__gnu_linux__)
29690   if (unlikely(mdbx_RunningOnWSL1)) {
29691     rc = ENOLCK /* No record locks available */;
29692     mdbx_error("%s, err %u",
29693                "WSL1 (Windows Subsystem for Linux) is mad and trouble-full, "
29694                "injecting failure to avoid data loss",
29695                rc);
29696     return rc;
29697   }
29698 #endif /* Linux */
29699 
29700   if (env->me_lfd == INVALID_HANDLE_VALUE) {
29701     /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */
29702     rc =
29703         lck_op(env->me_lazy_fd, op_setlk,
29704                (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX);
29705     if (rc != MDBX_SUCCESS) {
29706       mdbx_error("%s, err %u", "without-lck", rc);
29707       mdbx_assert(env, MDBX_IS_ERROR(rc));
29708       return rc;
29709     }
29710     return MDBX_RESULT_TRUE /* Done: return with exclusive locking. */;
29711   }
29712 #if defined(_POSIX_PRIORITY_SCHEDULING) && _POSIX_PRIORITY_SCHEDULING > 0
29713   sched_yield();
29714 #endif
29715 
29716 retry:
29717   if (rc == MDBX_RESULT_TRUE) {
29718     rc = lck_op(env->me_lfd, op_setlk, F_UNLCK, 0, 1);
29719     if (rc != MDBX_SUCCESS) {
29720       mdbx_error("%s, err %u", "unlock-before-retry", rc);
29721       mdbx_assert(env, MDBX_IS_ERROR(rc));
29722       return rc;
29723     }
29724   }
29725 
29726   /* Firstly try to get exclusive locking.  */
29727   rc = lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, 1);
29728   if (rc == MDBX_SUCCESS) {
29729     rc = check_fstat(env);
29730     if (MDBX_IS_ERROR(rc))
29731       return rc;
29732 
29733   continue_dxb_exclusive:
29734     rc =
29735         lck_op(env->me_lazy_fd, op_setlk,
29736                (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX);
29737     if (rc == MDBX_SUCCESS)
29738       return MDBX_RESULT_TRUE /* Done: return with exclusive locking. */;
29739 
29740     int err = check_fstat(env);
29741     if (MDBX_IS_ERROR(err))
29742       return err;
29743 
29744     /* the cause may be a collision with POSIX's file-lock recovery. */
29745     if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK ||
29746           rc == EDEADLK)) {
29747       mdbx_error("%s, err %u", "dxb-exclusive", rc);
29748       mdbx_assert(env, MDBX_IS_ERROR(rc));
29749       return rc;
29750     }
29751 
29752     /* Fallback to lck-shared */
29753   } else if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY ||
29754                rc == EWOULDBLOCK || rc == EDEADLK)) {
29755     mdbx_error("%s, err %u", "try-exclusive", rc);
29756     mdbx_assert(env, MDBX_IS_ERROR(rc));
29757     return rc;
29758   }
29759 
29760   /* Here could be one of two:
29761    *  - mdbx_lck_destroy() from the another process was hold the lock
29762    *    during a destruction.
29763    *  - either mdbx_lck_seize() from the another process was got the exclusive
29764    *    lock and doing initialization.
29765    * For distinguish these cases will use size of the lck-file later. */
29766 
29767   /* Wait for lck-shared now. */
29768   /* Here may be await during transient processes, for instance until another
29769    * competing process doesn't call lck_downgrade(). */
29770   rc = lck_op(env->me_lfd, op_setlkw, F_RDLCK, 0, 1);
29771   if (rc != MDBX_SUCCESS) {
29772     mdbx_error("%s, err %u", "try-shared", rc);
29773     mdbx_assert(env, MDBX_IS_ERROR(rc));
29774     return rc;
29775   }
29776 
29777   rc = check_fstat(env);
29778   if (rc == MDBX_RESULT_TRUE)
29779     goto retry;
29780   if (rc != MDBX_SUCCESS) {
29781     mdbx_error("%s, err %u", "lck_fstat", rc);
29782     return rc;
29783   }
29784 
29785   /* got shared, retry exclusive */
29786   rc = lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, 1);
29787   if (rc == MDBX_SUCCESS)
29788     goto continue_dxb_exclusive;
29789 
29790   if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK ||
29791         rc == EDEADLK)) {
29792     mdbx_error("%s, err %u", "try-exclusive", rc);
29793     mdbx_assert(env, MDBX_IS_ERROR(rc));
29794     return rc;
29795   }
29796 
29797   /* Lock against another process operating in without-lck or exclusive mode. */
29798   rc =
29799       lck_op(env->me_lazy_fd, op_setlk,
29800              (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, env->me_pid, 1);
29801   if (rc != MDBX_SUCCESS) {
29802     mdbx_error("%s, err %u", "lock-against-without-lck", rc);
29803     mdbx_assert(env, MDBX_IS_ERROR(rc));
29804     return rc;
29805   }
29806 
29807   /* Done: return with shared locking. */
29808   return MDBX_RESULT_FALSE;
29809 }
29810 
mdbx_lck_downgrade(MDBX_env * env)29811 MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env) {
29812   assert(env->me_lfd != INVALID_HANDLE_VALUE);
29813   if (unlikely(mdbx_getpid() != env->me_pid))
29814     return MDBX_PANIC;
29815 
29816   int rc = MDBX_SUCCESS;
29817   if ((env->me_flags & MDBX_EXCLUSIVE) == 0) {
29818     rc = lck_op(env->me_lazy_fd, op_setlk, F_UNLCK, 0, env->me_pid);
29819     if (rc == MDBX_SUCCESS)
29820       rc = lck_op(env->me_lazy_fd, op_setlk, F_UNLCK, env->me_pid + 1,
29821                   OFF_T_MAX - env->me_pid - 1);
29822   }
29823   if (rc == MDBX_SUCCESS)
29824     rc = lck_op(env->me_lfd, op_setlk, F_RDLCK, 0, 1);
29825   if (unlikely(rc != 0)) {
29826     mdbx_error("%s, err %u", "lck", rc);
29827     assert(MDBX_IS_ERROR(rc));
29828   }
29829   return rc;
29830 }
29831 
mdbx_lck_destroy(MDBX_env * env,MDBX_env * inprocess_neighbor)29832 __cold MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env,
29833                                                MDBX_env *inprocess_neighbor) {
29834   if (unlikely(mdbx_getpid() != env->me_pid))
29835     return MDBX_PANIC;
29836 
29837   int rc = MDBX_SUCCESS;
29838   struct stat lck_info;
29839   MDBX_lockinfo *lck = env->me_lck_mmap.lck;
29840   if (env->me_lfd != INVALID_HANDLE_VALUE && !inprocess_neighbor && lck &&
29841       /* try get exclusive access */
29842       lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, OFF_T_MAX) == 0 &&
29843       /* if LCK was not removed */
29844       fstat(env->me_lfd, &lck_info) == 0 && lck_info.st_nlink > 0 &&
29845       lck_op(env->me_lazy_fd, op_setlk,
29846              (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0,
29847              OFF_T_MAX) == 0) {
29848 
29849     mdbx_verbose("%p got exclusive, drown locks", (void *)env);
29850 #if MDBX_LOCKING == MDBX_LOCKING_SYSV
29851     if (env->me_sysv_ipc.semid != -1)
29852       rc = semctl(env->me_sysv_ipc.semid, 2, IPC_RMID) ? errno : 0;
29853 #else
29854     rc = mdbx_ipclock_destroy(&lck->mti_rlock);
29855     if (rc == 0)
29856       rc = mdbx_ipclock_destroy(&lck->mti_wlock);
29857 #endif /* MDBX_LOCKING */
29858 
29859     mdbx_assert(env, rc == 0);
29860     if (rc == 0) {
29861       const bool synced = lck->mti_unsynced_pages.weak == 0;
29862       mdbx_munmap(&env->me_lck_mmap);
29863       if (synced)
29864         rc = ftruncate(env->me_lfd, 0) ? errno : 0;
29865     }
29866 
29867     mdbx_jitter4testing(false);
29868   }
29869 
29870   /* 1) POSIX's fcntl() locks (i.e. when op_setlk == F_SETLK) should be restored
29871    * after file was closed.
29872    *
29873    * 2) File locks would be released (by kernel) while the file-descriptors will
29874    * be closed. But to avoid false-positive EACCESS and EDEADLK from the kernel,
29875    * locks should be released here explicitly with properly order. */
29876 
29877   /* close dxb and restore lock */
29878   if (env->me_dsync_fd != INVALID_HANDLE_VALUE) {
29879     if (unlikely(close(env->me_dsync_fd) != 0) && rc == MDBX_SUCCESS)
29880       rc = errno;
29881     env->me_dsync_fd = INVALID_HANDLE_VALUE;
29882   }
29883   if (env->me_lazy_fd != INVALID_HANDLE_VALUE) {
29884     if (unlikely(close(env->me_lazy_fd) != 0) && rc == MDBX_SUCCESS)
29885       rc = errno;
29886     env->me_lazy_fd = INVALID_HANDLE_VALUE;
29887     if (op_setlk == F_SETLK && inprocess_neighbor && rc == MDBX_SUCCESS) {
29888       /* restore file-lock */
29889       rc = lck_op(
29890           inprocess_neighbor->me_lazy_fd, F_SETLKW,
29891           (inprocess_neighbor->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK,
29892           (inprocess_neighbor->me_flags & MDBX_EXCLUSIVE)
29893               ? 0
29894               : inprocess_neighbor->me_pid,
29895           (inprocess_neighbor->me_flags & MDBX_EXCLUSIVE) ? OFF_T_MAX : 1);
29896     }
29897   }
29898 
29899   /* close clk and restore locks */
29900   if (env->me_lfd != INVALID_HANDLE_VALUE) {
29901     if (unlikely(close(env->me_lfd) != 0) && rc == MDBX_SUCCESS)
29902       rc = errno;
29903     env->me_lfd = INVALID_HANDLE_VALUE;
29904     if (op_setlk == F_SETLK && inprocess_neighbor && rc == MDBX_SUCCESS) {
29905       /* restore file-locks */
29906       rc = lck_op(inprocess_neighbor->me_lfd, F_SETLKW, F_RDLCK, 0, 1);
29907       if (rc == MDBX_SUCCESS && inprocess_neighbor->me_live_reader)
29908         rc = mdbx_rpid_set(inprocess_neighbor);
29909     }
29910   }
29911 
29912   if (inprocess_neighbor && rc != MDBX_SUCCESS)
29913     inprocess_neighbor->me_flags |= MDBX_FATAL_ERROR;
29914   return rc;
29915 }
29916 
29917 /*---------------------------------------------------------------------------*/
29918 
mdbx_lck_init(MDBX_env * env,MDBX_env * inprocess_neighbor,int global_uniqueness_flag)29919 __cold MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env,
29920                                             MDBX_env *inprocess_neighbor,
29921                                             int global_uniqueness_flag) {
29922 #if MDBX_LOCKING == MDBX_LOCKING_SYSV
29923   int semid = -1;
29924   /* don't initialize semaphores twice */
29925   (void)inprocess_neighbor;
29926   if (global_uniqueness_flag == MDBX_RESULT_TRUE) {
29927     struct stat st;
29928     if (fstat(env->me_lazy_fd, &st))
29929       return errno;
29930   sysv_retry_create:
29931     semid = semget(env->me_sysv_ipc.key, 2,
29932                    IPC_CREAT | IPC_EXCL |
29933                        (st.st_mode & (S_IRWXU | S_IRWXG | S_IRWXO)));
29934     if (unlikely(semid == -1)) {
29935       int err = errno;
29936       if (err != EEXIST)
29937         return err;
29938 
29939       /* remove and re-create semaphore set */
29940       semid = semget(env->me_sysv_ipc.key, 2, 0);
29941       if (semid == -1) {
29942         err = errno;
29943         if (err != ENOENT)
29944           return err;
29945         goto sysv_retry_create;
29946       }
29947       if (semctl(semid, 2, IPC_RMID)) {
29948         err = errno;
29949         if (err != EIDRM)
29950           return err;
29951       }
29952       goto sysv_retry_create;
29953     }
29954 
29955     unsigned short val_array[2] = {1, 1};
29956     if (semctl(semid, 2, SETALL, val_array))
29957       return errno;
29958   } else {
29959     semid = semget(env->me_sysv_ipc.key, 2, 0);
29960     if (semid == -1)
29961       return errno;
29962 
29963     /* check read & write access */
29964     struct semid_ds data[2];
29965     if (semctl(semid, 2, IPC_STAT, data) || semctl(semid, 2, IPC_SET, data))
29966       return errno;
29967   }
29968 
29969   env->me_sysv_ipc.semid = semid;
29970   return MDBX_SUCCESS;
29971 
29972 #elif MDBX_LOCKING == MDBX_LOCKING_FUTEX
29973   (void)inprocess_neighbor;
29974   if (global_uniqueness_flag != MDBX_RESULT_TRUE)
29975     return MDBX_SUCCESS;
29976 #error "FIXME: Not implemented"
29977 #elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988
29978 
29979   /* don't initialize semaphores twice */
29980   (void)inprocess_neighbor;
29981   if (global_uniqueness_flag == MDBX_RESULT_TRUE) {
29982     if (sem_init(&env->me_lck_mmap.lck->mti_rlock, true, 1))
29983       return errno;
29984     if (sem_init(&env->me_lck_mmap.lck->mti_wlock, true, 1))
29985       return errno;
29986   }
29987   return MDBX_SUCCESS;
29988 
29989 #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 ||                                \
29990     MDBX_LOCKING == MDBX_LOCKING_POSIX2008
29991   if (inprocess_neighbor)
29992     return MDBX_SUCCESS /* don't need any initialization for mutexes
29993       if LCK already opened/used inside current process */
29994         ;
29995 
29996     /* FIXME: Unfortunately, there is no other reliable way but to long testing
29997      * on each platform. On the other hand, behavior like FreeBSD is incorrect
29998      * and we can expect it to be rare. Moreover, even on FreeBSD without
29999      * additional in-process initialization, the probability of an problem
30000      * occurring is vanishingly small, and the symptom is a return of EINVAL
30001      * while locking a mutex. In other words, in the worst case, the problem
30002      * results in an EINVAL error at the start of the transaction, but NOT data
30003      * loss, nor database corruption, nor other fatal troubles. Thus, the code
30004      * below I am inclined to think the workaround for erroneous platforms (like
30005      * FreeBSD), rather than a defect of libmdbx. */
30006 #if defined(__FreeBSD__)
30007   /* seems that shared mutexes on FreeBSD required in-process initialization */
30008   (void)global_uniqueness_flag;
30009 #else
30010   /* shared mutexes on many other platforms (including Darwin and Linux's
30011    * futexes) doesn't need any addition in-process initialization */
30012   if (global_uniqueness_flag != MDBX_RESULT_TRUE)
30013     return MDBX_SUCCESS;
30014 #endif
30015 
30016   pthread_mutexattr_t ma;
30017   int rc = pthread_mutexattr_init(&ma);
30018   if (rc)
30019     return rc;
30020 
30021   rc = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED);
30022   if (rc)
30023     goto bailout;
30024 
30025 #if MDBX_LOCKING == MDBX_LOCKING_POSIX2008
30026 #if defined(PTHREAD_MUTEX_ROBUST) || defined(pthread_mutexattr_setrobust)
30027   rc = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST);
30028 #elif defined(PTHREAD_MUTEX_ROBUST_NP) ||                                      \
30029     defined(pthread_mutexattr_setrobust_np)
30030   rc = pthread_mutexattr_setrobust_np(&ma, PTHREAD_MUTEX_ROBUST_NP);
30031 #elif _POSIX_THREAD_PROCESS_SHARED < 200809L
30032   rc = pthread_mutexattr_setrobust_np(&ma, PTHREAD_MUTEX_ROBUST_NP);
30033 #else
30034   rc = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST);
30035 #endif
30036   if (rc)
30037     goto bailout;
30038 #endif /* MDBX_LOCKING == MDBX_LOCKING_POSIX2008 */
30039 
30040 #if defined(_POSIX_THREAD_PRIO_INHERIT) && _POSIX_THREAD_PRIO_INHERIT >= 0 &&  \
30041     !defined(MDBX_SAFE4QEMU)
30042   rc = pthread_mutexattr_setprotocol(&ma, PTHREAD_PRIO_INHERIT);
30043   if (rc == ENOTSUP)
30044     rc = pthread_mutexattr_setprotocol(&ma, PTHREAD_PRIO_NONE);
30045   if (rc && rc != ENOTSUP)
30046     goto bailout;
30047 #endif /* PTHREAD_PRIO_INHERIT */
30048 
30049   rc = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK);
30050   if (rc && rc != ENOTSUP)
30051     goto bailout;
30052 
30053   rc = pthread_mutex_init(&env->me_lck_mmap.lck->mti_rlock, &ma);
30054   if (rc)
30055     goto bailout;
30056   rc = pthread_mutex_init(&env->me_lck_mmap.lck->mti_wlock, &ma);
30057 
30058 bailout:
30059   pthread_mutexattr_destroy(&ma);
30060   return rc;
30061 #else
30062 #error "FIXME"
30063 #endif /* MDBX_LOCKING > 0 */
30064 }
30065 
mdbx_ipclock_failed(MDBX_env * env,mdbx_ipclock_t * ipc,const int err)30066 __cold static int mdbx_ipclock_failed(MDBX_env *env, mdbx_ipclock_t *ipc,
30067                                       const int err) {
30068   int rc = err;
30069 #if MDBX_LOCKING == MDBX_LOCKING_POSIX2008 || MDBX_LOCKING == MDBX_LOCKING_SYSV
30070   if (err == EOWNERDEAD) {
30071     /* We own the mutex. Clean up after dead previous owner. */
30072 
30073     const bool rlocked = ipc == &env->me_lck->mti_rlock;
30074     rc = MDBX_SUCCESS;
30075     if (!rlocked) {
30076       if (unlikely(env->me_txn)) {
30077         /* env is hosed if the dead thread was ours */
30078         env->me_flags |= MDBX_FATAL_ERROR;
30079         env->me_txn = NULL;
30080         rc = MDBX_PANIC;
30081       }
30082     }
30083     mdbx_warning("%clock owner died, %s", (rlocked ? 'r' : 'w'),
30084                  (rc ? "this process' env is hosed" : "recovering"));
30085 
30086     int check_rc = mdbx_cleanup_dead_readers(env, rlocked, NULL);
30087     check_rc = (check_rc == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : check_rc;
30088 
30089 #if MDBX_LOCKING == MDBX_LOCKING_SYSV
30090     rc = (rc == MDBX_SUCCESS) ? check_rc : rc;
30091 #else
30092 #if defined(PTHREAD_MUTEX_ROBUST) || defined(pthread_mutex_consistent)
30093     int mreco_rc = pthread_mutex_consistent(ipc);
30094 #elif defined(PTHREAD_MUTEX_ROBUST_NP) || defined(pthread_mutex_consistent_np)
30095     int mreco_rc = pthread_mutex_consistent_np(ipc);
30096 #elif _POSIX_THREAD_PROCESS_SHARED < 200809L
30097     int mreco_rc = pthread_mutex_consistent_np(ipc);
30098 #else
30099     int mreco_rc = pthread_mutex_consistent(ipc);
30100 #endif
30101     check_rc = (mreco_rc == 0) ? check_rc : mreco_rc;
30102 
30103     if (unlikely(mreco_rc))
30104       mdbx_error("lock recovery failed, %s", mdbx_strerror(mreco_rc));
30105 
30106     rc = (rc == MDBX_SUCCESS) ? check_rc : rc;
30107     if (MDBX_IS_ERROR(rc))
30108       pthread_mutex_unlock(ipc);
30109 #endif /* MDBX_LOCKING == MDBX_LOCKING_POSIX2008 */
30110     return rc;
30111   }
30112 #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001
30113   (void)ipc;
30114 #elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988
30115   (void)ipc;
30116 #elif MDBX_LOCKING == MDBX_LOCKING_FUTEX
30117 #ifdef _MSC_VER
30118 #pragma message("warning: TODO")
30119 #else
30120 #warning "TODO"
30121 #endif
30122   (void)ipc;
30123 #else
30124 #error "FIXME"
30125 #endif /* MDBX_LOCKING */
30126 
30127   mdbx_error("mutex (un)lock failed, %s", mdbx_strerror(err));
30128   if (rc != EDEADLK)
30129     env->me_flags |= MDBX_FATAL_ERROR;
30130   return rc;
30131 }
30132 
mdbx_ipclock_lock(MDBX_env * env,mdbx_ipclock_t * ipc,const bool dont_wait)30133 static int mdbx_ipclock_lock(MDBX_env *env, mdbx_ipclock_t *ipc,
30134                              const bool dont_wait) {
30135 #if MDBX_LOCKING == MDBX_LOCKING_POSIX2001 ||                                  \
30136     MDBX_LOCKING == MDBX_LOCKING_POSIX2008
30137   int rc = dont_wait ? pthread_mutex_trylock(ipc) : pthread_mutex_lock(ipc);
30138   rc = (rc == EBUSY && dont_wait) ? MDBX_BUSY : rc;
30139 #elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988
30140   int rc = MDBX_SUCCESS;
30141   if (dont_wait) {
30142     if (sem_trywait(ipc)) {
30143       rc = errno;
30144       if (rc == EAGAIN)
30145         rc = MDBX_BUSY;
30146     }
30147   } else if (sem_wait(ipc))
30148     rc = errno;
30149 #elif MDBX_LOCKING == MDBX_LOCKING_SYSV
30150   struct sembuf op = {.sem_num = (ipc != &env->me_lck->mti_wlock),
30151                       .sem_op = -1,
30152                       .sem_flg = dont_wait ? IPC_NOWAIT | SEM_UNDO : SEM_UNDO};
30153   int rc;
30154   if (semop(env->me_sysv_ipc.semid, &op, 1)) {
30155     rc = errno;
30156     if (dont_wait && rc == EAGAIN)
30157       rc = MDBX_BUSY;
30158   } else {
30159     rc = *ipc ? EOWNERDEAD : MDBX_SUCCESS;
30160     *ipc = env->me_pid;
30161   }
30162 #else
30163 #error "FIXME"
30164 #endif /* MDBX_LOCKING */
30165 
30166   if (unlikely(rc != MDBX_SUCCESS && rc != MDBX_BUSY))
30167     rc = mdbx_ipclock_failed(env, ipc, rc);
30168   return rc;
30169 }
30170 
mdbx_ipclock_unlock(MDBX_env * env,mdbx_ipclock_t * ipc)30171 static int mdbx_ipclock_unlock(MDBX_env *env, mdbx_ipclock_t *ipc) {
30172 #if MDBX_LOCKING == MDBX_LOCKING_POSIX2001 ||                                  \
30173     MDBX_LOCKING == MDBX_LOCKING_POSIX2008
30174   int rc = pthread_mutex_unlock(ipc);
30175   (void)env;
30176 #elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988
30177   int rc = sem_post(ipc) ? errno : MDBX_SUCCESS;
30178   (void)env;
30179 #elif MDBX_LOCKING == MDBX_LOCKING_SYSV
30180   if (unlikely(*ipc != (pid_t)env->me_pid))
30181     return EPERM;
30182   *ipc = 0;
30183   struct sembuf op = {.sem_num = (ipc != &env->me_lck->mti_wlock),
30184                       .sem_op = 1,
30185                       .sem_flg = SEM_UNDO};
30186   int rc = semop(env->me_sysv_ipc.semid, &op, 1) ? errno : MDBX_SUCCESS;
30187 #else
30188 #error "FIXME"
30189 #endif /* MDBX_LOCKING */
30190   return rc;
30191 }
30192 
mdbx_rdt_lock(MDBX_env * env)30193 MDBX_INTERNAL_FUNC int mdbx_rdt_lock(MDBX_env *env) {
30194   mdbx_trace("%s", ">>");
30195   mdbx_jitter4testing(true);
30196   int rc = mdbx_ipclock_lock(env, &env->me_lck->mti_rlock, false);
30197   mdbx_trace("<< rc %d", rc);
30198   return rc;
30199 }
30200 
mdbx_rdt_unlock(MDBX_env * env)30201 MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env) {
30202   mdbx_trace("%s", ">>");
30203   int rc = mdbx_ipclock_unlock(env, &env->me_lck->mti_rlock);
30204   mdbx_trace("<< rc %d", rc);
30205   if (unlikely(rc != MDBX_SUCCESS))
30206     mdbx_panic("%s() failed: err %d\n", __func__, rc);
30207   mdbx_jitter4testing(true);
30208 }
30209 
mdbx_txn_lock(MDBX_env * env,bool dont_wait)30210 int mdbx_txn_lock(MDBX_env *env, bool dont_wait) {
30211   mdbx_trace("%swait %s", dont_wait ? "dont-" : "", ">>");
30212   mdbx_jitter4testing(true);
30213   int rc = mdbx_ipclock_lock(env, &env->me_lck->mti_wlock, dont_wait);
30214   mdbx_trace("<< rc %d", rc);
30215   return MDBX_IS_ERROR(rc) ? rc : MDBX_SUCCESS;
30216 }
30217 
mdbx_txn_unlock(MDBX_env * env)30218 void mdbx_txn_unlock(MDBX_env *env) {
30219   mdbx_trace("%s", ">>");
30220   int rc = mdbx_ipclock_unlock(env, &env->me_lck->mti_wlock);
30221   mdbx_trace("<< rc %d", rc);
30222   if (unlikely(rc != MDBX_SUCCESS))
30223     mdbx_panic("%s() failed: err %d\n", __func__, rc);
30224   mdbx_jitter4testing(true);
30225 }
30226 
30227 #else
30228 #ifdef _MSC_VER
30229 #pragma warning(disable : 4206) /* nonstandard extension used: translation     \
30230                                    unit is empty */
30231 #endif                          /* _MSC_VER (warnings) */
30232 #endif                          /* !Windows LCK-implementation */
30233