xref: /freebsd/contrib/xz/src/common/tuklib_integer.h (revision 069ac184)
1 ///////////////////////////////////////////////////////////////////////////////
2 //
3 /// \file       tuklib_integer.h
4 /// \brief      Various integer and bit operations
5 ///
6 /// This file provides macros or functions to do some basic integer and bit
7 /// operations.
8 ///
9 /// Native endian inline functions (XX = 16, 32, or 64):
10 ///   - Unaligned native endian reads: readXXne(ptr)
11 ///   - Unaligned native endian writes: writeXXne(ptr, num)
12 ///   - Aligned native endian reads: aligned_readXXne(ptr)
13 ///   - Aligned native endian writes: aligned_writeXXne(ptr, num)
14 ///
15 /// Endianness-converting integer operations (these can be macros!)
16 /// (XX = 16, 32, or 64; Y = b or l):
17 ///   - Byte swapping: bswapXX(num)
18 ///   - Byte order conversions to/from native (byteswaps if Y isn't
19 ///     the native endianness): convXXYe(num)
20 ///   - Unaligned reads: readXXYe(ptr)
21 ///   - Unaligned writes: writeXXYe(ptr, num)
22 ///   - Aligned reads: aligned_readXXYe(ptr)
23 ///   - Aligned writes: aligned_writeXXYe(ptr, num)
24 ///
25 /// Since the above can macros, the arguments should have no side effects
26 /// because they may be evaluated more than once.
27 ///
28 /// Bit scan operations for non-zero 32-bit integers (inline functions):
29 ///   - Bit scan reverse (find highest non-zero bit): bsr32(num)
30 ///   - Count leading zeros: clz32(num)
31 ///   - Count trailing zeros: ctz32(num)
32 ///   - Bit scan forward (simply an alias for ctz32()): bsf32(num)
33 ///
34 /// The above bit scan operations return 0-31. If num is zero,
35 /// the result is undefined.
36 //
37 //  Authors:    Lasse Collin
38 //              Joachim Henke
39 //
40 //  This file has been put into the public domain.
41 //  You can do whatever you want with this file.
42 //
43 ///////////////////////////////////////////////////////////////////////////////
44 
45 #ifndef TUKLIB_INTEGER_H
46 #define TUKLIB_INTEGER_H
47 
48 #include "tuklib_common.h"
49 #include <string.h>
50 
51 // Newer Intel C compilers require immintrin.h for _bit_scan_reverse()
52 // and such functions.
53 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500)
54 #	include <immintrin.h>
55 // Only include <intrin.h> when it is needed. GCC and Clang can both
56 // use __builtin's, so we only need Windows instrincs when using MSVC.
57 // GCC and Clang can set _MSC_VER on Windows, so we need to exclude these
58 // cases explicitly.
59 #elif defined(_MSC_VER) && !TUKLIB_GNUC_REQ(3, 4) && !defined(__clang__)
60 #	include <intrin.h>
61 #endif
62 
63 
64 ///////////////////
65 // Byte swapping //
66 ///////////////////
67 
68 #if defined(HAVE___BUILTIN_BSWAPXX)
69 	// GCC >= 4.8 and Clang
70 #	define bswap16(n) __builtin_bswap16(n)
71 #	define bswap32(n) __builtin_bswap32(n)
72 #	define bswap64(n) __builtin_bswap64(n)
73 
74 #elif defined(HAVE_BYTESWAP_H)
75 	// glibc, uClibc, dietlibc
76 #	include <byteswap.h>
77 #	ifdef HAVE_BSWAP_16
78 #		define bswap16(num) bswap_16(num)
79 #	endif
80 #	ifdef HAVE_BSWAP_32
81 #		define bswap32(num) bswap_32(num)
82 #	endif
83 #	ifdef HAVE_BSWAP_64
84 #		define bswap64(num) bswap_64(num)
85 #	endif
86 
87 #elif defined(HAVE_SYS_ENDIAN_H)
88 	// *BSDs and Darwin
89 #	include <sys/endian.h>
90 
91 #elif defined(HAVE_SYS_BYTEORDER_H)
92 	// Solaris
93 #	include <sys/byteorder.h>
94 #	ifdef BSWAP_16
95 #		define bswap16(num) BSWAP_16(num)
96 #	endif
97 #	ifdef BSWAP_32
98 #		define bswap32(num) BSWAP_32(num)
99 #	endif
100 #	ifdef BSWAP_64
101 #		define bswap64(num) BSWAP_64(num)
102 #	endif
103 #	ifdef BE_16
104 #		define conv16be(num) BE_16(num)
105 #	endif
106 #	ifdef BE_32
107 #		define conv32be(num) BE_32(num)
108 #	endif
109 #	ifdef BE_64
110 #		define conv64be(num) BE_64(num)
111 #	endif
112 #	ifdef LE_16
113 #		define conv16le(num) LE_16(num)
114 #	endif
115 #	ifdef LE_32
116 #		define conv32le(num) LE_32(num)
117 #	endif
118 #	ifdef LE_64
119 #		define conv64le(num) LE_64(num)
120 #	endif
121 #endif
122 
123 #ifndef bswap16
124 #	define bswap16(n) (uint16_t)( \
125 		  (((n) & 0x00FFU) << 8) \
126 		| (((n) & 0xFF00U) >> 8) \
127 	)
128 #endif
129 
130 #ifndef bswap32
131 #	define bswap32(n) (uint32_t)( \
132 		  (((n) & UINT32_C(0x000000FF)) << 24) \
133 		| (((n) & UINT32_C(0x0000FF00)) << 8) \
134 		| (((n) & UINT32_C(0x00FF0000)) >> 8) \
135 		| (((n) & UINT32_C(0xFF000000)) >> 24) \
136 	)
137 #endif
138 
139 #ifndef bswap64
140 #	define bswap64(n) (uint64_t)( \
141 		  (((n) & UINT64_C(0x00000000000000FF)) << 56) \
142 		| (((n) & UINT64_C(0x000000000000FF00)) << 40) \
143 		| (((n) & UINT64_C(0x0000000000FF0000)) << 24) \
144 		| (((n) & UINT64_C(0x00000000FF000000)) << 8) \
145 		| (((n) & UINT64_C(0x000000FF00000000)) >> 8) \
146 		| (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \
147 		| (((n) & UINT64_C(0x00FF000000000000)) >> 40) \
148 		| (((n) & UINT64_C(0xFF00000000000000)) >> 56) \
149 	)
150 #endif
151 
152 // Define conversion macros using the basic byte swapping macros.
153 #ifdef WORDS_BIGENDIAN
154 #	ifndef conv16be
155 #		define conv16be(num) ((uint16_t)(num))
156 #	endif
157 #	ifndef conv32be
158 #		define conv32be(num) ((uint32_t)(num))
159 #	endif
160 #	ifndef conv64be
161 #		define conv64be(num) ((uint64_t)(num))
162 #	endif
163 #	ifndef conv16le
164 #		define conv16le(num) bswap16(num)
165 #	endif
166 #	ifndef conv32le
167 #		define conv32le(num) bswap32(num)
168 #	endif
169 #	ifndef conv64le
170 #		define conv64le(num) bswap64(num)
171 #	endif
172 #else
173 #	ifndef conv16be
174 #		define conv16be(num) bswap16(num)
175 #	endif
176 #	ifndef conv32be
177 #		define conv32be(num) bswap32(num)
178 #	endif
179 #	ifndef conv64be
180 #		define conv64be(num) bswap64(num)
181 #	endif
182 #	ifndef conv16le
183 #		define conv16le(num) ((uint16_t)(num))
184 #	endif
185 #	ifndef conv32le
186 #		define conv32le(num) ((uint32_t)(num))
187 #	endif
188 #	ifndef conv64le
189 #		define conv64le(num) ((uint64_t)(num))
190 #	endif
191 #endif
192 
193 
194 ////////////////////////////////
195 // Unaligned reads and writes //
196 ////////////////////////////////
197 
198 // No-strict-align archs like x86-64
199 // ---------------------------------
200 //
201 // The traditional way of casting e.g. *(const uint16_t *)uint8_pointer
202 // is bad even if the uint8_pointer is properly aligned because this kind
203 // of casts break strict aliasing rules and result in undefined behavior.
204 // With unaligned pointers it's even worse: compilers may emit vector
205 // instructions that require aligned pointers even if non-vector
206 // instructions work with unaligned pointers.
207 //
208 // Using memcpy() is the standard compliant way to do unaligned access.
209 // Many modern compilers inline it so there is no function call overhead.
210 // For those compilers that don't handle the memcpy() method well, the
211 // old casting method (that violates strict aliasing) can be requested at
212 // build time. A third method, casting to a packed struct, would also be
213 // an option but isn't provided to keep things simpler (it's already a mess).
214 // Hopefully this is flexible enough in practice.
215 //
216 // Some compilers on x86-64 like Clang >= 10 and GCC >= 5.1 detect that
217 //
218 //     buf[0] | (buf[1] << 8)
219 //
220 // reads a 16-bit value and can emit a single 16-bit load and produce
221 // identical code than with the memcpy() method. In other cases Clang and GCC
222 // produce either the same or better code with memcpy(). For example, Clang 9
223 // on x86-64 can detect 32-bit load but not 16-bit load.
224 //
225 // MSVC uses unaligned access with the memcpy() method but emits byte-by-byte
226 // code for "buf[0] | (buf[1] << 8)".
227 //
228 // Conclusion: The memcpy() method is the best choice when unaligned access
229 // is supported.
230 //
231 // Strict-align archs like SPARC
232 // -----------------------------
233 //
234 // GCC versions from around 4.x to to at least 13.2.0 produce worse code
235 // from the memcpy() method than from simple byte-by-byte shift-or code
236 // when reading a 32-bit integer:
237 //
238 //     (1) It may be constructed on stack using using four 8-bit loads,
239 //         four 8-bit stores to stack, and finally one 32-bit load from stack.
240 //
241 //     (2) Especially with -Os, an actual memcpy() call may be emitted.
242 //
243 // This is true on at least on ARM, ARM64, SPARC, SPARC64, MIPS64EL, and
244 // RISC-V. Of these, ARM, ARM64, and RISC-V support unaligned access in
245 // some processors but not all so this is relevant only in the case when
246 // GCC assumes that unaligned is not supported or -mstrict-align or
247 // -mno-unaligned-access is used.
248 //
249 // For Clang it makes little difference. ARM64 with -O2 -mstrict-align
250 // was one the very few with a minor difference: the memcpy() version
251 // was one instruction longer.
252 //
253 // Conclusion: At least in case of GCC and Clang, byte-by-byte code is
254 // the best choise for strict-align archs to do unaligned access.
255 //
256 // See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111502
257 //
258 // Thanks to <https://godbolt.org/> it was easy to test different compilers.
259 // The following is for little endian targets:
260 /*
261 #include <stdint.h>
262 #include <string.h>
263 
264 uint32_t bytes16(const uint8_t *b)
265 {
266     return (uint32_t)b[0]
267         | ((uint32_t)b[1] << 8);
268 }
269 
270 uint32_t copy16(const uint8_t *b)
271 {
272     uint16_t v;
273     memcpy(&v, b, sizeof(v));
274     return v;
275 }
276 
277 uint32_t bytes32(const uint8_t *b)
278 {
279     return (uint32_t)b[0]
280         | ((uint32_t)b[1] << 8)
281         | ((uint32_t)b[2] << 16)
282         | ((uint32_t)b[3] << 24);
283 }
284 
285 uint32_t copy32(const uint8_t *b)
286 {
287     uint32_t v;
288     memcpy(&v, b, sizeof(v));
289     return v;
290 }
291 
292 void wbytes16(uint8_t *b, uint16_t v)
293 {
294     b[0] = (uint8_t)v;
295     b[1] = (uint8_t)(v >> 8);
296 }
297 
298 void wcopy16(uint8_t *b, uint16_t v)
299 {
300     memcpy(b, &v, sizeof(v));
301 }
302 
303 void wbytes32(uint8_t *b, uint32_t v)
304 {
305     b[0] = (uint8_t)v;
306     b[1] = (uint8_t)(v >> 8);
307     b[2] = (uint8_t)(v >> 16);
308     b[3] = (uint8_t)(v >> 24);
309 }
310 
311 void wcopy32(uint8_t *b, uint32_t v)
312 {
313     memcpy(b, &v, sizeof(v));
314 }
315 */
316 
317 
318 #ifdef TUKLIB_FAST_UNALIGNED_ACCESS
319 
320 static inline uint16_t
321 read16ne(const uint8_t *buf)
322 {
323 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
324 	return *(const uint16_t *)buf;
325 #else
326 	uint16_t num;
327 	memcpy(&num, buf, sizeof(num));
328 	return num;
329 #endif
330 }
331 
332 
333 static inline uint32_t
334 read32ne(const uint8_t *buf)
335 {
336 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
337 	return *(const uint32_t *)buf;
338 #else
339 	uint32_t num;
340 	memcpy(&num, buf, sizeof(num));
341 	return num;
342 #endif
343 }
344 
345 
346 static inline uint64_t
347 read64ne(const uint8_t *buf)
348 {
349 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
350 	return *(const uint64_t *)buf;
351 #else
352 	uint64_t num;
353 	memcpy(&num, buf, sizeof(num));
354 	return num;
355 #endif
356 }
357 
358 
359 static inline void
360 write16ne(uint8_t *buf, uint16_t num)
361 {
362 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
363 	*(uint16_t *)buf = num;
364 #else
365 	memcpy(buf, &num, sizeof(num));
366 #endif
367 	return;
368 }
369 
370 
371 static inline void
372 write32ne(uint8_t *buf, uint32_t num)
373 {
374 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
375 	*(uint32_t *)buf = num;
376 #else
377 	memcpy(buf, &num, sizeof(num));
378 #endif
379 	return;
380 }
381 
382 
383 static inline void
384 write64ne(uint8_t *buf, uint64_t num)
385 {
386 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
387 	*(uint64_t *)buf = num;
388 #else
389 	memcpy(buf, &num, sizeof(num));
390 #endif
391 	return;
392 }
393 
394 
395 static inline uint16_t
396 read16be(const uint8_t *buf)
397 {
398 	uint16_t num = read16ne(buf);
399 	return conv16be(num);
400 }
401 
402 
403 static inline uint16_t
404 read16le(const uint8_t *buf)
405 {
406 	uint16_t num = read16ne(buf);
407 	return conv16le(num);
408 }
409 
410 
411 static inline uint32_t
412 read32be(const uint8_t *buf)
413 {
414 	uint32_t num = read32ne(buf);
415 	return conv32be(num);
416 }
417 
418 
419 static inline uint32_t
420 read32le(const uint8_t *buf)
421 {
422 	uint32_t num = read32ne(buf);
423 	return conv32le(num);
424 }
425 
426 
427 static inline uint64_t
428 read64be(const uint8_t *buf)
429 {
430 	uint64_t num = read64ne(buf);
431 	return conv64be(num);
432 }
433 
434 
435 static inline uint64_t
436 read64le(const uint8_t *buf)
437 {
438 	uint64_t num = read64ne(buf);
439 	return conv64le(num);
440 }
441 
442 
443 // NOTE: Possible byte swapping must be done in a macro to allow the compiler
444 // to optimize byte swapping of constants when using glibc's or *BSD's
445 // byte swapping macros. The actual write is done in an inline function
446 // to make type checking of the buf pointer possible.
447 #define write16be(buf, num) write16ne(buf, conv16be(num))
448 #define write32be(buf, num) write32ne(buf, conv32be(num))
449 #define write64be(buf, num) write64ne(buf, conv64be(num))
450 #define write16le(buf, num) write16ne(buf, conv16le(num))
451 #define write32le(buf, num) write32ne(buf, conv32le(num))
452 #define write64le(buf, num) write64ne(buf, conv64le(num))
453 
454 #else
455 
456 #ifdef WORDS_BIGENDIAN
457 #	define read16ne read16be
458 #	define read32ne read32be
459 #	define read64ne read64be
460 #	define write16ne write16be
461 #	define write32ne write32be
462 #	define write64ne write64be
463 #else
464 #	define read16ne read16le
465 #	define read32ne read32le
466 #	define read64ne read64le
467 #	define write16ne write16le
468 #	define write32ne write32le
469 #	define write64ne write64le
470 #endif
471 
472 
473 static inline uint16_t
474 read16be(const uint8_t *buf)
475 {
476 	uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1];
477 	return num;
478 }
479 
480 
481 static inline uint16_t
482 read16le(const uint8_t *buf)
483 {
484 	uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8);
485 	return num;
486 }
487 
488 
489 static inline uint32_t
490 read32be(const uint8_t *buf)
491 {
492 	uint32_t num = (uint32_t)buf[0] << 24;
493 	num |= (uint32_t)buf[1] << 16;
494 	num |= (uint32_t)buf[2] << 8;
495 	num |= (uint32_t)buf[3];
496 	return num;
497 }
498 
499 
500 static inline uint32_t
501 read32le(const uint8_t *buf)
502 {
503 	uint32_t num = (uint32_t)buf[0];
504 	num |= (uint32_t)buf[1] << 8;
505 	num |= (uint32_t)buf[2] << 16;
506 	num |= (uint32_t)buf[3] << 24;
507 	return num;
508 }
509 
510 
511 static inline uint64_t
512 read64be(const uint8_t *buf)
513 {
514 	uint64_t num = (uint64_t)buf[0] << 56;
515 	num |= (uint64_t)buf[1] << 48;
516 	num |= (uint64_t)buf[2] << 40;
517 	num |= (uint64_t)buf[3] << 32;
518 	num |= (uint64_t)buf[4] << 24;
519 	num |= (uint64_t)buf[5] << 16;
520 	num |= (uint64_t)buf[6] << 8;
521 	num |= (uint64_t)buf[7];
522 	return num;
523 }
524 
525 
526 static inline uint64_t
527 read64le(const uint8_t *buf)
528 {
529 	uint64_t num = (uint64_t)buf[0];
530 	num |= (uint64_t)buf[1] << 8;
531 	num |= (uint64_t)buf[2] << 16;
532 	num |= (uint64_t)buf[3] << 24;
533 	num |= (uint64_t)buf[4] << 32;
534 	num |= (uint64_t)buf[5] << 40;
535 	num |= (uint64_t)buf[6] << 48;
536 	num |= (uint64_t)buf[7] << 56;
537 	return num;
538 }
539 
540 
541 static inline void
542 write16be(uint8_t *buf, uint16_t num)
543 {
544 	buf[0] = (uint8_t)(num >> 8);
545 	buf[1] = (uint8_t)num;
546 	return;
547 }
548 
549 
550 static inline void
551 write16le(uint8_t *buf, uint16_t num)
552 {
553 	buf[0] = (uint8_t)num;
554 	buf[1] = (uint8_t)(num >> 8);
555 	return;
556 }
557 
558 
559 static inline void
560 write32be(uint8_t *buf, uint32_t num)
561 {
562 	buf[0] = (uint8_t)(num >> 24);
563 	buf[1] = (uint8_t)(num >> 16);
564 	buf[2] = (uint8_t)(num >> 8);
565 	buf[3] = (uint8_t)num;
566 	return;
567 }
568 
569 
570 static inline void
571 write32le(uint8_t *buf, uint32_t num)
572 {
573 	buf[0] = (uint8_t)num;
574 	buf[1] = (uint8_t)(num >> 8);
575 	buf[2] = (uint8_t)(num >> 16);
576 	buf[3] = (uint8_t)(num >> 24);
577 	return;
578 }
579 
580 
581 static inline void
582 write64be(uint8_t *buf, uint64_t num)
583 {
584 	buf[0] = (uint8_t)(num >> 56);
585 	buf[1] = (uint8_t)(num >> 48);
586 	buf[2] = (uint8_t)(num >> 40);
587 	buf[3] = (uint8_t)(num >> 32);
588 	buf[4] = (uint8_t)(num >> 24);
589 	buf[5] = (uint8_t)(num >> 16);
590 	buf[6] = (uint8_t)(num >> 8);
591 	buf[7] = (uint8_t)num;
592 	return;
593 }
594 
595 
596 static inline void
597 write64le(uint8_t *buf, uint64_t num)
598 {
599 	buf[0] = (uint8_t)num;
600 	buf[1] = (uint8_t)(num >> 8);
601 	buf[2] = (uint8_t)(num >> 16);
602 	buf[3] = (uint8_t)(num >> 24);
603 	buf[4] = (uint8_t)(num >> 32);
604 	buf[5] = (uint8_t)(num >> 40);
605 	buf[6] = (uint8_t)(num >> 48);
606 	buf[7] = (uint8_t)(num >> 56);
607 	return;
608 }
609 
610 #endif
611 
612 
613 //////////////////////////////
614 // Aligned reads and writes //
615 //////////////////////////////
616 
617 // Separate functions for aligned reads and writes are provided since on
618 // strict-align archs aligned access is much faster than unaligned access.
619 //
620 // Just like in the unaligned case, memcpy() is needed to avoid
621 // strict aliasing violations. However, on archs that don't support
622 // unaligned access the compiler cannot know that the pointers given
623 // to memcpy() are aligned which results in slow code. As of C11 there is
624 // no standard way to tell the compiler that we know that the address is
625 // aligned but some compilers have language extensions to do that. With
626 // such language extensions the memcpy() method gives excellent results.
627 //
628 // What to do on a strict-align system when no known language extentensions
629 // are available? Falling back to byte-by-byte access would be safe but ruin
630 // optimizations that have been made specifically with aligned access in mind.
631 // As a compromise, aligned reads will fall back to non-compliant type punning
632 // but aligned writes will be byte-by-byte, that is, fast reads are preferred
633 // over fast writes. This obviously isn't great but hopefully it's a working
634 // compromise for now.
635 //
636 // __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6.
637 #ifdef HAVE___BUILTIN_ASSUME_ALIGNED
638 #	define tuklib_memcpy_aligned(dest, src, size) \
639 		memcpy(dest, __builtin_assume_aligned(src, size), size)
640 #else
641 #	define tuklib_memcpy_aligned(dest, src, size) \
642 		memcpy(dest, src, size)
643 #	ifndef TUKLIB_FAST_UNALIGNED_ACCESS
644 #		define TUKLIB_USE_UNSAFE_ALIGNED_READS 1
645 #	endif
646 #endif
647 
648 
649 static inline uint16_t
650 aligned_read16ne(const uint8_t *buf)
651 {
652 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
653 		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
654 	return *(const uint16_t *)buf;
655 #else
656 	uint16_t num;
657 	tuklib_memcpy_aligned(&num, buf, sizeof(num));
658 	return num;
659 #endif
660 }
661 
662 
663 static inline uint32_t
664 aligned_read32ne(const uint8_t *buf)
665 {
666 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
667 		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
668 	return *(const uint32_t *)buf;
669 #else
670 	uint32_t num;
671 	tuklib_memcpy_aligned(&num, buf, sizeof(num));
672 	return num;
673 #endif
674 }
675 
676 
677 static inline uint64_t
678 aligned_read64ne(const uint8_t *buf)
679 {
680 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
681 		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
682 	return *(const uint64_t *)buf;
683 #else
684 	uint64_t num;
685 	tuklib_memcpy_aligned(&num, buf, sizeof(num));
686 	return num;
687 #endif
688 }
689 
690 
691 static inline void
692 aligned_write16ne(uint8_t *buf, uint16_t num)
693 {
694 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
695 	*(uint16_t *)buf = num;
696 #else
697 	tuklib_memcpy_aligned(buf, &num, sizeof(num));
698 #endif
699 	return;
700 }
701 
702 
703 static inline void
704 aligned_write32ne(uint8_t *buf, uint32_t num)
705 {
706 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
707 	*(uint32_t *)buf = num;
708 #else
709 	tuklib_memcpy_aligned(buf, &num, sizeof(num));
710 #endif
711 	return;
712 }
713 
714 
715 static inline void
716 aligned_write64ne(uint8_t *buf, uint64_t num)
717 {
718 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
719 	*(uint64_t *)buf = num;
720 #else
721 	tuklib_memcpy_aligned(buf, &num, sizeof(num));
722 #endif
723 	return;
724 }
725 
726 
727 static inline uint16_t
728 aligned_read16be(const uint8_t *buf)
729 {
730 	uint16_t num = aligned_read16ne(buf);
731 	return conv16be(num);
732 }
733 
734 
735 static inline uint16_t
736 aligned_read16le(const uint8_t *buf)
737 {
738 	uint16_t num = aligned_read16ne(buf);
739 	return conv16le(num);
740 }
741 
742 
743 static inline uint32_t
744 aligned_read32be(const uint8_t *buf)
745 {
746 	uint32_t num = aligned_read32ne(buf);
747 	return conv32be(num);
748 }
749 
750 
751 static inline uint32_t
752 aligned_read32le(const uint8_t *buf)
753 {
754 	uint32_t num = aligned_read32ne(buf);
755 	return conv32le(num);
756 }
757 
758 
759 static inline uint64_t
760 aligned_read64be(const uint8_t *buf)
761 {
762 	uint64_t num = aligned_read64ne(buf);
763 	return conv64be(num);
764 }
765 
766 
767 static inline uint64_t
768 aligned_read64le(const uint8_t *buf)
769 {
770 	uint64_t num = aligned_read64ne(buf);
771 	return conv64le(num);
772 }
773 
774 
775 // These need to be macros like in the unaligned case.
776 #define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num))
777 #define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num))
778 #define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num))
779 #define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num))
780 #define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num))
781 #define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num))
782 
783 
784 ////////////////////
785 // Bit operations //
786 ////////////////////
787 
788 static inline uint32_t
789 bsr32(uint32_t n)
790 {
791 	// Check for ICC first, since it tends to define __GNUC__ too.
792 #if defined(__INTEL_COMPILER)
793 	return _bit_scan_reverse(n);
794 
795 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
796 	// GCC >= 3.4 has __builtin_clz(), which gives good results on
797 	// multiple architectures. On x86, __builtin_clz() ^ 31U becomes
798 	// either plain BSR (so the XOR gets optimized away) or LZCNT and
799 	// XOR (if -march indicates that SSE4a instructions are supported).
800 	return (uint32_t)__builtin_clz(n) ^ 31U;
801 
802 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
803 	uint32_t i;
804 	__asm__("bsrl %1, %0" : "=r" (i) : "rm" (n));
805 	return i;
806 
807 #elif defined(_MSC_VER)
808 	unsigned long i;
809 	_BitScanReverse(&i, n);
810 	return i;
811 
812 #else
813 	uint32_t i = 31;
814 
815 	if ((n & 0xFFFF0000) == 0) {
816 		n <<= 16;
817 		i = 15;
818 	}
819 
820 	if ((n & 0xFF000000) == 0) {
821 		n <<= 8;
822 		i -= 8;
823 	}
824 
825 	if ((n & 0xF0000000) == 0) {
826 		n <<= 4;
827 		i -= 4;
828 	}
829 
830 	if ((n & 0xC0000000) == 0) {
831 		n <<= 2;
832 		i -= 2;
833 	}
834 
835 	if ((n & 0x80000000) == 0)
836 		--i;
837 
838 	return i;
839 #endif
840 }
841 
842 
843 static inline uint32_t
844 clz32(uint32_t n)
845 {
846 #if defined(__INTEL_COMPILER)
847 	return _bit_scan_reverse(n) ^ 31U;
848 
849 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
850 	return (uint32_t)__builtin_clz(n);
851 
852 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
853 	uint32_t i;
854 	__asm__("bsrl %1, %0\n\t"
855 		"xorl $31, %0"
856 		: "=r" (i) : "rm" (n));
857 	return i;
858 
859 #elif defined(_MSC_VER)
860 	unsigned long i;
861 	_BitScanReverse(&i, n);
862 	return i ^ 31U;
863 
864 #else
865 	uint32_t i = 0;
866 
867 	if ((n & 0xFFFF0000) == 0) {
868 		n <<= 16;
869 		i = 16;
870 	}
871 
872 	if ((n & 0xFF000000) == 0) {
873 		n <<= 8;
874 		i += 8;
875 	}
876 
877 	if ((n & 0xF0000000) == 0) {
878 		n <<= 4;
879 		i += 4;
880 	}
881 
882 	if ((n & 0xC0000000) == 0) {
883 		n <<= 2;
884 		i += 2;
885 	}
886 
887 	if ((n & 0x80000000) == 0)
888 		++i;
889 
890 	return i;
891 #endif
892 }
893 
894 
895 static inline uint32_t
896 ctz32(uint32_t n)
897 {
898 #if defined(__INTEL_COMPILER)
899 	return _bit_scan_forward(n);
900 
901 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX >= UINT32_MAX
902 	return (uint32_t)__builtin_ctz(n);
903 
904 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
905 	uint32_t i;
906 	__asm__("bsfl %1, %0" : "=r" (i) : "rm" (n));
907 	return i;
908 
909 #elif defined(_MSC_VER)
910 	unsigned long i;
911 	_BitScanForward(&i, n);
912 	return i;
913 
914 #else
915 	uint32_t i = 0;
916 
917 	if ((n & 0x0000FFFF) == 0) {
918 		n >>= 16;
919 		i = 16;
920 	}
921 
922 	if ((n & 0x000000FF) == 0) {
923 		n >>= 8;
924 		i += 8;
925 	}
926 
927 	if ((n & 0x0000000F) == 0) {
928 		n >>= 4;
929 		i += 4;
930 	}
931 
932 	if ((n & 0x00000003) == 0) {
933 		n >>= 2;
934 		i += 2;
935 	}
936 
937 	if ((n & 0x00000001) == 0)
938 		++i;
939 
940 	return i;
941 #endif
942 }
943 
944 #define bsf32 ctz32
945 
946 #endif
947