1*3b35e7eeSXin LI/* SPDX-License-Identifier: 0BSD */ 2*3b35e7eeSXin LI 381ad8388SMartin Matuska/* 481ad8388SMartin Matuska * Speed-optimized CRC64 using slicing-by-four algorithm 581ad8388SMartin Matuska * 681ad8388SMartin Matuska * This uses only i386 instructions, but it is optimized for i686 and later 781ad8388SMartin Matuska * (including e.g. Pentium II/III/IV, Athlon XP, and Core 2). 881ad8388SMartin Matuska * 981ad8388SMartin Matuska * Authors: Igor Pavlov (original CRC32 assembly code) 1081ad8388SMartin Matuska * Lasse Collin (CRC64 adaptation of the modified CRC32 code) 1181ad8388SMartin Matuska * 1281ad8388SMartin Matuska * This code needs lzma_crc64_table, which can be created using the 1381ad8388SMartin Matuska * following C code: 1481ad8388SMartin Matuska 1581ad8388SMartin Matuskauint64_t lzma_crc64_table[4][256]; 1681ad8388SMartin Matuska 1781ad8388SMartin Matuskavoid 1881ad8388SMartin Matuskainit_table(void) 1981ad8388SMartin Matuska{ 2081ad8388SMartin Matuska // ECMA-182 2181ad8388SMartin Matuska static const uint64_t poly64 = UINT64_C(0xC96C5795D7870F42); 2281ad8388SMartin Matuska 2381ad8388SMartin Matuska for (size_t s = 0; s < 4; ++s) { 2481ad8388SMartin Matuska for (size_t b = 0; b < 256; ++b) { 2581ad8388SMartin Matuska uint64_t r = s == 0 ? b : lzma_crc64_table[s - 1][b]; 2681ad8388SMartin Matuska 2781ad8388SMartin Matuska for (size_t i = 0; i < 8; ++i) { 2881ad8388SMartin Matuska if (r & 1) 2981ad8388SMartin Matuska r = (r >> 1) ^ poly64; 3081ad8388SMartin Matuska else 3181ad8388SMartin Matuska r >>= 1; 3281ad8388SMartin Matuska } 3381ad8388SMartin Matuska 3481ad8388SMartin Matuska lzma_crc64_table[s][b] = r; 3581ad8388SMartin Matuska } 3681ad8388SMartin Matuska } 3781ad8388SMartin Matuska} 3881ad8388SMartin Matuska 3981ad8388SMartin Matuska * The prototype of the CRC64 function: 4081ad8388SMartin Matuska * extern uint64_t lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc); 4181ad8388SMartin Matuska */ 4281ad8388SMartin Matuska 439e6bbe47SXin LI/* When Intel CET is enabled, include <cet.h> in assembly code to mark 449e6bbe47SXin LI Intel CET support. */ 459e6bbe47SXin LI#ifdef __CET__ 469e6bbe47SXin LI# include <cet.h> 479e6bbe47SXin LI#else 489e6bbe47SXin LI# define _CET_ENDBR 499e6bbe47SXin LI#endif 509e6bbe47SXin LI 5181ad8388SMartin Matuska/* 5281ad8388SMartin Matuska * On some systems, the functions need to be prefixed. The prefix is 5381ad8388SMartin Matuska * usually an underscore. 5481ad8388SMartin Matuska */ 5581ad8388SMartin Matuska#ifndef __USER_LABEL_PREFIX__ 5681ad8388SMartin Matuska# define __USER_LABEL_PREFIX__ 5781ad8388SMartin Matuska#endif 5881ad8388SMartin Matuska#define MAKE_SYM_CAT(prefix, sym) prefix ## sym 5981ad8388SMartin Matuska#define MAKE_SYM(prefix, sym) MAKE_SYM_CAT(prefix, sym) 6081ad8388SMartin Matuska#define LZMA_CRC64 MAKE_SYM(__USER_LABEL_PREFIX__, lzma_crc64) 6181ad8388SMartin Matuska#define LZMA_CRC64_TABLE MAKE_SYM(__USER_LABEL_PREFIX__, lzma_crc64_table) 6281ad8388SMartin Matuska 6381ad8388SMartin Matuska/* 6481ad8388SMartin Matuska * Solaris assembler doesn't have .p2align, and Darwin uses .align 6581ad8388SMartin Matuska * differently than GNU/Linux and Solaris. 6681ad8388SMartin Matuska */ 6781ad8388SMartin Matuska#if defined(__APPLE__) || defined(__MSDOS__) 6881ad8388SMartin Matuska# define ALIGN(pow2, abs) .align pow2 6981ad8388SMartin Matuska#else 7081ad8388SMartin Matuska# define ALIGN(pow2, abs) .align abs 7181ad8388SMartin Matuska#endif 7281ad8388SMartin Matuska 7381ad8388SMartin Matuska .text 7481ad8388SMartin Matuska .globl LZMA_CRC64 7581ad8388SMartin Matuska 7681ad8388SMartin Matuska#if !defined(__APPLE__) && !defined(_WIN32) && !defined(__CYGWIN__) \ 7781ad8388SMartin Matuska && !defined(__MSDOS__) 7881ad8388SMartin Matuska .type LZMA_CRC64, @function 7981ad8388SMartin Matuska#endif 8081ad8388SMartin Matuska 8181ad8388SMartin Matuska ALIGN(4, 16) 8281ad8388SMartin MatuskaLZMA_CRC64: 839e6bbe47SXin LI _CET_ENDBR 8481ad8388SMartin Matuska /* 8581ad8388SMartin Matuska * Register usage: 8681ad8388SMartin Matuska * %eax crc LSB 8781ad8388SMartin Matuska * %edx crc MSB 8881ad8388SMartin Matuska * %esi buf 8981ad8388SMartin Matuska * %edi size or buf + size 9081ad8388SMartin Matuska * %ebx lzma_crc64_table 9181ad8388SMartin Matuska * %ebp Table index 9281ad8388SMartin Matuska * %ecx Temporary 9381ad8388SMartin Matuska */ 9481ad8388SMartin Matuska pushl %ebx 9581ad8388SMartin Matuska pushl %esi 9681ad8388SMartin Matuska pushl %edi 9781ad8388SMartin Matuska pushl %ebp 9881ad8388SMartin Matuska movl 0x14(%esp), %esi /* buf */ 9981ad8388SMartin Matuska movl 0x18(%esp), %edi /* size */ 10081ad8388SMartin Matuska movl 0x1C(%esp), %eax /* crc LSB */ 10181ad8388SMartin Matuska movl 0x20(%esp), %edx /* crc MSB */ 10281ad8388SMartin Matuska 10381ad8388SMartin Matuska /* 10481ad8388SMartin Matuska * Store the address of lzma_crc64_table to %ebx. This is needed to 10581ad8388SMartin Matuska * get position-independent code (PIC). 10681ad8388SMartin Matuska * 10781ad8388SMartin Matuska * The PIC macro is defined by libtool, while __PIC__ is defined 10881ad8388SMartin Matuska * by GCC but only on some systems. Testing for both makes it simpler 10981ad8388SMartin Matuska * to test this code without libtool, and keeps the code working also 11081ad8388SMartin Matuska * when built with libtool but using something else than GCC. 11181ad8388SMartin Matuska * 11281ad8388SMartin Matuska * I understood that libtool may define PIC on Windows even though 11381ad8388SMartin Matuska * the code in Windows DLLs is not PIC in sense that it is in ELF 11481ad8388SMartin Matuska * binaries, so we need a separate check to always use the non-PIC 11581ad8388SMartin Matuska * code on Windows. 11681ad8388SMartin Matuska */ 11781ad8388SMartin Matuska#if (!defined(PIC) && !defined(__PIC__)) \ 11881ad8388SMartin Matuska || (defined(_WIN32) || defined(__CYGWIN__)) 11981ad8388SMartin Matuska /* Not PIC */ 12081ad8388SMartin Matuska movl $ LZMA_CRC64_TABLE, %ebx 12181ad8388SMartin Matuska#elif defined(__APPLE__) 12281ad8388SMartin Matuska /* Mach-O */ 12381ad8388SMartin Matuska call .L_get_pc 12481ad8388SMartin Matuska.L_pic: 12581ad8388SMartin Matuska leal .L_lzma_crc64_table$non_lazy_ptr-.L_pic(%ebx), %ebx 12681ad8388SMartin Matuska movl (%ebx), %ebx 12781ad8388SMartin Matuska#else 12881ad8388SMartin Matuska /* ELF */ 12981ad8388SMartin Matuska call .L_get_pc 13081ad8388SMartin Matuska addl $_GLOBAL_OFFSET_TABLE_, %ebx 13181ad8388SMartin Matuska movl LZMA_CRC64_TABLE@GOT(%ebx), %ebx 13281ad8388SMartin Matuska#endif 13381ad8388SMartin Matuska 13481ad8388SMartin Matuska /* Complement the initial value. */ 13581ad8388SMartin Matuska notl %eax 13681ad8388SMartin Matuska notl %edx 13781ad8388SMartin Matuska 13881ad8388SMartin Matuska.L_align: 13981ad8388SMartin Matuska /* 14081ad8388SMartin Matuska * Check if there is enough input to use slicing-by-four. 14181ad8388SMartin Matuska * We need eight bytes, because the loop pre-reads four bytes. 14281ad8388SMartin Matuska */ 14381ad8388SMartin Matuska cmpl $8, %edi 14481ad8388SMartin Matuska jb .L_rest 14581ad8388SMartin Matuska 14681ad8388SMartin Matuska /* Check if we have reached alignment of four bytes. */ 14781ad8388SMartin Matuska testl $3, %esi 14881ad8388SMartin Matuska jz .L_slice 14981ad8388SMartin Matuska 15081ad8388SMartin Matuska /* Calculate CRC of the next input byte. */ 15181ad8388SMartin Matuska movzbl (%esi), %ebp 15281ad8388SMartin Matuska incl %esi 15381ad8388SMartin Matuska movzbl %al, %ecx 15481ad8388SMartin Matuska xorl %ecx, %ebp 15581ad8388SMartin Matuska shrdl $8, %edx, %eax 15681ad8388SMartin Matuska xorl (%ebx, %ebp, 8), %eax 15781ad8388SMartin Matuska shrl $8, %edx 15881ad8388SMartin Matuska xorl 4(%ebx, %ebp, 8), %edx 15981ad8388SMartin Matuska decl %edi 16081ad8388SMartin Matuska jmp .L_align 16181ad8388SMartin Matuska 16281ad8388SMartin Matuska.L_slice: 16381ad8388SMartin Matuska /* 16481ad8388SMartin Matuska * If we get here, there's at least eight bytes of aligned input 16581ad8388SMartin Matuska * available. Make %edi multiple of four bytes. Store the possible 16681ad8388SMartin Matuska * remainder over the "size" variable in the argument stack. 16781ad8388SMartin Matuska */ 16881ad8388SMartin Matuska movl %edi, 0x18(%esp) 16981ad8388SMartin Matuska andl $-4, %edi 17081ad8388SMartin Matuska subl %edi, 0x18(%esp) 17181ad8388SMartin Matuska 17281ad8388SMartin Matuska /* 17381ad8388SMartin Matuska * Let %edi be buf + size - 4 while running the main loop. This way 17481ad8388SMartin Matuska * we can compare for equality to determine when exit the loop. 17581ad8388SMartin Matuska */ 17681ad8388SMartin Matuska addl %esi, %edi 17781ad8388SMartin Matuska subl $4, %edi 17881ad8388SMartin Matuska 17981ad8388SMartin Matuska /* Read in the first four aligned bytes. */ 18081ad8388SMartin Matuska movl (%esi), %ecx 18181ad8388SMartin Matuska 18281ad8388SMartin Matuska.L_loop: 18381ad8388SMartin Matuska xorl %eax, %ecx 18481ad8388SMartin Matuska movzbl %cl, %ebp 18581ad8388SMartin Matuska movl 0x1800(%ebx, %ebp, 8), %eax 18681ad8388SMartin Matuska xorl %edx, %eax 18781ad8388SMartin Matuska movl 0x1804(%ebx, %ebp, 8), %edx 18881ad8388SMartin Matuska movzbl %ch, %ebp 18981ad8388SMartin Matuska xorl 0x1000(%ebx, %ebp, 8), %eax 19081ad8388SMartin Matuska xorl 0x1004(%ebx, %ebp, 8), %edx 19181ad8388SMartin Matuska shrl $16, %ecx 19281ad8388SMartin Matuska movzbl %cl, %ebp 19381ad8388SMartin Matuska xorl 0x0800(%ebx, %ebp, 8), %eax 19481ad8388SMartin Matuska xorl 0x0804(%ebx, %ebp, 8), %edx 19581ad8388SMartin Matuska movzbl %ch, %ebp 19681ad8388SMartin Matuska addl $4, %esi 19781ad8388SMartin Matuska xorl (%ebx, %ebp, 8), %eax 19881ad8388SMartin Matuska xorl 4(%ebx, %ebp, 8), %edx 19981ad8388SMartin Matuska 20081ad8388SMartin Matuska /* Check for end of aligned input. */ 20181ad8388SMartin Matuska cmpl %edi, %esi 20281ad8388SMartin Matuska 20381ad8388SMartin Matuska /* 20481ad8388SMartin Matuska * Copy the next input byte to %ecx. It is slightly faster to 20581ad8388SMartin Matuska * read it here than at the top of the loop. 20681ad8388SMartin Matuska */ 20781ad8388SMartin Matuska movl (%esi), %ecx 20881ad8388SMartin Matuska jb .L_loop 20981ad8388SMartin Matuska 21081ad8388SMartin Matuska /* 21181ad8388SMartin Matuska * Process the remaining four bytes, which we have already 21281ad8388SMartin Matuska * copied to %ecx. 21381ad8388SMartin Matuska */ 21481ad8388SMartin Matuska xorl %eax, %ecx 21581ad8388SMartin Matuska movzbl %cl, %ebp 21681ad8388SMartin Matuska movl 0x1800(%ebx, %ebp, 8), %eax 21781ad8388SMartin Matuska xorl %edx, %eax 21881ad8388SMartin Matuska movl 0x1804(%ebx, %ebp, 8), %edx 21981ad8388SMartin Matuska movzbl %ch, %ebp 22081ad8388SMartin Matuska xorl 0x1000(%ebx, %ebp, 8), %eax 22181ad8388SMartin Matuska xorl 0x1004(%ebx, %ebp, 8), %edx 22281ad8388SMartin Matuska shrl $16, %ecx 22381ad8388SMartin Matuska movzbl %cl, %ebp 22481ad8388SMartin Matuska xorl 0x0800(%ebx, %ebp, 8), %eax 22581ad8388SMartin Matuska xorl 0x0804(%ebx, %ebp, 8), %edx 22681ad8388SMartin Matuska movzbl %ch, %ebp 22781ad8388SMartin Matuska addl $4, %esi 22881ad8388SMartin Matuska xorl (%ebx, %ebp, 8), %eax 22981ad8388SMartin Matuska xorl 4(%ebx, %ebp, 8), %edx 23081ad8388SMartin Matuska 23181ad8388SMartin Matuska /* Copy the number of remaining bytes to %edi. */ 23281ad8388SMartin Matuska movl 0x18(%esp), %edi 23381ad8388SMartin Matuska 23481ad8388SMartin Matuska.L_rest: 23581ad8388SMartin Matuska /* Check for end of input. */ 23681ad8388SMartin Matuska testl %edi, %edi 23781ad8388SMartin Matuska jz .L_return 23881ad8388SMartin Matuska 23981ad8388SMartin Matuska /* Calculate CRC of the next input byte. */ 24081ad8388SMartin Matuska movzbl (%esi), %ebp 24181ad8388SMartin Matuska incl %esi 24281ad8388SMartin Matuska movzbl %al, %ecx 24381ad8388SMartin Matuska xorl %ecx, %ebp 24481ad8388SMartin Matuska shrdl $8, %edx, %eax 24581ad8388SMartin Matuska xorl (%ebx, %ebp, 8), %eax 24681ad8388SMartin Matuska shrl $8, %edx 24781ad8388SMartin Matuska xorl 4(%ebx, %ebp, 8), %edx 24881ad8388SMartin Matuska decl %edi 24981ad8388SMartin Matuska jmp .L_rest 25081ad8388SMartin Matuska 25181ad8388SMartin Matuska.L_return: 25281ad8388SMartin Matuska /* Complement the final value. */ 25381ad8388SMartin Matuska notl %eax 25481ad8388SMartin Matuska notl %edx 25581ad8388SMartin Matuska 25681ad8388SMartin Matuska popl %ebp 25781ad8388SMartin Matuska popl %edi 25881ad8388SMartin Matuska popl %esi 25981ad8388SMartin Matuska popl %ebx 26081ad8388SMartin Matuska ret 26181ad8388SMartin Matuska 26281ad8388SMartin Matuska#if defined(PIC) || defined(__PIC__) 26381ad8388SMartin Matuska ALIGN(4, 16) 26481ad8388SMartin Matuska.L_get_pc: 26581ad8388SMartin Matuska movl (%esp), %ebx 26681ad8388SMartin Matuska ret 26781ad8388SMartin Matuska#endif 26881ad8388SMartin Matuska 26981ad8388SMartin Matuska#if defined(__APPLE__) && (defined(PIC) || defined(__PIC__)) 27081ad8388SMartin Matuska /* Mach-O PIC */ 27181ad8388SMartin Matuska .section __IMPORT,__pointers,non_lazy_symbol_pointers 27281ad8388SMartin Matuska.L_lzma_crc64_table$non_lazy_ptr: 27381ad8388SMartin Matuska .indirect_symbol LZMA_CRC64_TABLE 27481ad8388SMartin Matuska .long 0 27581ad8388SMartin Matuska 27681ad8388SMartin Matuska#elif defined(_WIN32) || defined(__CYGWIN__) 27781ad8388SMartin Matuska# ifdef DLL_EXPORT 27881ad8388SMartin Matuska /* This is equivalent of __declspec(dllexport). */ 27981ad8388SMartin Matuska .section .drectve 28081ad8388SMartin Matuska .ascii " -export:lzma_crc64" 28181ad8388SMartin Matuska# endif 28281ad8388SMartin Matuska 28381ad8388SMartin Matuska#elif !defined(__MSDOS__) 28481ad8388SMartin Matuska /* ELF */ 28581ad8388SMartin Matuska .size LZMA_CRC64, .-LZMA_CRC64 28681ad8388SMartin Matuska#endif 28781ad8388SMartin Matuska 28881ad8388SMartin Matuska/* 28981ad8388SMartin Matuska * This is needed to support non-executable stack. It's ugly to 2909e6bbe47SXin LI * use __FreeBSD__ and __linux__ here, but I don't know a way to detect when 29181ad8388SMartin Matuska * we are using GNU assembler. 29281ad8388SMartin Matuska */ 293953111c9SEd Maste#if defined(__ELF__) && (defined(__FreeBSD__) || defined(__linux__)) 29481ad8388SMartin Matuska .section .note.GNU-stack,"",@progbits 29581ad8388SMartin Matuska#endif 296