xref: /freebsd/contrib/xz/src/liblzma/check/crc64_x86.S (revision 3b35e7ee)
1*3b35e7eeSXin LI/* SPDX-License-Identifier: 0BSD */
2*3b35e7eeSXin LI
381ad8388SMartin Matuska/*
481ad8388SMartin Matuska * Speed-optimized CRC64 using slicing-by-four algorithm
581ad8388SMartin Matuska *
681ad8388SMartin Matuska * This uses only i386 instructions, but it is optimized for i686 and later
781ad8388SMartin Matuska * (including e.g. Pentium II/III/IV, Athlon XP, and Core 2).
881ad8388SMartin Matuska *
981ad8388SMartin Matuska * Authors: Igor Pavlov (original CRC32 assembly code)
1081ad8388SMartin Matuska *          Lasse Collin (CRC64 adaptation of the modified CRC32 code)
1181ad8388SMartin Matuska *
1281ad8388SMartin Matuska * This code needs lzma_crc64_table, which can be created using the
1381ad8388SMartin Matuska * following C code:
1481ad8388SMartin Matuska
1581ad8388SMartin Matuskauint64_t lzma_crc64_table[4][256];
1681ad8388SMartin Matuska
1781ad8388SMartin Matuskavoid
1881ad8388SMartin Matuskainit_table(void)
1981ad8388SMartin Matuska{
2081ad8388SMartin Matuska	// ECMA-182
2181ad8388SMartin Matuska	static const uint64_t poly64 = UINT64_C(0xC96C5795D7870F42);
2281ad8388SMartin Matuska
2381ad8388SMartin Matuska	for (size_t s = 0; s < 4; ++s) {
2481ad8388SMartin Matuska		for (size_t b = 0; b < 256; ++b) {
2581ad8388SMartin Matuska			uint64_t r = s == 0 ? b : lzma_crc64_table[s - 1][b];
2681ad8388SMartin Matuska
2781ad8388SMartin Matuska			for (size_t i = 0; i < 8; ++i) {
2881ad8388SMartin Matuska				if (r & 1)
2981ad8388SMartin Matuska					r = (r >> 1) ^ poly64;
3081ad8388SMartin Matuska				else
3181ad8388SMartin Matuska					r >>= 1;
3281ad8388SMartin Matuska			}
3381ad8388SMartin Matuska
3481ad8388SMartin Matuska			lzma_crc64_table[s][b] = r;
3581ad8388SMartin Matuska		}
3681ad8388SMartin Matuska	}
3781ad8388SMartin Matuska}
3881ad8388SMartin Matuska
3981ad8388SMartin Matuska * The prototype of the CRC64 function:
4081ad8388SMartin Matuska * extern uint64_t lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc);
4181ad8388SMartin Matuska */
4281ad8388SMartin Matuska
439e6bbe47SXin LI/* When Intel CET is enabled, include <cet.h> in assembly code to mark
449e6bbe47SXin LI   Intel CET support.  */
459e6bbe47SXin LI#ifdef __CET__
469e6bbe47SXin LI# include <cet.h>
479e6bbe47SXin LI#else
489e6bbe47SXin LI# define _CET_ENDBR
499e6bbe47SXin LI#endif
509e6bbe47SXin LI
5181ad8388SMartin Matuska/*
5281ad8388SMartin Matuska * On some systems, the functions need to be prefixed. The prefix is
5381ad8388SMartin Matuska * usually an underscore.
5481ad8388SMartin Matuska */
5581ad8388SMartin Matuska#ifndef __USER_LABEL_PREFIX__
5681ad8388SMartin Matuska#	define __USER_LABEL_PREFIX__
5781ad8388SMartin Matuska#endif
5881ad8388SMartin Matuska#define MAKE_SYM_CAT(prefix, sym) prefix ## sym
5981ad8388SMartin Matuska#define MAKE_SYM(prefix, sym) MAKE_SYM_CAT(prefix, sym)
6081ad8388SMartin Matuska#define LZMA_CRC64 MAKE_SYM(__USER_LABEL_PREFIX__, lzma_crc64)
6181ad8388SMartin Matuska#define LZMA_CRC64_TABLE MAKE_SYM(__USER_LABEL_PREFIX__, lzma_crc64_table)
6281ad8388SMartin Matuska
6381ad8388SMartin Matuska/*
6481ad8388SMartin Matuska * Solaris assembler doesn't have .p2align, and Darwin uses .align
6581ad8388SMartin Matuska * differently than GNU/Linux and Solaris.
6681ad8388SMartin Matuska */
6781ad8388SMartin Matuska#if defined(__APPLE__) || defined(__MSDOS__)
6881ad8388SMartin Matuska#	define ALIGN(pow2, abs) .align pow2
6981ad8388SMartin Matuska#else
7081ad8388SMartin Matuska#	define ALIGN(pow2, abs) .align abs
7181ad8388SMartin Matuska#endif
7281ad8388SMartin Matuska
7381ad8388SMartin Matuska	.text
7481ad8388SMartin Matuska	.globl	LZMA_CRC64
7581ad8388SMartin Matuska
7681ad8388SMartin Matuska#if !defined(__APPLE__) && !defined(_WIN32) && !defined(__CYGWIN__) \
7781ad8388SMartin Matuska		&& !defined(__MSDOS__)
7881ad8388SMartin Matuska	.type	LZMA_CRC64, @function
7981ad8388SMartin Matuska#endif
8081ad8388SMartin Matuska
8181ad8388SMartin Matuska	ALIGN(4, 16)
8281ad8388SMartin MatuskaLZMA_CRC64:
839e6bbe47SXin LI	_CET_ENDBR
8481ad8388SMartin Matuska	/*
8581ad8388SMartin Matuska	 * Register usage:
8681ad8388SMartin Matuska	 * %eax crc LSB
8781ad8388SMartin Matuska	 * %edx crc MSB
8881ad8388SMartin Matuska	 * %esi buf
8981ad8388SMartin Matuska	 * %edi size or buf + size
9081ad8388SMartin Matuska	 * %ebx lzma_crc64_table
9181ad8388SMartin Matuska	 * %ebp Table index
9281ad8388SMartin Matuska	 * %ecx Temporary
9381ad8388SMartin Matuska	 */
9481ad8388SMartin Matuska	pushl	%ebx
9581ad8388SMartin Matuska	pushl	%esi
9681ad8388SMartin Matuska	pushl	%edi
9781ad8388SMartin Matuska	pushl	%ebp
9881ad8388SMartin Matuska	movl	0x14(%esp), %esi /* buf */
9981ad8388SMartin Matuska	movl	0x18(%esp), %edi /* size */
10081ad8388SMartin Matuska	movl	0x1C(%esp), %eax /* crc LSB */
10181ad8388SMartin Matuska	movl	0x20(%esp), %edx /* crc MSB */
10281ad8388SMartin Matuska
10381ad8388SMartin Matuska	/*
10481ad8388SMartin Matuska	 * Store the address of lzma_crc64_table to %ebx. This is needed to
10581ad8388SMartin Matuska	 * get position-independent code (PIC).
10681ad8388SMartin Matuska	 *
10781ad8388SMartin Matuska	 * The PIC macro is defined by libtool, while __PIC__ is defined
10881ad8388SMartin Matuska	 * by GCC but only on some systems. Testing for both makes it simpler
10981ad8388SMartin Matuska	 * to test this code without libtool, and keeps the code working also
11081ad8388SMartin Matuska	 * when built with libtool but using something else than GCC.
11181ad8388SMartin Matuska	 *
11281ad8388SMartin Matuska	 * I understood that libtool may define PIC on Windows even though
11381ad8388SMartin Matuska	 * the code in Windows DLLs is not PIC in sense that it is in ELF
11481ad8388SMartin Matuska	 * binaries, so we need a separate check to always use the non-PIC
11581ad8388SMartin Matuska	 * code on Windows.
11681ad8388SMartin Matuska	 */
11781ad8388SMartin Matuska#if (!defined(PIC) && !defined(__PIC__)) \
11881ad8388SMartin Matuska		|| (defined(_WIN32) || defined(__CYGWIN__))
11981ad8388SMartin Matuska	/* Not PIC */
12081ad8388SMartin Matuska	movl	$ LZMA_CRC64_TABLE, %ebx
12181ad8388SMartin Matuska#elif defined(__APPLE__)
12281ad8388SMartin Matuska	/* Mach-O */
12381ad8388SMartin Matuska	call	.L_get_pc
12481ad8388SMartin Matuska.L_pic:
12581ad8388SMartin Matuska	leal	.L_lzma_crc64_table$non_lazy_ptr-.L_pic(%ebx), %ebx
12681ad8388SMartin Matuska	movl	(%ebx), %ebx
12781ad8388SMartin Matuska#else
12881ad8388SMartin Matuska	/* ELF */
12981ad8388SMartin Matuska	call	.L_get_pc
13081ad8388SMartin Matuska	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
13181ad8388SMartin Matuska	movl	LZMA_CRC64_TABLE@GOT(%ebx), %ebx
13281ad8388SMartin Matuska#endif
13381ad8388SMartin Matuska
13481ad8388SMartin Matuska	/* Complement the initial value. */
13581ad8388SMartin Matuska	notl	%eax
13681ad8388SMartin Matuska	notl	%edx
13781ad8388SMartin Matuska
13881ad8388SMartin Matuska.L_align:
13981ad8388SMartin Matuska	/*
14081ad8388SMartin Matuska	 * Check if there is enough input to use slicing-by-four.
14181ad8388SMartin Matuska	 * We need eight bytes, because the loop pre-reads four bytes.
14281ad8388SMartin Matuska	 */
14381ad8388SMartin Matuska	cmpl	$8, %edi
14481ad8388SMartin Matuska	jb	.L_rest
14581ad8388SMartin Matuska
14681ad8388SMartin Matuska	/* Check if we have reached alignment of four bytes. */
14781ad8388SMartin Matuska	testl	$3, %esi
14881ad8388SMartin Matuska	jz	.L_slice
14981ad8388SMartin Matuska
15081ad8388SMartin Matuska	/* Calculate CRC of the next input byte. */
15181ad8388SMartin Matuska	movzbl	(%esi), %ebp
15281ad8388SMartin Matuska	incl	%esi
15381ad8388SMartin Matuska	movzbl	%al, %ecx
15481ad8388SMartin Matuska	xorl	%ecx, %ebp
15581ad8388SMartin Matuska	shrdl	$8, %edx, %eax
15681ad8388SMartin Matuska	xorl	(%ebx, %ebp, 8), %eax
15781ad8388SMartin Matuska	shrl	$8, %edx
15881ad8388SMartin Matuska	xorl	4(%ebx, %ebp, 8), %edx
15981ad8388SMartin Matuska	decl	%edi
16081ad8388SMartin Matuska	jmp	.L_align
16181ad8388SMartin Matuska
16281ad8388SMartin Matuska.L_slice:
16381ad8388SMartin Matuska	/*
16481ad8388SMartin Matuska	 * If we get here, there's at least eight bytes of aligned input
16581ad8388SMartin Matuska	 * available. Make %edi multiple of four bytes. Store the possible
16681ad8388SMartin Matuska	 * remainder over the "size" variable in the argument stack.
16781ad8388SMartin Matuska	 */
16881ad8388SMartin Matuska	movl	%edi, 0x18(%esp)
16981ad8388SMartin Matuska	andl	$-4, %edi
17081ad8388SMartin Matuska	subl	%edi, 0x18(%esp)
17181ad8388SMartin Matuska
17281ad8388SMartin Matuska	/*
17381ad8388SMartin Matuska	 * Let %edi be buf + size - 4 while running the main loop. This way
17481ad8388SMartin Matuska	 * we can compare for equality to determine when exit the loop.
17581ad8388SMartin Matuska	 */
17681ad8388SMartin Matuska	addl	%esi, %edi
17781ad8388SMartin Matuska	subl	$4, %edi
17881ad8388SMartin Matuska
17981ad8388SMartin Matuska	/* Read in the first four aligned bytes. */
18081ad8388SMartin Matuska	movl	(%esi), %ecx
18181ad8388SMartin Matuska
18281ad8388SMartin Matuska.L_loop:
18381ad8388SMartin Matuska	xorl	%eax, %ecx
18481ad8388SMartin Matuska	movzbl	%cl, %ebp
18581ad8388SMartin Matuska	movl	0x1800(%ebx, %ebp, 8), %eax
18681ad8388SMartin Matuska	xorl	%edx, %eax
18781ad8388SMartin Matuska	movl	0x1804(%ebx, %ebp, 8), %edx
18881ad8388SMartin Matuska	movzbl	%ch, %ebp
18981ad8388SMartin Matuska	xorl	0x1000(%ebx, %ebp, 8), %eax
19081ad8388SMartin Matuska	xorl	0x1004(%ebx, %ebp, 8), %edx
19181ad8388SMartin Matuska	shrl	$16, %ecx
19281ad8388SMartin Matuska	movzbl	%cl, %ebp
19381ad8388SMartin Matuska	xorl	0x0800(%ebx, %ebp, 8), %eax
19481ad8388SMartin Matuska	xorl	0x0804(%ebx, %ebp, 8), %edx
19581ad8388SMartin Matuska	movzbl	%ch, %ebp
19681ad8388SMartin Matuska	addl	$4, %esi
19781ad8388SMartin Matuska	xorl	(%ebx, %ebp, 8), %eax
19881ad8388SMartin Matuska	xorl	4(%ebx, %ebp, 8), %edx
19981ad8388SMartin Matuska
20081ad8388SMartin Matuska	/* Check for end of aligned input. */
20181ad8388SMartin Matuska	cmpl	%edi, %esi
20281ad8388SMartin Matuska
20381ad8388SMartin Matuska	/*
20481ad8388SMartin Matuska	 * Copy the next input byte to %ecx. It is slightly faster to
20581ad8388SMartin Matuska	 * read it here than at the top of the loop.
20681ad8388SMartin Matuska	 */
20781ad8388SMartin Matuska	movl	(%esi), %ecx
20881ad8388SMartin Matuska	jb	.L_loop
20981ad8388SMartin Matuska
21081ad8388SMartin Matuska	/*
21181ad8388SMartin Matuska	 * Process the remaining four bytes, which we have already
21281ad8388SMartin Matuska	 * copied to %ecx.
21381ad8388SMartin Matuska	 */
21481ad8388SMartin Matuska	xorl	%eax, %ecx
21581ad8388SMartin Matuska	movzbl	%cl, %ebp
21681ad8388SMartin Matuska	movl	0x1800(%ebx, %ebp, 8), %eax
21781ad8388SMartin Matuska	xorl	%edx, %eax
21881ad8388SMartin Matuska	movl	0x1804(%ebx, %ebp, 8), %edx
21981ad8388SMartin Matuska	movzbl	%ch, %ebp
22081ad8388SMartin Matuska	xorl	0x1000(%ebx, %ebp, 8), %eax
22181ad8388SMartin Matuska	xorl	0x1004(%ebx, %ebp, 8), %edx
22281ad8388SMartin Matuska	shrl	$16, %ecx
22381ad8388SMartin Matuska	movzbl	%cl, %ebp
22481ad8388SMartin Matuska	xorl	0x0800(%ebx, %ebp, 8), %eax
22581ad8388SMartin Matuska	xorl	0x0804(%ebx, %ebp, 8), %edx
22681ad8388SMartin Matuska	movzbl	%ch, %ebp
22781ad8388SMartin Matuska	addl	$4, %esi
22881ad8388SMartin Matuska	xorl	(%ebx, %ebp, 8), %eax
22981ad8388SMartin Matuska	xorl	4(%ebx, %ebp, 8), %edx
23081ad8388SMartin Matuska
23181ad8388SMartin Matuska	/* Copy the number of remaining bytes to %edi. */
23281ad8388SMartin Matuska	movl	0x18(%esp), %edi
23381ad8388SMartin Matuska
23481ad8388SMartin Matuska.L_rest:
23581ad8388SMartin Matuska	/* Check for end of input. */
23681ad8388SMartin Matuska	testl	%edi, %edi
23781ad8388SMartin Matuska	jz	.L_return
23881ad8388SMartin Matuska
23981ad8388SMartin Matuska	/* Calculate CRC of the next input byte. */
24081ad8388SMartin Matuska	movzbl	(%esi), %ebp
24181ad8388SMartin Matuska	incl	%esi
24281ad8388SMartin Matuska	movzbl	%al, %ecx
24381ad8388SMartin Matuska	xorl	%ecx, %ebp
24481ad8388SMartin Matuska	shrdl	$8, %edx, %eax
24581ad8388SMartin Matuska	xorl	(%ebx, %ebp, 8), %eax
24681ad8388SMartin Matuska	shrl	$8, %edx
24781ad8388SMartin Matuska	xorl	4(%ebx, %ebp, 8), %edx
24881ad8388SMartin Matuska	decl	%edi
24981ad8388SMartin Matuska	jmp	.L_rest
25081ad8388SMartin Matuska
25181ad8388SMartin Matuska.L_return:
25281ad8388SMartin Matuska	/* Complement the final value. */
25381ad8388SMartin Matuska	notl	%eax
25481ad8388SMartin Matuska	notl	%edx
25581ad8388SMartin Matuska
25681ad8388SMartin Matuska	popl	%ebp
25781ad8388SMartin Matuska	popl	%edi
25881ad8388SMartin Matuska	popl	%esi
25981ad8388SMartin Matuska	popl	%ebx
26081ad8388SMartin Matuska	ret
26181ad8388SMartin Matuska
26281ad8388SMartin Matuska#if defined(PIC) || defined(__PIC__)
26381ad8388SMartin Matuska	ALIGN(4, 16)
26481ad8388SMartin Matuska.L_get_pc:
26581ad8388SMartin Matuska	movl	(%esp), %ebx
26681ad8388SMartin Matuska	ret
26781ad8388SMartin Matuska#endif
26881ad8388SMartin Matuska
26981ad8388SMartin Matuska#if defined(__APPLE__) && (defined(PIC) || defined(__PIC__))
27081ad8388SMartin Matuska	/* Mach-O PIC */
27181ad8388SMartin Matuska	.section __IMPORT,__pointers,non_lazy_symbol_pointers
27281ad8388SMartin Matuska.L_lzma_crc64_table$non_lazy_ptr:
27381ad8388SMartin Matuska	.indirect_symbol LZMA_CRC64_TABLE
27481ad8388SMartin Matuska	.long 0
27581ad8388SMartin Matuska
27681ad8388SMartin Matuska#elif defined(_WIN32) || defined(__CYGWIN__)
27781ad8388SMartin Matuska#	ifdef DLL_EXPORT
27881ad8388SMartin Matuska	/* This is equivalent of __declspec(dllexport). */
27981ad8388SMartin Matuska	.section .drectve
28081ad8388SMartin Matuska	.ascii " -export:lzma_crc64"
28181ad8388SMartin Matuska#	endif
28281ad8388SMartin Matuska
28381ad8388SMartin Matuska#elif !defined(__MSDOS__)
28481ad8388SMartin Matuska	/* ELF */
28581ad8388SMartin Matuska	.size	LZMA_CRC64, .-LZMA_CRC64
28681ad8388SMartin Matuska#endif
28781ad8388SMartin Matuska
28881ad8388SMartin Matuska/*
28981ad8388SMartin Matuska * This is needed to support non-executable stack. It's ugly to
2909e6bbe47SXin LI * use __FreeBSD__ and __linux__ here, but I don't know a way to detect when
29181ad8388SMartin Matuska * we are using GNU assembler.
29281ad8388SMartin Matuska */
293953111c9SEd Maste#if defined(__ELF__) && (defined(__FreeBSD__) || defined(__linux__))
29481ad8388SMartin Matuska	.section	.note.GNU-stack,"",@progbits
29581ad8388SMartin Matuska#endif
296