1 //  Copyright (c) 2018, Arm Limited and affiliates. All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 
6 #include "util/crc32c_arm64.h"
7 
8 #if defined(__linux__) && defined(HAVE_ARM64_CRC)
9 
10 #include <asm/hwcap.h>
11 #include <sys/auxv.h>
12 #ifndef HWCAP_CRC32
13 #define HWCAP_CRC32 (1 << 7)
14 #endif
15 
16 #ifdef HAVE_ARM64_CRYPTO
17 /* unfolding to compute 8 * 3 = 24 bytes parallelly */
18 #define CRC32C24BYTES(ITR)                                    \
19   crc1 = crc32c_u64(crc1, *(buf64 + BLK_LENGTH + (ITR)));     \
20   crc2 = crc32c_u64(crc2, *(buf64 + BLK_LENGTH * 2 + (ITR))); \
21   crc0 = crc32c_u64(crc0, *(buf64 + (ITR)));
22 
23 /* unfolding to compute 24 * 7 = 168 bytes parallelly */
24 #define CRC32C7X24BYTES(ITR)   \
25   do {                         \
26     CRC32C24BYTES((ITR)*7 + 0) \
27     CRC32C24BYTES((ITR)*7 + 1) \
28     CRC32C24BYTES((ITR)*7 + 2) \
29     CRC32C24BYTES((ITR)*7 + 3) \
30     CRC32C24BYTES((ITR)*7 + 4) \
31     CRC32C24BYTES((ITR)*7 + 5) \
32     CRC32C24BYTES((ITR)*7 + 6) \
33   } while (0)
34 #endif
35 
crc32c_runtime_check(void)36 uint32_t crc32c_runtime_check(void) {
37   uint64_t auxv = getauxval(AT_HWCAP);
38   return (auxv & HWCAP_CRC32) != 0;
39 }
40 
crc32c_arm64(uint32_t crc,unsigned char const * data,unsigned len)41 uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data,
42                              unsigned len) {
43   const uint8_t *buf8;
44   const uint64_t *buf64 = (uint64_t *)data;
45   int length = (int)len;
46   crc ^= 0xffffffff;
47 
48 #ifdef HAVE_ARM64_CRYPTO
49 /* Crc32c Parallel computation
50  *   Algorithm comes from Intel whitepaper:
51  *   crc-iscsi-polynomial-crc32-instruction-paper
52  *
53  * Input data is divided into three equal-sized blocks
54  *   Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes
55  *   One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes
56  */
57 #define BLK_LENGTH 42
58   while (length >= 1024) {
59     uint64_t t0, t1;
60     uint32_t crc0 = 0, crc1 = 0, crc2 = 0;
61 
62     /* Parallel Param:
63      *   k0 = CRC32(x ^ (42 * 8 * 8 * 2 - 1));
64      *   k1 = CRC32(x ^ (42 * 8 * 8 - 1));
65      */
66     uint32_t k0 = 0xe417f38a, k1 = 0x8f158014;
67 
68     /* Prefetch data for following block to avoid cache miss */
69     PREF1KL1((uint8_t *)buf64, 1024);
70 
71     /* First 8 byte for better pipelining */
72     crc0 = crc32c_u64(crc, *buf64++);
73 
74     /* 3 blocks crc32c parallel computation
75      * Macro unfolding to compute parallelly
76      * 168 * 6 = 1008 (bytes)
77      */
78     CRC32C7X24BYTES(0);
79     CRC32C7X24BYTES(1);
80     CRC32C7X24BYTES(2);
81     CRC32C7X24BYTES(3);
82     CRC32C7X24BYTES(4);
83     CRC32C7X24BYTES(5);
84     buf64 += (BLK_LENGTH * 3);
85 
86     /* Last 8 bytes */
87     crc = crc32c_u64(crc2, *buf64++);
88 
89     t0 = (uint64_t)vmull_p64(crc0, k0);
90     t1 = (uint64_t)vmull_p64(crc1, k1);
91 
92     /* Merge (crc0, crc1, crc2) -> crc */
93     crc1 = crc32c_u64(0, t1);
94     crc ^= crc1;
95     crc0 = crc32c_u64(0, t0);
96     crc ^= crc0;
97 
98     length -= 1024;
99   }
100 
101   if (length == 0) return crc ^ (0xffffffffU);
102 #endif
103   buf8 = (const uint8_t *)buf64;
104   while (length >= 8) {
105     crc = crc32c_u64(crc, *(const uint64_t *)buf8);
106     buf8 += 8;
107     length -= 8;
108   }
109 
110   /* The following is more efficient than the straight loop */
111   if (length >= 4) {
112     crc = crc32c_u32(crc, *(const uint32_t *)buf8);
113     buf8 += 4;
114     length -= 4;
115   }
116 
117   if (length >= 2) {
118     crc = crc32c_u16(crc, *(const uint16_t *)buf8);
119     buf8 += 2;
120     length -= 2;
121   }
122 
123   if (length >= 1) crc = crc32c_u8(crc, *buf8);
124 
125   crc ^= 0xffffffff;
126   return crc;
127 }
128 
129 #endif
130