1 // Copyright 2017 The CRC32C Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE.google-crc32c file. See the AUTHORS.google-crc32c file
4 // for names of contributors.
5 
6 // In a separate source file to allow this accelerated CRC32C function to be
7 // compiled with the appropriate compiler flags to enable ARM NEON CRC32C
8 // instructions.
9 
10 // This implementation is based on https://github.com/google/leveldb/pull/490.
11 //
12 // Adjusted from https://github.com/google/crc32c to be fit for inclusion
13 // into this python package
14 
15 #if defined(IS_ARM) && (defined(__linux__) || defined(linux))
16 
17 #include <stddef.h>
18 #include <stdint.h>
19 
20 #include <arm_acle.h>
21 #include <arm_neon.h>
22 
23 #define KBYTES 1032
24 #define SEGMENTBYTES 256
25 
26 // compute 8bytes for each segment parallelly
27 #define CRC32C32BYTES(P, IND)                                             \
28   do {                                                                    \
29     crc1 = __crc32cd(                                                     \
30         crc1, *((const uint64_t *)(P) + (SEGMENTBYTES / 8) * 1 + (IND))); \
31     crc2 = __crc32cd(                                                     \
32         crc2, *((const uint64_t *)(P) + (SEGMENTBYTES / 8) * 2 + (IND))); \
33     crc3 = __crc32cd(                                                     \
34         crc3, *((const uint64_t *)(P) + (SEGMENTBYTES / 8) * 3 + (IND))); \
35     crc0 = __crc32cd(                                                     \
36         crc0, *((const uint64_t *)(P) + (SEGMENTBYTES / 8) * 0 + (IND))); \
37   } while (0);
38 
39 // compute 8*8 bytes for each segment parallelly
40 #define CRC32C256BYTES(P, IND)      \
41   do {                              \
42     CRC32C32BYTES((P), (IND)*8 + 0) \
43     CRC32C32BYTES((P), (IND)*8 + 1) \
44     CRC32C32BYTES((P), (IND)*8 + 2) \
45     CRC32C32BYTES((P), (IND)*8 + 3) \
46     CRC32C32BYTES((P), (IND)*8 + 4) \
47     CRC32C32BYTES((P), (IND)*8 + 5) \
48     CRC32C32BYTES((P), (IND)*8 + 6) \
49     CRC32C32BYTES((P), (IND)*8 + 7) \
50   } while (0);
51 
52 // compute 4*8*8 bytes for each segment parallelly
53 #define CRC32C1024BYTES(P)   \
54   do {                       \
55     CRC32C256BYTES((P), 0)   \
56     CRC32C256BYTES((P), 1)   \
57     CRC32C256BYTES((P), 2)   \
58     CRC32C256BYTES((P), 3)   \
59     (P) += 4 * SEGMENTBYTES; \
60   } while (0)
61 
62 
_crc32c_hw_arm64(uint32_t crc,const uint8_t * data,size_t size)63 uint32_t _crc32c_hw_arm64(uint32_t crc, const uint8_t *data, size_t size) {
64   int64_t length = size;
65   uint32_t crc0, crc1, crc2, crc3;
66   uint64_t t0, t1, t2;
67 
68   // k0=CRC(x^(3*SEGMENTBYTES*8)), k1=CRC(x^(2*SEGMENTBYTES*8)),
69   // k2=CRC(x^(SEGMENTBYTES*8))
70   const poly64_t k0 = 0x8d96551c, k1 = 0xbd6f81f8, k2 = 0xdcb17aa4;
71 
72   while (length >= KBYTES) {
73     crc0 = crc;
74     crc1 = 0;
75     crc2 = 0;
76     crc3 = 0;
77 
78     // Process 1024 bytes in parallel.
79     CRC32C1024BYTES(data);
80 
81     // Merge the 4 partial CRC32C values.
82     t2 = (uint64_t)vmull_p64(crc2, k2);
83     t1 = (uint64_t)vmull_p64(crc1, k1);
84     t0 = (uint64_t)vmull_p64(crc0, k0);
85     crc = __crc32cd(crc3, *(uint64_t *)data);
86     data += sizeof(uint64_t);
87     crc ^= __crc32cd(0, t2);
88     crc ^= __crc32cd(0, t1);
89     crc ^= __crc32cd(0, t0);
90 
91     length -= KBYTES;
92   }
93 
94   while (length >= 8) {
95     crc = __crc32cd(crc, *(uint64_t *)data);
96     data += 8;
97     length -= 8;
98   }
99 
100   if (length & 4) {
101     crc = __crc32cw(crc, *(uint32_t *)data);
102     data += 4;
103   }
104 
105   if (length & 2) {
106     crc = __crc32ch(crc, *(uint16_t *)data);
107     data += 2;
108   }
109 
110   if (length & 1) {
111     crc = __crc32cb(crc, *data);
112   }
113 
114   return crc;
115 }
116 
117 #endif
118