1 /* Copyright 2018 The Chromium Authors. All rights reserved.
2  * Use of this source code is governed by a BSD-style license that can be
3  * found in the Chromium source repository LICENSE file.
4  */
5 #ifndef __SLIDE_HASH__NEON__
6 #define __SLIDE_HASH__NEON__
7 
8 #include "deflate.h"
9 #include <arm_neon.h>
10 
neon_slide_hash_update(Posf * hash,const uInt hash_size,const ush w_size)11 inline static void ZLIB_INTERNAL neon_slide_hash_update(Posf *hash,
12                                                         const uInt hash_size,
13                                                         const ush w_size)
14 {
15    /* NEON 'Q' registers allow to store 128 bits, so we can load 8x16-bits
16      * values. For further details, check:
17      * ARM DHT 0002A, section 1.3.2 NEON Registers.
18      */
19     const size_t chunk = sizeof(uint16x8_t) / sizeof(uint16_t);
20     /* Unrolling the operation yielded a compression performance boost in both
21      * ARMv7 (from 11.7% to 13.4%) and ARMv8 (from 3.7% to 7.5%) for HTML4
22      * content. For full benchmarking data, check: http://crbug.com/863257.
23      */
24     const size_t stride = 2*chunk;
25     const uint16x8_t v = vdupq_n_u16(w_size);
26 
27     for (Posf *end = hash + hash_size; hash != end; hash += stride) {
28         uint16x8_t m_low = vld1q_u16(hash);
29         uint16x8_t m_high = vld1q_u16(hash + chunk);
30 
31         /* The first 'q' in vqsubq_u16 makes these subtracts saturate to zero,
32          * replacing the ternary operator expression in the original code:
33          * (m >= wsize ? m - wsize : NIL).
34          */
35         m_low = vqsubq_u16(m_low, v);
36         m_high = vqsubq_u16(m_high, v);
37 
38         vst1q_u16(hash, m_low);
39         vst1q_u16(hash + chunk, m_high);
40     }
41 }
42 
43 
neon_slide_hash(Posf * head,Posf * prev,const unsigned short w_size,const uInt hash_size)44 inline static void ZLIB_INTERNAL neon_slide_hash(Posf *head, Posf *prev,
45                                                  const unsigned short w_size,
46                                                  const uInt hash_size)
47 {
48     /*
49      * SIMD implementation for hash table rebase assumes:
50      * 1. hash chain offset (Pos) is 2 bytes.
51      * 2. hash table size is multiple of 32 bytes.
52      * #1 should be true as Pos is defined as "ush"
53      * #2 should be true as hash_bits are greater than 7
54      */
55     const size_t size = hash_size * sizeof(head[0]);
56     Assert(sizeof(Pos) == 2, "Wrong Pos size.");
57     Assert((size % sizeof(uint16x8_t) * 2) == 0, "Hash table size error.");
58 
59     neon_slide_hash_update(head, hash_size, w_size);
60 #ifndef FASTEST
61     neon_slide_hash_update(prev, w_size, w_size);
62 #endif
63 }
64 
65 #endif
66