1From f554daa098526e91c6440d29b1ddc213bd01ad0f Mon Sep 17 00:00:00 2001
2From: Damiano Galassi <damiog@gmail.com>
3Date: Tue, 26 Jan 2021 19:40:27 +0100
4Subject: [PATCH] Revert "Add aarch64 support - Part 2"
5
6This reverts commit ec7396adaa6afd2c8aab1918cfe4bb6e384740c3.
7---
8 build/aarch64-linux/crosscompile.cmake  |  15 --
9 build/aarch64-linux/make-Makefiles.bash |   4 -
10 source/CMakeLists.txt                   |  38 +---
11 source/common/CMakeLists.txt            |  35 +--
12 source/common/arm/asm-primitives.cpp    | 291 ++++++++++++------------
13 source/common/cpu.cpp                   |   4 -
14 source/common/pixel.cpp                 |   9 -
15 source/common/primitives.h              |  11 -
16 source/test/CMakeLists.txt              |  16 +-
17 source/test/testbench.cpp               |  16 --
18 source/test/testharness.h               |   5 -
19 11 files changed, 170 insertions(+), 274 deletions(-)
20 delete mode 100644 build/aarch64-linux/crosscompile.cmake
21 delete mode 100644 build/aarch64-linux/make-Makefiles.bash
22
23diff --git a/build/aarch64-linux/crosscompile.cmake b/build/aarch64-linux/crosscompile.cmake
24deleted file mode 100644
25index 41c8217f2..000000000
26--- a/build/aarch64-linux/crosscompile.cmake
27+++ /dev/null
28@@ -1,15 +0,0 @@
29-# CMake toolchain file for cross compiling x265 for aarch64
30-# This feature is only supported as experimental. Use with caution.
31-# Please report bugs on bitbucket
32-# Run cmake with: cmake -DCMAKE_TOOLCHAIN_FILE=crosscompile.cmake -G "Unix Makefiles" ../../source && ccmake ../../source
33-
34-set(CROSS_COMPILE_ARM 1)
35-set(CMAKE_SYSTEM_NAME Linux)
36-set(CMAKE_SYSTEM_PROCESSOR aarch64)
37-
38-# specify the cross compiler
39-set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
40-set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
41-
42-# specify the target environment
43-SET(CMAKE_FIND_ROOT_PATH  /usr/aarch64-linux-gnu)
44diff --git a/build/aarch64-linux/make-Makefiles.bash b/build/aarch64-linux/make-Makefiles.bash
45deleted file mode 100644
46index c9582da0a..000000000
47--- a/build/aarch64-linux/make-Makefiles.bash
48+++ /dev/null
49@@ -1,4 +0,0 @@
50-#!/bin/bash
51-# Run this from within a bash shell
52-
53-cmake -DCMAKE_TOOLCHAIN_FILE="crosscompile.cmake" -G "Unix Makefiles" ../../source && ccmake ../../source
54diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
55index 95218f5dc..2ed5c24e3 100755
56--- a/source/CMakeLists.txt
57+++ b/source/CMakeLists.txt
58@@ -40,7 +40,7 @@ SET(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake" "${CMAKE_MODULE_PATH}")
59 # System architecture detection
60 string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC)
61 set(X86_ALIASES x86 i386 i686 x86_64 amd64)
62-set(ARM_ALIASES armv6l armv7l aarch64)
63+set(ARM_ALIASES armv6l armv7l)
64 list(FIND X86_ALIASES "${SYSPROC}" X86MATCH)
65 list(FIND ARM_ALIASES "${SYSPROC}" ARMMATCH)
66 set(POWER_ALIASES ppc64 ppc64le)
67@@ -70,15 +70,9 @@ elseif(ARMMATCH GREATER "-1")
68     else()
69         set(CROSS_COMPILE_ARM 0)
70     endif()
71+    message(STATUS "Detected ARM target processor")
72     set(ARM 1)
73-    if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
74-        message(STATUS "Detected ARM64 target processor")
75-        set(ARM64 1)
76-        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=1 -DHAVE_ARMV6=0)
77-    else()
78-        message(STATUS "Detected ARM target processor")
79-        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1)
80-    endif()
81+    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
82 else()
83     message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
84     message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
85@@ -239,24 +233,14 @@ if(GCC)
86         endif()
87     endif()
88     if(ARM AND CROSS_COMPILE_ARM)
89-        if(ARM64)
90-            set(ARM_ARGS -fPIC)
91-        else()
92-            set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
93-        endif()
94-        message(STATUS "cross compile arm")
95+        set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
96     elseif(ARM)
97-        if(ARM64)
98-            set(ARM_ARGS -fPIC)
99+        find_package(Neon)
100+        if(CPU_HAS_NEON)
101+            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
102             add_definitions(-DHAVE_NEON)
103         else()
104-            find_package(Neon)
105-            if(CPU_HAS_NEON)
106-                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
107-                add_definitions(-DHAVE_NEON)
108-            else()
109-                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
110-            endif()
111+            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
112         endif()
113     endif()
114     add_definitions(${ARM_ARGS})
115@@ -536,11 +520,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
116     # compile ARM arch asm files here
117         enable_language(ASM)
118         foreach(ASM ${ARM_ASMS})
119-            if(ARM64)
120-                set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
121-            else()
122-                set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM})
123-            endif()
124+            set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM})
125             list(APPEND ASM_SRCS ${ASM_SRC})
126             list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
127             add_custom_command(
128diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
129index 12b643ad5..c34064b2f 100644
130--- a/source/common/CMakeLists.txt
131+++ b/source/common/CMakeLists.txt
132@@ -14,7 +14,7 @@ if(EXTRA_LIB)
133 endif(EXTRA_LIB)
134
135 if(ENABLE_ASSEMBLY)
136-    set_source_files_properties(threading.cpp primitives.cpp pixel.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1)
137+    set_source_files_properties(threading.cpp primitives.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1)
138     list(APPEND VFLAGS "-DENABLE_ASSEMBLY=1")
139 endif(ENABLE_ASSEMBLY)
140
141@@ -84,33 +84,16 @@ if(ENABLE_ASSEMBLY AND X86)
142 endif(ENABLE_ASSEMBLY AND X86)
143
144 if(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
145-    if(ARM64)
146-        if(GCC AND (CMAKE_CXX_FLAGS_RELEASE MATCHES "-O3"))
147-            message(STATUS "Detected CXX compiler using -O3 optimization level")
148-            add_definitions(-DAUTO_VECTORIZE=1)
149-        endif()
150-        set(C_SRCS asm-primitives.cpp pixel.h ipfilter8.h)
151-
152-        # add ARM assembly/intrinsic files here
153-        set(A_SRCS asm.S mc-a.S sad-a.S pixel-util.S ipfilter8.S)
154-        set(VEC_PRIMITIVES)
155-
156-        set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
157-        foreach(SRC ${C_SRCS})
158-            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
159-        endforeach()
160-    else()
161-        set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
162+    set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
163
164-        # add ARM assembly/intrinsic files here
165-        set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S dct-a.S)
166-        set(VEC_PRIMITIVES)
167+    # add ARM assembly/intrinsic files here
168+    set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S dct-a.S)
169+    set(VEC_PRIMITIVES)
170
171-        set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
172-        foreach(SRC ${C_SRCS})
173-            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})
174-        endforeach()
175-    endif()
176+    set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
177+    foreach(SRC ${C_SRCS})
178+        set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})
179+    endforeach()
180     source_group(Assembly FILES ${ASM_PRIMITIVES})
181 endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
182
183diff --git a/source/common/arm/asm-primitives.cpp b/source/common/arm/asm-primitives.cpp
184index 7f11503f9..422217845 100644
185--- a/source/common/arm/asm-primitives.cpp
186+++ b/source/common/arm/asm-primitives.cpp
187@@ -5,7 +5,6 @@
188  *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
189  *          Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
190  *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
191- *          Hongbin Liu<liuhongbin1@huawei.com>
192  *
193  * This program is free software; you can redistribute it and/or modify
194  * it under the terms of the GNU General Public License as published by
195@@ -49,77 +48,77 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask)
196         p.ssim_4x4x2_core = PFX(ssim_4x4x2_core_neon);
197
198         // addAvg
199-         p.pu[LUMA_4x4].addAvg[NONALIGNED]   = PFX(addAvg_4x4_neon);
200-         p.pu[LUMA_4x8].addAvg[NONALIGNED]   = PFX(addAvg_4x8_neon);
201-         p.pu[LUMA_4x16].addAvg[NONALIGNED]  = PFX(addAvg_4x16_neon);
202-         p.pu[LUMA_8x4].addAvg[NONALIGNED]   = PFX(addAvg_8x4_neon);
203-         p.pu[LUMA_8x8].addAvg[NONALIGNED]   = PFX(addAvg_8x8_neon);
204-         p.pu[LUMA_8x16].addAvg[NONALIGNED]  = PFX(addAvg_8x16_neon);
205-         p.pu[LUMA_8x32].addAvg[NONALIGNED]  = PFX(addAvg_8x32_neon);
206-         p.pu[LUMA_12x16].addAvg[NONALIGNED] = PFX(addAvg_12x16_neon);
207-         p.pu[LUMA_16x4].addAvg[NONALIGNED]  = PFX(addAvg_16x4_neon);
208-         p.pu[LUMA_16x8].addAvg[NONALIGNED]  = PFX(addAvg_16x8_neon);
209-         p.pu[LUMA_16x12].addAvg[NONALIGNED] = PFX(addAvg_16x12_neon);
210-         p.pu[LUMA_16x16].addAvg[NONALIGNED] = PFX(addAvg_16x16_neon);
211-         p.pu[LUMA_16x32].addAvg[NONALIGNED] = PFX(addAvg_16x32_neon);
212-         p.pu[LUMA_16x64].addAvg[NONALIGNED] = PFX(addAvg_16x64_neon);
213-         p.pu[LUMA_24x32].addAvg[NONALIGNED] = PFX(addAvg_24x32_neon);
214-         p.pu[LUMA_32x8].addAvg[NONALIGNED]  = PFX(addAvg_32x8_neon);
215-         p.pu[LUMA_32x16].addAvg[NONALIGNED] = PFX(addAvg_32x16_neon);
216-         p.pu[LUMA_32x24].addAvg[NONALIGNED] = PFX(addAvg_32x24_neon);
217-         p.pu[LUMA_32x32].addAvg[NONALIGNED] = PFX(addAvg_32x32_neon);
218-         p.pu[LUMA_32x64].addAvg[NONALIGNED] = PFX(addAvg_32x64_neon);
219-         p.pu[LUMA_48x64].addAvg[NONALIGNED] = PFX(addAvg_48x64_neon);
220-         p.pu[LUMA_64x16].addAvg[NONALIGNED] = PFX(addAvg_64x16_neon);
221-         p.pu[LUMA_64x32].addAvg[NONALIGNED] = PFX(addAvg_64x32_neon);
222-         p.pu[LUMA_64x48].addAvg[NONALIGNED] = PFX(addAvg_64x48_neon);
223-         p.pu[LUMA_64x64].addAvg[NONALIGNED] = PFX(addAvg_64x64_neon);
224+         p.pu[LUMA_4x4].addAvg   = PFX(addAvg_4x4_neon);
225+         p.pu[LUMA_4x8].addAvg   = PFX(addAvg_4x8_neon);
226+         p.pu[LUMA_4x16].addAvg  = PFX(addAvg_4x16_neon);
227+         p.pu[LUMA_8x4].addAvg   = PFX(addAvg_8x4_neon);
228+         p.pu[LUMA_8x8].addAvg   = PFX(addAvg_8x8_neon);
229+         p.pu[LUMA_8x16].addAvg  = PFX(addAvg_8x16_neon);
230+         p.pu[LUMA_8x32].addAvg  = PFX(addAvg_8x32_neon);
231+         p.pu[LUMA_12x16].addAvg = PFX(addAvg_12x16_neon);
232+         p.pu[LUMA_16x4].addAvg  = PFX(addAvg_16x4_neon);
233+         p.pu[LUMA_16x8].addAvg  = PFX(addAvg_16x8_neon);
234+         p.pu[LUMA_16x12].addAvg = PFX(addAvg_16x12_neon);
235+         p.pu[LUMA_16x16].addAvg = PFX(addAvg_16x16_neon);
236+         p.pu[LUMA_16x32].addAvg = PFX(addAvg_16x32_neon);
237+         p.pu[LUMA_16x64].addAvg = PFX(addAvg_16x64_neon);
238+         p.pu[LUMA_24x32].addAvg = PFX(addAvg_24x32_neon);
239+         p.pu[LUMA_32x8].addAvg  = PFX(addAvg_32x8_neon);
240+         p.pu[LUMA_32x16].addAvg = PFX(addAvg_32x16_neon);
241+         p.pu[LUMA_32x24].addAvg = PFX(addAvg_32x24_neon);
242+         p.pu[LUMA_32x32].addAvg = PFX(addAvg_32x32_neon);
243+         p.pu[LUMA_32x64].addAvg = PFX(addAvg_32x64_neon);
244+         p.pu[LUMA_48x64].addAvg = PFX(addAvg_48x64_neon);
245+         p.pu[LUMA_64x16].addAvg = PFX(addAvg_64x16_neon);
246+         p.pu[LUMA_64x32].addAvg = PFX(addAvg_64x32_neon);
247+         p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_neon);
248+         p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_neon);
249
250         // chroma addAvg
251-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].addAvg[NONALIGNED]   = PFX(addAvg_4x2_neon);
252-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].addAvg[NONALIGNED]   = PFX(addAvg_4x4_neon);
253-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].addAvg[NONALIGNED]   = PFX(addAvg_4x8_neon);
254-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].addAvg[NONALIGNED]  = PFX(addAvg_4x16_neon);
255-        p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].addAvg[NONALIGNED]   = PFX(addAvg_6x8_neon);
256-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg[NONALIGNED]   = PFX(addAvg_8x2_neon);
257-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg[NONALIGNED]   = PFX(addAvg_8x4_neon);
258-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg[NONALIGNED]   = PFX(addAvg_8x6_neon);
259-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg[NONALIGNED]   = PFX(addAvg_8x8_neon);
260-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg[NONALIGNED]  = PFX(addAvg_8x16_neon);
261-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg[NONALIGNED]  = PFX(addAvg_8x32_neon);
262-        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg[NONALIGNED] = PFX(addAvg_12x16_neon);
263-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg[NONALIGNED]  = PFX(addAvg_16x4_neon);
264-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg[NONALIGNED]  = PFX(addAvg_16x8_neon);
265-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg[NONALIGNED] = PFX(addAvg_16x12_neon);
266-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg[NONALIGNED] = PFX(addAvg_16x16_neon);
267-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg[NONALIGNED] = PFX(addAvg_16x32_neon);
268-        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].addAvg[NONALIGNED] = PFX(addAvg_24x32_neon);
269-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg[NONALIGNED]  = PFX(addAvg_32x8_neon);
270-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg[NONALIGNED] = PFX(addAvg_32x16_neon);
271-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg[NONALIGNED] = PFX(addAvg_32x24_neon);
272-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg[NONALIGNED] = PFX(addAvg_32x32_neon);
273-
274-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].addAvg[NONALIGNED]   = PFX(addAvg_4x8_neon);
275-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].addAvg[NONALIGNED]  = PFX(addAvg_4x16_neon);
276-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].addAvg[NONALIGNED]  = PFX(addAvg_4x32_neon);
277-        p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].addAvg[NONALIGNED]  = PFX(addAvg_6x16_neon);
278-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].addAvg[NONALIGNED]   = PFX(addAvg_8x4_neon);
279-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg[NONALIGNED]   = PFX(addAvg_8x8_neon);
280-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].addAvg[NONALIGNED]  = PFX(addAvg_8x12_neon);
281-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg[NONALIGNED]  = PFX(addAvg_8x16_neon);
282-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg[NONALIGNED]  = PFX(addAvg_8x32_neon);
283-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg[NONALIGNED]  = PFX(addAvg_8x64_neon);
284-        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].addAvg[NONALIGNED] = PFX(addAvg_12x32_neon);
285-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg[NONALIGNED]  = PFX(addAvg_16x8_neon);
286-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg[NONALIGNED] = PFX(addAvg_16x16_neon);
287-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg[NONALIGNED] = PFX(addAvg_16x24_neon);
288-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg[NONALIGNED] = PFX(addAvg_16x32_neon);
289-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg[NONALIGNED] = PFX(addAvg_16x64_neon);
290-        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg[NONALIGNED] = PFX(addAvg_24x64_neon);
291-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg[NONALIGNED] = PFX(addAvg_32x16_neon);
292-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg[NONALIGNED] = PFX(addAvg_32x32_neon);
293-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg[NONALIGNED] = PFX(addAvg_32x48_neon);
294-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg[NONALIGNED] = PFX(addAvg_32x64_neon);
295+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].addAvg   = PFX(addAvg_4x2_neon);
296+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].addAvg   = PFX(addAvg_4x4_neon);
297+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].addAvg   = PFX(addAvg_4x8_neon);
298+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].addAvg  = PFX(addAvg_4x16_neon);
299+        p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].addAvg   = PFX(addAvg_6x8_neon);
300+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg   = PFX(addAvg_8x2_neon);
301+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg   = PFX(addAvg_8x4_neon);
302+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg   = PFX(addAvg_8x6_neon);
303+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg   = PFX(addAvg_8x8_neon);
304+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg  = PFX(addAvg_8x16_neon);
305+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg  = PFX(addAvg_8x32_neon);
306+        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg = PFX(addAvg_12x16_neon);
307+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg  = PFX(addAvg_16x4_neon);
308+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg  = PFX(addAvg_16x8_neon);
309+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg = PFX(addAvg_16x12_neon);
310+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg = PFX(addAvg_16x16_neon);
311+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg = PFX(addAvg_16x32_neon);
312+        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].addAvg = PFX(addAvg_24x32_neon);
313+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg  = PFX(addAvg_32x8_neon);
314+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg = PFX(addAvg_32x16_neon);
315+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg = PFX(addAvg_32x24_neon);
316+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg = PFX(addAvg_32x32_neon);
317+
318+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].addAvg   = PFX(addAvg_4x8_neon);
319+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].addAvg  = PFX(addAvg_4x16_neon);
320+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].addAvg  = PFX(addAvg_4x32_neon);
321+        p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].addAvg  = PFX(addAvg_6x16_neon);
322+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].addAvg   = PFX(addAvg_8x4_neon);
323+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg   = PFX(addAvg_8x8_neon);
324+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].addAvg  = PFX(addAvg_8x12_neon);
325+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg  = PFX(addAvg_8x16_neon);
326+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg  = PFX(addAvg_8x32_neon);
327+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg  = PFX(addAvg_8x64_neon);
328+        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].addAvg = PFX(addAvg_12x32_neon);
329+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg  = PFX(addAvg_16x8_neon);
330+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg = PFX(addAvg_16x16_neon);
331+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg = PFX(addAvg_16x24_neon);
332+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg = PFX(addAvg_16x32_neon);
333+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg = PFX(addAvg_16x64_neon);
334+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg = PFX(addAvg_24x64_neon);
335+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg = PFX(addAvg_32x16_neon);
336+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg = PFX(addAvg_32x32_neon);
337+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg = PFX(addAvg_32x48_neon);
338+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg = PFX(addAvg_32x64_neon);
339
340         // quant
341          p.quant = PFX(quant_neon);
342@@ -403,7 +402,7 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask)
343         p.scale2D_64to32  = PFX(scale2D_64to32_neon);
344
345         // scale1D_128to64
346-        p.scale1D_128to64[NONALIGNED] = PFX(scale1D_128to64_neon);
347+        p.scale1D_128to64 = PFX(scale1D_128to64_neon);
348
349         // copy_count
350         p.cu[BLOCK_4x4].copy_cnt     = PFX(copy_cnt_4_neon);
351@@ -412,37 +411,37 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask)
352         p.cu[BLOCK_32x32].copy_cnt   = PFX(copy_cnt_32_neon);
353
354         // filterPixelToShort
355-        p.pu[LUMA_4x4].convert_p2s[NONALIGNED]   = PFX(filterPixelToShort_4x4_neon);
356-        p.pu[LUMA_4x8].convert_p2s[NONALIGNED]   = PFX(filterPixelToShort_4x8_neon);
357-        p.pu[LUMA_4x16].convert_p2s[NONALIGNED]  = PFX(filterPixelToShort_4x16_neon);
358-        p.pu[LUMA_8x4].convert_p2s[NONALIGNED]   = PFX(filterPixelToShort_8x4_neon);
359-        p.pu[LUMA_8x8].convert_p2s[NONALIGNED]   = PFX(filterPixelToShort_8x8_neon);
360-        p.pu[LUMA_8x16].convert_p2s[NONALIGNED]  = PFX(filterPixelToShort_8x16_neon);
361-        p.pu[LUMA_8x32].convert_p2s[NONALIGNED]  = PFX(filterPixelToShort_8x32_neon);
362-        p.pu[LUMA_12x16].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_12x16_neon);
363-        p.pu[LUMA_16x4].convert_p2s[NONALIGNED]  = PFX(filterPixelToShort_16x4_neon);
364-        p.pu[LUMA_16x8].convert_p2s[NONALIGNED]  = PFX(filterPixelToShort_16x8_neon);
365-        p.pu[LUMA_16x12].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_16x12_neon);
366-        p.pu[LUMA_16x16].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_16x16_neon);
367-        p.pu[LUMA_16x32].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_16x32_neon);
368-        p.pu[LUMA_16x64].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_16x64_neon);
369-        p.pu[LUMA_24x32].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_24x32_neon);
370-        p.pu[LUMA_32x8].convert_p2s[NONALIGNED]  = PFX(filterPixelToShort_32x8_neon);
371-        p.pu[LUMA_32x16].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_32x16_neon);
372-        p.pu[LUMA_32x24].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_32x24_neon);
373-        p.pu[LUMA_32x32].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_32x32_neon);
374-        p.pu[LUMA_32x64].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_32x64_neon);
375-        p.pu[LUMA_48x64].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_48x64_neon);
376-        p.pu[LUMA_64x16].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_64x16_neon);
377-        p.pu[LUMA_64x32].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_64x32_neon);
378-        p.pu[LUMA_64x48].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_64x48_neon);
379-        p.pu[LUMA_64x64].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_64x64_neon);
380+        p.pu[LUMA_4x4].convert_p2s   = PFX(filterPixelToShort_4x4_neon);
381+        p.pu[LUMA_4x8].convert_p2s   = PFX(filterPixelToShort_4x8_neon);
382+        p.pu[LUMA_4x16].convert_p2s  = PFX(filterPixelToShort_4x16_neon);
383+        p.pu[LUMA_8x4].convert_p2s   = PFX(filterPixelToShort_8x4_neon);
384+        p.pu[LUMA_8x8].convert_p2s   = PFX(filterPixelToShort_8x8_neon);
385+        p.pu[LUMA_8x16].convert_p2s  = PFX(filterPixelToShort_8x16_neon);
386+        p.pu[LUMA_8x32].convert_p2s  = PFX(filterPixelToShort_8x32_neon);
387+        p.pu[LUMA_12x16].convert_p2s = PFX(filterPixelToShort_12x16_neon);
388+        p.pu[LUMA_16x4].convert_p2s  = PFX(filterPixelToShort_16x4_neon);
389+        p.pu[LUMA_16x8].convert_p2s  = PFX(filterPixelToShort_16x8_neon);
390+        p.pu[LUMA_16x12].convert_p2s = PFX(filterPixelToShort_16x12_neon);
391+        p.pu[LUMA_16x16].convert_p2s = PFX(filterPixelToShort_16x16_neon);
392+        p.pu[LUMA_16x32].convert_p2s = PFX(filterPixelToShort_16x32_neon);
393+        p.pu[LUMA_16x64].convert_p2s = PFX(filterPixelToShort_16x64_neon);
394+        p.pu[LUMA_24x32].convert_p2s = PFX(filterPixelToShort_24x32_neon);
395+        p.pu[LUMA_32x8].convert_p2s  = PFX(filterPixelToShort_32x8_neon);
396+        p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_neon);
397+        p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_neon);
398+        p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_neon);
399+        p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_neon);
400+        p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_neon);
401+        p.pu[LUMA_64x16].convert_p2s = PFX(filterPixelToShort_64x16_neon);
402+        p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_neon);
403+        p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_neon);
404+        p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_neon);
405
406         // Block_fill
407-        p.cu[BLOCK_4x4].blockfill_s[NONALIGNED]   = PFX(blockfill_s_4x4_neon);
408-        p.cu[BLOCK_8x8].blockfill_s[NONALIGNED]   = PFX(blockfill_s_8x8_neon);
409-        p.cu[BLOCK_16x16].blockfill_s[NONALIGNED] = PFX(blockfill_s_16x16_neon);
410-        p.cu[BLOCK_32x32].blockfill_s[NONALIGNED] = PFX(blockfill_s_32x32_neon);
411+        p.cu[BLOCK_4x4].blockfill_s   = PFX(blockfill_s_4x4_neon);
412+        p.cu[BLOCK_8x8].blockfill_s   = PFX(blockfill_s_8x8_neon);
413+        p.cu[BLOCK_16x16].blockfill_s = PFX(blockfill_s_16x16_neon);
414+        p.cu[BLOCK_32x32].blockfill_s = PFX(blockfill_s_32x32_neon);
415
416         // Blockcopy_ss
417         p.cu[BLOCK_4x4].copy_ss   = PFX(blockcopy_ss_4x4_neon);
418@@ -496,21 +495,21 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask)
419         p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = PFX(blockcopy_sp_32x64_neon);
420
421         // pixel_add_ps
422-        p.cu[BLOCK_4x4].add_ps[NONALIGNED]   = PFX(pixel_add_ps_4x4_neon);
423-        p.cu[BLOCK_8x8].add_ps[NONALIGNED]   = PFX(pixel_add_ps_8x8_neon);
424-        p.cu[BLOCK_16x16].add_ps[NONALIGNED] = PFX(pixel_add_ps_16x16_neon);
425-        p.cu[BLOCK_32x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x32_neon);
426-        p.cu[BLOCK_64x64].add_ps[NONALIGNED] = PFX(pixel_add_ps_64x64_neon);
427+        p.cu[BLOCK_4x4].add_ps   = PFX(pixel_add_ps_4x4_neon);
428+        p.cu[BLOCK_8x8].add_ps   = PFX(pixel_add_ps_8x8_neon);
429+        p.cu[BLOCK_16x16].add_ps = PFX(pixel_add_ps_16x16_neon);
430+        p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_neon);
431+        p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_neon);
432
433         // chroma add_ps
434-        p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps[NONALIGNED]   = PFX(pixel_add_ps_4x4_neon);
435-        p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps[NONALIGNED]   = PFX(pixel_add_ps_8x8_neon);
436-        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps[NONALIGNED] = PFX(pixel_add_ps_16x16_neon);
437-        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x32_neon);
438-        p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps[NONALIGNED]   = PFX(pixel_add_ps_4x8_neon);
439-        p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps[NONALIGNED]  = PFX(pixel_add_ps_8x16_neon);
440-        p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_16x32_neon);
441-        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x64_neon);
442+        p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps   = PFX(pixel_add_ps_4x4_neon);
443+        p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps   = PFX(pixel_add_ps_8x8_neon);
444+        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps = PFX(pixel_add_ps_16x16_neon);
445+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_neon);
446+        p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps   = PFX(pixel_add_ps_4x8_neon);
447+        p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps  = PFX(pixel_add_ps_8x16_neon);
448+        p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps = PFX(pixel_add_ps_16x32_neon);
449+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_neon);
450
451         // cpy2Dto1D_shr
452         p.cu[BLOCK_4x4].cpy2Dto1D_shr   = PFX(cpy2Dto1D_shr_4x4_neon);
453@@ -519,10 +518,10 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask)
454         p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32x32_neon);
455
456         // ssd_s
457-        p.cu[BLOCK_4x4].ssd_s[NONALIGNED]   = PFX(pixel_ssd_s_4x4_neon);
458-        p.cu[BLOCK_8x8].ssd_s[NONALIGNED]   = PFX(pixel_ssd_s_8x8_neon);
459-        p.cu[BLOCK_16x16].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_16x16_neon);
460-        p.cu[BLOCK_32x32].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_32x32_neon);
461+        p.cu[BLOCK_4x4].ssd_s   = PFX(pixel_ssd_s_4x4_neon);
462+        p.cu[BLOCK_8x8].ssd_s   = PFX(pixel_ssd_s_8x8_neon);
463+        p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16x16_neon);
464+        p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32x32_neon);
465
466         // sse_ss
467         p.cu[BLOCK_4x4].sse_ss   = PFX(pixel_sse_ss_4x4_neon);
468@@ -549,10 +548,10 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask)
469         p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_neon);
470
471         // calc_Residual
472-        p.cu[BLOCK_4x4].calcresidual[NONALIGNED]   = PFX(getResidual4_neon);
473-        p.cu[BLOCK_8x8].calcresidual[NONALIGNED]   = PFX(getResidual8_neon);
474-        p.cu[BLOCK_16x16].calcresidual[NONALIGNED] = PFX(getResidual16_neon);
475-        p.cu[BLOCK_32x32].calcresidual[NONALIGNED] = PFX(getResidual32_neon);
476+        p.cu[BLOCK_4x4].calcresidual   = PFX(getResidual4_neon);
477+        p.cu[BLOCK_8x8].calcresidual   = PFX(getResidual8_neon);
478+        p.cu[BLOCK_16x16].calcresidual = PFX(getResidual16_neon);
479+        p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_neon);
480
481         // sse_pp
482         p.cu[BLOCK_4x4].sse_pp   = PFX(pixel_sse_pp_4x4_neon);
483@@ -723,31 +722,31 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask)
484         p.pu[LUMA_64x64].sad_x4 = PFX(sad_x4_64x64_neon);
485
486         // pixel_avg_pp
487-        p.pu[LUMA_4x4].pixelavg_pp[NONALIGNED]   = PFX(pixel_avg_pp_4x4_neon);
488-        p.pu[LUMA_4x8].pixelavg_pp[NONALIGNED]   = PFX(pixel_avg_pp_4x8_neon);
489-        p.pu[LUMA_4x16].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_pp_4x16_neon);
490-        p.pu[LUMA_8x4].pixelavg_pp[NONALIGNED]   = PFX(pixel_avg_pp_8x4_neon);
491-        p.pu[LUMA_8x8].pixelavg_pp[NONALIGNED]   = PFX(pixel_avg_pp_8x8_neon);
492-        p.pu[LUMA_8x16].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_pp_8x16_neon);
493-        p.pu[LUMA_8x32].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_pp_8x32_neon);
494-        p.pu[LUMA_12x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_12x16_neon);
495-        p.pu[LUMA_16x4].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_pp_16x4_neon);
496-        p.pu[LUMA_16x8].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_pp_16x8_neon);
497-        p.pu[LUMA_16x12].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_16x12_neon);
498-        p.pu[LUMA_16x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_16x16_neon);
499-        p.pu[LUMA_16x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_16x32_neon);
500-        p.pu[LUMA_16x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_16x64_neon);
501-        p.pu[LUMA_24x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_24x32_neon);
502-        p.pu[LUMA_32x8].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_pp_32x8_neon);
503-        p.pu[LUMA_32x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_32x16_neon);
504-        p.pu[LUMA_32x24].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_32x24_neon);
505-        p.pu[LUMA_32x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_32x32_neon);
506-        p.pu[LUMA_32x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_32x64_neon);
507-        p.pu[LUMA_48x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_48x64_neon);
508-        p.pu[LUMA_64x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_64x16_neon);
509-        p.pu[LUMA_64x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_64x32_neon);
510-        p.pu[LUMA_64x48].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_64x48_neon);
511-        p.pu[LUMA_64x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_64x64_neon);
512+        p.pu[LUMA_4x4].pixelavg_pp   = PFX(pixel_avg_pp_4x4_neon);
513+        p.pu[LUMA_4x8].pixelavg_pp   = PFX(pixel_avg_pp_4x8_neon);
514+        p.pu[LUMA_4x16].pixelavg_pp  = PFX(pixel_avg_pp_4x16_neon);
515+        p.pu[LUMA_8x4].pixelavg_pp   = PFX(pixel_avg_pp_8x4_neon);
516+        p.pu[LUMA_8x8].pixelavg_pp   = PFX(pixel_avg_pp_8x8_neon);
517+        p.pu[LUMA_8x16].pixelavg_pp  = PFX(pixel_avg_pp_8x16_neon);
518+        p.pu[LUMA_8x32].pixelavg_pp  = PFX(pixel_avg_pp_8x32_neon);
519+        p.pu[LUMA_12x16].pixelavg_pp = PFX(pixel_avg_pp_12x16_neon);
520+        p.pu[LUMA_16x4].pixelavg_pp  = PFX(pixel_avg_pp_16x4_neon);
521+        p.pu[LUMA_16x8].pixelavg_pp  = PFX(pixel_avg_pp_16x8_neon);
522+        p.pu[LUMA_16x12].pixelavg_pp = PFX(pixel_avg_pp_16x12_neon);
523+        p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_pp_16x16_neon);
524+        p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_pp_16x32_neon);
525+        p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_pp_16x64_neon);
526+        p.pu[LUMA_24x32].pixelavg_pp = PFX(pixel_avg_pp_24x32_neon);
527+        p.pu[LUMA_32x8].pixelavg_pp  = PFX(pixel_avg_pp_32x8_neon);
528+        p.pu[LUMA_32x16].pixelavg_pp = PFX(pixel_avg_pp_32x16_neon);
529+        p.pu[LUMA_32x24].pixelavg_pp = PFX(pixel_avg_pp_32x24_neon);
530+        p.pu[LUMA_32x32].pixelavg_pp = PFX(pixel_avg_pp_32x32_neon);
531+        p.pu[LUMA_32x64].pixelavg_pp = PFX(pixel_avg_pp_32x64_neon);
532+        p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_pp_48x64_neon);
533+        p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_pp_64x16_neon);
534+        p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_pp_64x32_neon);
535+        p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_pp_64x48_neon);
536+        p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_pp_64x64_neon);
537
538         // planecopy
539         p.planecopy_cp = PFX(pixel_planecopy_cp_neon);
540diff --git a/source/common/cpu.cpp b/source/common/cpu.cpp
541index 2eacfe4a9..26c82ea50 100644
542--- a/source/common/cpu.cpp
543+++ b/source/common/cpu.cpp
544@@ -5,8 +5,6 @@
545  *          Laurent Aimar <fenrir@via.ecp.fr>
546  *          Fiona Glaser <fiona@x264.com>
547  *          Steve Borho <steve@borho.org>
548- *          Hongbin Liu <liuhongbin1@huawei.com>
549- *          Yimeng Su <yimeng.su@huawei.com>
550  *
551  * This program is free software; you can redistribute it and/or modify
552  * it under the terms of the GNU General Public License as published by
553@@ -369,8 +367,6 @@ uint32_t cpu_detect(bool benableavx512)
554     flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
555 #endif
556     // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
557-#elif X265_ARCH_ARM64
558-    flags |= X265_CPU_NEON;
559 #endif // if HAVE_ARMV6
560     return flags;
561 }
562diff --git a/source/common/pixel.cpp b/source/common/pixel.cpp
563index e4f890cd5..99b84449c 100644
564--- a/source/common/pixel.cpp
565+++ b/source/common/pixel.cpp
566@@ -5,7 +5,6 @@
567  *          Mandar Gurav <mandar@multicorewareinc.com>
568  *          Mahesh Pittala <mahesh@multicorewareinc.com>
569  *          Min Chen <min.chen@multicorewareinc.com>
570- *          Hongbin Liu<liuhongbin1@huawei.com>
571  *
572  * This program is free software; you can redistribute it and/or modify
573  * it under the terms of the GNU General Public License as published by
574@@ -266,10 +265,6 @@ int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t s
575 {
576     int satd = 0;
577
578-#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
579-    pixelcmp_t satd_4x4 = x265_pixel_satd_4x4_neon;
580-#endif
581-
582     for (int row = 0; row < h; row += 4)
583         for (int col = 0; col < w; col += 4)
584             satd += satd_4x4(pix1 + row * stride_pix1 + col, stride_pix1,
585@@ -284,10 +279,6 @@ int satd8(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t s
586 {
587     int satd = 0;
588
589-#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
590-    pixelcmp_t satd_8x4 = x265_pixel_satd_8x4_neon;
591-#endif
592-
593     for (int row = 0; row < h; row += 4)
594         for (int col = 0; col < w; col += 8)
595             satd += satd_8x4(pix1 + row * stride_pix1 + col, stride_pix1,
596diff --git a/source/common/primitives.h b/source/common/primitives.h
597index 0b52f84de..5c64952fb 100644
598--- a/source/common/primitives.h
599+++ b/source/common/primitives.h
600@@ -8,8 +8,6 @@
601  *          Rajesh Paulraj <rajesh@multicorewareinc.com>
602  *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
603  *          Min Chen <chenm003@163.com>
604- *          Hongbin Liu<liuhongbin1@huawei.com>
605- *          Yimeng Su <yimeng.su@huawei.com>
606  *
607  * This program is free software; you can redistribute it and/or modify
608  * it under the terms of the GNU General Public License as published by
609@@ -469,9 +467,6 @@ void setupCPrimitives(EncoderPrimitives &p);
610 void setupInstrinsicPrimitives(EncoderPrimitives &p, int cpuMask);
611 void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask);
612 void setupAliasPrimitives(EncoderPrimitives &p);
613-#if X265_ARCH_ARM64
614-void setupAliasCPrimitives(EncoderPrimitives &cp, EncoderPrimitives &asmp, int cpuMask);
615-#endif
616 #if HAVE_ALTIVEC
617 void setupPixelPrimitives_altivec(EncoderPrimitives &p);
618 void setupDCTPrimitives_altivec(EncoderPrimitives &p);
619@@ -486,10 +481,4 @@ extern const char* PFX(version_str);
620 extern const char* PFX(build_info_str);
621 #endif
622
623-#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
624-extern "C" {
625-#include "aarch64/pixel-util.h"
626-}
627-#endif
628-
629 #endif // ifndef X265_PRIMITIVES_H
630diff --git a/source/test/CMakeLists.txt b/source/test/CMakeLists.txt
631index 9abaf31ff..260195f53 100644
632--- a/source/test/CMakeLists.txt
633+++ b/source/test/CMakeLists.txt
634@@ -23,15 +23,13 @@ endif(X86)
635
636 # add ARM assembly files
637 if(ARM OR CROSS_COMPILE_ARM)
638-    if(NOT ARM64)
639-        enable_language(ASM)
640-        set(NASM_SRC checkasm-arm.S)
641-        add_custom_command(
642-            OUTPUT checkasm-arm.obj
643-            COMMAND ${CMAKE_CXX_COMPILER}
644-            ARGS ${NASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-arm.S -o checkasm-arm.obj
645-            DEPENDS checkasm-arm.S)
646-    endif()
647+    enable_language(ASM)
648+    set(NASM_SRC checkasm-arm.S)
649+    add_custom_command(
650+        OUTPUT checkasm-arm.obj
651+        COMMAND ${CMAKE_CXX_COMPILER}
652+        ARGS ${NASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-arm.S -o checkasm-arm.obj
653+        DEPENDS checkasm-arm.S)
654 endif(ARM OR CROSS_COMPILE_ARM)
655
656 # add PowerPC assembly files
657diff --git a/source/test/testbench.cpp b/source/test/testbench.cpp
658index 8db8c0c25..ac14f9710 100644
659--- a/source/test/testbench.cpp
660+++ b/source/test/testbench.cpp
661@@ -5,7 +5,6 @@
662  *          Mandar Gurav <mandar@multicorewareinc.com>
663  *          Mahesh Pittala <mahesh@multicorewareinc.com>
664  *          Min Chen <chenm003@163.com>
665- *          Yimeng Su <yimeng.su@huawei.com>
666  *
667  * This program is free software; you can redistribute it and/or modify
668  * it under the terms of the GNU General Public License as published by
669@@ -209,14 +208,6 @@ int main(int argc, char *argv[])
670         EncoderPrimitives asmprim;
671         memset(&asmprim, 0, sizeof(asmprim));
672         setupAssemblyPrimitives(asmprim, test_arch[i].flag);
673-
674-#if X265_ARCH_ARM64
675-        /* Temporary workaround because luma_vsp assembly primitive has not been completed
676-         * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.
677-         * Otherwise, segment fault occurs. */
678-        setupAliasCPrimitives(cprim, asmprim, test_arch[i].flag);
679-#endif
680-
681         setupAliasPrimitives(asmprim);
682         memcpy(&primitives, &asmprim, sizeof(EncoderPrimitives));
683         for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); h++)
684@@ -241,13 +232,6 @@ int main(int argc, char *argv[])
685 #endif
686     setupAssemblyPrimitives(optprim, cpuid);
687
688-#if X265_ARCH_ARM64
689-    /* Temporary workaround because luma_vsp assembly primitive has not been completed
690-     * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.
691-     * Otherwise, segment fault occurs. */
692-    setupAliasCPrimitives(cprim, optprim, cpuid);
693-#endif
694-
695     /* Note that we do not setup aliases for performance tests, that would be
696      * redundant. The testbench only verifies they are correctly aliased */
697
698diff --git a/source/test/testharness.h b/source/test/testharness.h
699index 6e680953f..771551583 100644
700--- a/source/test/testharness.h
701+++ b/source/test/testharness.h
702@@ -3,7 +3,6 @@
703  *
704  * Authors: Steve Borho <steve@borho.org>
705  *          Min Chen <chenm003@163.com>
706- *          Yimeng Su <yimeng.su@huawei.com>
707  *
708  * This program is free software; you can redistribute it and/or modify
709  * it under the terms of the GNU General Public License as published by
710@@ -82,15 +81,11 @@ static inline uint32_t __rdtsc(void)
711 #if X265_ARCH_X86
712     asm volatile("rdtsc" : "=a" (a) ::"edx");
713 #elif X265_ARCH_ARM
714-#if X265_ARCH_ARM64
715-    asm volatile("mrs %0, cntvct_el0" : "=r"(a));
716-#else
717     // TOD-DO: verify following inline asm to get cpu Timestamp Counter for ARM arch
718     // asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(a));
719
720     // TO-DO: replace clock() function with appropriate ARM cpu instructions
721     a = clock();
722-#endif
723 #endif
724     return a;
725 }
726--
7272.24.3 (Apple Git-128)
728
729