1/*!
2 * \copy
3 *     Copyright (c)  2013, Cisco Systems
4 *     All rights reserved.
5 *
6 *     Redistribution and use in source and binary forms, with or without
7 *     modification, are permitted provided that the following conditions
8 *     are met:
9 *
10 *        * Redistributions of source code must retain the above copyright
11 *          notice, this list of conditions and the following disclaimer.
12 *
13 *        * Redistributions in binary form must reproduce the above copyright
14 *          notice, this list of conditions and the following disclaimer in
15 *          the documentation and/or other materials provided with the
16 *          distribution.
17 *
18 *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 *     POSSIBILITY OF SUCH DAMAGE.
30 *
31 */
32
33#ifdef HAVE_NEON_AARCH64
34#include "arm_arch64_common_macro.S"
35
36.macro LOAD_LUMA_DATA
37    sub     x7, x0, x1
38    ld1     {v0.16b}, [x7]      //top
39    sub     x7, x0, #1
40    ld1     {v1.b}[0], [x7], x1
41    ld1     {v1.b}[1], [x7], x1
42    ld1     {v1.b}[2], [x7], x1
43    ld1     {v1.b}[3], [x7], x1
44    ld1     {v1.b}[4], [x7], x1
45    ld1     {v1.b}[5], [x7], x1
46    ld1     {v1.b}[6], [x7], x1
47    ld1     {v1.b}[7], [x7], x1
48    ld1     {v1.b}[8], [x7], x1
49    ld1     {v1.b}[9], [x7], x1
50    ld1     {v1.b}[10], [x7], x1
51    ld1     {v1.b}[11], [x7], x1
52    ld1     {v1.b}[12], [x7], x1
53    ld1     {v1.b}[13], [x7], x1
54    ld1     {v1.b}[14], [x7], x1
55    ld1     {v1.b}[15], [x7]    //left
56.endm
57
58.macro LOAD_16X4_DATA
59    //Load the p_enc data and save to "v22 ~ v25"--- 16X4 bytes
60    ld1     {v0.16b}, [x2], x3
61    ld1     {v1.16b}, [x2], x3
62    ld1     {v20.16b}, [x2], x3
63    ld1     {v21.16b}, [x2], x3
64    trn1    v22.4s, v0.4s, v1.4s
65    trn2    v23.4s, v0.4s, v1.4s
66    trn1    v24.4s, v20.4s, v21.4s
67    trn2    v25.4s, v20.4s, v21.4s
68.endm
69
70.macro GET_16X16_V_SATD
71    trn1    v6.4s, v4.4s, v5.4s
72    trn2    v7.4s, v4.4s, v5.4s
73    add     v4.8h, v6.8h, v7.8h
74    sub     v5.8h, v6.8h, v7.8h
75    trn1    v6.8h, v4.8h, v5.8h
76    trn2    v7.8h, v4.8h, v5.8h
77    add     v4.8h, v6.8h, v7.8h
78    sub     v5.8h, v6.8h, v7.8h
79    trn1    v6.4s, v4.4s, v5.4s
80    trn2    v7.4s, v4.4s, v5.4s     //{0,1,3,2, 4,5,7,6} v6 {8,9,11,10, 12,13,15,14} v7
81.endm
82
83.macro GET_16X16_H_SATD
84    trn1    v16.4s, v4.4s, v5.4s
85    trn2    v17.4s, v4.4s, v5.4s
86    add     v4.8h, v16.8h, v17.8h
87    sub     v5.8h, v16.8h, v17.8h
88    trn1    v16.8h, v4.8h, v5.8h
89    trn2    v17.8h, v4.8h, v5.8h
90    add     v4.8h, v16.8h, v17.8h
91    sub     v5.8h, v16.8h, v17.8h
92    trn1    v16.4s, v4.4s, v5.4s
93    trn2    v17.4s, v4.4s, v5.4s    //{0,1,3,2, 4,5,7,6} v16 {8,9,11,10, 12,13,15,14} v17
94.endm
95
96.macro SELECT_BEST_COST arg0, arg1, arg2
97    cmp     w1, \arg0
98    csel    \arg0, \arg0, w1, \arg2
99    cset    w7, \arg1
100    cmp     w2, \arg0
101    mov     w6, #2
102    csel    \arg0, \arg0, w2, \arg2
103    csel    w7, w7, w6, \arg2
104.endm
105
106.macro SELECT_BEST_COST_PREFER_HIGHER arg0
107    SELECT_BEST_COST \arg0, ls, hi
108.endm
109
110.macro SELECT_BEST_COST_PREFER_LOWER arg0
111    SELECT_BEST_COST \arg0, lo, hs
112.endm
113
114.macro LOAD_CHROMA_DATA arg0, arg1, arg2
115    sub     x9, \arg0, x1
116    ld1     {\arg1}, [x9]      //top_cb
117    sub     x9, \arg0, #1
118    ld1     {\arg2}[8], [x9], x1
119    ld1     {\arg2}[9], [x9], x1
120    ld1     {\arg2}[10], [x9], x1
121    ld1     {\arg2}[11], [x9], x1
122    ld1     {\arg2}[12], [x9], x1
123    ld1     {\arg2}[13], [x9], x1
124    ld1     {\arg2}[14], [x9], x1
125    ld1     {\arg2}[15], [x9], x1 //left_cb
126.endm
127
128.macro LOAD_8X4_DATA arg0
129    //Load the p_enc data and save to "v20 ~ v21"--- 8X4 bytes
130    ld1     {v0.8b}, [\arg0], x3
131    ld1     {v1.8b}, [\arg0], x3
132    ld1     {v0.d}[1], [\arg0], x3
133    ld1     {v1.d}[1], [\arg0], x3
134    trn1    v2.4s, v0.4s, v1.4s
135    trn2    v1.4s, v0.4s, v1.4s
136    trn1    v20.2d, v2.2d, v1.2d
137    trn2    v21.2d, v2.2d, v1.2d
138.endm
139
140.macro HDM_TRANSFORM_4X4_L0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
141    //Do the vertical transform
142    uadd\arg9\()   v0.8h, \arg0, \arg1
143    usub\arg9\()   v1.8h, \arg0, \arg1
144    trn1    v3.2d, v0.2d, v1.2d
145    trn2    v1.2d, v0.2d, v1.2d
146    add     v4.8h, v3.8h, v1.8h //{0,1,2,3,4,5,6,7}
147    sub     v5.8h, v3.8h, v1.8h //{12,13,14,15,8,9,10,11}
148
149    //Do the horizontal transform
150    trn1    v0.4s, v4.4s, v5.4s
151    trn2    v1.4s, v4.4s, v5.4s
152    add     v4.8h, v0.8h, v1.8h
153    sub     v5.8h, v0.8h, v1.8h
154    trn1    v0.8h, v4.8h, v5.8h
155    trn2    v1.8h, v4.8h, v5.8h
156    add     v4.8h, v0.8h, v1.8h
157    sub     v5.8h, v0.8h, v1.8h
158
159    //16x16_v
160    trn1    v0.2s, v4.2s, v5.2s
161    trn2    v1.2s, v4.2s, v5.2s
162    sabal   \arg5, v0.4h, \arg2
163    sabal   \arg5, v1.4h, \arg8\().4h
164    sabal2  \arg5, v4.8h, \arg8\().8h
165    sabal2  \arg5, v5.8h, \arg8\().8h
166
167    //16x16_h
168    ins     v3.d[0], v4.d[1]
169    trn1    v0.4h, v4.4h, v3.4h
170    trn2    v1.4h, v4.4h, v3.4h
171    sabal   \arg6, v0.4h, \arg3
172    sabdl   v4.4s, v1.4h, \arg8\().4h
173    sabal   v4.4s, v5.4h, \arg8\().4h
174    sabal2  v4.4s, v5.8h, \arg8\().8h
175    add     \arg6, \arg6, v4.4s
176
177    //16x16_dc_both
178    sabal   \arg7, v0.4h, \arg4
179    add     \arg7, \arg7, v4.4s
180.endm
181
182//int32_t WelsIntra8x8Combined3Sad_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*, uint8_t*,uint8_t*);
183WELS_ASM_AARCH64_FUNC_BEGIN WelsIntra8x8Combined3Sad_AArch64_neon
184    ldr     x11, [sp, #0]
185    SIGN_EXTENSION x1,w1
186    SIGN_EXTENSION x3,w3
187    SIGN_EXTENSION x5,w5
188    LOAD_CHROMA_DATA x0, v0.8b, v0.b
189
190    uaddlp  v1.8h, v0.16b
191    uaddlp  v2.4s, v1.8h
192    ins     v3.d[0], v2.d[1]
193    add     v3.2s, v2.2s, v3.2s
194    urshr   v2.4s, v2.4s, #2
195    urshr   v3.2s, v3.2s, #3
196
197    dup     v20.8b, v3.b[0]
198    dup     v21.8b, v2.b[4]
199    dup     v22.8b, v2.b[12]
200    dup     v23.8b, v3.b[4]
201    ins     v20.s[1], v21.s[0]
202    ins     v22.s[1], v23.s[0]
203
204    LOAD_CHROMA_DATA x7, v4.8b, v4.b
205
206    uaddlp  v5.8h, v4.16b
207    uaddlp  v6.4s, v5.8h
208    ins     v7.d[0], v6.d[1]
209    add     v7.2s, v6.2s, v7.2s
210    urshr   v6.4s, v6.4s, #2
211    urshr   v7.2s, v7.2s, #3
212
213    dup     v24.8b, v7.b[0]
214    dup     v25.8b, v6.b[4]
215    dup     v26.8b, v6.b[12]
216    dup     v27.8b, v7.b[4]
217    ins     v24.s[1], v25.s[0]
218    ins     v26.s[1], v27.s[0]
219
220    sub     x9, x0, #1
221    sub     x10, x7, #1
222
223    ld1     {v3.8b}, [x2], x3
224    ld1     {v5.8b}, [x11], x3
225
226    ld1r    {v6.8b}, [x9], x1
227    ld1r    {v7.8b}, [x10], x1
228
229    uabdl   v29.8h, v0.8b, v3.8b
230    uabal   v29.8h, v4.8b, v5.8b   //top
231
232    uabdl   v30.8h, v6.8b, v3.8b
233    uabal   v30.8h, v7.8b, v5.8b   //left
234
235    uabdl   v31.8h, v20.8b, v3.8b
236    uabal   v31.8h, v24.8b, v5.8b   //Dc
237.rept 3
238    ld1     {v3.8b}, [x2], x3
239    ld1     {v5.8b}, [x11], x3
240
241    ld1r    {v6.8b}, [x9], x1
242    ld1r    {v7.8b}, [x10], x1
243
244    uabal   v29.8h, v0.8b, v3.8b
245    uabal   v29.8h, v4.8b, v5.8b   //top
246
247    uabal   v30.8h, v6.8b, v3.8b
248    uabal   v30.8h, v7.8b, v5.8b   //left
249
250    uabal   v31.8h, v20.8b, v3.8b
251    uabal   v31.8h, v24.8b, v5.8b   //Dc
252.endr
253
254.rept 4
255    ld1     {v3.8b}, [x2], x3
256    ld1     {v5.8b}, [x11], x3
257
258    ld1r    {v6.8b}, [x9], x1
259    ld1r    {v7.8b}, [x10], x1
260
261    uabal   v29.8h, v0.8b, v3.8b
262    uabal   v29.8h, v4.8b, v5.8b   //top
263
264    uabal   v30.8h, v6.8b, v3.8b
265    uabal   v30.8h, v7.8b, v5.8b   //left
266
267    uabal   v31.8h, v22.8b, v3.8b
268    uabal   v31.8h, v26.8b, v5.8b   //Dc
269.endr
270
271    saddlv  s29, v29.8h
272    fmov    w2, s29
273    add     w2, w2, w5, lsl #1
274    saddlv  s30, v30.8h
275    fmov    w1, s30
276    add     w1, w1, w5, lsl #1
277    saddlv  s31, v31.8h
278    fmov    w0, s31
279
280    SELECT_BEST_COST_PREFER_HIGHER w0
281
282    str     w7, [x4]
283WELS_ASM_AARCH64_FUNC_END
284
285//int32_t WelsIntra16x16Combined3Sad_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*);
286WELS_ASM_AARCH64_FUNC_BEGIN WelsIntra16x16Combined3Sad_AArch64_neon
287    SIGN_EXTENSION x1,w1
288    SIGN_EXTENSION x3,w3
289    SIGN_EXTENSION x5,w5
290    LOAD_LUMA_DATA
291
292    uaddlv    h2, v0.16b
293    uaddlv    h3, v1.16b
294    add       v2.8h, v2.8h, v3.8h
295    uqrshrn   b2, h2, #5
296    dup       v2.16b, v2.b[0]   //Dc
297
298    sub     x7, x0, #1
299    ld1     {v3.16b}, [x2], x3
300    ld1r    {v4.16b}, [x7], x1
301
302    uabdl   v29.8h, v0.8b, v3.8b
303    uabal2  v29.8h, v0.16b,v3.16b   //top
304
305    uabdl   v30.8h, v4.8b, v3.8b
306    uabal2  v30.8h, v4.16b,v3.16b   //left
307
308    uabdl   v31.8h, v2.8b, v3.8b
309    uabal2  v31.8h, v2.16b,v3.16b   //Dc
310    mov     x6, #15
311sad_intra_16x16_x3_opt_loop0:
312    ld1     {v3.16b}, [x2], x3
313    ld1r    {v4.16b}, [x7], x1
314
315    uabal   v29.8h, v0.8b, v3.8b
316    uabal2  v29.8h, v0.16b,v3.16b   //top
317
318    uabal   v30.8h, v4.8b, v3.8b
319    uabal2  v30.8h, v4.16b,v3.16b   //left
320
321    uabal   v31.8h, v2.8b, v3.8b
322    uabal2  v31.8h, v2.16b,v3.16b   //Dc
323    sub     x6, x6, #1
324    cbnz    x6,  sad_intra_16x16_x3_opt_loop0
325
326    saddlv  s29, v29.8h
327    fmov    w0, s29
328    saddlv  s30, v30.8h
329    fmov    w1, s30
330    add     w1, w1, w5, lsl #1
331    saddlv  s31, v31.8h
332    fmov    w2, s31
333    add     w2, w2, w5, lsl #1
334
335    SELECT_BEST_COST_PREFER_LOWER w0
336
337    str     w7, [x4]
338WELS_ASM_AARCH64_FUNC_END
339
340//int32_t WelsIntra4x4Combined3Satd_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, uint8_t*, int32_t*, int32_t, int32_t,int32_t);
341WELS_ASM_AARCH64_FUNC_BEGIN WelsIntra4x4Combined3Satd_AArch64_neon
342    SIGN_EXTENSION x1,w1
343    SIGN_EXTENSION x3,w3
344    SIGN_EXTENSION x6,w6
345    SIGN_EXTENSION x7,w7
346
347    sub     x9, x0, x1
348    ld1     {v16.s}[0], [x9]      //top
349    sub     x9, x0, #1
350    ld1     {v16.b}[4], [x9], x1
351    ld1     {v16.b}[5], [x9], x1
352    ld1     {v16.b}[6], [x9], x1
353    ld1     {v16.b}[7], [x9], x1
354
355
356    uaddlv  h2, v16.8b
357    uqrshrn b17, h2, #3
358    urshr   v2.4h, v2.4h, #3
359    shl     v2.4h, v2.4h, #4
360
361    //Calculate the 4x4_v 4x4_h mode SATD and save to "v6, v7"
362    ushll   v4.8h, v16.8b, #2
363    ins     v5.d[0], v4.d[1]
364    trn1    v6.2s, v4.2s, v5.2s
365    trn2    v7.2s, v4.2s, v5.2s
366
367    add     v4.4h, v6.4h, v7.4h
368    sub     v5.4h, v6.4h, v7.4h
369    trn1    v6.4h, v4.4h, v5.4h
370    trn2    v7.4h, v4.4h, v5.4h
371    add     v4.4h, v6.4h, v7.4h
372    sub     v5.4h, v6.4h, v7.4h
373    trn1    v6.2s, v4.2s, v5.2s
374    trn2    v7.2s, v4.2s, v5.2s     //{0,1,3,2,top} v6 {0,1,3,2,left} v7
375
376    eor     v31.16b, v31.16b, v31.16b  //Save the SATD of DC_BOTH
377    eor     v30.16b, v30.16b, v30.16b  //Save the SATD of H
378    eor     v29.16b, v29.16b, v29.16b  //Save the SATD of V
379    eor     v28.16b, v28.16b, v28.16b  //For zero register
380
381    //Load the p_enc data and save to "v22 ~ v23"--- 16X4 bytes
382    ld1     {v22.s}[0], [x2], x3
383    ld1     {v22.s}[1], [x2], x3
384    ld1     {v23.s}[0], [x2], x3
385    ld1     {v23.s}[1], [x2], x3
386
387    HDM_TRANSFORM_4X4_L0 v22.8b, v23.8b, v6.4h, v7.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
388
389    ldr     x11, [sp, #0]
390    urshr   v29.4s, v29.4s, #1
391    addv    s29, v29.4s
392    fmov    w0, s29
393    add     w0, w0, w11
394
395    urshr   v30.4s, v30.4s, #1
396    addv    s30, v30.4s
397    fmov    w1, s30
398    add     w1, w1, w7
399
400    urshr   v31.4s, v31.4s, #1
401    addv    s31, v31.4s
402    fmov    w2, s31
403    add     w2, w2, w6
404
405    mov     w10, w0
406    SELECT_BEST_COST_PREFER_HIGHER w10
407
408    str     w7, [x5]
409
410    sub     w9, w10, w2
411    cbnz    w9, satd_intra_4x4_x3_opt_jump0
412    dup     v0.16b, v17.b[0]
413    st1     {v0.16b}, [x4]
414    b       satd_intra_4x4_x3_opt_end
415
416satd_intra_4x4_x3_opt_jump0:
417    sub     w8, w10, w1
418    cbnz    w8, satd_intra_4x4_x3_opt_jump1
419    dup     v0.16b, v16.b[4]
420    dup     v1.16b, v16.b[5]
421    dup     v2.16b, v16.b[6]
422    dup     v3.16b, v16.b[7]
423    st4     {v0.s,v1.s,v2.s,v3.s}[0], [x4]
424    b       satd_intra_4x4_x3_opt_end
425
426satd_intra_4x4_x3_opt_jump1:
427    st1     {v16.S}[0], [x4], #4
428    st1     {v16.S}[0], [x4], #4
429    st1     {v16.S}[0], [x4], #4
430    st1     {v16.S}[0], [x4]
431satd_intra_4x4_x3_opt_end:
432    mov     w0, w10
433
434WELS_ASM_AARCH64_FUNC_END
435
436//int32_t WelsIntra8x8Combined3Satd_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*, uint8_t*,uint8_t*);
437WELS_ASM_AARCH64_FUNC_BEGIN WelsIntra8x8Combined3Satd_AArch64_neon
438    ldr     x11, [sp, #0]
439
440    SIGN_EXTENSION x1,w1
441    SIGN_EXTENSION x3,w3
442    SIGN_EXTENSION x5,w5
443    LOAD_CHROMA_DATA x0, v0.8b, v0.b
444
445    LOAD_CHROMA_DATA x7, v1.8b, v1.b
446
447    //Calculate the 16x16_v mode SATD and save to "v6, v7"
448    ushll   v4.8h, v0.8b, #2
449    ushll   v5.8h, v1.8b, #2
450    GET_16X16_V_SATD
451
452    //Calculate the 16x16_h mode SATD and save to "v16, v17"
453    ushll2  v4.8h, v0.16b, #2
454    ushll2  v5.8h, v1.16b, #2
455    GET_16X16_H_SATD
456
457    uaddlp  v0.8h, v0.16b
458    uaddlp  v2.4s, v0.8h
459    ins     v3.d[0], v2.d[1]
460    add     v3.2s, v2.2s, v3.2s
461
462    uaddlp  v1.8h, v1.16b
463    uaddlp  v4.4s, v1.8h
464    ins     v5.d[0], v4.d[1]
465    add     v5.2s, v4.2s, v5.2s
466
467    trn2    v0.4s, v2.4s, v4.4s
468    urshr   v0.4s, v0.4s, #2
469    urshr   v3.2s, v3.2s, #3
470    urshr   v5.2s, v5.2s, #3
471
472    ushll   v22.2d, v0.2s, #4    //{1cb, 1cr}
473    ushll2  v23.2d, v0.4s, #4    //{2cb, 2cr}
474    ushll   v24.2d, v3.2s, #4   //{0cb, 3cb}
475    ushll   v25.2d, v5.2s, #4   //{0cr, 3cr}
476
477    eor     v31.16b, v31.16b, v31.16b  //Save the SATD of DC_BOTH
478    eor     v30.16b, v30.16b, v30.16b  //Save the SATD of H
479    eor     v29.16b, v29.16b, v29.16b  //Save the SATD of V
480    eor     v28.16b, v28.16b, v28.16b  //For zero register
481
482    ins     v18.d[0], v6.d[1]
483    ins     v19.d[0], v7.d[1]
484    ins     v26.d[0], v16.d[1]
485    ins     v27.d[0], v17.d[1]
486
487    LOAD_8X4_DATA x2
488
489    HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v6.4h, v16.4h, v24.4h, v29.4s, v30.4s, v31.4s, v28, l
490    HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v18.4h, v16.4h, v22.4h, v29.4s, v30.4s, v31.4s, v28, l2
491
492    LOAD_8X4_DATA x11
493
494    ins     v22.d[0], v22.d[1]
495    HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v7.4h, v17.4h, v25.4h, v29.4s, v30.4s, v31.4s, v28, l
496    HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v19.4h, v17.4h, v22.4h, v29.4s, v30.4s, v31.4s, v28, l2
497
498    LOAD_8X4_DATA x2
499
500    ins     v24.d[0], v24.d[1]
501    HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v6.4h, v26.4h, v23.4h, v29.4s, v30.4s, v31.4s, v28, l
502    HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v18.4h, v26.4h, v24.4h, v29.4s, v30.4s, v31.4s, v28, l2
503
504    LOAD_8X4_DATA x11
505
506    ins     v23.d[0], v23.d[1]
507    ins     v25.d[0], v25.d[1]
508    HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v7.4h, v27.4h, v23.4h, v29.4s, v30.4s, v31.4s, v28, l
509    HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v19.4h, v27.4h, v25.4h, v29.4s, v30.4s, v31.4s, v28, l2
510
511    urshr   v29.4s, v29.4s, #1
512    addv    s29, v29.4s
513    fmov    w2, s29
514    add     w2, w2, w5, lsl #1
515
516    urshr   v30.4s, v30.4s, #1
517    addv    s30, v30.4s
518    fmov    w1, s30
519    add     w1, w1, w5, lsl #1
520
521    urshr   v31.4s, v31.4s, #1
522    addv    s31, v31.4s
523    fmov    w0, s31
524
525    SELECT_BEST_COST_PREFER_HIGHER w0
526
527    str     w7, [x4]
528WELS_ASM_AARCH64_FUNC_END
529
530//int32_t WelsIntra16x16Combined3Satd_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*);
531WELS_ASM_AARCH64_FUNC_BEGIN WelsIntra16x16Combined3Satd_AArch64_neon
532    SIGN_EXTENSION x1,w1
533    SIGN_EXTENSION x3,w3
534    SIGN_EXTENSION x5,w5
535    LOAD_LUMA_DATA
536
537    uaddlv  h2, v0.16b
538    uaddlv  h3, v1.16b
539    add     v2.8h, v2.8h, v3.8h
540    urshr   v2.4h, v2.4h, #5
541    shl     v2.4h, v2.4h, #4
542
543    //Calculate the 16x16_v mode SATD and save to "v6, v7"
544    ushll   v4.8h, v0.8b, #2
545    ushll2  v5.8h, v0.16b, #2
546    GET_16X16_V_SATD
547
548    //Calculate the 16x16_h mode SATD and save to "v16, v17"
549    ushll   v4.8h, v1.8b, #2
550    ushll2  v5.8h, v1.16b, #2
551    GET_16X16_H_SATD
552
553    eor     v31.16b, v31.16b, v31.16b  //Save the SATD of DC_BOTH
554    eor     v30.16b, v30.16b, v30.16b  //Save the SATD of H
555    eor     v29.16b, v29.16b, v29.16b  //Save the SATD of V
556    eor     v28.16b, v28.16b, v28.16b  //For zero register
557
558    ins     v18.d[0], v6.d[1]
559    ins     v19.d[0], v7.d[1]
560    ins     v26.d[0], v16.d[1]
561    ins     v27.d[0], v17.d[1]
562
563    LOAD_16X4_DATA
564
565    HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
566    HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
567    HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
568    HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
569
570    LOAD_16X4_DATA
571
572    HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
573    HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
574    HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
575    HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
576
577    LOAD_16X4_DATA
578
579    HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
580    HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
581    HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
582    HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
583
584    LOAD_16X4_DATA
585
586    HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
587    HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
588    HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
589    HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
590
591    urshr   v29.4s, v29.4s, #1
592    addv    s29, v29.4s
593    fmov    w0, s29
594
595    urshr   v30.4s, v30.4s, #1
596    addv    s30, v30.4s
597    fmov    w1, s30
598    add     w1, w1, w5, lsl #1
599
600    urshr   v31.4s, v31.4s, #1
601    addv    s31, v31.4s
602    fmov    w2, s31
603    add     w2, w2, w5, lsl #1
604
605    SELECT_BEST_COST_PREFER_LOWER w0
606
607    str     w7, [x4]
608
609WELS_ASM_AARCH64_FUNC_END
610
611#endif
612